diff --git a/doc/source/nfloat.rst b/doc/source/nfloat.rst index d332d7f00f..2621212b0a 100644 --- a/doc/source/nfloat.rst +++ b/doc/source/nfloat.rst @@ -317,5 +317,11 @@ Internal functions int _nfloat_cmpabs(nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) int _nfloat_add_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong delta, gr_ctx_t ctx) int _nfloat_sub_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong delta, gr_ctx_t ctx) + int _nfloat_add_2(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, gr_ctx_t ctx) + int _nfloat_sub_2(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, gr_ctx_t ctx) + int _nfloat_add_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) + int _nfloat_sub_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) + int _nfloat_add_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) + int _nfloat_sub_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) int _nfloat_add_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx) int _nfloat_sub_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx) diff --git a/src/nfloat.h b/src/nfloat.h index 2c01f65403..5897abb802 100644 --- a/src/nfloat.h +++ b/src/nfloat.h @@ -291,6 +291,12 @@ int nfloat_im(nfloat_ptr res, nfloat_srcptr x, gr_ctx_t ctx); int _nfloat_add_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong delta, gr_ctx_t ctx); int _nfloat_sub_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong delta, gr_ctx_t ctx); +int _nfloat_add_2(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, gr_ctx_t ctx); +int _nfloat_sub_2(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, gr_ctx_t ctx); +int _nfloat_add_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx); +int _nfloat_sub_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx); +int _nfloat_add_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx); +int _nfloat_sub_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx); int _nfloat_add_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx); int _nfloat_sub_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr yd, slong delta, slong nlimbs, gr_ctx_t ctx); diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c index 20a6f8255d..65f9b8ec6a 100644 --- a/src/nfloat/dot.c +++ b/src/nfloat/dot.c @@ -32,6 +32,64 @@ (r1) = __r1; (r2) = __r2; (r3) = __r3; \ } while (0) +/* todo: define in longlong.h */ +#if FLINT_BITS == 64 && defined(__GNUC__) && defined(__AVX2__) + +#define add_sssssaaaaaaaaaa(s4,s3,s2,s1,s0, a4,a3,a2,a1,a0, b4,b3,b2,b1,b0) \ + __asm__ ("addq %14,%q4\n\tadcq %12,%q3\n\tadcq %10,%q2\n\tadcq %8,%q1\n\tadcq %6,%q0" \ + : "=r" (s4), "=&r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a4)), "rme" ((ulong)(b4)), \ + "1" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "2" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "3" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "4" ((ulong)(a0)), "rme" ((ulong)(b0))) + + +#define sub_ddddmmmmssss(s3, s2, s1, s0, a3, a2, a1, a0, b3, b2, b1, b0) \ + __asm__ ("subq %11,%q3\n\tsbbq %9,%q2\n\tsbbq %7,%q1\n\tsbbq %5,%q0" \ + : "=r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "1" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "2" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "3" ((ulong)(a0)), "rme" ((ulong)(b0))) + +#define sub_dddddmmmmmsssss(s4,s3,s2,s1,s0, a4,a3,a2,a1,a0, b4,b3,b2,b1,b0) \ + __asm__ ("subq %14,%q4\n\tsbbq %12,%q3\n\tsbbq %10,%q2\n\tsbbq %8,%q1\n\tsbbq %6,%q0" \ + : "=r" (s4), "=&r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a4)), "rme" ((ulong)(b4)), \ + "1" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "2" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "3" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "4" ((ulong)(a0)), "rme" ((ulong)(b0))) +#else + +#define add_sssssaaaaaaaaaa(s4, s3, s2, s1, s0, a4, a3, a2, a1, a0, b4, b3, b2, b1, b0) \ + do { \ + ulong __t0 = 0; \ + add_ssssaaaaaaaa(__t0, s2, s1, s0, (ulong) 0, a2, a1, a0, (ulong) 0, b2, b1, b0); \ + add_ssaaaa(s4, s3, a4, a3, b4, b3); \ + add_ssaaaa(s4, s3, s4, s3, (ulong) 0, __t0); \ + } while (0) + + +#define sub_ddddmmmmssss(s3, s2, s1, s0, a3, a2, a1, a0, b3, b2, b1, b0) \ + do { \ + ulong __t1, __u1; \ + sub_dddmmmsss(__t1, s1, s0, (ulong) 0, a1, a0, (ulong) 0, b1, b0); \ + sub_ddmmss(__u1, s2, (ulong) 0, a2, (ulong) 0, b2); \ + sub_ddmmss(s3, s2, (a3) - (b3), s2, -__u1, -__t1); \ + } while (0) + +#define sub_dddddmmmmmsssss(s4, s3, s2, s1, s0, a4, a3, a2, a1, a0, b4, b3, b2, b1, b0) \ + do { \ + ulong __t2, __u2; \ + sub_ddddmmmmssss(__t2, s2, s1, s0, (ulong) 0, a2, a1, a0, (ulong) 0, b2, b1, b0); \ + sub_ddmmss(__u2, s3, (ulong) 0, a3, (ulong) 0, b3); \ + sub_ddmmss(s4, s3, (a4) - (b4), s3, -__u2, -__t2); \ + } while (0) + +#endif + int __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, slong sizeof_xstep, nfloat_srcptr y, slong sizeof_ystep, slong len, gr_ctx_t ctx) { @@ -467,12 +525,48 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src if (delta < FLINT_BITS) { t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs); - mpn_rshift(t, t, nlimbs + 1, delta); - if (xsgnbit) - mpn_sub_n(s, s, t, nlimbs + 1); + if (nlimbs == 3) + { + if (xsgnbit) + sub_ddddmmmmssss(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], + t[3] >> delta, + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + else + add_ssssaaaaaaaa(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], + t[3] >> delta, + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + } + else if (nlimbs == 4) + { + if (xsgnbit) + sub_dddddmmmmmsssss(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], + t[4] >> delta, + (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + else + add_sssssaaaaaaaaaa(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], + t[4] >> delta, + (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + } else - mpn_add_n(s, s, t, nlimbs + 1); + { + mpn_rshift(t, t, nlimbs + 1, delta); + + if (xsgnbit) + mpn_sub_n(s, s, t, nlimbs + 1); + else + mpn_add_n(s, s, t, nlimbs + 1); + } } else { diff --git a/src/nfloat/nfloat.c b/src/nfloat/nfloat.c index 9d44810f40..2f3bf0c725 100644 --- a/src/nfloat/nfloat.c +++ b/src/nfloat/nfloat.c @@ -19,6 +19,64 @@ #include "gr_generic.h" #include "gr_special.h" +/* todo: define in longlong.h */ +#if FLINT_BITS == 64 && defined(__GNUC__) && defined(__AVX2__) + +#define add_sssssaaaaaaaaaa(s4,s3,s2,s1,s0, a4,a3,a2,a1,a0, b4,b3,b2,b1,b0) \ + __asm__ ("addq %14,%q4\n\tadcq %12,%q3\n\tadcq %10,%q2\n\tadcq %8,%q1\n\tadcq %6,%q0" \ + : "=r" (s4), "=&r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a4)), "rme" ((ulong)(b4)), \ + "1" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "2" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "3" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "4" ((ulong)(a0)), "rme" ((ulong)(b0))) + + +#define sub_ddddmmmmssss(s3, s2, s1, s0, a3, a2, a1, a0, b3, b2, b1, b0) \ + __asm__ ("subq %11,%q3\n\tsbbq %9,%q2\n\tsbbq %7,%q1\n\tsbbq %5,%q0" \ + : "=r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "1" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "2" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "3" ((ulong)(a0)), "rme" ((ulong)(b0))) + +#define sub_dddddmmmmmsssss(s4,s3,s2,s1,s0, a4,a3,a2,a1,a0, b4,b3,b2,b1,b0) \ + __asm__ ("subq %14,%q4\n\tsbbq %12,%q3\n\tsbbq %10,%q2\n\tsbbq %8,%q1\n\tsbbq %6,%q0" \ + : "=r" (s4), "=&r" (s3), "=&r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((ulong)(a4)), "rme" ((ulong)(b4)), \ + "1" ((ulong)(a3)), "rme" ((ulong)(b3)), \ + "2" ((ulong)(a2)), "rme" ((ulong)(b2)), \ + "3" ((ulong)(a1)), "rme" ((ulong)(b1)), \ + "4" ((ulong)(a0)), "rme" ((ulong)(b0))) +#else + +#define add_sssssaaaaaaaaaa(s4, s3, s2, s1, s0, a4, a3, a2, a1, a0, b4, b3, b2, b1, b0) \ + do { \ + ulong __t0 = 0; \ + add_ssssaaaaaaaa(__t0, s2, s1, s0, (ulong) 0, a2, a1, a0, (ulong) 0, b2, b1, b0); \ + add_ssaaaa(s4, s3, a4, a3, b4, b3); \ + add_ssaaaa(s4, s3, s4, s3, (ulong) 0, __t0); \ + } while (0) + + +#define sub_ddddmmmmssss(s3, s2, s1, s0, a3, a2, a1, a0, b3, b2, b1, b0) \ + do { \ + ulong __t1, __u1; \ + sub_dddmmmsss(__t1, s1, s0, (ulong) 0, a1, a0, (ulong) 0, b1, b0); \ + sub_ddmmss(__u1, s2, (ulong) 0, a2, (ulong) 0, b2); \ + sub_ddmmss(s3, s2, (a3) - (b3), s2, -__u1, -__t1); \ + } while (0) + +#define sub_dddddmmmmmsssss(s4, s3, s2, s1, s0, a4, a3, a2, a1, a0, b4, b3, b2, b1, b0) \ + do { \ + ulong __t2, __u2; \ + sub_ddddmmmmssss(__t2, s2, s1, s0, (ulong) 0, a2, a1, a0, (ulong) 0, b2, b1, b0); \ + sub_ddmmss(__u2, s3, (ulong) 0, a3, (ulong) 0, b3); \ + sub_ddmmss(s4, s3, (a4) - (b4), s3, -__u2, -__t2); \ + } while (0) + +#endif + int nfloat_write(gr_stream_t out, nfloat_srcptr x, gr_ctx_t ctx) { @@ -787,34 +845,227 @@ _nfloat_add_2(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, } int -_nfloat_sub_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong delta, gr_ctx_t ctx) +_nfloat_add_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) { - ulong u; - slong norm; + ulong x2, x1, x0, y2, y1, y0, s3, s2, s1, s0; - if (delta == 0) + NFLOAT_SGNBIT(res) = xsgnbit; + + x0 = x[0]; + x1 = x[1]; + x2 = x[2]; + + if (delta < FLINT_BITS) { - NFLOAT_SGNBIT(res) = n_signed_sub(&u, x0, y0) ^ xsgnbit; + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; - if (u == 0) - return nfloat_zero(res, ctx); + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta); + } + } + else if (delta < 2 * FLINT_BITS) + { + delta -= FLINT_BITS; + y0 = y[1]; + y1 = y[2]; + y2 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta); + } + } + else if (delta < 3 * FLINT_BITS) + { + delta -= 2 * FLINT_BITS; + y0 = y[2] >> delta; + y1 = 0; + y2 = 0; + } + else + { + y0 = 0; + y1 = 0; + y2 = 0; + } + + add_ssssaaaaaaaa(s3, s2, s1, s0, 0, x2, x1, x0, 0, y2, y1, y0); + + if (s3 == 0) + { + NFLOAT_D(res)[0] = s0; + NFLOAT_D(res)[1] = s1; + NFLOAT_D(res)[2] = s2; + NFLOAT_EXP(res) = xexp; + } + else + { + NFLOAT_D(res)[0] = (s0 >> 1) | (s1 << (FLINT_BITS - 1)); + NFLOAT_D(res)[1] = (s1 >> 1) | (s2 << (FLINT_BITS - 1)); + NFLOAT_D(res)[2] = (s2 >> 1) | (UWORD(1) << (FLINT_BITS - 1)); + NFLOAT_EXP(res) = xexp + 1; + NFLOAT_HANDLE_OVERFLOW(res, ctx); + } + + return GR_SUCCESS; +} + +int +_nfloat_add_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) +{ + ulong x3, x2, x1, x0, y3, y2, y1, y0, s4, s3, s2, s1, s0; + + NFLOAT_SGNBIT(res) = xsgnbit; + + x0 = x[0]; + x1 = x[1]; + x2 = x[2]; + x3 = x[3]; + + if (delta < FLINT_BITS) + { + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; + y3 = y[3]; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta) | (y3 << (FLINT_BITS - delta)); + y3 = (y3 >> delta); + } + } + else if (delta < 2 * FLINT_BITS) + { + delta -= FLINT_BITS; + y0 = y[1]; + y1 = y[2]; + y2 = y[3]; + y3 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta); + } + } + else if (delta < 3 * FLINT_BITS) + { + delta -= 2 * FLINT_BITS; + y0 = y[2]; + y1 = y[3]; + y2 = 0; + y3 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta); + } + } + else if (delta < 4 * FLINT_BITS) + { + delta -= 3 * FLINT_BITS; + y0 = y[3] >> delta; + y1 = 0; + y2 = 0; + y3 = 0; + } + else + { + y0 = 0; + y1 = 0; + y2 = 0; + y3 = 0; + } + + add_sssssaaaaaaaaaa(s4, s3, s2, s1, s0, 0, x3, x2, x1, x0, 0, y3, y2, y1, y0); + + if (s4 == 0) + { + NFLOAT_D(res)[0] = s0; + NFLOAT_D(res)[1] = s1; + NFLOAT_D(res)[2] = s2; + NFLOAT_D(res)[3] = s3; + NFLOAT_EXP(res) = xexp; + } + else + { + NFLOAT_D(res)[0] = (s0 >> 1) | (s1 << (FLINT_BITS - 1)); + NFLOAT_D(res)[1] = (s1 >> 1) | (s2 << (FLINT_BITS - 1)); + NFLOAT_D(res)[2] = (s2 >> 1) | (s3 << (FLINT_BITS - 1)); + NFLOAT_D(res)[3] = (s3 >> 1) | (UWORD(1) << (FLINT_BITS - 1)); + NFLOAT_EXP(res) = xexp + 1; + NFLOAT_HANDLE_OVERFLOW(res, ctx); + } + + return GR_SUCCESS; +} + +int +_nfloat_sub_1(nfloat_ptr res, mp_limb_t x0, slong xexp, int xsgnbit, mp_limb_t y0, slong delta, gr_ctx_t ctx) +{ + ulong u, u0; + slong norm = 0; + + if (delta <= 1) + { + if (delta == 0) + { + xsgnbit ^= n_signed_sub(&u, x0, y0); + + if (u == 0) + return nfloat_zero(res, ctx); + + norm = flint_clz(u); + u <<= norm; + xexp -= norm; + } + else + { + sub_ddmmss(u, u0, x0, 0, y0 >> delta, y0 << (FLINT_BITS - delta)); + + if (FLINT_UNLIKELY(u == 0)) + { + u = u0; + xexp -= FLINT_BITS; + } + else if (!LIMB_MSB_IS_SET(u)) + { + norm = flint_clz(u); + u = (u << norm) | (u0 >> (FLINT_BITS - norm)); + xexp -= norm; + } + } } else if (delta < FLINT_BITS) { u = x0 - (y0 >> delta); NFLOAT_SGNBIT(res) = xsgnbit; + + if (!LIMB_MSB_IS_SET(u)) + { + u <<= 1; + xexp--; + } } else { - NFLOAT_D(res)[0] = x0; - NFLOAT_EXP(res) = xexp; - NFLOAT_SGNBIT(res) = xsgnbit; - return GR_SUCCESS; + u = x0; } - norm = flint_clz(u); - NFLOAT_D(res)[0] = u << norm; - NFLOAT_EXP(res) = xexp - norm; + NFLOAT_EXP(res) = xexp; + NFLOAT_SGNBIT(res) = xsgnbit; + NFLOAT_D(res)[0] = u; NFLOAT_HANDLE_UNDERFLOW(res, ctx); return GR_SUCCESS; } @@ -822,7 +1073,7 @@ _nfloat_sub_1(nfloat_ptr res, ulong x0, slong xexp, int xsgnbit, ulong y0, slong int _nfloat_sub_2(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) { - ulong x1, x0, y1, y0, s1, s0; + ulong x1, x0, y1, y0, s1, s0, sb; slong norm; NFLOAT_SGNBIT(res) = xsgnbit; @@ -830,11 +1081,11 @@ _nfloat_sub_2(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, x0 = x[0]; x1 = x[1]; - if (delta < 2 * FLINT_BITS) - { - y0 = y[0]; - y1 = y[1]; + y0 = y[0]; + y1 = y[1]; + if (delta <= 1) + { if (delta == 0) { if (x1 > y1 || (x1 == y1 && x0 >= y0)) @@ -843,57 +1094,407 @@ _nfloat_sub_2(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, if (s1 == 0 && s0 == 0) return nfloat_zero(res, ctx); - - NFLOAT_SGNBIT(res) = xsgnbit; } else { sub_ddmmss(s1, s0, y1, y0, x1, x0); + xsgnbit = !xsgnbit; + } - NFLOAT_SGNBIT(res) = !xsgnbit; + sb = 0; + } + else + { + sub_dddmmmsss(s1, s0, sb, + x1, x0, 0, + y1 >> 1, + (y0 >> 1) | (y1 << (FLINT_BITS - 1)), + y0 << (FLINT_BITS - 1)); + } + + if (FLINT_UNLIKELY(s1 == 0)) + { + if (s0 == 0) + { + s1 = sb; + s0 = 0; + xexp -= 2 * FLINT_BITS; + } + else + { + s1 = s0; + s0 = sb; + sb = 0; + xexp -= FLINT_BITS; } } - else if (delta < FLINT_BITS) + + if (!LIMB_MSB_IS_SET(s1)) + { + norm = flint_clz(s1); + s1 = (s1 << norm) | (s0 >> (FLINT_BITS - norm)); + s0 = (s0 << norm) | (sb >> (FLINT_BITS - norm)); + xexp -= norm; + } + } + else + { + if (delta < FLINT_BITS) { sub_ddmmss(s1, s0, x1, x0, y1 >> delta, (y0 >> delta) | (y1 << (FLINT_BITS - delta))); - NFLOAT_SGNBIT(res) = xsgnbit; } - else + else if (delta < 2 * FLINT_BITS) { + delta -= FLINT_BITS; sub_ddmmss(s1, s0, x1, x0, 0, y1 >> delta); - NFLOAT_SGNBIT(res) = xsgnbit; + } + else + { + s0 = x0; + s1 = x1; + } + + if (!LIMB_MSB_IS_SET(s1)) + { + s1 = (s1 << 1) | (s0 >> (FLINT_BITS - 1)); + s0 = (s0 << 1); + xexp--; } } - else + + NFLOAT_EXP(res) = xexp; + NFLOAT_SGNBIT(res) = xsgnbit; + NFLOAT_D(res)[0] = s0; + NFLOAT_D(res)[1] = s1; + NFLOAT_HANDLE_UNDERFLOW(res, ctx); + return GR_SUCCESS; +} + +int +_nfloat_sub_3(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) +{ + ulong x2, x1, x0, y2, y1, y0, s2, s1, s0, sb; + slong norm; + + NFLOAT_SGNBIT(res) = xsgnbit; + + x0 = x[0]; + x1 = x[1]; + x2 = x[2]; + + if (delta <= 1) { - NFLOAT_D(res)[0] = x0; - NFLOAT_D(res)[1] = x1; - NFLOAT_EXP(res) = xexp; - NFLOAT_SGNBIT(res) = xsgnbit; - return GR_SUCCESS; - } + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; + + if (delta == 0) + { + if (x2 > y2 || (x2 == y2 && (x1 > y1 || (x1 == y1 && x0 >= y0)))) + { + sub_dddmmmsss(s2, s1, s0, x2, x1, x0, y2, y1, y0); + + if (s2 == 0 && s1 == 0 && s0 == 0) + return nfloat_zero(res, ctx); + } + else + { + sub_dddmmmsss(s2, s1, s0, y2, y1, y0, x2, x1, x0); + xsgnbit = !xsgnbit; + } + + sb = 0; + } + else + { + sub_ddddmmmmssss(s2, s1, s0, sb, + x2, x1, x0, 0, + y2 >> 1, + (y1 >> 1) | (y2 << (FLINT_BITS - 1)), + (y0 >> 1) | (y1 << (FLINT_BITS - 1)), + y0 << (FLINT_BITS - 1)); + } + + if (FLINT_UNLIKELY(s2 == 0)) + { + if (s1 == 0) + { + if (s0 == 0) + { + s2 = sb; + s1 = 0; + s0 = 0; + sb = 0; + xexp -= 3 * FLINT_BITS; + } + else + { + s2 = s0; + s1 = sb; + s0 = 0; + sb = 0; + xexp -= 2 * FLINT_BITS; + } + } + else + { + s2 = s1; + s1 = s0; + s0 = sb; + sb = 0; + xexp -= FLINT_BITS; + } + } - if (s1 == 0) + if (!LIMB_MSB_IS_SET(s2)) + { + norm = flint_clz(s2); + s2 = (s2 << norm) | (s1 >> (FLINT_BITS - norm)); + s1 = (s1 << norm) | (s0 >> (FLINT_BITS - norm)); + s0 = (s0 << norm) | (sb >> (FLINT_BITS - norm)); + xexp -= norm; + } + } + else { - s1 = s0; - s0 = 0; - xexp -= FLINT_BITS; + if (delta < FLINT_BITS) + { + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; + + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta); + } + else if (delta < 2 * FLINT_BITS) + { + delta -= FLINT_BITS; + y0 = y[1]; + y1 = y[2]; + y2 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta); + } + } + else if (delta < 3 * FLINT_BITS) + { + delta -= 2 * FLINT_BITS; + y0 = y[2] >> delta; + y1 = 0; + y2 = 0; + } + else + { + y0 = 0; + y1 = 0; + y2 = 0; + } + + sub_dddmmmsss(s2, s1, s0, x2, x1, x0, y2, y1, y0); + + if (!LIMB_MSB_IS_SET(s2)) + { + s2 = (s2 << 1) | (s1 >> (FLINT_BITS - 1)); + s1 = (s1 << 1) | (s0 >> (FLINT_BITS - 1)); + s0 = (s0 << 1); + xexp--; + } } - norm = flint_clz(s1); + NFLOAT_EXP(res) = xexp; + NFLOAT_SGNBIT(res) = xsgnbit; + NFLOAT_D(res)[0] = s0; + NFLOAT_D(res)[1] = s1; + NFLOAT_D(res)[2] = s2; + NFLOAT_HANDLE_UNDERFLOW(res, ctx); + return GR_SUCCESS; +} + +int +_nfloat_sub_4(nfloat_ptr res, nn_srcptr x, slong xexp, int xsgnbit, nn_srcptr y, slong delta, gr_ctx_t ctx) +{ + ulong x3, x2, x1, x0, y3, y2, y1, y0, s3, s2, s1, s0, sb; + slong norm; - if (norm == 0) + NFLOAT_SGNBIT(res) = xsgnbit; + + x0 = x[0]; + x1 = x[1]; + x2 = x[2]; + x3 = x[3]; + + if (delta <= 1) { - NFLOAT_D(res)[0] = s0; - NFLOAT_D(res)[1] = s1; + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; + y3 = y[3]; + + if (delta == 0) + { + if (x3 > y3 || (x3 == y3 && (x2 > y2 || (x2 == y2 && (x1 > y1 || (x1 == y1 && x0 >= y0)))))) + { + sub_ddddmmmmssss(s3, s2, s1, s0, x3, x2, x1, x0, y3, y2, y1, y0); + + if (s3 == 0 && s2 == 0 && s1 == 0 && s0 == 0) + return nfloat_zero(res, ctx); + } + else + { + sub_ddddmmmmssss(s3, s2, s1, s0, y3, y2, y1, y0, x3, x2, x1, x0); + xsgnbit = !xsgnbit; + } + + sb = 0; + } + else + { + sub_dddddmmmmmsssss(s3, s2, s1, s0, sb, + x3, x2, x1, x0, 0, + y3 >> 1, + (y2 >> 1) | (y3 << (FLINT_BITS - 1)), + (y1 >> 1) | (y2 << (FLINT_BITS - 1)), + (y0 >> 1) | (y1 << (FLINT_BITS - 1)), + y0 << (FLINT_BITS - 1)); + } + + if (FLINT_UNLIKELY(s3 == 0)) + { + if (s2 == 0) + { + if (s1 == 0) + { + if (s0 == 0) + { + s3 = sb; + s2 = 0; + s1 = 0; + s0 = 0; + sb = 0; + xexp -= 4 * FLINT_BITS; + } + else + { + s3 = s0; + s2 = sb; + s1 = 0; + s0 = 0; + sb = 0; + xexp -= 3 * FLINT_BITS; + } + } + else + { + s3 = s1; + s2 = s0; + s1 = sb; + s0 = 0; + sb = 0; + xexp -= 2 * FLINT_BITS; + } + } + else + { + s3 = s2; + s2 = s1; + s1 = s0; + s0 = sb; + sb = 0; + xexp -= FLINT_BITS; + } + } + + if (!LIMB_MSB_IS_SET(s3)) + { + norm = flint_clz(s3); + s3 = (s3 << norm) | (s2 >> (FLINT_BITS - norm)); + s2 = (s2 << norm) | (s1 >> (FLINT_BITS - norm)); + s1 = (s1 << norm) | (s0 >> (FLINT_BITS - norm)); + s0 = (s0 << norm) | (sb >> (FLINT_BITS - norm)); + xexp -= norm; + } } else { - NFLOAT_D(res)[0] = (s0 << norm) | (s1 >> (FLINT_BITS - norm)); - NFLOAT_D(res)[1] = s1 << norm; + if (delta < FLINT_BITS) + { + y0 = y[0]; + y1 = y[1]; + y2 = y[2]; + y3 = y[3]; + + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta) | (y3 << (FLINT_BITS - delta)); + y3 = (y3 >> delta); + } + else if (delta < 2 * FLINT_BITS) + { + delta -= FLINT_BITS; + y0 = y[1]; + y1 = y[2]; + y2 = y[3]; + y3 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta) | (y2 << (FLINT_BITS - delta)); + y2 = (y2 >> delta); + } + } + else if (delta < 3 * FLINT_BITS) + { + delta -= 2 * FLINT_BITS; + y0 = y[2]; + y1 = y[3]; + y2 = 0; + y3 = 0; + + if (delta != 0) + { + y0 = (y0 >> delta) | (y1 << (FLINT_BITS - delta)); + y1 = (y1 >> delta); + } + } + else if (delta < 4 * FLINT_BITS) + { + delta -= 3 * FLINT_BITS; + y0 = y[3] >> delta; + y1 = 0; + y2 = 0; + y3 = 0; + } + else + { + y0 = 0; + y1 = 0; + y2 = 0; + y3 = 0; + } + + sub_ddddmmmmssss(s3, s2, s1, s0, x3, x2, x1, x0, y3, y2, y1, y0); + + if (!LIMB_MSB_IS_SET(s3)) + { + s3 = (s3 << 1) | (s2 >> (FLINT_BITS - 1)); + s2 = (s2 << 1) | (s1 >> (FLINT_BITS - 1)); + s1 = (s1 << 1) | (s0 >> (FLINT_BITS - 1)); + s0 = (s0 << 1); + xexp--; + } } - NFLOAT_EXP(res) = xexp - norm; + NFLOAT_EXP(res) = xexp; + NFLOAT_SGNBIT(res) = xsgnbit; + NFLOAT_D(res)[0] = s0; + NFLOAT_D(res)[1] = s1; + NFLOAT_D(res)[2] = s2; + NFLOAT_D(res)[3] = s3; NFLOAT_HANDLE_UNDERFLOW(res, ctx); return GR_SUCCESS; } @@ -952,20 +1553,82 @@ _nfloat_sub_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr y { slong shift_limbs, shift_bits, n, norm; ulong t[NFLOAT_MAX_LIMBS]; + ulong sb; - if (delta == 0) + /* In some experiments, + + ~10% of additions have shift == 0 + ~10% of additions have shift == 1 + ~60% of additions have 2 <= shift < FLINT_BITS + ~20% of additions have shift >= FLINT_BITS + + The shift == 1 case is the only case other than delta == 0 where + we can have significant cancellation. In this case, we need to + subtract the shifted-out bit of y to guarantee that the output + approximates the exact difference with high relative accuracy. + */ + if (delta <= 1) { - NFLOAT_SGNBIT(res) = flint_mpn_signed_sub_n(NFLOAT_D(res), xd, yd, nlimbs) ^ xsgnbit; + if (delta == 0) + { + xsgnbit ^= flint_mpn_signed_sub_n(NFLOAT_D(res), xd, yd, nlimbs); + sb = 0; + } + else + { + mpn_rshift(t, yd, nlimbs, 1); + sb = yd[0] << (FLINT_BITS - 1); + mpn_sub_n(NFLOAT_D(res), xd, t, nlimbs); + mpn_sub_1(NFLOAT_D(res), NFLOAT_D(res), nlimbs, sb != 0); + } + + n = nlimbs; + MPN_NORM(NFLOAT_D(res), n); + + /* We have at least one full limb of cancellation */ + if (FLINT_UNLIKELY(n != nlimbs)) + { + if (n == 0) + { + if (sb == 0) + { + return nfloat_zero(res, ctx); + } + else + { + NFLOAT_D(res)[nlimbs - 1] = sb; + flint_mpn_zero(NFLOAT_D(res), nlimbs - 1); + xexp -= nlimbs * FLINT_BITS; + } + } + else + { + /* The copy is avoidable when we also have a shift, + but since this is an unlikely, branch, it's + not really worth optimizing. */ + flint_mpn_copyd(NFLOAT_D(res) + nlimbs - n, NFLOAT_D(res), n); + flint_mpn_zero(NFLOAT_D(res), nlimbs - n - 1); + NFLOAT_D(res)[nlimbs - n - 1] = sb; + sb = 0; + xexp -= (nlimbs - n) * FLINT_BITS; + } + } + + norm = flint_clz(NFLOAT_D(res)[nlimbs - 1]); + + if (norm) + { + mpn_lshift(NFLOAT_D(res), NFLOAT_D(res), nlimbs, norm); + NFLOAT_D(res)[0] |= (sb >> (FLINT_BITS - norm)); + xexp -= norm; + } } else { - NFLOAT_SGNBIT(res) = xsgnbit; - if (delta < FLINT_BITS) { mpn_rshift(t, yd, nlimbs, delta); mpn_sub_n(NFLOAT_D(res), xd, t, nlimbs); - NFLOAT_SGNBIT(res) = xsgnbit; } else if (delta < nlimbs * FLINT_BITS) { @@ -976,44 +1639,22 @@ _nfloat_sub_n(nfloat_ptr res, nn_srcptr xd, slong xexp, int xsgnbit, nn_srcptr y else mpn_rshift(t, yd + shift_limbs, nlimbs - shift_limbs, shift_bits); mpn_sub(NFLOAT_D(res), xd, nlimbs, t, nlimbs - shift_limbs); - NFLOAT_SGNBIT(res) = xsgnbit; } else { flint_mpn_copyi(NFLOAT_D(res), xd, nlimbs); - NFLOAT_EXP(res) = xexp; - NFLOAT_SGNBIT(res) = xsgnbit; - return GR_SUCCESS; } - } - - n = nlimbs; - MPN_NORM(NFLOAT_D(res), n); - if (n != nlimbs) - { - if (n == 0) - return nfloat_zero(res, ctx); - - norm = flint_clz(NFLOAT_D(res)[n - 1]); - if (norm) - mpn_lshift(NFLOAT_D(res) + nlimbs - n, NFLOAT_D(res), n, norm); - else - flint_mpn_copyd(NFLOAT_D(res) + nlimbs - n, NFLOAT_D(res), n); - - xexp -= (nlimbs - n) * FLINT_BITS + norm; - } - else - { - norm = flint_clz(NFLOAT_D(res)[nlimbs - 1]); - if (norm) - mpn_lshift(NFLOAT_D(res), NFLOAT_D(res), nlimbs, norm); - xexp -= norm; + if (!LIMB_MSB_IS_SET(NFLOAT_D(res)[nlimbs - 1])) + { + mpn_lshift(NFLOAT_D(res), NFLOAT_D(res), nlimbs, 1); + xexp--; + } } + NFLOAT_SGNBIT(res) = xsgnbit; NFLOAT_EXP(res) = xexp; NFLOAT_HANDLE_UNDERFLOW(res, ctx); - return GR_SUCCESS; } @@ -1058,6 +1699,10 @@ nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) return _nfloat_add_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx); else if (nlimbs == 2) return _nfloat_add_2(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 3) + return _nfloat_add_3(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 4) + return _nfloat_add_4(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); else return _nfloat_add_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx); } @@ -1067,6 +1712,10 @@ nfloat_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) return _nfloat_sub_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx); else if (nlimbs == 2) return _nfloat_sub_2(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 3) + return _nfloat_sub_3(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 4) + return _nfloat_sub_4(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); else return _nfloat_sub_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx); } @@ -1114,6 +1763,10 @@ nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) return _nfloat_add_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx); else if (nlimbs == 2) return _nfloat_add_2(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 3) + return _nfloat_add_3(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 4) + return _nfloat_add_4(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); else return _nfloat_add_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx); } @@ -1123,6 +1776,10 @@ nfloat_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, gr_ctx_t ctx) return _nfloat_sub_1(res, NFLOAT_D(x)[0], xexp, xsgnbit, NFLOAT_D(y)[0], delta, ctx); else if (nlimbs == 2) return _nfloat_sub_2(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 3) + return _nfloat_sub_3(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); + else if (nlimbs == 4) + return _nfloat_sub_4(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, ctx); else return _nfloat_sub_n(res, NFLOAT_D(x), xexp, xsgnbit, NFLOAT_D(y), delta, nlimbs, ctx); } @@ -1246,6 +1903,124 @@ _nfloat_vec_aors_2(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, int subtrac return status; } +int +_nfloat_vec_aors_3(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, int subtract, slong len, gr_ctx_t ctx) +{ + slong i, stride; + slong xexp, yexp, delta; + int xsgnbit, ysgnbit; + nfloat_srcptr xi, yi; + nfloat_ptr ri; + int status = GR_SUCCESS; + + stride = NFLOAT_HEADER_LIMBS + 3; + + for (i = 0; i < len; i++) + { + xi = (nn_srcptr) x + i * stride; + yi = (nn_srcptr) y + i * stride; + ri = (nn_ptr) res + i * stride; + + xexp = NFLOAT_EXP(xi); + yexp = NFLOAT_EXP(yi); + + if (yexp == NFLOAT_EXP_ZERO) + { + flint_mpn_copyi(ri, xi, NFLOAT_HEADER_LIMBS + 3); + continue; + } + + if (xexp == NFLOAT_EXP_ZERO) + { + flint_mpn_copyi(ri, yi, NFLOAT_HEADER_LIMBS + 3); + if (subtract) + NFLOAT_SGNBIT(ri) = !NFLOAT_SGNBIT(ri); + continue; + } + + xsgnbit = NFLOAT_SGNBIT(xi); + ysgnbit = NFLOAT_SGNBIT(yi) ^ subtract; + + delta = xexp - yexp; + + if (xsgnbit == ysgnbit) + { + if (delta >= 0) + status |= _nfloat_add_3(ri, NFLOAT_D(xi), xexp, xsgnbit, NFLOAT_D(yi), delta, ctx); + else + status |= _nfloat_add_3(ri, NFLOAT_D(yi), yexp, ysgnbit, NFLOAT_D(xi), -delta, ctx); + } + else + { + if (delta >= 0) + status |= _nfloat_sub_3(ri, NFLOAT_D(xi), xexp, xsgnbit, NFLOAT_D(yi), delta, ctx); + else + status |= _nfloat_sub_3(ri, NFLOAT_D(yi), yexp, ysgnbit, NFLOAT_D(xi), -delta, ctx); + } + } + + return status; +} + +int +_nfloat_vec_aors_4(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, int subtract, slong len, gr_ctx_t ctx) +{ + slong i, stride; + slong xexp, yexp, delta; + int xsgnbit, ysgnbit; + nfloat_srcptr xi, yi; + nfloat_ptr ri; + int status = GR_SUCCESS; + + stride = NFLOAT_HEADER_LIMBS + 4; + + for (i = 0; i < len; i++) + { + xi = (nn_srcptr) x + i * stride; + yi = (nn_srcptr) y + i * stride; + ri = (nn_ptr) res + i * stride; + + xexp = NFLOAT_EXP(xi); + yexp = NFLOAT_EXP(yi); + + if (yexp == NFLOAT_EXP_ZERO) + { + flint_mpn_copyi(ri, xi, NFLOAT_HEADER_LIMBS + 4); + continue; + } + + if (xexp == NFLOAT_EXP_ZERO) + { + flint_mpn_copyi(ri, yi, NFLOAT_HEADER_LIMBS + 4); + if (subtract) + NFLOAT_SGNBIT(ri) = !NFLOAT_SGNBIT(ri); + continue; + } + + xsgnbit = NFLOAT_SGNBIT(xi); + ysgnbit = NFLOAT_SGNBIT(yi) ^ subtract; + + delta = xexp - yexp; + + if (xsgnbit == ysgnbit) + { + if (delta >= 0) + status |= _nfloat_add_4(ri, NFLOAT_D(xi), xexp, xsgnbit, NFLOAT_D(yi), delta, ctx); + else + status |= _nfloat_add_4(ri, NFLOAT_D(yi), yexp, ysgnbit, NFLOAT_D(xi), -delta, ctx); + } + else + { + if (delta >= 0) + status |= _nfloat_sub_4(ri, NFLOAT_D(xi), xexp, xsgnbit, NFLOAT_D(yi), delta, ctx); + else + status |= _nfloat_sub_4(ri, NFLOAT_D(yi), yexp, ysgnbit, NFLOAT_D(xi), -delta, ctx); + } + } + + return status; +} + int _nfloat_vec_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx) { @@ -1262,6 +2037,10 @@ _nfloat_vec_add(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ return _nfloat_vec_aors_1(res, x, y, 0, len, ctx); if (nlimbs == 2) return _nfloat_vec_aors_2(res, x, y, 0, len, ctx); + if (nlimbs == 3) + return _nfloat_vec_aors_3(res, x, y, 0, len, ctx); + if (nlimbs == 4) + return _nfloat_vec_aors_4(res, x, y, 0, len, ctx); } stride = NFLOAT_CTX_DATA_NLIMBS(ctx); @@ -1294,6 +2073,10 @@ _nfloat_vec_sub(nfloat_ptr res, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ return _nfloat_vec_aors_1(res, x, y, 1, len, ctx); if (nlimbs == 2) return _nfloat_vec_aors_2(res, x, y, 1, len, ctx); + if (nlimbs == 3) + return _nfloat_vec_aors_3(res, x, y, 1, len, ctx); + if (nlimbs == 4) + return _nfloat_vec_aors_4(res, x, y, 1, len, ctx); } stride = NFLOAT_CTX_DATA_NLIMBS(ctx); diff --git a/src/nfloat/profile/p-vs_arf.c b/src/nfloat/profile/p-vs_arf.c index 17946c64b5..f9e5921367 100644 --- a/src/nfloat/profile/p-vs_arf.c +++ b/src/nfloat/profile/p-vs_arf.c @@ -45,16 +45,16 @@ int main() flint_printf(" _gr_vec_add _gr_vec_mul _gr_vec_mul_scalar _gr_vec_sum _gr_vec_product _gr_vec_dot\n"); - for (prec = 64; prec <= 2048; prec *= 2) + for (prec = 64; prec <= 2048; prec = prec < 256 ? prec + 64 : prec * 2) { flint_printf("prec = %wd\n", prec); - for (n = 3; n <= 100; n = (n == 3) ? 10 : n * 10) + for (n = 10; n <= 100; n *= 10) { for (which = 0; which < 2; which++) { flint_rand_t state; - flint_randinit(state); + flint_rand_init(state); if (which == 0) gr_ctx_init_real_float_arf(ctx, prec); @@ -127,7 +127,7 @@ int main() gr_heap_clear_vec(vec2, n, ctx); gr_heap_clear_vec(vec3, n, ctx); - flint_randclear(state); + flint_rand_clear(state); } flint_printf("n = %4wd ", n); diff --git a/src/nfloat/test/main.c b/src/nfloat/test/main.c index 328cfa3dda..8fbecaf297 100644 --- a/src/nfloat/test/main.c +++ b/src/nfloat/test/main.c @@ -11,12 +11,14 @@ /* Include functions *********************************************************/ +#include "t-add_sub_n.c" #include "t-nfloat.c" /* Array of test functions ***************************************************/ test_struct tests[] = { + TEST_FUNCTION(add_sub_n), TEST_FUNCTION(nfloat), }; diff --git a/src/nfloat/test/t-add_sub_n.c b/src/nfloat/test/t-add_sub_n.c new file mode 100644 index 0000000000..6d02f8f674 --- /dev/null +++ b/src/nfloat/test/t-add_sub_n.c @@ -0,0 +1,97 @@ +/* + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "fmpq.h" +#include "arf.h" +#include "gr.h" +#include "gr_vec.h" +#include "gr_special.h" +#include "nfloat.h" + +TEST_FUNCTION_START(add_sub_n, state) +{ + gr_ctx_t ctx; + slong iter, prec; + gr_ptr a, b, r1, r2; + + /* test that add_X and sub_X specializations implement the same + algorithm as add_n and sub_n */ + for (prec = FLINT_BITS; prec <= 4 * FLINT_BITS; prec += FLINT_BITS) + { + nfloat_ctx_init(ctx, prec, 0); + GR_TMP_INIT4(a, b, r1, r2, ctx); + + for (iter = 0; iter < 100000 * flint_test_multiplier(); iter++) + { + GR_IGNORE(gr_randtest_not_zero(a, state, ctx)); + GR_IGNORE(gr_randtest_not_zero(b, state, ctx)); + + if (NFLOAT_EXP(a) < NFLOAT_EXP(b)) + nfloat_swap(a, b, ctx); + + if (n_randint(state, 2)) + NFLOAT_EXP(b) = NFLOAT_EXP(a) - 1; + + /* trigger special cases with increased probability */ + if (n_randint(state, 100) == 0) + flint_mpn_store(NFLOAT_D(b), prec / FLINT_BITS, ~UWORD(0)); + + GR_MUST_SUCCEED(_nfloat_add_n(r1, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), prec / FLINT_BITS, ctx)); + + if (prec == FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_add_1(r2, NFLOAT_D(a)[0], NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b)[0], NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 2 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_add_2(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 3 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_add_3(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 4 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_add_4(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + + if (gr_equal(r1, r2, ctx) != T_TRUE) + { + flint_printf("FAIL: add_%wd\n", prec / FLINT_BITS); + flint_printf("a = "); gr_println(a, ctx); + flint_printf("b = "); gr_println(b, ctx); + flint_printf("r1 = "); gr_println(r1, ctx); + flint_printf("r2 = "); gr_println(r2, ctx); + flint_abort(); + } + + GR_MUST_SUCCEED(_nfloat_sub_n(r1, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), prec / FLINT_BITS, ctx)); + + if (prec == FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_sub_1(r2, NFLOAT_D(a)[0], NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b)[0], NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 2 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_sub_2(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 3 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_sub_3(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + else if (prec == 4 * FLINT_BITS) + GR_MUST_SUCCEED(_nfloat_sub_4(r2, NFLOAT_D(a), NFLOAT_EXP(a), NFLOAT_SGNBIT(a), NFLOAT_D(b), NFLOAT_EXP(a) - NFLOAT_EXP(b), ctx)); + + if (gr_equal(r1, r2, ctx) != T_TRUE) + { + flint_printf("FAIL: sub_%wd\n", prec / FLINT_BITS); + flint_printf("a = "); gr_println(a, ctx); + flint_printf("b = "); gr_println(b, ctx); + flint_printf("r1 = "); gr_println(r1, ctx); + flint_printf("r2 = "); gr_println(r2, ctx); + flint_abort(); + } + } + + GR_TMP_CLEAR4(a, b, r1, r2, ctx); + gr_ctx_clear(ctx); + } + + + TEST_FUNCTION_END(state); +} diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c index b808531d63..ac275b76ee 100644 --- a/src/nfloat/test/t-nfloat.c +++ b/src/nfloat/test/t-nfloat.c @@ -10,10 +10,10 @@ */ #include "test_helpers.h" +#include "fmpq.h" +#include "arf.h" #include "gr_vec.h" #include "gr_special.h" -#include "arf.h" -#include "fmpq.h" #include "nfloat.h" int @@ -607,9 +607,17 @@ TEST_FUNCTION_START(nfloat, state) tol = gr_heap_init(ctx2); GR_IGNORE(gr_one(tol, ctx2)); - GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 1, ctx2)); + GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 2, ctx2)); + + reps = (prec <= 256 ? 10000 : 1) * flint_test_multiplier(); + + for (i = 0; i < reps; i++) + { + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_add, ctx2, tol, state, 0); + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0); + } - reps = (prec <= 128 ? 1000 : 1) * flint_test_multiplier(); + reps = (prec <= 256 ? 1000 : 1) * flint_test_multiplier(); for (i = 0; i < reps; i++) {