From 8b77cc49144930777b494f7735a9ebd79a62e820 Mon Sep 17 00:00:00 2001 From: Naoki Shibata Date: Tue, 1 Oct 2019 08:21:06 +0900 Subject: [PATCH] Update fmod algorithm This patch will update the algorithm for fmod to the version written in the paper. --- src/libm/sleefdp.c | 27 +++++++++++++++------------ src/libm/sleefsimddp.c | 40 +++++++++++++++++++++++----------------- src/libm/sleefsimdsp.c | 8 ++++++-- src/libm/sleefsp.c | 6 ++++-- 4 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/libm/sleefdp.c b/src/libm/sleefdp.c index dcdba4f4..1c5e79ff 100644 --- a/src/libm/sleefdp.c +++ b/src/libm/sleefdp.c @@ -2408,22 +2408,25 @@ static INLINE CONST double ptrunc(double x) { } EXPORT CONST double xfmod(double x, double y) { - double nu = fabsk(x), de = fabsk(y), s = 1, q; - if (de < DBL_MIN) { nu *= 1ULL << 54; de *= 1ULL << 54; s = 1.0 / (1ULL << 54); } - Sleef_double2 r = dd(nu, 0); - double rde = toward0(1.0 / de); - - for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 51) + 1 - q = (de+de > r.x && r.x >= de) ? 1 : (toward0(r.x) * rde); - r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(removelsb(ptrunc(q)), -de))); - if (r.x < de) break; + double n = fabsk(x), d = fabsk(y), s = 1, q; + if (d < DBL_MIN) { n *= 1ULL << 54; d *= 1ULL << 54; s = 1.0 / (1ULL << 54); } + Sleef_double2 r = dd(n, 0); + double rd = toward0(1.0 / d); + + for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52) + q = removelsb(ptrunc(toward0(r.x) * rd)); + q = (3*d > r.x && r.x > d) ? 2 : q; + q = (2*d > r.x && r.x > d) ? 1 : q; + q = r.x == d ? (r.y >= 0 ? 1 : 0) : q; + r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d))); + if (r.x < d) break; } double ret = r.x * s; - if (r.x + r.y == de) ret = 0; + if (r.x + r.y == d) ret = 0; ret = mulsign(ret, x); - if (nu < de) ret = x; - if (de == 0) ret = SLEEF_NAN; + if (n < d) ret = x; + if (d == 0) ret = SLEEF_NAN; return ret; } diff --git a/src/libm/sleefsimddp.c b/src/libm/sleefsimddp.c index 8827e9c9..212f4ce2 100644 --- a/src/libm/sleefsimddp.c +++ b/src/libm/sleefsimddp.c @@ -3298,30 +3298,36 @@ static INLINE CONST VECTOR_CC vdouble vptrunc(vdouble x) { // round to integer t /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) { - vdouble nu = vabs_vd_vd(x), de = vabs_vd_vd(y), s = vcast_vd_d(1), q; - vopmask o = vlt_vo_vd_vd(de, vcast_vd_d(DBL_MIN)); - nu = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(nu, vcast_vd_d(1ULL << 54)), nu); - de = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(de, vcast_vd_d(1ULL << 54)), de); + vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q; + vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); + n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(1ULL << 54)), n); + d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1ULL << 54)), d); s = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (1ULL << 54))), s); - vdouble rde = vtoward0(vrec_vd_vd(de)); - vdouble2 r = vcast_vd2_vd_vd(nu, vcast_vd_d(0)); - - for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 51) + 1 - q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(de, de), r.x), - vge_vo_vd_vd(r.x, de)), - vcast_vd_d(1), vmul_vd_vd_vd(vtoward0(r.x), rde)); - q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(vptrunc(q)), vcast_vm_i_i(0xffffffff, 0xfffffffe))); - r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(de)))); - if (vtestallones_i_vo64(vlt_vo_vd_vd(r.x, de))) break; + vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0)); + vdouble rd = vtoward0(vrec_vd_vd(d)); + + for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52) + q = vptrunc(vmul_vd_vd_vd(vtoward0(r.x), rd)); +#ifndef ENABLE_FMA_DP + q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe))); +#endif + q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vmul_vd_vd_vd(vcast_vd_d(3), d), r.x), + vge_vo_vd_vd(r.x, d)), + vcast_vd_d(2), q); + q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(d, d), r.x), + vge_vo_vd_vd(r.x, d)), + vcast_vd_d(1), q); + r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d)))); + if (vtestallones_i_vo64(vlt_vo_vd_vd(r.x, d))) break; } vdouble ret = vmul_vd_vd_vd(r.x, s); - ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(r.x, r.y), de), vcast_vd_d(0), ret); + ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(r.x, r.y), d), vcast_vd_d(0), ret); ret = vmulsign_vd_vd_vd(ret, x); - ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(nu, de), x, ret); - ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(de, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret); + ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(n, d), x, ret); + ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret); return ret; } diff --git a/src/libm/sleefsimdsp.c b/src/libm/sleefsimdsp.c index 3a1fa480..9f661c85 100644 --- a/src/libm/sleefsimdsp.c +++ b/src/libm/sleefsimdsp.c @@ -2902,9 +2902,13 @@ EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) { vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0)); for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 - q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vadd_vf_vf_vf(de, de), r.x), + q = vptruncf(vmul_vf_vf_vf(vtoward0f(r.x), rde)); + q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(3), de), r.x), vge_vo_vf_vf(r.x, de)), - vcast_vf_f(1), vmul_vf_vf_vf(vtoward0f(r.x), rde)); + vcast_vf_f(2), q); + q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2), de), r.x), + vge_vo_vf_vf(r.x, de)), + vcast_vf_f(1), q); r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de)))); if (vtestallones_i_vo32(vlt_vo_vf_vf(r.x, de))) break; } diff --git a/src/libm/sleefsp.c b/src/libm/sleefsp.c index 7d580119..c360b2c6 100644 --- a/src/libm/sleefsp.c +++ b/src/libm/sleefsp.c @@ -2020,8 +2020,10 @@ EXPORT CONST float xfmodf(float x, float y) { float rde = toward0f(1.0f / de); for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 - q = (de+de > r.x && r.x >= de) ? 1.0f : (toward0f(r.x) * rde); - r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(ptruncf(q), -de))); + q = ptruncf(toward0f(r.x) * rde); + q = (3*de > r.x && r.x >= de) ? 2 : q; + q = (2*de > r.x && r.x >= de) ? 1 : q; + r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -de))); if (r.x < de) break; }