pxfm/tangent/
atanpi.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::{dd_fmla, dyad_fmla, f_fmla};
30use crate::double_double::DoubleDouble;
31use crate::shared_eval::poly_dd_3;
32use crate::tangent::atan::{ATAN_CIRCLE, ATAN_REDUCE};
33
34const ONE_OVER_PI_DD: DoubleDouble = DoubleDouble::new(
35    f64::from_bits(0xbc76b01ec5417056),
36    f64::from_bits(0x3fd45f306dc9c883),
37);
38
39const ONE_OVER_3PI: f64 = f64::from_bits(0x3fbb2995e7b7b604); // approximates 1/(3pi)
40
41fn atanpi_small(x: f64) -> f64 {
42    if x == 0. {
43        return x;
44    }
45    if x.abs() == f64::from_bits(0x0015cba89af1f855) {
46        return if x > 0. {
47            f_fmla(
48                f64::from_bits(0x9a70000000000000),
49                f64::from_bits(0x1a70000000000000),
50                f64::from_bits(0x0006f00f7cd3a40b),
51            )
52        } else {
53            f_fmla(
54                f64::from_bits(0x1a70000000000000),
55                f64::from_bits(0x1a70000000000000),
56                f64::from_bits(0x8006f00f7cd3a40b),
57            )
58        };
59    }
60    // generic worst case
61    let mut v = x.to_bits();
62    if (v & 0xfffffffffffff) == 0x59af9a1194efe
63    // +/-0x1.59af9a1194efe*2^e
64    {
65        let e = v >> 52;
66        if (e & 0x7ff) > 2 {
67            v = ((e - 2) << 52) | 0xb824198b94a89;
68            return if x > 0. {
69                dd_fmla(
70                    f64::from_bits(0x9a70000000000000),
71                    f64::from_bits(0x1a70000000000000),
72                    f64::from_bits(v),
73                )
74            } else {
75                dd_fmla(
76                    f64::from_bits(0x1a70000000000000),
77                    f64::from_bits(0x1a70000000000000),
78                    f64::from_bits(v),
79                )
80            };
81        }
82    }
83    let h = x * ONE_OVER_PI_DD.hi;
84    /* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
85    e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
86    let mut corr = dd_fmla(
87        x * f64::from_bits(0x4690000000000000),
88        ONE_OVER_PI_DD.hi,
89        -h * f64::from_bits(0x4690000000000000),
90    );
91    corr = dd_fmla(
92        x * f64::from_bits(0x4690000000000000),
93        ONE_OVER_PI_DD.lo,
94        corr,
95    );
96    // now return h + corr * 2^-106
97
98    dyad_fmla(corr, f64::from_bits(0x3950000000000000), h)
99}
100
101#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
102#[target_feature(enable = "avx", enable = "fma")]
103unsafe fn atanpi_small_fma(x: f64) -> f64 {
104    if x == 0. {
105        return x;
106    }
107    if x.abs() == f64::from_bits(0x0015cba89af1f855) {
108        return if x > 0. {
109            f64::mul_add(
110                f64::from_bits(0x9a70000000000000),
111                f64::from_bits(0x1a70000000000000),
112                f64::from_bits(0x0006f00f7cd3a40b),
113            )
114        } else {
115            f64::mul_add(
116                f64::from_bits(0x1a70000000000000),
117                f64::from_bits(0x1a70000000000000),
118                f64::from_bits(0x8006f00f7cd3a40b),
119            )
120        };
121    }
122    // generic worst case
123    let mut v = x.to_bits();
124    if (v & 0xfffffffffffff) == 0x59af9a1194efe
125    // +/-0x1.59af9a1194efe*2^e
126    {
127        let e = v >> 52;
128        if (e & 0x7ff) > 2 {
129            v = ((e - 2) << 52) | 0xb824198b94a89;
130            return if x > 0. {
131                f64::mul_add(
132                    f64::from_bits(0x9a70000000000000),
133                    f64::from_bits(0x1a70000000000000),
134                    f64::from_bits(v),
135                )
136            } else {
137                f64::mul_add(
138                    f64::from_bits(0x1a70000000000000),
139                    f64::from_bits(0x1a70000000000000),
140                    f64::from_bits(v),
141                )
142            };
143        }
144    }
145    let h = x * ONE_OVER_PI_DD.hi;
146    /* Assuming h = x*ONE_OVER_PI_DD.hi - e, the correction term is
147    e + x * ONE_OVER_PIL, but we need to scale values to avoid underflow. */
148    let mut corr = f64::mul_add(
149        x * f64::from_bits(0x4690000000000000),
150        ONE_OVER_PI_DD.hi,
151        -h * f64::from_bits(0x4690000000000000),
152    );
153    corr = f64::mul_add(
154        x * f64::from_bits(0x4690000000000000),
155        ONE_OVER_PI_DD.lo,
156        corr,
157    );
158    // now return h + corr * 2^-106
159
160    f64::mul_add(corr, f64::from_bits(0x3950000000000000), h)
161}
162
163/* Deal with the case where |x| is large:
164for x > 0, atanpi(x) = 1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5)
165for x < 0, atanpi(x) = -1/2 - 1/pi * 1/x + 1/(3pi) * 1/x^3 + O(1/x^5).
166The next term 1/5*x^5/pi is smaller than 2^-107 * atanpi(x)
167when |x| > 0x1.bep20. */
168fn atanpi_asympt(x: f64) -> f64 {
169    let h = f64::copysign(0.5, x);
170    // approximate 1/x as yh + yl
171    let rcy = DoubleDouble::from_quick_recip(x);
172    let mut m = DoubleDouble::quick_mult(rcy, ONE_OVER_PI_DD);
173    // m + l ~ 1/pi * 1/x
174    m.hi = -m.hi;
175    m.lo = dd_fmla(ONE_OVER_3PI * rcy.hi, rcy.hi * rcy.hi, -m.lo);
176    // m + l ~ - 1/pi * 1/x + 1/(3pi) * 1/x^3
177    let vh = DoubleDouble::from_exact_add(h, m.hi);
178    m.hi = vh.hi;
179    m = DoubleDouble::from_exact_add(vh.lo, m.lo);
180    if m.hi.abs() == f64::from_bits(0x3c80000000000000) {
181        // this is 1/2 ulp(atan(x))
182        m.hi = if m.hi * m.lo > 0. {
183            f64::copysign(f64::from_bits(0x3c80000000000001), m.hi)
184        } else {
185            f64::copysign(f64::from_bits(0x3c7fffffffffffff), m.hi)
186        };
187    }
188    vh.hi + m.hi
189}
190
191#[inline]
192fn atanpi_tiny(x: f64) -> f64 {
193    let mut dx = DoubleDouble::quick_mult_f64(ONE_OVER_PI_DD, x);
194    dx.lo = dd_fmla(-ONE_OVER_3PI * x, x * x, dx.lo);
195    dx.to_f64()
196}
197
198#[cold]
199fn as_atanpi_refine2(x: f64, a: f64) -> f64 {
200    if x.abs() > f64::from_bits(0x413be00000000000) {
201        return atanpi_asympt(x);
202    }
203    if x.abs() < f64::from_bits(0x3e4c700000000000) {
204        return atanpi_tiny(x);
205    }
206    const CH: [(u64, u64); 3] = [
207        (0xbfd5555555555555, 0xbc75555555555555),
208        (0x3fc999999999999a, 0xbc6999999999bcb8),
209        (0xbfc2492492492492, 0xbc6249242093c016),
210    ];
211    const CL: [u64; 4] = [
212        0x3fbc71c71c71c71c,
213        0xbfb745d1745d1265,
214        0x3fb3b13b115bcbc4,
215        0xbfb1107c41ad3253,
216    ];
217    let phi = ((a.abs()) * f64::from_bits(0x40545f306dc9c883) + 256.5).to_bits();
218    let i: i64 = ((phi >> (52 - 8)) & 0xff) as i64;
219    let dh: DoubleDouble = if i == 128 {
220        DoubleDouble::from_quick_recip(-x)
221    } else {
222        let ta = f64::copysign(f64::from_bits(ATAN_REDUCE[i as usize].0), x);
223        let dzta = DoubleDouble::from_exact_mult(x, ta);
224        let zmta = x - ta;
225        let v = 1. + dzta.hi;
226        let d = 1. - v;
227        let ev = (d + dzta.hi) - ((d + v) - 1.) + dzta.lo;
228        let r = 1.0 / v;
229        let rl = f_fmla(-ev, r, dd_fmla(r, -v, 1.0)) * r;
230        DoubleDouble::quick_mult_f64(DoubleDouble::new(rl, r), zmta)
231    };
232    let d2 = DoubleDouble::quick_mult(dh, dh);
233    let h4 = d2.hi * d2.hi;
234    let h3 = DoubleDouble::quick_mult(dh, d2);
235
236    let fl0 = f_fmla(d2.hi, f64::from_bits(CL[1]), f64::from_bits(CL[0]));
237    let fl1 = f_fmla(d2.hi, f64::from_bits(CL[3]), f64::from_bits(CL[2]));
238
239    let fl = d2.hi * f_fmla(h4, fl1, fl0);
240    let mut f = poly_dd_3(d2, CH, fl);
241    f = DoubleDouble::quick_mult(h3, f);
242    let (ah, mut al, mut at);
243    if i == 0 {
244        ah = dh.hi;
245        al = f.hi;
246        at = f.lo;
247    } else {
248        let mut df = 0.;
249        if i < 128 {
250            df = f64::copysign(1.0, x) * f64::from_bits(ATAN_REDUCE[i as usize].1);
251        }
252        let id = f64::copysign(i as f64, x);
253        ah = f64::from_bits(0x3f8921fb54442d00) * id;
254        al = f64::from_bits(0x3c88469898cc5180) * id;
255        at = f64::from_bits(0xb97fc8f8cbb5bf80) * id;
256        let v0 = DoubleDouble::add(DoubleDouble::new(at, al), DoubleDouble::new(0., df));
257        let v1 = DoubleDouble::add(v0, dh);
258        let v2 = DoubleDouble::add(v1, f);
259        al = v2.hi;
260        at = v2.lo;
261    }
262
263    let v2 = DoubleDouble::from_exact_add(ah, al);
264    let v1 = DoubleDouble::from_exact_add(v2.lo, at);
265
266    let z0 = DoubleDouble::quick_mult(DoubleDouble::new(v1.hi, v2.hi), ONE_OVER_PI_DD);
267    // atanpi_end
268    z0.to_f64()
269}
270
271#[inline(always)]
272/// fma - fma
273/// dd_fma - mandatory fma fallback
274fn atanpi_gen_impl<
275    Q: Fn(f64, f64, f64) -> f64,
276    F: Fn(f64, f64, f64) -> f64,
277    DdMul: Fn(DoubleDouble, DoubleDouble) -> DoubleDouble,
278    AtanpiSmall: Fn(f64) -> f64,
279>(
280    x: f64,
281    fma: Q,
282    dd_fma: F,
283    dd_mul: DdMul,
284    atanpi_small: AtanpiSmall,
285) -> f64 {
286    const CH: [u64; 4] = [
287        0x3ff0000000000000,
288        0xbfd555555555552b,
289        0x3fc9999999069c20,
290        0xbfc248d2c8444ac6,
291    ];
292    let t = x.to_bits();
293    let at: u64 = t & 0x7fff_ffff_ffff_ffff;
294    let mut i = (at >> 51).wrapping_sub(2030u64);
295    if at < 0x3f7b21c475e6362au64 {
296        // |x| < 0.006624
297        if at < 0x3c90000000000000u64 {
298            // |x| < 2^-54
299            return atanpi_small(x);
300        }
301        if x == 0. {
302            return x;
303        }
304        const CH2: [u64; 4] = [
305            0xbfd5555555555555,
306            0x3fc99999999998c1,
307            0xbfc249249176aec0,
308            0x3fbc711fd121ae80,
309        ];
310        let x2 = x * x;
311        let x3 = x * x2;
312        let x4 = x2 * x2;
313
314        let f0 = fma(x2, f64::from_bits(CH2[3]), f64::from_bits(CH2[2]));
315        let f1 = fma(x2, f64::from_bits(CH2[1]), f64::from_bits(CH2[0]));
316
317        let f = x3 * dd_fma(x4, f0, f1);
318        // begin_atanpi
319        /* Here x+f approximates atan(x), with absolute error bounded by
320        0x4.8p-52*f (see atan.c). After multiplying by 1/pi this error
321        will be bounded by 0x1.6fp-52*f. For |x| < 0x1.b21c475e6362ap-8
322        we have |f| < 2^-16*|x|, thus the error is bounded by
323        0x1.6fp-52*2^-16*|x| < 0x1.6fp-68. */
324        // multiply x + f by 1/pi
325        let hy = dd_mul(DoubleDouble::new(f, x), ONE_OVER_PI_DD);
326        /* The rounding error in muldd and the approximation error between
327        1/pi and ONE_OVER_PIH + ONE_OVER_PIL are covered by the difference
328        between 0x4.8p-52*pi and 0x1.6fp-52, which is > 2^-61.8. */
329        let mut ub = hy.hi + dd_fma(f64::from_bits(0x3bb6f00000000000), x, hy.lo);
330        let lb = hy.hi + dd_fma(f64::from_bits(0xbbb6f00000000000), x, hy.lo);
331        if ub == lb {
332            return ub;
333        }
334        // end_atanpi
335        ub = (f + f * f64::from_bits(0x3cd2000000000000)) + x; // atanpi_specific, original value in atan
336        return as_atanpi_refine2(x, ub);
337    }
338    // now |x| >= 0x1.b21c475e6362ap-8
339    let h;
340    let mut a: DoubleDouble;
341    if at > 0x4062ded8e34a9035u64 {
342        // |x| > 0x1.2ded8e34a9035p+7, atanpi|x| > 0.49789
343        if at >= 0x43445f306dc9c883u64 {
344            // |x| >= 0x1.45f306dc9c883p+53, atanpi|x| > 0.5 - 0x1p-55
345            if at >= (0x7ffu64 << 52) {
346                // case Inf or NaN
347                if at == 0x7ffu64 << 52 {
348                    // Inf
349                    return f64::copysign(0.5, x);
350                } // atanpi_specific
351                return x + x; // NaN
352            }
353            return f64::copysign(0.5, x) - f64::copysign(f64::from_bits(0x3c70000000000000), x);
354        }
355        h = -1.0 / x;
356        a = DoubleDouble::new(
357            f64::copysign(f64::from_bits(0x3c91a62633145c07), x),
358            f64::copysign(f64::from_bits(0x3ff921fb54442d18), x),
359        );
360    } else {
361        // 0x1.b21c475e6362ap-8 <= |x| <= 0x1.2ded8e34a9035p+7
362        /* we need to deal with |x| = 1 separately since in this case
363        h=0 below, and the error is measured in terms of multiple of h */
364        if at == 0x3ff0000000000000 {
365            // |x| = 1
366            return f64::copysign(f64::from_bits(0x3fd0000000000000), x);
367        }
368        let u: u64 = t & 0x0007ffffffffffff;
369        let ut = u >> (51 - 16);
370        let ut2 = (ut * ut) >> 16;
371        let vc = ATAN_CIRCLE[i as usize];
372        i = (((vc[0] as u64).wrapping_shl(16)) + ut * (vc[1] as u64) - ut2 * (vc[2] as u64))
373            >> (16 + 9);
374        let va = ATAN_REDUCE[i as usize];
375        let ta = f64::copysign(1.0, x) * f64::from_bits(va.0);
376        let id = f64::copysign(1.0, x) * i as f64;
377        h = (x - ta) / fma(x, ta, 1.);
378        a = DoubleDouble::new(
379            fma(
380                f64::copysign(1.0, x),
381                f64::from_bits(va.1),
382                f64::from_bits(0x3c88469898cc5170) * id,
383            ),
384            f64::from_bits(0x3f8921fb54442d00) * id,
385        );
386    }
387    let h2 = h * h;
388    let h4 = h2 * h2;
389
390    let f0 = fma(h2, f64::from_bits(CH[3]), f64::from_bits(CH[2]));
391    let f1 = fma(h2, f64::from_bits(CH[1]), f64::from_bits(CH[0]));
392
393    let f = fma(h4, f0, f1);
394    a.lo = dd_fma(h, f, a.lo);
395    // begin_atanpi
396    /* Now ah + al approximates atan(x) with error bounded by 0x3.fp-52*h
397    (see atan.c), thus by 0x1.41p-52*h after multiplication by 1/pi.
398    We normalize ah+al so that the rounding error in muldd is negligible
399    below. */
400    let e0 = h * f64::from_bits(0x3ccf800000000000); // original value in atan.c
401    let ub0 = (a.lo + e0) + a.hi; // original value in atan.c
402    a = DoubleDouble::from_exact_add(a.hi, a.lo);
403    a = dd_mul(a, ONE_OVER_PI_DD);
404    /* The rounding error in muldd() and the approximation error between 1/pi
405    and ONE_OVER_PIH+ONE_OVER_PIL are absorbed when rounding up 0x3.fp-52*pi
406    to 0x1.41p-52. */
407    let e = h * f64::from_bits(0x3cb4100000000000); // atanpi_specific
408    // end_atanpi
409    let ub = (a.lo + e) + a.hi;
410    let lb = (a.lo - e) + a.hi;
411    if ub == lb {
412        return ub;
413    }
414    as_atanpi_refine2(x, ub0)
415}
416
417#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
418#[target_feature(enable = "avx", enable = "fma")]
419unsafe fn atanpi_fma_impl(x: f64) -> f64 {
420    atanpi_gen_impl(
421        x,
422        f64::mul_add,
423        f64::mul_add,
424        DoubleDouble::quick_mult_fma,
425        |x| unsafe { atanpi_small_fma(x) },
426    )
427}
428
429/// Computes atan(x)/pi
430///
431/// Max ULP 0.5
432pub fn f_atanpi(x: f64) -> f64 {
433    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
434    {
435        atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
436    }
437    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
438    {
439        use std::sync::OnceLock;
440        static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
441        let q = EXECUTOR.get_or_init(|| {
442            if std::arch::is_x86_feature_detected!("avx")
443                && std::arch::is_x86_feature_detected!("fma")
444            {
445                atanpi_fma_impl
446            } else {
447                fn def_atanpi(x: f64) -> f64 {
448                    atanpi_gen_impl(x, f_fmla, dd_fmla, DoubleDouble::quick_mult, atanpi_small)
449                }
450                def_atanpi
451            }
452        });
453        unsafe { q(x) }
454    }
455}
456
457#[cfg(test)]
458mod tests {
459
460    use super::*;
461
462    #[test]
463    fn atanpi_test() {
464        assert_eq!(f_atanpi(119895888825197.97), 0.49999999999999734);
465        assert_eq!(f_atanpi(1.5610674235815122E-307), 4.969031939254545e-308);
466        assert_eq!(f_atanpi(0.00004577636903266291), 0.000014571070806516354);
467        assert_eq!(f_atanpi(-0.00004577636903266291), -0.000014571070806516354);
468        assert_eq!(f_atanpi(-0.4577636903266291), -0.13664770469904508);
469        assert_eq!(f_atanpi(0.9876636903266291), 0.24802445507819193);
470    }
471}
pxfm/tangent/atanpi.rs

pxfm/tangent/
atanpi.rs