pxfm/tangent/
atan.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::{dd_fmla, dyad_fmla, f_fmla};
30use crate::double_double::DoubleDouble;
31use crate::shared_eval::poly_dd_3;
32
33pub(crate) static ATAN_CIRCLE: [[u16; 3]; 31] = [
34    [419, 81, 0],
35    [500, 81, 0],
36    [582, 163, 0],
37    [745, 163, 0],
38    [908, 326, 0],
39    [1234, 326, 0],
40    [1559, 651, 0],
41    [2210, 650, 1],
42    [2860, 1299, 3],
43    [4156, 1293, 4],
44    [5444, 2569, 24],
45    [7989, 2520, 32],
46    [10476, 4917, 168],
47    [15224, 4576, 200],
48    [19601, 8341, 838],
49    [27105, 6648, 731],
50    [33036, 10210, 1998],
51    [41266, 6292, 1117],
52    [46469, 7926, 2048],
53    [52375, 4038, 849],
54    [55587, 4591, 1291],
55    [58906, 2172, 479],
56    [60612, 2390, 688],
57    [62325, 1107, 247],
58    [63192, 1207, 349],
59    [64056, 556, 124],
60    [64491, 605, 175],
61    [64923, 278, 62],
62    [65141, 303, 88],
63    [65358, 139, 31],
64    [65467, 151, 44],
65];
66
67pub(crate) static ATAN_REDUCE: [(u64, u64); 129] = [
68    (0x0000000000000000, 0x0000000000000000),
69    (0x3f89224e047e368e, 0x3c1a3ca6c727c59d),
70    (0x3f992346247a91f0, 0x3bf138b0ef96a186),
71    (0x3fa2dbaae9a05db0, 0x3c436e7f8a3f5e42),
72    (0x3fa927278a3b1162, 0xbbfac986efb92662),
73    (0x3faf7495ea3f3783, 0x3c406ec8011ee816),
74    (0x3fb2e239ccff3831, 0xbc5858437d431332),
75    (0x3fb60b9f7597fdec, 0xbc3cebd13eb7c513),
76    (0x3fb936bb8c5b2da2, 0xbc5840cac0d81db5),
77    (0x3fbc63ce377fc802, 0x3c5400b0fdaa109e),
78    (0x3fbf93183a8db9e9, 0x3c40e04e06c86e72),
79    (0x3fc1626d85a91e70, 0x3c4f7ad829163ca7),
80    (0x3fc2fcac73a60640, 0xbc52680735ce2cd8),
81    (0x3fc4986a74cf4e57, 0xbc690559690b42e4),
82    (0x3fc635c990ce0d36, 0x3c591d29110b41aa),
83    (0x3fc7d4ec54fb5968, 0xbc4ea90e27182780),
84    (0x3fc975f5e0553158, 0xbc2dc82ac14e3e1c),
85    (0x3fcb1909efd8b762, 0xbc573a10fd13daaf),
86    (0x3fccbe4ceb4b4cf2, 0xbc63a7ffbeabda0b),
87    (0x3fce65e3f27c9f2a, 0xbc6db6627a24d523),
88    (0x3fd007fa758626ae, 0xbc645f97dd3099f6),
89    (0x3fd0de53475f3b3c, 0xbc66293f68741816),
90    (0x3fd1b6103d3597e9, 0xbc6ab240d40633e9),
91    (0x3fd28f459ecad74d, 0xbc2de34d14e832e0),
92    (0x3fd36a08355c63dc, 0x3c6af540d9fb4926),
93    (0x3fd4466d542bac92, 0x3c6da60fdbc82ac4),
94    (0x3fd5248ae1701b17, 0xbc792a601170138a),
95    (0x3fd604775fbb27df, 0xbc67f1fca1d5d15b),
96    (0x3fd6e649f7d78649, 0xbc64e223ea716c7b),
97    (0x3fd7ca1a832d0f84, 0x3c7b24c824ac51fc),
98    (0x3fd8b00196b3d022, 0x3c64314cd132ba43),
99    (0x3fd998188e816bf0, 0xbc711f1e0817879a),
100    (0x3fda827999fcef32, 0xbc6c3dea4dbad538),
101    (0x3fdb6f3fc8c61e5b, 0x3c660d1b780ee3eb),
102    (0x3fdc5e87185e67b6, 0xbc4ab5edb7dfa545),
103    (0x3fdd506c82a2c800, 0xbc68e1437048b5bd),
104    (0x3fde450e0d273e7a, 0xbc706951c97b050f),
105    (0x3fdf3c8ad985d9ee, 0xbc414af9522ab518),
106    (0x3fe01b819b5a7cf7, 0xbc7aba0d7d97d1f2),
107    (0x3fe09a4c59bd0d4d, 0x3c4095bc4ebc2c42),
108    (0x3fe11ab7190834ec, 0x3c8798826fa27774),
109    (0x3fe19cd3fe8e405d, 0x3c8008f6258fc98f),
110    (0x3fe220b5ef047825, 0xbc5462af7ceb7de6),
111    (0x3fe2a6709a74f289, 0xbc71184dfd78b472),
112    (0x3fe32e1889047ffd, 0x3c79141876dc40c5),
113    (0x3fe3b7c3289ed6f3, 0x3c8481c20189726c),
114    (0x3fe44386db9ce5db, 0x3c82e851bd025441),
115    (0x3fe4d17b087b265d, 0x3c713ada9b8bc419),
116    (0x3fe561b82ab7f990, 0xbc805b4c3c4cbee8),
117    (0x3fe5f457e4f4812e, 0xbc85619249bd96f1),
118    (0x3fe6897514751db6, 0xbc6b0a0fbcafc671),
119    (0x3fe7212be621be6d, 0xbc819ff2dc66da45),
120    (0x3fe7bb99ed2990cf, 0x3c81320449592d92),
121    (0x3fe858de3b716571, 0xbc81fddcd2f3da8e),
122    (0x3fe8f9197bf85eeb, 0x3c6d44a42e35cc97),
123    (0x3fe99c6e0f634394, 0xbc7585a178b4a18d),
124    (0x3fea43002ae42850, 0x3c6f95a531b3a970),
125    (0x3feaecf5f9ba35a6, 0xbc396c2d43ca3392),
126    (0x3feb9a77c18c1af2, 0xbc6a5bed94b05def),
127    (0x3fec4bb009e77983, 0x3c454509d2bff511),
128    (0x3fed00cbc7384d2e, 0xbc6b4c867cef300c),
129    (0x3fedb9fa89953fcf, 0xbc1ddfac663d6bc6),
130    (0x3fee776eafc91706, 0xbc7a510683ff7cb6),
131    (0x3fef395d9f0e3c92, 0x3c44fdcd8e4e8710),
132    (0x3ff0000000000000, 0x0000000000000000),
133    (0x3ff065c900aaf2d8, 0xbc8deec7fc9042ad),
134    (0x3ff0ce29d0883c99, 0xbc8395ae45e0657d),
135    (0x3ff139447e6a86ee, 0x3c8332cf301a97f3),
136    (0x3ff1a73d55278c4b, 0xbc86cc8c4b78213b),
137    (0x3ff2183b0c4573ff, 0x3c870a90841da57a),
138    (0x3ff28c66fdaf8f09, 0xbc6ba39bad450ee0),
139    (0x3ff303ed61109e20, 0xbc88692946d9f93c),
140    (0x3ff37efd8d87607e, 0x3c63b711bf765b58),
141    (0x3ff3fdca42847507, 0x3c7c21387985b081),
142    (0x3ff48089f8bf42cc, 0xbc87ddb19d3d0efc),
143    (0x3ff507773c537ead, 0xbc7f5e354cf971f3),
144    (0x3ff592d11142fa55, 0xbc700f0ad675330d),
145    (0x3ff622db63c8ecc2, 0xbc82c93f50ab2c0e),
146    (0x3ff6b7df86265200, 0x3c7bec391adc37d5),
147    (0x3ff7522cbdd428a8, 0xbc69686ddc9ffcf5),
148    (0x3ff7f218e25a7461, 0xbc78d16529514246),
149    (0x3ff89801106cc709, 0xbc8092f51e9c2803),
150    (0x3ff9444a7462122a, 0xbc807c06755404c4),
151    (0x3ff9f7632fa9e871, 0x3c802e0d43abc92b),
152    (0x3ffab1c35d8a74ea, 0x3c5d0184e48af6f7),
153    (0x3ffb73ee3c3ef16a, 0x3c773be957380bc2),
154    (0x3ffc3e738086bc0f, 0xbc702b6e26c84462),
155    (0x3ffd11f0dae40609, 0x3c525c4f3ffa6e1f),
156    (0x3ffdef13b73c1406, 0xbc5e302db3c6823f),
157    (0x3ffed69b4153a45d, 0x3c73207830326c0e),
158    (0x3fffc95abad6cf4a, 0xbc66308cee7927bf),
159    (0x4000641e192ceab3, 0xbc70147ebf0df4c5),
160    (0x4000ea21d716fbf7, 0xbc7168533cc41d8b),
161    (0x40017749711a6679, 0xbc652a0b0333e9c5),
162    (0x40020c36c6a7f38e, 0x3c68659eece35395),
163    (0x4002a99f50fd4f4f, 0x3c820fcad18cb36f),
164    (0x4003504f333f9de6, 0xbc752afdbd5a8c74),
165    (0x4004012ce2586a17, 0xbc79747a792907d7),
166    (0x4004bd3d87fe0650, 0x3c790c59393b52c8),
167    (0x400585aa4e1530fa, 0x3c7af6934f13a3a8),
168    (0x40065bc6cc825147, 0xbc48534dcab5ad3e),
169    (0x40074118e4b6a7c8, 0xbc7555aa8bfca9a1),
170    (0x400837626d70fdb8, 0xbc556b3fee9ca72b),
171    (0x400940ad30abc792, 0x3c54b3fdd4fdc06c),
172    (0x400a5f59e90600dd, 0x3c6285d367c55ddc),
173    (0x400b9633283b6d14, 0xbc48712976f17a16),
174    (0x400ce885653127e7, 0xbc3abe8ab65d49fc),
175    (0x400e5a3de972a377, 0x3c5cd9be81ad764b),
176    (0x400ff01305ecd8dc, 0x3c4742c2922656fa),
177    (0x4010d7dc7cff4c9e, 0xbc77c842978bee09),
178    (0x4011d0143e71565f, 0x3c67bc7dea7c3c03),
179    (0x4012e4ff1626b949, 0x3c4aefbe25b404e9),
180    (0x40141bfee2424771, 0xbc34bcfaaa95cb2c),
181    (0x40157be4eaa5e11b, 0x3c50fe741e4ec679),
182    (0x40170d751908c1b1, 0x3c5fe74a5b0ec709),
183    (0x4018dc25c117782b, 0x3c50ca1c19f710ef),
184    (0x401af73f4ca3310f, 0x3c52867b40ba77d6),
185    (0x401d7398d15e70db, 0x3c60fd4e0d4b1547),
186    (0x4020372fb36b87e2, 0x3c5c16c9ecc1621d),
187    (0x402208dbdae055ef, 0x3c56b81a36e75e8c),
188    (0x40244e6c595afdcc, 0xbc57c22045771848),
189    (0x4027398c57f3f1ad, 0x3c5970503be105c0),
190    (0x402b1d03c03d2f7f, 0xbc3f299d010aead2),
191    (0x403046e9fe60a77e, 0x3c5d2b61deff33ec),
192    (0x40345affed201b55, 0x3bf0e84d9567203a),
193    (0x403b267195b1ffae, 0xbbfad44b44b92653),
194    (0x40445e2455e4aaa7, 0xbc3296d577b5e21d),
195    (0x40545eed6854ce99, 0x3c02db53886013ca),
196    (0x0000000000000000, 0x0000000000000000),
197];
198
199#[cold]
200fn atan_refine(x: f64, a: f64) -> f64 {
201    const CH: [(u64, u64); 3] = [
202        (0xbfd5555555555555, 0xbc75555555555555),
203        (0x3fc999999999999a, 0xbc6999999999bcb8),
204        (0xbfc2492492492492, 0xbc6249242093c016),
205    ];
206    const CL: [u64; 4] = [
207        0x3fbc71c71c71c71c,
208        0xbfb745d1745d1265,
209        0x3fb3b13b115bcbc4,
210        0xbfb1107c41ad3253,
211    ];
212    let phi = f_fmla(a.abs(), f64::from_bits(0x40545f306dc9c883), 256.5).to_bits();
213    let i = ((phi >> (52 - 8)) & 0xff) as i64;
214    let h: DoubleDouble = if i == 128 {
215        DoubleDouble::from_quick_recip(-x)
216    } else {
217        let ta = f64::copysign(f64::from_bits(ATAN_REDUCE[i as usize].0), x);
218        let dzt = DoubleDouble::from_exact_mult(x, ta);
219        let zmta = x - ta;
220        let v = 1.0 + dzt.hi;
221        let d = 1.0 - v;
222        let ev = (d + dzt.hi) - ((d + v) - 1.0) + dzt.lo;
223        let r = 1.0 / v;
224        let rl = f_fmla(-ev, r, dd_fmla(r, -v, 1.0)) * r;
225        DoubleDouble::quick_mult_f64(DoubleDouble::new(rl, r), zmta)
226    };
227    let h2 = DoubleDouble::quick_mult(h, h);
228    let h3 = DoubleDouble::quick_mult(h, h2);
229    let h4 = h2.hi * h2.hi;
230
231    let zw0 = f_fmla(h2.hi, f64::from_bits(CL[3]), f64::from_bits(CL[2]));
232    let zw1 = f_fmla(h2.hi, f64::from_bits(CL[1]), f64::from_bits(CL[0]));
233    let zfl = f_fmla(h2.hi, zw1, h4 * zw0);
234
235    let mut f = poly_dd_3(h2, CH, zfl);
236    f = DoubleDouble::quick_mult(h3, f);
237    let (ah, mut az);
238    if i == 0 {
239        ah = h.hi;
240        az = f;
241    } else {
242        let mut df = 0.;
243        if i < 128 {
244            df = f64::copysign(1.0, x) * f64::from_bits(ATAN_REDUCE[i as usize].1);
245        }
246        let id = f64::copysign(i as f64, x);
247        ah = f64::from_bits(0x3f8921fb54442d00) * id;
248        az = DoubleDouble::new(
249            f64::from_bits(0xb97fc8f8cbb5bf80) * id,
250            f64::from_bits(0x3c88469898cc5180) * id,
251        );
252        az = DoubleDouble::add(az, DoubleDouble::new(0., df));
253        az = DoubleDouble::add(az, h);
254        az = DoubleDouble::add(az, f);
255    }
256    let v0 = DoubleDouble::from_exact_add(ah, az.hi);
257    let v1 = DoubleDouble::from_exact_add(v0.lo, az.lo);
258
259    v1.hi + v0.hi
260}
261
262/// fma - fma
263/// dd_fma - DD fma fallback
264/// dyad_fma - mandatory fma fallback
265#[inline(always)]
266fn atan_gen_impl<
267    Q: Fn(f64, f64, f64) -> f64,
268    F: Fn(f64, f64, f64) -> f64,
269    D: Fn(f64, f64, f64) -> f64,
270>(
271    x: f64,
272    fma: Q,
273    dd_fma: F,
274    dyad_fma: D,
275) -> f64 {
276    const CH: [u64; 4] = [
277        0x3ff0000000000000,
278        0xbfd555555555552b,
279        0x3fc9999999069c20,
280        0xbfc248d2c8444ac6,
281    ];
282    let t = x.to_bits();
283    let at = t & 0x7fffffffffffffff; // at encodes |x|
284    let mut i = (at.wrapping_shr(51) as i64).wrapping_sub(2030i64); // -2030 <= i <= 2065
285    if at < 0x3f7b21c475e6362au64 {
286        // |x| < 0x1.b21c475e6362ap-8
287        if at == 0 {
288            return x;
289        } // atan(+/-0) = +/-0
290        const CH2: [u64; 4] = [
291            0xbfd5555555555555,
292            0x3fc99999999998c1,
293            0xbfc249249176aec0,
294            0x3fbc711fd121ae80,
295        ];
296        if at < 0x3e40000000000000u64 {
297            // |x| < 0x1p-27
298            /* underflow when 0 < |x| < 2^-1022 or when |x| = 2^-1022
299            and rounding towards zero. */
300            return dyad_fma(f64::from_bits(0xbc90000000000000), x, x);
301        }
302        let x2 = x * x;
303        let x3 = x * x2;
304        let x4 = x2 * x2;
305
306        let w0 = fma(x2, f64::from_bits(CH2[3]), f64::from_bits(CH2[2]));
307        let w1 = fma(x2, f64::from_bits(CH2[1]), f64::from_bits(CH2[0]));
308
309        let f = x3 * fma(x4, w0, w1);
310        let ub = fma(f, f64::from_bits(0x3cd2000000000000), f) + x;
311        let lb = fma(-f, f64::from_bits(0x3cc4000000000000), f) + x;
312        // Ziv's accuracy test
313        if ub != lb {
314            return atan_refine(x, ub);
315        }
316        return ub;
317    }
318    let h;
319    let ah;
320    let mut al;
321    if at > 0x4062ded8e34a9035u64 {
322        // |x| > 0x4062ded8e34a9035
323        ah = f64::copysign(f64::from_bits(0x3ff921fb54442d18), x);
324        al = f64::copysign(f64::from_bits(0x3c91a62633145c07), x);
325        if at >= 0x434d02967c31cdb5u64 {
326            // |x| >= 1.63312e+16
327            if at > (0x7ffu64 << 52) {
328                return x + x;
329            } // NaN
330            return ah + al;
331        }
332        h = -1.0 / x;
333    } else {
334        // now 0.006624 <= |x| <= 150.964 thus 1<=i<=30
335        let u: u64 = t & 0x0007ffffffffffff;
336        let ut: u64 = u.wrapping_shr(51 - 16);
337        let ut2: u64 = (ut * ut).wrapping_shr(16);
338        let lc = ATAN_CIRCLE[i as usize];
339        i = (((lc[0] as u64)
340            .wrapping_shl(16)
341            .wrapping_add(ut * lc[1] as u64)
342            .wrapping_sub(ut2 * lc[2] as u64))
343            >> (16 + 9)) as i64;
344        let la = ATAN_REDUCE[i as usize];
345        let ta = f64::copysign(1.0, x) * f64::from_bits(la.0);
346        let id = f64::copysign(1.0, x) * i as f64;
347        al = dd_fma(
348            f64::copysign(1.0, x),
349            f64::from_bits(la.1),
350            f64::from_bits(0x3c88469898cc5170) * id,
351        );
352        h = (x - ta) / fma(x, ta, 1.0);
353        ah = f64::from_bits(0x3f8921fb54442d00) * id;
354    }
355    let h2 = h * h;
356    let h4 = h2 * h2;
357
358    let f0 = fma(h2, f64::from_bits(CH[3]), f64::from_bits(CH[2]));
359    let f1 = fma(h2, f64::from_bits(CH[1]), f64::from_bits(CH[0]));
360
361    let f = fma(h4, f0, f1);
362    al = dd_fma(h, f, al);
363    let ub = fma(h, f64::from_bits(0x3ccf800000000000), al) + ah;
364    let lb = fma(-h, f64::from_bits(0x3ccf800000000000), al) + ah;
365    // Ziv's accuracy test
366    if lb != ub {
367        return atan_refine(x, ub);
368    }
369    ub
370}
371
372#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
373#[target_feature(enable = "avx", enable = "fma")]
374unsafe fn atan_fma_impl(x: f64) -> f64 {
375    atan_gen_impl(x, f64::mul_add, f64::mul_add, f64::mul_add)
376}
377
378/// Computes atan in double precision
379///
380/// ULP 0.5
381pub fn f_atan(x: f64) -> f64 {
382    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
383    {
384        atan_gen_impl(x, f_fmla, dd_fmla, dyad_fmla)
385    }
386    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
387    {
388        use std::sync::OnceLock;
389        static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
390        let q = EXECUTOR.get_or_init(|| {
391            if std::arch::is_x86_feature_detected!("avx")
392                && std::arch::is_x86_feature_detected!("fma")
393            {
394                atan_fma_impl
395            } else {
396                fn def_atan(x: f64) -> f64 {
397                    atan_gen_impl(x, f_fmla, dd_fmla, dyad_fmla)
398                }
399                def_atan
400            }
401        });
402        unsafe { q(x) }
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409
410    #[test]
411    fn atan_test() {
412        assert_eq!(f_atan(7.7877585082074305E-308), 7.78775850820743e-308);
413        assert_eq!(f_atan(0.0), 0.0);
414        assert_eq!(f_atan(1.0), 0.7853981633974483096156608458198);
415        assert_eq!(f_atan(35.9), 1.542948374599341097473183563168947);
416        assert_eq!(f_atan(-35.9), -1.542948374599341097473183563168947);
417        assert_eq!(f_atan(f64::INFINITY), 1.5707963267948966);
418    }
419}