pxfm/tangent/
atan2.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::f_fmla;
30use crate::double_double::DoubleDouble;
31use crate::dyadic_float::{DyadicFloat128, DyadicSign};
32use crate::polyeval::f_polyeval4;
33use crate::rounding::CpuRound;
34
35// atan(i/64) with i = 0..64, generated by Sollya with:
36// > for i from 0 to 64 do {
37//     a = round(atan(i/64), D, RN);
38//     b = round(atan(i/64) - a, D, RN);
39//     print("{", b, ",", a, "},");
40//   };
41pub(crate) static ATAN_I: [(u64, u64); 65] = [
42    (0x0000000000000000, 0x0000000000000000),
43    (0xbc2220c39d4dff50, 0x3f8fff555bbb729b),
44    (0xbc35ec431444912c, 0x3f9ffd55bba97625),
45    (0xbc086ef8f794f105, 0x3fa7fb818430da2a),
46    (0xbc3c934d86d23f1d, 0x3faff55bb72cfdea),
47    (0x3c5ac4ce285df847, 0x3fb3f59f0e7c559d),
48    (0xbc5cfb654c0c3d98, 0x3fb7ee182602f10f),
49    (0x3c5f7b8f29a05987, 0x3fbbe39ebe6f07c3),
50    (0xbc4cd37686760c17, 0x3fbfd5ba9aac2f6e),
51    (0xbc4b485914dacf8c, 0x3fc1e1fafb043727),
52    (0x3c661a3b0ce9281b, 0x3fc3d6eee8c6626c),
53    (0xbc5054ab2c010f3d, 0x3fc5c9811e3ec26a),
54    (0x3c5347b0b4f881ca, 0x3fc7b97b4bce5b02),
55    (0x3c4cf601e7b4348e, 0x3fc9a6a8e96c8626),
56    (0x3c217b10d2e0e5ab, 0x3fcb90d7529260a2),
57    (0x3c6c648d1534597e, 0x3fcd77d5df205736),
58    (0x3c68ab6e3cf7afbd, 0x3fcf5b75f92c80dd),
59    (0x3c762e47390cb865, 0x3fd09dc597d86362),
60    (0x3c630ca4748b1bf9, 0x3fd18bf5a30bf178),
61    (0xbc7077cdd36dfc81, 0x3fd278372057ef46),
62    (0xbc6963a544b672d8, 0x3fd362773707ebcc),
63    (0xbc75d5e43c55b3ba, 0x3fd44aa436c2af0a),
64    (0xbc62566480884082, 0x3fd530ad9951cd4a),
65    (0xbc7a725715711f00, 0x3fd614840309cfe2),
66    (0xbc7c63aae6f6e918, 0x3fd6f61941e4def1),
67    (0x3c769c885c2b249a, 0x3fd7d5604b63b3f7),
68    (0x3c7b6d0ba3748fa8, 0x3fd8b24d394a1b25),
69    (0x3c79e6c988fd0a77, 0x3fd98cd5454d6b18),
70    (0xbc724dec1b50b7ff, 0x3fda64eec3cc23fd),
71    (0x3c7ae187b1ca5040, 0x3fdb3a911da65c6c),
72    (0xbc7cc1ce70934c34, 0x3fdc0db4c94ec9f0),
73    (0xbc7a2cfa4418f1ad, 0x3fdcde53432c1351),
74    (0x3c7a2b7f222f65e2, 0x3fddac670561bb4f),
75    (0x3c70e53dc1bf3435, 0x3fde77eb7f175a34),
76    (0xbc6a3992dc382a23, 0x3fdf40dd0b541418),
77    (0xbc8b32c949c9d593, 0x3fe0039c73c1a40c),
78    (0xbc7d5b495f6349e6, 0x3fe0657e94db30d0),
79    (0x3c5974fa13b5404f, 0x3fe0c6145b5b43da),
80    (0xbc52bdaee1c0ee35, 0x3fe1255d9bfbd2a9),
81    (0x3c8c621cec00c301, 0x3fe1835a88be7c13),
82    (0xbc5928df287a668f, 0x3fe1e00babdefeb4),
83    (0x3c6c421c9f38224e, 0x3fe23b71e2cc9e6a),
84    (0xbc709e73b0c6c087, 0x3fe2958e59308e31),
85    (0x3c8c5d5e9ff0cf8d, 0x3fe2ee628406cbca),
86    (0x3c81021137c71102, 0x3fe345f01cce37bb),
87    (0xbc82304331d8bf46, 0x3fe39c391cd4171a),
88    (0x3c7ecf8b492644f0, 0x3fe3f13fb89e96f4),
89    (0xbc7f76d0163f79c8, 0x3fe445065b795b56),
90    (0x3c72419a87f2a458, 0x3fe4978fa3269ee1),
91    (0x3c84a33dbeb3796c, 0x3fe4e8de5bb6ec04),
92    (0xbc81bb74abda520c, 0x3fe538f57b89061f),
93    (0xbc75e5c9d8c5a950, 0x3fe587d81f732fbb),
94    (0x3c60028e4bc5e7ca, 0x3fe5d58987169b18),
95    (0xbc62b785350ee8c1, 0x3fe6220d115d7b8e),
96    (0xbc76ea6febe8bbba, 0x3fe66d663923e087),
97    (0xbc8a80386188c50e, 0x3fe6b798920b3d99),
98    (0xbc78c34d25aadef6, 0x3fe700a7c5784634),
99    (0x3c47b2a6165884a1, 0x3fe748978fba8e0f),
100    (0x3c8406a089803740, 0x3fe78f6bbd5d315e),
101    (0x3c8560821e2f3aa9, 0x3fe7d528289fa093),
102    (0xbc7bf76229d3b917, 0x3fe819d0b7158a4d),
103    (0x3c66b66e7fc8b8c3, 0x3fe85d69576cc2c5),
104    (0xbc855b9a5e177a1b, 0x3fe89ff5ff57f1f8),
105    (0xbc7ec182ab042f61, 0x3fe8e17aa99cc05e),
106    (0x3c81a62633145c07, 0x3fe921fb54442d18),
107];
108
109// Sage math:
110// print("pub(crate) static ATAN_RATIONAL_128: [DyadicFloat128; 65] = [")
111// for i in range(65):
112//     x = D3(i)/D3(64)
113//     v = atan(x)
114//     print_dyadic(v)
115//
116// print("];")
117pub(crate) static ATAN_RATIONAL_128: [DyadicFloat128; 65] = [
118    DyadicFloat128 {
119        sign: DyadicSign::Pos,
120        exponent: 0,
121        mantissa: 0x0_u128,
122    },
123    DyadicFloat128 {
124        sign: DyadicSign::Pos,
125        exponent: -134,
126        mantissa: 0xfffaaadd_db94d5bb_e78c5640_15f76048_u128,
127    },
128    DyadicFloat128 {
129        sign: DyadicSign::Pos,
130        exponent: -133,
131        mantissa: 0xffeaaddd_4bb12542_779d776d_da8c6214_u128,
132    },
133    DyadicFloat128 {
134        sign: DyadicSign::Pos,
135        exponent: -132,
136        mantissa: 0xbfdc0c21_86d14fcf_220e10d6_1df56ec7_u128,
137    },
138    DyadicFloat128 {
139        sign: DyadicSign::Pos,
140        exponent: -132,
141        mantissa: 0xffaaddb9_67ef4e36_cb2792dc_0e2e0d51_u128,
142    },
143    DyadicFloat128 {
144        sign: DyadicSign::Pos,
145        exponent: -131,
146        mantissa: 0x9facf873_e2aceb58_99c50bbf_08e6cdf6_u128,
147    },
148    DyadicFloat128 {
149        sign: DyadicSign::Pos,
150        exponent: -131,
151        mantissa: 0xbf70c130_17887460_93567e78_4cf83676_u128,
152    },
153    DyadicFloat128 {
154        sign: DyadicSign::Pos,
155        exponent: -131,
156        mantissa: 0xdf1cf5f3_783e1bef_71e5340b_30e5d9ef_u128,
157    },
158    DyadicFloat128 {
159        sign: DyadicSign::Pos,
160        exponent: -131,
161        mantissa: 0xfeadd4d5_617b6e32_c897989f_3e888ef8_u128,
162    },
163    DyadicFloat128 {
164        sign: DyadicSign::Pos,
165        exponent: -130,
166        mantissa: 0x8f0fd7d8_21b93725_bd375929_83a0af9a_u128,
167    },
168    DyadicFloat128 {
169        sign: DyadicSign::Pos,
170        exponent: -130,
171        mantissa: 0x9eb77746_331362c3_47619d25_0360fe85_u128,
172    },
173    DyadicFloat128 {
174        sign: DyadicSign::Pos,
175        exponent: -130,
176        mantissa: 0xae4c08f1_f6134efa_b54d3fef_0c2de994_u128,
177    },
178    DyadicFloat128 {
179        sign: DyadicSign::Pos,
180        exponent: -130,
181        mantissa: 0xbdcbda5e_72d81134_7b0b4f88_1c9c7488_u128,
182    },
183    DyadicFloat128 {
184        sign: DyadicSign::Pos,
185        exponent: -130,
186        mantissa: 0xcd35474b_643130e7_b00f3da1_a46eeb3b_u128,
187    },
188    DyadicFloat128 {
189        sign: DyadicSign::Pos,
190        exponent: -130,
191        mantissa: 0xdc86ba94_93051022_f621a5c1_cb552f03_u128,
192    },
193    DyadicFloat128 {
194        sign: DyadicSign::Pos,
195        exponent: -130,
196        mantissa: 0xebbeaef9_02b9b38c_91a2a68b_2fbd78e8_u128,
197    },
198    DyadicFloat128 {
199        sign: DyadicSign::Pos,
200        exponent: -130,
201        mantissa: 0xfadbafc9_6406eb15_6dc79ef5_f7a217e6_u128,
202    },
203    DyadicFloat128 {
204        sign: DyadicSign::Pos,
205        exponent: -129,
206        mantissa: 0x84ee2cbe_c31b12c5_c8e72197_0cabd3a3_u128,
207    },
208    DyadicFloat128 {
209        sign: DyadicSign::Pos,
210        exponent: -129,
211        mantissa: 0x8c5fad18_5f8bc130_ca4748b1_bf88298d_u128,
212    },
213    DyadicFloat128 {
214        sign: DyadicSign::Pos,
215        exponent: -129,
216        mantissa: 0x93c1b902_bf7a2df1_06459240_6fe1447a_u128,
217    },
218    DyadicFloat128 {
219        sign: DyadicSign::Pos,
220        exponent: -129,
221        mantissa: 0x9b13b9b8_3f5e5e69_c5abb498_d27af328_u128,
222    },
223    DyadicFloat128 {
224        sign: DyadicSign::Pos,
225        exponent: -129,
226        mantissa: 0xa25521b6_15784d45_43787549_88b8d9e3_u128,
227    },
228    DyadicFloat128 {
229        sign: DyadicSign::Pos,
230        exponent: -129,
231        mantissa: 0xa9856cca_8e6a4eda_99b7f77b_f7d9e8c1_u128,
232    },
233    DyadicFloat128 {
234        sign: DyadicSign::Pos,
235        exponent: -129,
236        mantissa: 0xb0a42018_4e7f0cb1_b51d51dc_200a0fc3_u128,
237    },
238    DyadicFloat128 {
239        sign: DyadicSign::Pos,
240        exponent: -129,
241        mantissa: 0xb7b0ca0f_26f78473_8aa32122_dcfe4483_u128,
242    },
243    DyadicFloat128 {
244        sign: DyadicSign::Pos,
245        exponent: -129,
246        mantissa: 0xbeab025b_1d9fbad3_910b8564_93411026_u128,
247    },
248    DyadicFloat128 {
249        sign: DyadicSign::Pos,
250        exponent: -129,
251        mantissa: 0xc59269ca_50d92b6d_a1746e91_f50a28de_u128,
252    },
253    DyadicFloat128 {
254        sign: DyadicSign::Pos,
255        exponent: -129,
256        mantissa: 0xcc66aa2a_6b58c33c_d9311fa1_4ed9b7c4_u128,
257    },
258    DyadicFloat128 {
259        sign: DyadicSign::Pos,
260        exponent: -129,
261        mantissa: 0xd327761e_611fe5b6_427c95e9_001e7136_u128,
262    },
263    DyadicFloat128 {
264        sign: DyadicSign::Pos,
265        exponent: -129,
266        mantissa: 0xd9d488ed_32e3635c_30f6394a_0806345d_u128,
267    },
268    DyadicFloat128 {
269        sign: DyadicSign::Pos,
270        exponent: -129,
271        mantissa: 0xe06da64a_764f7c67_c631ed96_798cb804_u128,
272    },
273    DyadicFloat128 {
274        sign: DyadicSign::Pos,
275        exponent: -129,
276        mantissa: 0xe6f29a19_609a84ba_60b77ce1_ca6dc2c8_u128,
277    },
278    DyadicFloat128 {
279        sign: DyadicSign::Pos,
280        exponent: -129,
281        mantissa: 0xed63382b_0dda7b45_6fe445ec_bc3a8d03_u128,
282    },
283    DyadicFloat128 {
284        sign: DyadicSign::Pos,
285        exponent: -129,
286        mantissa: 0xf3bf5bf8_bad1a21c_a7b837e6_86adf3fa_u128,
287    },
288    DyadicFloat128 {
289        sign: DyadicSign::Pos,
290        exponent: -129,
291        mantissa: 0xfa06e85a_a0a0be5c_66d23c7d_5dc8ecc2_u128,
292    },
293    DyadicFloat128 {
294        sign: DyadicSign::Pos,
295        exponent: -128,
296        mantissa: 0x801ce39e_0d205c99_a6d6c6c5_4d938596_u128,
297    },
298    DyadicFloat128 {
299        sign: DyadicSign::Pos,
300        exponent: -128,
301        mantissa: 0x832bf4a6_d9867e2a_4b6a09cb_61a515c1_u128,
302    },
303    DyadicFloat128 {
304        sign: DyadicSign::Pos,
305        exponent: -128,
306        mantissa: 0x8630a2da_da1ed065_d3e84ed5_013ca37e_u128,
307    },
308    DyadicFloat128 {
309        sign: DyadicSign::Pos,
310        exponent: -128,
311        mantissa: 0x892aecdf_de9547b5_094478fc_472b4afc_u128,
312    },
313    DyadicFloat128 {
314        sign: DyadicSign::Pos,
315        exponent: -128,
316        mantissa: 0x8c1ad445_f3e09b8c_439d8018_60205921_u128,
317    },
318    DyadicFloat128 {
319        sign: DyadicSign::Pos,
320        exponent: -128,
321        mantissa: 0x8f005d5e_f7f59f9b_5c835e16_65c43748_u128,
322    },
323    DyadicFloat128 {
324        sign: DyadicSign::Pos,
325        exponent: -128,
326        mantissa: 0x91db8f16_64f350e2_10e4f9c1_126e0220_u128,
327    },
328    DyadicFloat128 {
329        sign: DyadicSign::Pos,
330        exponent: -128,
331        mantissa: 0x94ac72c9_847186f6_18c4f393_f78a32f9_u128,
332    },
333    DyadicFloat128 {
334        sign: DyadicSign::Pos,
335        exponent: -128,
336        mantissa: 0x97731420_365e538b_abd3fe19_f1aeb6b3_u128,
337    },
338    DyadicFloat128 {
339        sign: DyadicSign::Pos,
340        exponent: -128,
341        mantissa: 0x9a2f80e6_71bdda20_4226f8e2_204ff3bd_u128,
342    },
343    DyadicFloat128 {
344        sign: DyadicSign::Pos,
345        exponent: -128,
346        mantissa: 0x9ce1c8e6_a0b8cdb9_f799c4e8_174cf11c_u128,
347    },
348    DyadicFloat128 {
349        sign: DyadicSign::Pos,
350        exponent: -128,
351        mantissa: 0x9f89fdc4_f4b7a1ec_f8b49264_4f0701e0_u128,
352    },
353    DyadicFloat128 {
354        sign: DyadicSign::Pos,
355        exponent: -128,
356        mantissa: 0xa22832db_cadaae08_92fe9c08_637af0e6_u128,
357    },
358    DyadicFloat128 {
359        sign: DyadicSign::Pos,
360        exponent: -128,
361        mantissa: 0xa4bc7d19_34f70924_19a87f2a_457dac9f_u128,
362    },
363    DyadicFloat128 {
364        sign: DyadicSign::Pos,
365        exponent: -128,
366        mantissa: 0xa746f2dd_b7602294_67b7d66f_2d74e019_u128,
367    },
368    DyadicFloat128 {
369        sign: DyadicSign::Pos,
370        exponent: -128,
371        mantissa: 0xa9c7abdc_4830f5c8_916a84b5_be7933f6_u128,
372    },
373    DyadicFloat128 {
374        sign: DyadicSign::Pos,
375        exponent: -128,
376        mantissa: 0xac3ec0fb_997dd6a1_a36273a5_6afa8ef4_u128,
377    },
378    DyadicFloat128 {
379        sign: DyadicSign::Pos,
380        exponent: -128,
381        mantissa: 0xaeac4c38_b4d8c080_14725e2f_3e52070a_u128,
382    },
383    DyadicFloat128 {
384        sign: DyadicSign::Pos,
385        exponent: -128,
386        mantissa: 0xb110688a_ebdc6f6a_43d65788_b9f6a7b5_u128,
387    },
388    DyadicFloat128 {
389        sign: DyadicSign::Pos,
390        exponent: -128,
391        mantissa: 0xb36b31c9_1f043691_59014174_4462f93a_u128,
392    },
393    DyadicFloat128 {
394        sign: DyadicSign::Pos,
395        exponent: -128,
396        mantissa: 0xb5bcc490_59ecc4af_f8f3cee7_5e3907d5_u128,
397    },
398    DyadicFloat128 {
399        sign: DyadicSign::Pos,
400        exponent: -128,
401        mantissa: 0xb8053e2b_c2319e73_cb2da552_10a4443d_u128,
402    },
403    DyadicFloat128 {
404        sign: DyadicSign::Pos,
405        exponent: -128,
406        mantissa: 0xba44bc7d_d470782f_654c2cb1_0942e386_u128,
407    },
408    DyadicFloat128 {
409        sign: DyadicSign::Pos,
410        exponent: -128,
411        mantissa: 0xbc7b5dea_e98af280_d4113006_e80fb290_u128,
412    },
413    DyadicFloat128 {
414        sign: DyadicSign::Pos,
415        exponent: -128,
416        mantissa: 0xbea94144_fd049aac_1043c5e7_55282e7d_u128,
417    },
418    DyadicFloat128 {
419        sign: DyadicSign::Pos,
420        exponent: -128,
421        mantissa: 0xc0ce85b8_ac526640_89dd62c4_6e92fa25_u128,
422    },
423    DyadicFloat128 {
424        sign: DyadicSign::Pos,
425        exponent: -128,
426        mantissa: 0xc2eb4abb_661628b5_b373fe45_c61bb9fb_u128,
427    },
428    DyadicFloat128 {
429        sign: DyadicSign::Pos,
430        exponent: -128,
431        mantissa: 0xc4ffaffa_bf8fbd54_8cb43d10_bc9e0221_u128,
432    },
433    DyadicFloat128 {
434        sign: DyadicSign::Pos,
435        exponent: -128,
436        mantissa: 0xc70bd54c_e602ee13_e7d54fbd_09f2be38_u128,
437    },
438    DyadicFloat128 {
439        sign: DyadicSign::Pos,
440        exponent: -128,
441        mantissa: 0xc90fdaa2_2168c234_c4c6628b_80dc1cd1_u128,
442    },
443];
444
445// Approximate atan(x) for |x| <= 2^-7.
446// Using degree-9 Taylor polynomial:
447//  P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9;
448// Then the absolute error is bounded by:
449//   |atan(x) - P(x)| < |x|^11/11 < 2^(-7*11) / 11 < 2^-80.
450// And the relative error is bounded by:
451//   |(atan(x) - P(x))/atan(x)| < |x|^10 / 10 < 2^-73.
452// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than
453//   ulp(x_hi^3 / 3) gives us:
454// P(x) ~ x_hi - x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
455//        + x_lo * (1 - x_hi^2 + x_hi^4)
456// Since p.lo is ~ x^3/3, the relative error from rounding is bounded by:
457//   |(atan(x) - P(x))/atan(x)| < ulp(x^2) <= 2^(-14-52) = 2^-66.
458#[inline]
459pub(crate) fn atan_eval(x: DoubleDouble) -> DoubleDouble {
460    let p_hi = x.hi;
461    let x_hi_sq = x.hi * x.hi;
462    // c0 ~ x_hi^2 * 1/5 - 1/3
463    let c0 = f_fmla(
464        x_hi_sq,
465        f64::from_bits(0x3fc999999999999a),
466        f64::from_bits(0xbfd5555555555555),
467    );
468    // c1 ~ x_hi^2 * 1/9 - 1/7
469    let c1 = f_fmla(
470        x_hi_sq,
471        f64::from_bits(0x3fbc71c71c71c71c),
472        f64::from_bits(0xbfc2492492492492),
473    );
474    // x_hi^3
475    let x_hi_3 = x_hi_sq * x.hi;
476    // x_hi^4
477    let x_hi_4 = x_hi_sq * x_hi_sq;
478    // d0 ~ 1/3 - x_hi^2 / 5 + x_hi^4 / 7 - x_hi^6 / 9
479    let d0 = f_fmla(x_hi_4, c1, c0);
480    // x_lo - x_lo * x_hi^2 + x_lo * x_hi^4
481    let d1 = f_fmla(x_hi_4 - x_hi_sq, x.lo, x.lo);
482    // p.lo ~ -x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
483    //        + x_lo * (1 - x_hi^2 + x_hi^4)
484    let p_lo = f_fmla(x_hi_3, d0, d1);
485    DoubleDouble::new(p_lo, p_hi)
486}
487
488#[inline(always)]
489#[allow(unused)]
490pub(crate) fn atan_eval_fma(x: DoubleDouble) -> DoubleDouble {
491    let p_hi = x.hi;
492    let x_hi_sq = x.hi * x.hi;
493    // c0 ~ x_hi^2 * 1/5 - 1/3
494    let c0 = f64::mul_add(
495        x_hi_sq,
496        f64::from_bits(0x3fc999999999999a),
497        f64::from_bits(0xbfd5555555555555),
498    );
499    // c1 ~ x_hi^2 * 1/9 - 1/7
500    let c1 = f64::mul_add(
501        x_hi_sq,
502        f64::from_bits(0x3fbc71c71c71c71c),
503        f64::from_bits(0xbfc2492492492492),
504    );
505    // x_hi^3
506    let x_hi_3 = x_hi_sq * x.hi;
507    // x_hi^4
508    let x_hi_4 = x_hi_sq * x_hi_sq;
509    // d0 ~ 1/3 - x_hi^2 / 5 + x_hi^4 / 7 - x_hi^6 / 9
510    let d0 = f64::mul_add(x_hi_4, c1, c0);
511    // x_lo - x_lo * x_hi^2 + x_lo * x_hi^4
512    let d1 = f64::mul_add(x_hi_4 - x_hi_sq, x.lo, x.lo);
513    // p.lo ~ -x_hi^3/3 + x_hi^5/5 - x_hi^7/7 + x_hi^9/9 +
514    //        + x_lo * (1 - x_hi^2 + x_hi^4)
515    let p_lo = f64::mul_add(x_hi_3, d0, d1);
516    DoubleDouble::new(p_lo, p_hi)
517}
518
519#[inline]
520fn atan_eval_hard(x: DyadicFloat128) -> DyadicFloat128 {
521    // let x_hi_sq = x * x;
522    // c0 ~ x_hi^2 * 1/5 - 1/3
523    const C: [DyadicFloat128; 4] = [
524        DyadicFloat128 {
525            sign: DyadicSign::Neg,
526            exponent: -129,
527            mantissa: 0xaaaaaaaa_aaaaaaaa_aaaaaaaa_aaaaaaab_u128,
528        },
529        DyadicFloat128 {
530            sign: DyadicSign::Pos,
531            exponent: -130,
532            mantissa: 0xcccccccc_cccccccc_cccccccc_cccccccd_u128,
533        },
534        DyadicFloat128 {
535            sign: DyadicSign::Neg,
536            exponent: -130,
537            mantissa: 0x92492492_49249249_24924924_92492492_u128,
538        },
539        DyadicFloat128 {
540            sign: DyadicSign::Pos,
541            exponent: -131,
542            mantissa: 0xe38e38e3_8e38e38e_38e38e38_e38e38e4_u128,
543        },
544    ];
545    let dx2 = x * x;
546    // Taylor polynomial
547    // P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9;
548    let p = f_polyeval4(dx2, C[0], C[1], C[2], C[3]);
549    x + dx2 * x * p
550}
551
552#[cold]
553#[inline(never)]
554pub(crate) fn atan2_hard(y: f64, x: f64) -> DyadicFloat128 {
555    /*
556       Sage math:
557       from sage.all import *
558
559       def format_dyadic_hex(value):
560           l = hex(value)[2:]
561           n = 8
562           x = [l[i:i + n] for i in range(0, len(l), n)]
563           return "0x" + "_".join(x) + "_u128"
564
565       def print_dyadic(value):
566           (s, m, e) = RealField(128)(value).sign_mantissa_exponent();
567           print("DyadicFloat128 {")
568           print(f"    sign: DyadicSign::{'Pos' if s >= 0 else 'Neg'},")
569           print(f"    exponent: {e},")
570           print(f"    mantissa: {format_dyadic_hex(m)},")
571           print("},")
572
573       D3 = RealField(157)
574
575       print("Minus Pi")
576       print_dyadic(-D3.pi())
577
578       print("\nPI over 2")
579       print_dyadic(D3.pi() / 2)
580
581       print("\nMinus PI over 2")
582       print_dyadic(-D3.pi() / 2)
583    */
584    static ZERO: DyadicFloat128 = DyadicFloat128 {
585        sign: DyadicSign::Pos,
586        exponent: 0,
587        mantissa: 0_u128,
588    };
589    static MINUS_PI: DyadicFloat128 = DyadicFloat128 {
590        sign: DyadicSign::Neg,
591        exponent: -126,
592        mantissa: 0xc90fdaa2_2168c234_c4c6628b_80dc1cd1_u128,
593    };
594    static PI_OVER_2: DyadicFloat128 = DyadicFloat128 {
595        sign: DyadicSign::Pos,
596        exponent: -127,
597        mantissa: 0xc90fdaa2_2168c234_c4c6628b_80dc1cd1_u128,
598    };
599    static MPI_OVER_2: DyadicFloat128 = DyadicFloat128 {
600        sign: DyadicSign::Neg,
601        exponent: -127,
602        mantissa: 0xc90fdaa2_2168c234_c4c6628b_80dc1cd1_u128,
603    };
604    static CONST_ADJ: [[[DyadicFloat128; 2]; 2]; 2] = [
605        [[ZERO, MPI_OVER_2], [ZERO, MPI_OVER_2]],
606        [[MINUS_PI, PI_OVER_2], [MINUS_PI, PI_OVER_2]],
607    ];
608
609    let x_sign = x.is_sign_negative() as usize;
610    let y_sign = y.is_sign_negative() as usize;
611    let x_bits = x.to_bits() & 0x7fff_ffff_ffff_ffff;
612    let y_bits = y.to_bits() & 0x7fff_ffff_ffff_ffff;
613    let x_abs = x_bits;
614    let y_abs = y_bits;
615    let recip = x_abs < y_abs;
616
617    let min_abs = if recip { x_abs } else { y_abs };
618    let max_abs = if !recip { x_abs } else { y_abs };
619    let min_exp = min_abs.wrapping_shr(52);
620    let max_exp = max_abs.wrapping_shr(52);
621
622    let mut num = f64::from_bits(min_abs);
623    let mut den = f64::from_bits(max_abs);
624
625    if max_exp > 0x7ffu64 - 128u64 || min_exp < 128u64 {
626        let scale_up = min_exp < 128u64;
627        let scale_down = max_exp > 0x7ffu64 - 128u64;
628        // At least one input is denormal, multiply both numerator and denominator
629        // by some large enough power of 2 to normalize denormal inputs.
630        if scale_up {
631            num *= f64::from_bits(0x43f0000000000000);
632            if !scale_down {
633                den *= f64::from_bits(0x43f0000000000000)
634            }
635        } else if scale_down {
636            den *= f64::from_bits(0x3bf0000000000000);
637            if !scale_up {
638                num *= f64::from_bits(0x3bf0000000000000);
639            }
640        }
641    }
642
643    static IS_NEG: [DyadicSign; 2] = [DyadicSign::Pos, DyadicSign::Neg];
644
645    let final_sign = IS_NEG[((x_sign != y_sign) != recip) as usize];
646    let const_term = CONST_ADJ[x_sign][y_sign][recip as usize];
647
648    let num = DyadicFloat128::new_from_f64(num);
649    let den = DyadicFloat128::new_from_f64(den);
650
651    let den_recip0 = den.reciprocal();
652
653    let mut k_product = num * den_recip0;
654    k_product.exponent += 6;
655
656    let mut k = k_product.round_to_nearest_f64();
657    let idx = k as u64;
658    // k = idx / 64
659    k *= f64::from_bits(0x3f90000000000000);
660
661    // Range reduction:
662    // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
663    //                        = atan((n - d * k/64)) / (d + n * k/64))
664    let k_rational128 = DyadicFloat128::new_from_f64(k);
665    let num_k = num * k_rational128;
666    let den_k = den * k_rational128;
667
668    // num_dd = n - d * k
669    let num_rational128 = num - den_k;
670    // den_dd = d + n * k
671    let den_rational128 = den + num_k;
672
673    // q = (n - d * k) / (d + n * k)
674    let den_rational128_recip = den_rational128.reciprocal();
675    let q = num_rational128 * den_rational128_recip;
676
677    let p = atan_eval_hard(q);
678
679    let vl = ATAN_RATIONAL_128[idx as usize];
680    let mut r = p + vl + const_term;
681    r.sign = r.sign.mult(final_sign);
682
683    r
684}
685
686static IS_NEG: [f64; 2] = [1.0, -1.0];
687const ZERO: DoubleDouble = DoubleDouble::new(0.0, 0.0);
688const MZERO: DoubleDouble = DoubleDouble::new(-0.0, -0.0);
689const PI: DoubleDouble = DoubleDouble::new(
690    f64::from_bits(0x3ca1a62633145c07),
691    f64::from_bits(0x400921fb54442d18),
692);
693const MPI: DoubleDouble = DoubleDouble::new(
694    f64::from_bits(0xbca1a62633145c07),
695    f64::from_bits(0xc00921fb54442d18),
696);
697const PI_OVER_2: DoubleDouble = DoubleDouble::new(
698    f64::from_bits(0x3c91a62633145c07),
699    f64::from_bits(0x3ff921fb54442d18),
700);
701const MPI_OVER_2: DoubleDouble = DoubleDouble::new(
702    f64::from_bits(0xbc91a62633145c07),
703    f64::from_bits(0xbff921fb54442d18),
704);
705const PI_OVER_4: DoubleDouble = DoubleDouble::new(
706    f64::from_bits(0x3c81a62633145c07),
707    f64::from_bits(0x3fe921fb54442d18),
708);
709const THREE_PI_OVER_4: DoubleDouble = DoubleDouble::new(
710    f64::from_bits(0x3c9a79394c9e8a0a),
711    f64::from_bits(0x4002d97c7f3321d2),
712);
713
714// Adjustment for constant term:
715//   CONST_ADJ[x_sign][y_sign][recip]
716static CONST_ADJ: [[[DoubleDouble; 2]; 2]; 2] = [
717    [[ZERO, MPI_OVER_2], [MZERO, MPI_OVER_2]],
718    [[MPI, PI_OVER_2], [MPI, PI_OVER_2]],
719];
720
721// Exceptional cases:
722//   EXCEPT[y_except][x_except][x_is_neg]
723// with x_except & y_except:
724//   0: zero
725//   1: finite, non-zero
726//   2: infinity
727static EXCEPTS: [[[DoubleDouble; 2]; 3]; 3] = [
728    [[ZERO, PI], [ZERO, PI], [ZERO, PI]],
729    [[PI_OVER_2, PI_OVER_2], [ZERO, ZERO], [ZERO, PI]],
730    [
731        [PI_OVER_2, PI_OVER_2],
732        [PI_OVER_2, PI_OVER_2],
733        [PI_OVER_4, THREE_PI_OVER_4],
734    ],
735];
736
737#[inline(always)]
738fn atan2_gen_fma(y: f64, x: f64) -> f64 {
739    let x_sign = x.is_sign_negative() as usize;
740    let y_sign = y.is_sign_negative() as usize;
741    let x_bits = x.to_bits() & 0x7fff_ffff_ffff_ffff;
742    let y_bits = y.to_bits() & 0x7fff_ffff_ffff_ffff;
743    let x_abs = x_bits;
744    let y_abs = y_bits;
745    let recip = x_abs < y_abs;
746    let mut min_abs = if recip { x_abs } else { y_abs };
747    let mut max_abs = if !recip { x_abs } else { y_abs };
748    let mut min_exp = min_abs.wrapping_shr(52);
749    let mut max_exp = max_abs.wrapping_shr(52);
750
751    let mut num = f64::from_bits(min_abs);
752    let mut den = f64::from_bits(max_abs);
753
754    // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
755    // overflow, or close to underflow.
756    if max_exp > 0x7ffu64 - 128u64 || min_exp < 128u64 {
757        if x.is_nan() || y.is_nan() {
758            return f64::NAN;
759        }
760        let x_except = if x == 0.0 {
761            0
762        } else if x.is_infinite() {
763            2
764        } else {
765            1
766        };
767        let y_except = if y == 0.0 {
768            0
769        } else if y.is_infinite() {
770            2
771        } else {
772            1
773        };
774
775        if (x_except != 1) || (y_except != 1) {
776            let r = EXCEPTS[y_except][x_except][x_sign];
777            return f_fmla(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
778        }
779        let scale_up = min_exp < 128u64;
780        let scale_down = max_exp > 0x7ffu64 - 128u64;
781        // At least one input is denormal, multiply both numerator and denominator
782        // by some large enough power of 2 to normalize denormal inputs.
783        // if scale_up || scale_down {
784        //     return atan2_hard(y, x).fast_as_f64();
785        // }
786        if scale_up {
787            num *= f64::from_bits(0x43f0000000000000);
788            if !scale_down {
789                den *= f64::from_bits(0x43f0000000000000);
790            }
791        } else if scale_down {
792            den *= f64::from_bits(0x3bf0000000000000);
793            if !scale_up {
794                num *= f64::from_bits(0x3bf0000000000000);
795            }
796        }
797
798        min_abs = num.to_bits();
799        max_abs = den.to_bits();
800        min_exp = min_abs.wrapping_shr(52);
801        max_exp = max_abs.wrapping_shr(52);
802    }
803    let final_sign = IS_NEG[((x_sign != y_sign) != recip) as usize];
804    let const_term = CONST_ADJ[x_sign][y_sign][recip as usize];
805    let exp_diff = max_exp - min_exp;
806    // We have the following bound for normalized n and d:
807    //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
808    if exp_diff > 54 {
809        return f_fmla(
810            final_sign,
811            const_term.hi,
812            final_sign * (const_term.lo + num / den),
813        );
814    }
815
816    let mut k = (64.0 * num / den).cpu_round();
817    let idx = k as u64;
818    // k = idx / 64
819    k *= f64::from_bits(0x3f90000000000000);
820
821    // Range reduction:
822    // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
823    //                        = atan((n - d * k/64)) / (d + n * k/64))
824    let num_k = DoubleDouble::from_exact_mult(num, k);
825    let den_k = DoubleDouble::from_exact_mult(den, k);
826
827    // num_dd = n - d * k
828    let num_dd = DoubleDouble::from_exact_add(num - den_k.hi, -den_k.lo);
829    // den_dd = d + n * k
830    let mut den_dd = DoubleDouble::from_exact_add(den, num_k.hi);
831    den_dd.lo += num_k.lo;
832
833    // q = (n - d * k) / (d + n * k)
834    let q = DoubleDouble::div(num_dd, den_dd);
835    // p ~ atan(q)
836    let p = atan_eval(q);
837
838    let vl = ATAN_I[idx as usize];
839    let vlo = DoubleDouble::from_bit_pair(vl);
840    let mut r = DoubleDouble::add(const_term, DoubleDouble::add(vlo, p));
841
842    let err = f_fmla(
843        p.hi,
844        f64::from_bits(0x3bd0000000000000),
845        f64::from_bits(0x3c00000000000000),
846    );
847
848    let ub = r.hi + (r.lo + err);
849    let lb = r.hi + (r.lo - err);
850
851    if ub == lb {
852        r.hi *= final_sign;
853        r.lo *= final_sign;
854
855        return r.to_f64();
856    }
857    atan2_hard(y, x).fast_as_f64()
858}
859
860#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
861#[target_feature(enable = "avx", enable = "fma")]
862unsafe fn atan2_fma_impl(y: f64, x: f64) -> f64 {
863    let x_sign = x.is_sign_negative() as usize;
864    let y_sign = y.is_sign_negative() as usize;
865    let x_bits = x.to_bits() & 0x7fff_ffff_ffff_ffff;
866    let y_bits = y.to_bits() & 0x7fff_ffff_ffff_ffff;
867    let x_abs = x_bits;
868    let y_abs = y_bits;
869    let recip = x_abs < y_abs;
870    let mut min_abs = if recip { x_abs } else { y_abs };
871    let mut max_abs = if !recip { x_abs } else { y_abs };
872    let mut min_exp = min_abs.wrapping_shr(52);
873    let mut max_exp = max_abs.wrapping_shr(52);
874
875    let mut num = f64::from_bits(min_abs);
876    let mut den = f64::from_bits(max_abs);
877
878    // Check for exceptional cases, whether inputs are 0, inf, nan, or close to
879    // overflow, or close to underflow.
880    if max_exp > 0x7ffu64 - 128u64 || min_exp < 128u64 {
881        if x.is_nan() || y.is_nan() {
882            return f64::NAN;
883        }
884        let x_except = if x == 0.0 {
885            0
886        } else if x.is_infinite() {
887            2
888        } else {
889            1
890        };
891        let y_except = if y == 0.0 {
892            0
893        } else if y.is_infinite() {
894            2
895        } else {
896            1
897        };
898
899        if (x_except != 1) || (y_except != 1) {
900            let r = EXCEPTS[y_except][x_except][x_sign];
901            return f64::mul_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo);
902        }
903        let scale_up = min_exp < 128u64;
904        let scale_down = max_exp > 0x7ffu64 - 128u64;
905        // At least one input is denormal, multiply both numerator and denominator
906        // by some large enough power of 2 to normalize denormal inputs.
907        // if scale_up || scale_down {
908        //     return atan2_hard(y, x).fast_as_f64();
909        // }
910        if scale_up {
911            num *= f64::from_bits(0x43f0000000000000);
912            if !scale_down {
913                den *= f64::from_bits(0x43f0000000000000);
914            }
915        } else if scale_down {
916            den *= f64::from_bits(0x3bf0000000000000);
917            if !scale_up {
918                num *= f64::from_bits(0x3bf0000000000000);
919            }
920        }
921
922        min_abs = num.to_bits();
923        max_abs = den.to_bits();
924        min_exp = min_abs.wrapping_shr(52);
925        max_exp = max_abs.wrapping_shr(52);
926    }
927    let final_sign = IS_NEG[((x_sign != y_sign) != recip) as usize];
928    let const_term = CONST_ADJ[x_sign][y_sign][recip as usize];
929    let exp_diff = max_exp - min_exp;
930    // We have the following bound for normalized n and d:
931    //   2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1).
932    if exp_diff > 54 {
933        return f64::mul_add(
934            final_sign,
935            const_term.hi,
936            final_sign * (const_term.lo + num / den),
937        );
938    }
939
940    let mut k = (64.0 * num / den).round();
941    let idx = k as u64;
942    // k = idx / 64
943    k *= f64::from_bits(0x3f90000000000000);
944
945    // Range reduction:
946    // atan(n/d) - atan(k/64) = atan((n/d - k/64) / (1 + (n/d) * (k/64)))
947    //                        = atan((n - d * k/64)) / (d + n * k/64))
948    let num_k = DoubleDouble::from_exact_mult_fma(num, k);
949    let den_k = DoubleDouble::from_exact_mult_fma(den, k);
950
951    // num_dd = n - d * k
952    let num_dd = DoubleDouble::from_exact_add(num - den_k.hi, -den_k.lo);
953    // den_dd = d + n * k
954    let mut den_dd = DoubleDouble::from_exact_add(den, num_k.hi);
955    den_dd.lo += num_k.lo;
956
957    // q = (n - d * k) / (d + n * k)
958    let q = DoubleDouble::div_fma(num_dd, den_dd);
959    // p ~ atan(q)
960    let p = atan_eval_fma(q);
961
962    let vl = ATAN_I[idx as usize];
963    let vlo = DoubleDouble::from_bit_pair(vl);
964    let mut r = DoubleDouble::add(const_term, DoubleDouble::add(vlo, p));
965
966    let err = f64::mul_add(
967        p.hi,
968        f64::from_bits(0x3bd0000000000000),
969        f64::from_bits(0x3c00000000000000),
970    );
971
972    let ub = r.hi + (r.lo + err);
973    let lb = r.hi + (r.lo - err);
974
975    if ub == lb {
976        r.hi *= final_sign;
977        r.lo *= final_sign;
978
979        return r.to_f64();
980    }
981    atan2_hard(y, x).fast_as_f64()
982}
983
984/// Computes atan(x)
985///
986/// Max found ULP 0.5
987pub fn f_atan2(y: f64, x: f64) -> f64 {
988    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
989    {
990        atan2_gen_fma(y, x)
991    }
992    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
993    {
994        use std::sync::OnceLock;
995        static EXECUTOR: OnceLock<unsafe fn(f64, f64) -> f64> = OnceLock::new();
996        let q = EXECUTOR.get_or_init(|| {
997            if std::arch::is_x86_feature_detected!("avx")
998                && std::arch::is_x86_feature_detected!("fma")
999            {
1000                atan2_fma_impl
1001            } else {
1002                fn def_atan2pi(y: f64, x: f64) -> f64 {
1003                    atan2_gen_fma(y, x)
1004                }
1005                def_atan2pi
1006            }
1007        });
1008        unsafe { q(y, x) }
1009    }
1010}
1011
1012#[cfg(test)]
1013mod tests {
1014    use super::*;
1015
1016    #[test]
1017    fn test_atan2() {
1018        assert_eq!(
1019            f_atan2(0.05474853958030223, 0.9999995380640253),
1020            0.05469396182367716
1021        );
1022        assert_eq!(f_atan2(-5., 2.), -1.1902899496825317);
1023        assert_eq!(f_atan2(2., -5.), 2.761086276477428);
1024        assert_eq!(
1025            f_atan2(1.220342145227879E-321, 6.9806238698201653E-309),
1026            0.00000000000017481849301519772
1027        );
1028    }
1029}
pxfm/tangent/atan2.rs

pxfm/tangent/
atan2.rs