1use crate::common::f_fmla;
30use crate::double_double::DoubleDouble;
31use crate::shared_eval::poly_dekker_generic;
32
33static ATAN2F_TABLE: [(u64, u64); 32] = [
34 (0x3ff0000000000000, 0xba88c1dac5492248),
35 (0xbfd5555555555555, 0xbc755553bf3a2abe),
36 (0x3fc999999999999a, 0xbc699deed1ec9071),
37 (0xbfc2492492492492, 0xbc5fd99c8d18269a),
38 (0x3fbc71c71c71c717, 0xbc2651eee4c4d9d0),
39 (0xbfb745d1745d1649, 0xbc5632683d6c44a6),
40 (0x3fb3b13b13b11c63, 0x3c5bf69c1f8af41d),
41 (0xbfb11111110e6338, 0x3c23c3e431e8bb68),
42 (0x3fae1e1e1dc45c4a, 0xbc4be2db05c77bbf),
43 (0xbfaaf286b8164b4f, 0x3c2a4673491f0942),
44 (0x3fa86185e9ad4846, 0x3c4e12e32d79fcee),
45 (0xbfa642c6d5161fae, 0x3c43ce76c1ca03f0),
46 (0x3fa47ad6f277e5bf, 0xbc3abd8d85bdb714),
47 (0xbfa2f64a2ee8896d, 0x3c2ef87d4b615323),
48 (0x3fa1a6a2b31741b5, 0x3c1a5d9d973547ee),
49 (0xbfa07fbdad65e0a6, 0xbc265ac07f5d35f4),
50 (0x3f9ee9932a9a5f8b, 0x3c2f8b9623f6f55a),
51 (0xbf9ce8b5b9584dc6, 0x3c2fe5af96e8ea2d),
52 (0x3f9ac9cb288087b7, 0xbc3450cdfceaf5ca),
53 (0xbf984b025351f3e6, 0x3c2579561b0d73da),
54 (0x3f952f5b8ecdd52b, 0x3c3036bd2c6fba47),
55 (0xbf9163a8c44909dc, 0x3c318f735ffb9f16),
56 (0x3f8a400dce3eea6f, 0xbc2c90569c0c1b5c),
57 (0xbf81caa78ae6db3a, 0xbc24c60f8161ea09),
58 (0x3f752672453c0731, 0x3c1834efb598c338),
59 (0xbf65850c5be137cf, 0xbc0445fc150ca7f5),
60 (0x3f523eb98d22e1ca, 0xbbf388fbaf1d7830),
61 (0xbf38f4e974a40741, 0x3bd271198a97da34),
62 (0x3f1a5cf2e9cf76e5, 0xbbb887eb4a63b665),
63 (0xbef420c270719e32, 0x3b8efd595b27888b),
64 (0x3ec3ba2d69b51677, 0xbb64fb06829cdfc7),
65 (0xbe829b7e6f676385, 0xbb2a783b6de718fb),
66];
67
68const M: [f64; 2] = [0., 1.];
69const PI: f64 = f64::from_bits(0x400921fb54442d18);
70const PI2: f64 = f64::from_bits(0x3ff921fb54442d18);
71const PI2L: f64 = f64::from_bits(0x3c91a62633145c07);
72static OFF: [f64; 8] = [0.0, PI2, PI, PI2, -0.0, -PI2, -PI, -PI2];
73static OFFL: [f64; 8] = [0.0, PI2L, 2. * PI2L, PI2L, -0.0, -PI2L, -2. * PI2L, -PI2L];
74static SGN: [f64; 2] = [1., -1.];
75
76#[cold]
77fn atan2f_tiny(y: f32, x: f32) -> f32 {
78 let dy = y as f64;
79 let dx = x as f64;
80 let z = dy / dx;
81 let mut e = f_fmla(-z, x as f64, y as f64);
82 const C: f64 = f64::from_bits(0xbfd5555555555555); let zz = z * z;
85 let cz = C * z;
86 e = e / x as f64 + cz * zz;
87 let mut t = z.to_bits();
88 if (t & 0xfffffffu64) == 0 {
89 if z * e > 0. {
94 t = t.wrapping_add(1);
95 } else {
96 t = t.wrapping_sub(1);
97 }
98 }
99 f64::from_bits(t) as f32
100}
101
102#[allow(clippy::too_many_arguments)]
103#[cold]
104fn atan2f_refine(ay: u32, ax: u32, y: f32, x: f32, zy: f64, zx: f64, gt: usize, i: u32) -> f32 {
105 const PI: f64 = f64::from_bits(0x400921fb54442d18);
106 const PI2: f64 = f64::from_bits(0x3ff921fb54442d18);
107 const PI2L: f64 = f64::from_bits(0x3c91a62633145c07);
108 static OFF: [f64; 8] = [0.0, PI2, PI, PI2, -0.0, -PI2, -PI, -PI2];
109 static OFFL: [f64; 8] = [0.0, PI2L, 2. * PI2L, PI2L, -0.0, -PI2L, -2. * PI2L, -PI2L];
110 static SGN: [f64; 2] = [1., -1.];
111 if ay < ax && ((ax - ay) >> 23 >= 25) {
113 return atan2f_tiny(y, x);
114 }
115 let mut zh;
116 let mut zl;
117 if gt == 0 {
118 zh = zy / zx;
119 zl = f_fmla(zh, -zx, zy) / zx;
120 } else {
121 zh = zx / zy;
122 zl = f_fmla(zh, -zy, zx) / zy;
123 }
124 let z2 = DoubleDouble::quick_mult(DoubleDouble::new(zl, zh), DoubleDouble::new(zl, zh));
125 let mut p = poly_dekker_generic(z2, ATAN2F_TABLE);
126 zh *= SGN[gt];
127 zl *= SGN[gt];
128 p = DoubleDouble::quick_mult(DoubleDouble::new(zl, zh), p);
129 let sh = p.hi + OFF[i as usize];
130 let sl = ((OFF[i as usize] - sh) + p.hi) + p.lo + OFFL[i as usize];
131 let rf = sh as f32;
132 let th = rf as f64;
133 let dh = sh - th;
134 let mut tm: f64 = dh + sl;
135 let mut tth = th.to_bits();
136 if th + th * f64::from_bits(0x3c30000000000000) == th - th * f64::from_bits(0x3c30000000000000)
137 {
138 tth &= 0x7ffu64 << 52;
139 tth = tth.wrapping_sub(24 << 52);
140 if tm.abs() > f64::from_bits(tth) {
141 tm *= 1.25;
142 } else {
143 tm *= 0.75;
144 }
145 }
146 let r = th + tm;
147 r as f32
148}
149
150#[inline(always)]
151fn atan2f_gen_impl<Q: Fn(f64, f64, f64) -> f64>(y: f32, x: f32, fma: Q) -> f32 {
152 let tx = x.to_bits();
153 let ty = y.to_bits();
154 let ux = tx;
155 let uy = ty;
156 let ax = ux & 0x7fffffff;
157 let ay = uy & 0x7fffffff;
158 if ay >= (0xff << 23) || ax >= (0xff << 23) {
159 if ay > (0xff << 23) {
163 return x + y;
164 } if ax > (0xff << 23) {
166 return x + y;
167 } let yinf = ay == (0xff << 23);
169 let xinf = ax == (0xff << 23);
170 if yinf & xinf {
171 return if (ux >> 31) != 0 {
172 (f64::from_bits(0x4002d97c7f3321d2) * SGN[(uy >> 31) as usize]) as f32 } else {
174 (f64::from_bits(0x3fe921fb54442d18) * SGN[(uy >> 31) as usize]) as f32 };
176 }
177 if xinf {
178 return if (ux >> 31) != 0 {
179 (PI * SGN[(uy >> 31) as usize]) as f32
180 } else {
181 (0.0 * SGN[(uy >> 31) as usize]) as f32
182 };
183 }
184 if yinf {
185 return (PI2 * SGN[(uy >> 31) as usize]) as f32;
186 }
187 }
188 if ay == 0 {
189 if ax == 0 {
190 let i = (uy >> 31)
191 .wrapping_mul(4)
192 .wrapping_add((ux >> 31).wrapping_mul(2));
193 return if (ux >> 31) != 0 {
194 (OFF[i as usize] + OFFL[i as usize]) as f32
195 } else {
196 OFF[i as usize] as f32
197 };
198 }
199 if (ux >> 31) == 0 {
200 return (0.0 * SGN[(uy >> 31) as usize]) as f32;
201 }
202 }
203 let gt = (ay > ax) as usize;
204 let i = (uy >> 31)
205 .wrapping_mul(4)
206 .wrapping_add((ux >> 31).wrapping_mul(2))
207 .wrapping_add(gt as u32);
208
209 let zx = x as f64;
210 let zy = y as f64;
211 let mut z = fma(M[gt], zx, M[1usize.wrapping_sub(gt)] * zy)
212 / fma(M[gt], zy, M[1usize.wrapping_sub(gt)] * zx);
213 let mut r;
215
216 let d = ax as i32 - ay as i32;
217 if d < (27 << 23) && d > (-(27 << 23)) {
218 let z2 = z * z;
219 let z4 = z2 * z2;
220 let z8 = z4 * z4;
221 const CN: [u64; 7] = [
226 0x3ff0000000000000,
227 0x40040e0698f94c35,
228 0x400248c5da347f0d,
229 0x3fed873386572976,
230 0x3fc46fa40b20f1d0,
231 0x3f833f5e041eed0f,
232 0x3f1546bbf28667c5,
233 ];
234 const CD: [u64; 7] = [
235 0x3ff0000000000000,
236 0x4006b8b143a3f6da,
237 0x4008421201d18ed5,
238 0x3ff8221d086914eb,
239 0x3fd670657e3a07ba,
240 0x3fa0f4951fd1e72d,
241 0x3f4b3874b8798286,
242 ];
243
244 let mut cn0 = fma(z2, f64::from_bits(CN[1]), f64::from_bits(CN[0]));
245 let cn2 = fma(z2, f64::from_bits(CN[3]), f64::from_bits(CN[2]));
246 let mut cn4 = fma(z2, f64::from_bits(CN[5]), f64::from_bits(CN[4]));
247 let cn6 = f64::from_bits(CN[6]);
248 cn0 = fma(z4, cn2, cn0);
249 cn4 = fma(z4, cn6, cn4);
250 cn0 = fma(z8, cn4, cn0);
251 let mut cd0 = fma(z2, f64::from_bits(CD[1]), f64::from_bits(CD[0]));
252 let cd2 = fma(z2, f64::from_bits(CD[3]), f64::from_bits(CD[2]));
253 let mut cd4 = fma(z2, f64::from_bits(CD[5]), f64::from_bits(CD[4]));
254 let cd6 = f64::from_bits(CD[6]);
255 cd0 = fma(z4, cd2, cd0);
256 cd4 = fma(z4, cd6, cd4);
257 cd0 = fma(z8, cd4, cd0);
258 r = cn0 / cd0;
259 } else {
260 r = 1.;
261 }
262 z *= SGN[gt];
263 r = fma(z, r, OFF[i as usize]);
264 let res = r.to_bits();
265 if ((res.wrapping_add(8)) & 0xfffffff) <= 16 {
266 return atan2f_refine(ay, ax, y, x, zy, zx, gt, i);
267 }
268
269 r as f32
270}
271
272#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
273#[target_feature(enable = "avx", enable = "fma")]
274unsafe fn atan2f_fma_impl(y: f32, x: f32) -> f32 {
275 atan2f_gen_impl(y, x, f64::mul_add)
276}
277
278#[inline]
282pub fn f_atan2f(y: f32, x: f32) -> f32 {
283 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
284 {
285 atan2f_gen_impl(y, x, f_fmla)
286 }
287 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
288 {
289 use std::sync::OnceLock;
290 static EXECUTOR: OnceLock<unsafe fn(f32, f32) -> f32> = OnceLock::new();
291 let q = EXECUTOR.get_or_init(|| {
292 if std::arch::is_x86_feature_detected!("avx")
293 && std::arch::is_x86_feature_detected!("fma")
294 {
295 atan2f_fma_impl
296 } else {
297 fn def_atan2f(y: f32, x: f32) -> f32 {
298 atan2f_gen_impl(y, x, f_fmla)
299 }
300 def_atan2f
301 }
302 });
303 unsafe { q(y, x) }
304 }
305}
306
307#[cfg(test)]
308mod tests {
309 use super::*;
310
311 #[test]
312 fn f_atan2_test() {
313 assert_eq!(
314 f_atan2f(
315 0.000000000000000000000000000000000000017907922,
316 170141180000000000000000000000000000000.
317 ),
318 0.
319 );
320 assert_eq!(f_atan2f(-3590000000., -15437000.), -1.5750962);
321 assert_eq!(f_atan2f(-5., 2.), -1.19029);
322 }
323}