1use crate::common::{f_fmla, fmla, pow2i, rintk};
30use crate::double_double::DoubleDouble;
31use crate::exponents::auxiliary::fast_ldexp;
32use crate::exponents::expf::{ExpfBackend, GenericExpfBackend};
33
34#[inline]
37pub const fn exp(d: f64) -> f64 {
38 const EXP_POLY_1_D: f64 = 2f64;
39 const EXP_POLY_2_D: f64 = 0.16666666666666674f64;
40 const EXP_POLY_3_D: f64 = -0.0027777777777777614f64;
41 const EXP_POLY_4_D: f64 = 6.613756613755705e-5f64;
42 const EXP_POLY_5_D: f64 = -1.6534391534392554e-6f64;
43 const EXP_POLY_6_D: f64 = 4.17535139757361979584e-8f64;
44
45 const L2_U: f64 = 0.693_147_180_559_662_956_511_601_805_686_950_683_593_75;
46 const L2_L: f64 = 0.282_352_905_630_315_771_225_884_481_750_134_360_255_254_120_68_e-12;
47 const R_LN2: f64 =
48 1.442_695_040_888_963_407_359_924_681_001_892_137_426_645_954_152_985_934_135_449_406_931;
49
50 let qf = rintk(d * R_LN2);
51 let q = qf as i32;
52
53 let mut r = fmla(qf, -L2_U, d);
54 r = fmla(qf, -L2_L, r);
55
56 let f = r * r;
57 let mut u = EXP_POLY_6_D;
59 u = fmla(u, f, EXP_POLY_5_D);
60 u = fmla(u, f, EXP_POLY_4_D);
61 u = fmla(u, f, EXP_POLY_3_D);
62 u = fmla(u, f, EXP_POLY_2_D);
63 u = fmla(u, f, EXP_POLY_1_D);
64 let u = 1f64 + 2f64 * r / (u - r);
65 let i2 = pow2i(q);
66 u * i2
67 }
74
75pub(crate) static EXP_REDUCE_T0: [(u64, u64); 64] = [
76 (0x0000000000000000, 0x3ff0000000000000),
77 (0xbc719083535b085e, 0x3ff02c9a3e778061),
78 (0x3c8d73e2a475b466, 0x3ff059b0d3158574),
79 (0x3c6186be4bb28500, 0x3ff0874518759bc8),
80 (0x3c98a62e4adc610a, 0x3ff0b5586cf9890f),
81 (0x3c403a1727c57b52, 0x3ff0e3ec32d3d1a2),
82 (0xbc96c51039449b3a, 0x3ff11301d0125b51),
83 (0xbc932fbf9af1369e, 0x3ff1429aaea92de0),
84 (0xbc819041b9d78a76, 0x3ff172b83c7d517b),
85 (0x3c8e5b4c7b4968e4, 0x3ff1a35beb6fcb75),
86 (0x3c9e016e00a2643c, 0x3ff1d4873168b9aa),
87 (0x3c8dc775814a8494, 0x3ff2063b88628cd6),
88 (0x3c99b07eb6c70572, 0x3ff2387a6e756238),
89 (0x3c82bd339940e9da, 0x3ff26b4565e27cdd),
90 (0x3c8612e8afad1256, 0x3ff29e9df51fdee1),
91 (0x3c90024754db41d4, 0x3ff2d285a6e4030b),
92 (0x3c86f46ad23182e4, 0x3ff306fe0a31b715),
93 (0x3c932721843659a6, 0x3ff33c08b26416ff),
94 (0xbc963aeabf42eae2, 0x3ff371a7373aa9cb),
95 (0xbc75e436d661f5e2, 0x3ff3a7db34e59ff7),
96 (0x3c8ada0911f09ebc, 0x3ff3dea64c123422),
97 (0xbc5ef3691c309278, 0x3ff4160a21f72e2a),
98 (0x3c489b7a04ef80d0, 0x3ff44e086061892d),
99 (0x3c73c1a3b69062f0, 0x3ff486a2b5c13cd0),
100 (0x3c7d4397afec42e2, 0x3ff4bfdad5362a27),
101 (0xbc94b309d25957e4, 0x3ff4f9b2769d2ca7),
102 (0xbc807abe1db13cac, 0x3ff5342b569d4f82),
103 (0x3c99bb2c011d93ac, 0x3ff56f4736b527da),
104 (0x3c96324c054647ac, 0x3ff5ab07dd485429),
105 (0x3c9ba6f93080e65e, 0x3ff5e76f15ad2148),
106 (0xbc9383c17e40b496, 0x3ff6247eb03a5585),
107 (0xbc9bb60987591c34, 0x3ff6623882552225),
108 (0xbc9bdd3413b26456, 0x3ff6a09e667f3bcd),
109 (0xbc6bbe3a683c88aa, 0x3ff6dfb23c651a2f),
110 (0xbc816e4786887a9a, 0x3ff71f75e8ec5f74),
111 (0xbc90245957316dd4, 0x3ff75feb564267c9),
112 (0xbc841577ee049930, 0x3ff7a11473eb0187),
113 (0x3c705d02ba15797e, 0x3ff7e2f336cf4e62),
114 (0xbc9d4c1dd41532d8, 0x3ff82589994cce13),
115 (0xbc9fc6f89bd4f6ba, 0x3ff868d99b4492ed),
116 (0x3c96e9f156864b26, 0x3ff8ace5422aa0db),
117 (0x3c85cc13a2e3976c, 0x3ff8f1ae99157736),
118 (0xbc675fc781b57ebc, 0x3ff93737b0cdc5e5),
119 (0xbc9d185b7c1b85d0, 0x3ff97d829fde4e50),
120 (0x3c7c7c46b071f2be, 0x3ff9c49182a3f090),
121 (0xbc9359495d1cd532, 0x3ffa0c667b5de565),
122 (0xbc9d2f6edb8d41e2, 0x3ffa5503b23e255d),
123 (0x3c90fac90ef7fd32, 0x3ffa9e6b5579fdbf),
124 (0x3c97a1cd345dcc82, 0x3ffae89f995ad3ad),
125 (0xbc62805e3084d708, 0x3ffb33a2b84f15fb),
126 (0xbc75584f7e54ac3a, 0x3ffb7f76f2fb5e47),
127 (0x3c823dd07a2d9e84, 0x3ffbcc1e904bc1d2),
128 (0x3c811065895048de, 0x3ffc199bdd85529c),
129 (0x3c92884dff483cac, 0x3ffc67f12e57d14b),
130 (0x3c7503cbd1e949dc, 0x3ffcb720dcef9069),
131 (0xbc9cbc3743797a9c, 0x3ffd072d4a07897c),
132 (0x3c82ed02d75b3706, 0x3ffd5818dcfba487),
133 (0x3c9c2300696db532, 0x3ffda9e603db3285),
134 (0xbc91a5cd4f184b5c, 0x3ffdfc97337b9b5f),
135 (0x3c839e8980a9cc90, 0x3ffe502ee78b3ff6),
136 (0xbc9e9c23179c2894, 0x3ffea4afa2a490da),
137 (0x3c9dc7f486a4b6b0, 0x3ffefa1bee615a27),
138 (0x3c99d3e12dd8a18a, 0x3fff50765b6e4540),
139 (0x3c874853f3a5931e, 0x3fffa7c1819e90d8),
140];
141
142pub(crate) static EXP_REDUCE_T1: [(u64, u64); 64] = [
143 (0x0000000000000000, 0x3ff0000000000000),
144 (0x3c9ae8e38c59c72a, 0x3ff000b175effdc7),
145 (0xbc57b5d0d58ea8f4, 0x3ff00162f3904052),
146 (0x3c94115cb6b16a8e, 0x3ff0021478e11ce6),
147 (0xbc8d7c96f201bb2e, 0x3ff002c605e2e8cf),
148 (0x3c984711d4c35ea0, 0x3ff003779a95f959),
149 (0xbc80484245243778, 0x3ff0042936faa3d8),
150 (0xbc94b237da2025fa, 0x3ff004dadb113da0),
151 (0xbc75e00e62d6b30e, 0x3ff0058c86da1c0a),
152 (0x3c9a1d6cedbb9480, 0x3ff0063e3a559473),
153 (0xbc94acf197a00142, 0x3ff006eff583fc3d),
154 (0xbc6eaf2ea42391a6, 0x3ff007a1b865a8ca),
155 (0x3c7da93f90835f76, 0x3ff0085382faef83),
156 (0xbc86a79084ab093c, 0x3ff00905554425d4),
157 (0x3c986364f8fbe8f8, 0x3ff009b72f41a12b),
158 (0xbc882e8e14e3110e, 0x3ff00a6910f3b6fd),
159 (0xbc84f6b2a7609f72, 0x3ff00b1afa5abcbf),
160 (0xbc7e1a258ea8f71a, 0x3ff00bcceb7707ec),
161 (0x3c74362ca5bc26f2, 0x3ff00c7ee448ee02),
162 (0x3c9095a56c919d02, 0x3ff00d30e4d0c483),
163 (0xbc6406ac4e81a646, 0x3ff00de2ed0ee0f5),
164 (0x3c9b5a6902767e08, 0x3ff00e94fd0398e0),
165 (0xbc991b2060859320, 0x3ff00f4714af41d3),
166 (0x3c8427068ab22306, 0x3ff00ff93412315c),
167 (0x3c9c1d0660524e08, 0x3ff010ab5b2cbd11),
168 (0xbc9e7bdfb3204be8, 0x3ff0115d89ff3a8b),
169 (0x3c8843aa8b9cbbc6, 0x3ff0120fc089ff63),
170 (0xbc734104ee7edae8, 0x3ff012c1fecd613b),
171 (0xbc72b6aeb6176892, 0x3ff0137444c9b5b5),
172 (0x3c7a8cd33b8a1bb2, 0x3ff01426927f5278),
173 (0x3c72edc08e5da99a, 0x3ff014d8e7ee8d2f),
174 (0x3c857ba2dc7e0c72, 0x3ff0158b4517bb88),
175 (0x3c9b61299ab8cdb8, 0x3ff0163da9fb3335),
176 (0xbc990565902c5f44, 0x3ff016f0169949ed),
177 (0x3c870fc41c5c2d54, 0x3ff017a28af25567),
178 (0x3c94b9a6e145d76c, 0x3ff018550706ab62),
179 (0xbc7008eff5142bfa, 0x3ff019078ad6a19f),
180 (0xbc977669f033c7de, 0x3ff019ba16628de2),
181 (0xbc909bb78eeead0a, 0x3ff01a6ca9aac5f3),
182 (0x3c9371231477ece6, 0x3ff01b1f44af9f9e),
183 (0x3c75e7626621eb5a, 0x3ff01bd1e77170b4),
184 (0xbc9bc72b100828a4, 0x3ff01c8491f08f08),
185 (0xbc6ce39cbbab8bbe, 0x3ff01d37442d5070),
186 (0x3c816996709da2e2, 0x3ff01de9fe280ac8),
187 (0xbc8c11f5239bf536, 0x3ff01e9cbfe113ef),
188 (0x3c8e1d4eb5edc6b4, 0x3ff01f4f8958c1c6),
189 (0xbc9afb99946ee3f0, 0x3ff020025a8f6a35),
190 (0xbc98f06d8a148a32, 0x3ff020b533856324),
191 (0xbc82bf310fc54eb6, 0x3ff02168143b0281),
192 (0xbc9c95a035eb4176, 0x3ff0221afcb09e3e),
193 (0xbc9491793e46834c, 0x3ff022cdece68c4f),
194 (0xbc73e8d0d9c49090, 0x3ff02380e4dd22ad),
195 (0xbc9314aa16278aa4, 0x3ff02433e494b755),
196 (0x3c848daf888e9650, 0x3ff024e6ec0da046),
197 (0x3c856dc8046821f4, 0x3ff02599fb483385),
198 (0x3c945b42356b9d46, 0x3ff0264d1244c719),
199 (0xbc7082ef51b61d7e, 0x3ff027003103b10e),
200 (0x3c72106ed0920a34, 0x3ff027b357854772),
201 (0xbc9fd4cf26ea5d0e, 0x3ff0286685c9e059),
202 (0xbc909f8775e78084, 0x3ff02919bbd1d1d8),
203 (0x3c564cbba902ca28, 0x3ff029ccf99d720a),
204 (0x3c94383ef231d206, 0x3ff02a803f2d170d),
205 (0x3c94a47a505b3a46, 0x3ff02b338c811703),
206 (0x3c9e471202234680, 0x3ff02be6e199c811),
207];
208
209#[inline]
211pub(crate) fn to_denormal(x: f64) -> f64 {
212 let mut ix = x.to_bits();
213 ix &= 0x000fffffffffffff;
214 f64::from_bits(ix)
215}
216
217#[inline]
218fn exp_poly_dd(z: DoubleDouble) -> DoubleDouble {
219 const C: [(u64, u64); 7] = [
220 (0x0000000000000000, 0x3ff0000000000000),
221 (0x39c712f72ecec2cf, 0x3fe0000000000000),
222 (0x3c65555555554d07, 0x3fc5555555555555),
223 (0x3c455194d28275da, 0x3fa5555555555555),
224 (0x3c012faa0e1c0f7b, 0x3f81111111111111),
225 (0xbbf4ba45ab25d2a3, 0x3f56c16c16da6973),
226 (0xbbc9091d845ecd36, 0x3f2a01a019eb7f31),
227 ];
228 let mut r = DoubleDouble::quick_mul_add(
229 DoubleDouble::from_bit_pair(C[6]),
230 z,
231 DoubleDouble::from_bit_pair(C[5]),
232 );
233 r = DoubleDouble::quick_mul_add(r, z, DoubleDouble::from_bit_pair(C[4]));
234 r = DoubleDouble::quick_mul_add(r, z, DoubleDouble::from_bit_pair(C[3]));
235 r = DoubleDouble::quick_mul_add(r, z, DoubleDouble::from_bit_pair(C[2]));
236 r = DoubleDouble::quick_mul_add(r, z, DoubleDouble::from_bit_pair(C[1]));
237 DoubleDouble::quick_mul_add_f64(r, z, f64::from_bits(0x3ff0000000000000))
238}
239
240#[cold]
241fn as_exp_accurate(x: f64, t: f64, tz: DoubleDouble, ie: i64) -> f64 {
242 let mut ix = x.to_bits();
243 if ((ix >> 52) & 0x7ff) < 0x3c9 {
244 return 1. + x;
245 }
246
247 const L2: DoubleDouble = DoubleDouble::new(
250 f64::from_bits(0x3d0718432a1b0e26),
251 f64::from_bits(0x3f262e42ff000000),
252 );
253 const L2LL: f64 = f64::from_bits(0x3999ff0342542fc3);
254 let dx = f_fmla(-L2.hi, t, x);
255 let dx_dd = DoubleDouble::quick_mult_f64(DoubleDouble::new(L2LL, L2.lo), t);
256 let dz = DoubleDouble::full_add_f64(dx_dd, dx);
257 let mut f = exp_poly_dd(dz);
258 f = DoubleDouble::quick_mult(dz, f);
259 if ix > 0xc086232bdd7abcd2u64 {
260 ix = 1i64.wrapping_sub(ie).wrapping_shl(52) as u64;
262 f = DoubleDouble::quick_mult(f, tz);
263 f = DoubleDouble::add(tz, f);
264
265 let new_f = DoubleDouble::from_exact_add(f64::from_bits(ix), f.hi);
266 f.lo += new_f.lo;
267 f.hi = to_denormal(f.hi + f.lo);
268 } else {
269 if tz.hi == 1.0 {
270 let fhe = DoubleDouble::from_exact_add(tz.hi, f.hi);
271 let fhl = DoubleDouble::from_exact_add(fhe.lo, f.lo);
272 f.hi = fhe.hi;
273 f.lo = fhl.hi;
274 ix = f.lo.to_bits();
275 if (ix & 0x000fffffffffffff) == 0 {
276 let v = fhl.lo.to_bits();
277 let d: i64 = (((((ix as i64) >> 63) ^ ((v as i64) >> 63)) as u64).wrapping_shl(1)
278 as i64)
279 .wrapping_add(1);
280 ix = ix.wrapping_add(d as u64);
281 f.lo = f64::from_bits(ix);
282 }
283 } else {
284 f = DoubleDouble::quick_mult(f, tz);
285 f = DoubleDouble::add(tz, f);
286 }
287 f = DoubleDouble::from_exact_add(f.hi, f.lo);
288 f.hi = fast_ldexp(f.hi, ie as i32);
289 }
290 f.hi
291}
292
293#[inline(always)]
294fn exp_gen<B: ExpfBackend>(x: f64, backend: B) -> f64 {
295 let mut ix = x.to_bits();
296 let aix = ix & 0x7fffffffffffffff;
297 if aix <= 0x3c90000000000000u64 {
299 return 1.0 + x;
301 }
302 if aix >= 0x40862e42fefa39f0u64 {
303 if aix > 0x7ff0000000000000u64 {
305 return x + x;
306 } if aix == 0x7ff0000000000000u64 {
308 return if (ix >> 63) != 0 {
310 0.0 } else {
312 x };
314 }
315 if (ix >> 63) == 0 {
316 let z = std::hint::black_box(f64::from_bits(0x7fe0000000000000));
318 return z * z;
319 }
320 if aix >= 0x40874910d52d3052u64 {
321 return f64::from_bits(0x18000000000000) * f64::from_bits(0x3c80000000000000);
323 }
324 }
325 const S: f64 = f64::from_bits(0x40b71547652b82fe);
326 let t = backend.round(x * S);
327 let jt: i64 = unsafe {
328 t.to_int_unchecked::<i64>() };
330 let i0: i64 = (jt >> 6) & 0x3f;
331 let i1 = jt & 0x3f;
332 let ie: i64 = jt >> 12;
333 let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i0 as usize]);
334 let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
335 let tz = backend.quick_mult(t0, t1);
336
337 const L2: DoubleDouble = DoubleDouble::new(
338 f64::from_bits(0x3d0718432a1b0e26),
339 f64::from_bits(0x3f262e42ff000000),
340 );
341
342 let dx = backend.fma(L2.lo, t, backend.fma(-L2.hi, t, x));
345 let dx2 = dx * dx;
346 const CH: [u64; 4] = [
347 0x3ff0000000000000,
348 0x3fe0000000000000,
349 0x3fc55555557e54ff,
350 0x3fa55555553a12f4,
351 ];
352
353 let pw0 = backend.fma(dx, f64::from_bits(CH[3]), f64::from_bits(CH[2]));
354 let pw1 = backend.fma(dx, f64::from_bits(CH[1]), f64::from_bits(CH[0]));
355
356 let p = backend.fma(dx2, pw0, pw1);
357 let mut f = DoubleDouble::new(backend.fma(tz.hi * dx, p, tz.lo), tz.hi);
358 const EPS: f64 = f64::from_bits(0x3c0833beace2b6fe);
359 if ix > 0xc086232bdd7abcd2u64 {
360 ix = 1u64.wrapping_sub(ie as u64).wrapping_shl(52);
362 let sums = DoubleDouble::from_exact_add(f64::from_bits(ix), f.hi);
363 f.hi = sums.hi;
364 f.lo += sums.lo;
365 let ub = f.hi + (f.lo + EPS);
366 let lb = f.hi + (f.lo - EPS);
367 if ub != lb {
368 return as_exp_accurate(x, t, tz, ie);
369 }
370 f.hi = to_denormal(lb);
371 } else {
372 let ub = f.hi + (f.lo + EPS);
373 let lb = f.hi + (f.lo - EPS);
374 if ub != lb {
375 return as_exp_accurate(x, t, tz, ie);
376 }
377 f.hi = fast_ldexp(lb, ie as i32);
378 }
379 f.hi
380}
381
382#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
383#[target_feature(enable = "avx", enable = "fma")]
384unsafe fn exp_fma_impl(x: f64) -> f64 {
385 use crate::exponents::expf::FmaBackend;
386 exp_gen(x, FmaBackend {})
387}
388
389pub fn f_exp(x: f64) -> f64 {
393 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
394 {
395 exp_gen(x, GenericExpfBackend {})
396 }
397 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
398 {
399 use std::sync::OnceLock;
400 static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
401 let q = EXECUTOR.get_or_init(|| {
402 if std::arch::is_x86_feature_detected!("avx")
403 && std::arch::is_x86_feature_detected!("fma")
404 {
405 exp_fma_impl
406 } else {
407 fn def_exp(x: f64) -> f64 {
408 exp_gen(x, GenericExpfBackend {})
409 }
410 def_exp
411 }
412 });
413 unsafe { q(x) }
414 }
415}
416
417#[cfg(test)]
418mod tests {
419 use super::*;
420
421 #[test]
422 fn exp_test() {
423 assert!(
424 (exp(0f64) - 1f64).abs() < 1e-8,
425 "Invalid result {}",
426 exp(0f64)
427 );
428 assert!(
429 (exp(5f64) - 148.4131591025766034211155800405522796f64).abs() < 1e-8,
430 "Invalid result {}",
431 exp(5f64)
432 );
433 }
434
435 #[test]
436 fn f_exp_test() {
437 assert_eq!(f_exp(0.000000014901161193847656), 1.0000000149011614);
438 assert_eq!(f_exp(0.), 1.);
439 assert_eq!(f_exp(5f64), 148.4131591025766034211155800405522796f64);
440 assert_eq!(f_exp(f64::INFINITY), f64::INFINITY);
441 assert_eq!(f_exp(f64::NEG_INFINITY), 0.);
442 assert!(f_exp(f64::NAN).is_nan());
443 }
444}