pxfm/exponents/
expf.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29#![allow(clippy::too_many_arguments)]
30use crate::common::{dd_fmla, dyad_fmla, f_fmla, f_fmlaf, fmlaf, pow2if, rintfk};
31use crate::double_double::DoubleDouble;
32use crate::rounding::{CpuFloor, CpuRound};
33
34const L2U_F: f32 = 0.693_145_751_953_125;
35const L2L_F: f32 = 1.428_606_765_330_187_045_e-6;
36const R_LN2_F: f32 = std::f32::consts::LOG2_E;
37
38/// Exp for given value for const context.
39/// This is simplified version just to make a good approximation on const context.
40#[inline]
41pub const fn expf(d: f32) -> f32 {
42    const EXP_POLY_1_S: f32 = 2f32;
43    const EXP_POLY_2_S: f32 = 0.16666707f32;
44    const EXP_POLY_3_S: f32 = -0.002775669f32;
45    let qf = rintfk(d * R_LN2_F);
46    let q = qf as i32;
47    let r = fmlaf(qf, -L2U_F, d);
48    let r = fmlaf(qf, -L2L_F, r);
49
50    let f = r * r;
51    // Poly for u = r*(exp(r)+1)/(exp(r)-1)
52    let mut u = EXP_POLY_3_S;
53    u = fmlaf(u, f, EXP_POLY_2_S);
54    u = fmlaf(u, f, EXP_POLY_1_S);
55    let u = 1f32 + 2f32 * r / (u - r);
56    let i2 = pow2if(q);
57    u * i2
58    // if d < -87f32 {
59    //     r = 0f32;
60    // }
61    // if d > 88f32 {
62    //     r = f32::INFINITY;
63    // }
64}
65
66// Lookup table for exp(m) with m = -104, ..., 102.
67//   -104 = floor(log(single precision's min denormal))
68//    103 = ceil(log(single precision's max bessel K(n) that will be used))
69// Table is generated with SageMath as follows:
70// for r in range(-104, 103):
71//     print(double_to_hex(RealField(180)(r).exp()) + ",")
72pub(crate) static EXP_M1: [u64; 207] = [
73    0x368f1e6b68529e33,
74    0x36a525be4e4e601d,
75    0x36bcbe0a45f75eb1,
76    0x36d3884e838aea68,
77    0x36ea8c1f14e2af5d,
78    0x37020a717e64a9bd,
79    0x3718851d84118908,
80    0x3730a9bdfb02d240,
81    0x3746a5bea046b42e,
82    0x375ec7f3b269efa8,
83    0x3774eafb87eab0f2,
84    0x378c6e2d05bbc000,
85    0x37a35208867c2683,
86    0x37ba425b317eeacd,
87    0x37d1d8508fa8246a,
88    0x37e840fbc08fdc8a,
89    0x38007b7112bc1ffe,
90    0x381666d0dad2961d,
91    0x382e726c3f64d0fe,
92    0x3844b0dc07cabf98,
93    0x385c1f2daf3b6a46,
94    0x38731c5957a47de2,
95    0x3889f96445648b9f,
96    0x38a1a6baeadb4fd1,
97    0x38b7fd974d372e45,
98    0x38d04da4d1452919,
99    0x38e62891f06b3450,
100    0x38fe1dd273aa8a4a,
101    0x3914775e0840bfdd,
102    0x392bd109d9d94bda,
103    0x3942e73f53fba844,
104    0x3959b138170d6bfe,
105    0x397175af0cf60ec5,
106    0x3987baee1bffa80b,
107    0x39a02057d1245ceb,
108    0x39b5eafffb34ba31,
109    0x39cdca23bae16424,
110    0x39e43e7fc88b8056,
111    0x39fb83bf23a9a9eb,
112    0x3a12b2b8dd05b318,
113    0x3a2969d47321e4cc,
114    0x3a41452b7723aed2,
115    0x3a5778fe2497184c,
116    0x3a6fe7116182e9cc,
117    0x3a85ae191a99585a,
118    0x3a9d775d87da854d,
119    0x3ab4063f8cc8bb98,
120    0x3acb374b315f87c1,
121    0x3ae27ec458c65e3c,
122    0x3af923372c67a074,
123    0x3b11152eaeb73c08,
124    0x3b2737c5645114b5,
125    0x3b3f8e6c24b5592e,
126    0x3b5571db733a9d61,
127    0x3b6d257d547e083f,
128    0x3b83ce9b9de78f85,
129    0x3b9aebabae3a41b5,
130    0x3bb24b6031b49bda,
131    0x3bc8dd5e1bb09d7e,
132    0x3be0e5b73d1ff53d,
133    0x3bf6f741de1748ec,
134    0x3c0f36bd37f42f3e,
135    0x3c2536452ee2f75c,
136    0x3c3cd480a1b74820,
137    0x3c539792499b1a24,
138    0x3c6aa0de4bf35b38,
139    0x3c82188ad6ae3303,
140    0x3c9898471fca6055,
141    0x3cb0b6c3afdde064,
142    0x3cc6b7719a59f0e0,
143    0x3cdee001eed62aa0,
144    0x3cf4fb547c775da8,
145    0x3d0c8464f7616468,
146    0x3d236121e24d3bba,
147    0x3d3a56e0c2ac7f75,
148    0x3d51e642baeb84a0,
149    0x3d6853f01d6d53ba,
150    0x3d80885298767e9a,
151    0x3d967852a7007e42,
152    0x3dae8a37a45fc32e,
153    0x3dc4c1078fe9228a,
154    0x3ddc3527e433fab1,
155    0x3df32b48bf117da2,
156    0x3e0a0db0d0ddb3ec,
157    0x3e21b48655f37267,
158    0x3e381056ff2c5772,
159    0x3e505a628c699fa1,
160    0x3e6639e3175a689d,
161    0x3e7e355bbaee85cb,
162    0x3e94875ca227ec38,
163    0x3eabe6c6fdb01612,
164    0x3ec2f6053b981d98,
165    0x3ed9c54c3b43bc8b,
166    0x3ef18354238f6764,
167    0x3f07cd79b5647c9b,
168    0x3f202cf22526545a,
169    0x3f35fc21041027ad,
170    0x3f4de16b9c24a98f,
171    0x3f644e51f113d4d6,
172    0x3f7b993fe00d5376,
173    0x3f92c155b8213cf4,
174    0x3fa97db0ccceb0af,
175    0x3fc152aaa3bf81cc,
176    0x3fd78b56362cef38,
177    0x3ff0000000000000,
178    0x4005bf0a8b145769,
179    0x401d8e64b8d4ddae,
180    0x403415e5bf6fb106,
181    0x404b4c902e273a58,
182    0x40628d389970338f,
183    0x407936dc5690c08f,
184    0x409122885aaeddaa,
185    0x40a749ea7d470c6e,
186    0x40bfa7157c470f82,
187    0x40d5829dcf950560,
188    0x40ed3c4488ee4f7f,
189    0x4103de1654d37c9a,
190    0x411b00b5916ac955,
191    0x413259ac48bf05d7,
192    0x4148f0ccafad2a87,
193    0x4160f2ebd0a80020,
194    0x417709348c0ea4f9,
195    0x418f4f22091940bd,
196    0x41a546d8f9ed26e1,
197    0x41bceb088b68e804,
198    0x41d3a6e1fd9eecfd,
199    0x41eab5adb9c43600,
200    0x420226af33b1fdc1,
201    0x4218ab7fb5475fb7,
202    0x4230c3d3920962c9,
203    0x4246c932696a6b5d,
204    0x425ef822f7f6731d,
205    0x42750bba3796379a,
206    0x428c9aae4631c056,
207    0x42a370470aec28ed,
208    0x42ba6b765d8cdf6d,
209    0x42d1f43fcc4b662c,
210    0x42e866f34a725782,
211    0x4300953e2f3a1ef7,
212    0x431689e221bc8d5b,
213    0x432ea215a1d20d76,
214    0x4344d13fbb1a001a,
215    0x435c4b334617cc67,
216    0x43733a43d282a519,
217    0x438a220d397972eb,
218    0x43a1c25c88df6862,
219    0x43b8232558201159,
220    0x43d0672a3c9eb871,
221    0x43e64b41c6d37832,
222    0x43fe4cf766fe49be,
223    0x44149767bc0483e3,
224    0x442bfc951eb8bb76,
225    0x444304d6aeca254b,
226    0x4459d97010884251,
227    0x44719103e4080b45,
228    0x4487e013cd114461,
229    0x44a03996528e074c,
230    0x44b60d4f6fdac731,
231    0x44cdf8c5af17ba3b,
232    0x44e45e3076d61699,
233    0x44fbaed16a6e0da7,
234    0x4512cffdfebde1a1,
235    0x4529919cabefcb69,
236    0x454160345c9953e3,
237    0x45579dbc9dc53c66,
238    0x45700c810d464097,
239    0x4585d009394c5c27,
240    0x459da57de8f107a8,
241    0x45b425982cf597cd,
242    0x45cb61e5ca3a5e31,
243    0x45e29bb825dfcf87,
244    0x45f94a90db0d6fe2,
245    0x46112fec759586fd,
246    0x46275c1dc469e3af,
247    0x463fbfd219c43b04,
248    0x4655936d44e1a146,
249    0x466d531d8a7ee79c,
250    0x4683ed9d24a2d51b,
251    0x469b15cfe5b6e17b,
252    0x46b268038c2c0e00,
253    0x46c9044a73545d48,
254    0x46e1002ab6218b38,
255    0x46f71b3540cbf921,
256    0x470f6799ea9c414a,
257    0x47255779b984f3eb,
258    0x473d01a210c44aa4,
259    0x4753b63da8e91210,
260    0x476aca8d6b0116b8,
261    0x478234de9e0c74e9,
262    0x4798bec7503ca477,
263    0x47b0d0eda9796b90,
264    0x47c6db0118477245,
265    0x47df1056dc7bf22d,
266    0x47f51c2cc3433801,
267    0x480cb108ffbec164,
268    0x48237f780991b584,
269    0x483a801c0ea8ac4d,
270    0x48520247cc4c46c1,
271    0x48687a0553328015,
272    0x4880a233dee4f9bb,
273    0x48969b7f55b808ba,
274    0x48aeba064644060a,
275    0x48c4e184933d9364,
276    0x48dc614fe2531841,
277    0x48f3494a9b171bf5,
278    0x490a36798b9d969b,
279    0x4921d03d8c0c04af,
280];
281
282// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
283// Table is generated with Sollya as follows:
284// > display = hexadecimal;
285// > for i from 0 to 127 do { D(exp(i / 128)); };
286pub(crate) static EXP_M2: [u64; 128] = [
287    0x3ff0000000000000,
288    0x3ff0202015600446,
289    0x3ff04080ab55de39,
290    0x3ff06122436410dd,
291    0x3ff08205601127ed,
292    0x3ff0a32a84e9c1f6,
293    0x3ff0c49236829e8c,
294    0x3ff0e63cfa7ab09d,
295    0x3ff1082b577d34ed,
296    0x3ff12a5dd543ccc5,
297    0x3ff14cd4fc989cd6,
298    0x3ff16f9157587069,
299    0x3ff192937074e0cd,
300    0x3ff1b5dbd3f68122,
301    0x3ff1d96b0eff0e79,
302    0x3ff1fd41afcba45e,
303    0x3ff2216045b6f5cd,
304    0x3ff245c7613b8a9b,
305    0x3ff26a7793f60164,
306    0x3ff28f7170a755fd,
307    0x3ff2b4b58b372c79,
308    0x3ff2da4478b620c7,
309    0x3ff3001ecf601af7,
310    0x3ff32645269ea829,
311    0x3ff34cb8170b5835,
312    0x3ff373783a722012,
313    0x3ff39a862bd3c106,
314    0x3ff3c1e2876834aa,
315    0x3ff3e98deaa11dcc,
316    0x3ff41188f42c3e32,
317    0x3ff439d443f5f159,
318    0x3ff462707b2bac21,
319    0x3ff48b5e3c3e8186,
320    0x3ff4b49e2ae5ac67,
321    0x3ff4de30ec211e60,
322    0x3ff50817263c13cd,
323    0x3ff5325180cfacf7,
324    0x3ff55ce0a4c58c7c,
325    0x3ff587c53c5a7af0,
326    0x3ff5b2fff3210fd9,
327    0x3ff5de9176045ff5,
328    0x3ff60a7a734ab0e8,
329    0x3ff636bb9a983258,
330    0x3ff663559cf1bc7c,
331    0x3ff690492cbf9433,
332    0x3ff6bd96fdd034a2,
333    0x3ff6eb3fc55b1e76,
334    0x3ff719443a03acb9,
335    0x3ff747a513dbef6a,
336    0x3ff776630c678bc1,
337    0x3ff7a57ede9ea23e,
338    0x3ff7d4f946f0ba8d,
339    0x3ff804d30347b546,
340    0x3ff8350cd30ac390,
341    0x3ff865a7772164c5,
342    0x3ff896a3b1f66a0e,
343    0x3ff8c802477b0010,
344    0x3ff8f9c3fd29beaf,
345    0x3ff92be99a09bf00,
346    0x3ff95e73e6b1b75e,
347    0x3ff99163ad4b1dcc,
348    0x3ff9c4b9b995509b,
349    0x3ff9f876d8e8c566,
350    0x3ffa2c9bda3a3e78,
351    0x3ffa61298e1e069c,
352    0x3ffa9620c6cb3374,
353    0x3ffacb82581eee54,
354    0x3ffb014f179fc3b8,
355    0x3ffb3787dc80f95f,
356    0x3ffb6e2d7fa5eb18,
357    0x3ffba540dba56e56,
358    0x3ffbdcc2cccd3c85,
359    0x3ffc14b431256446,
360    0x3ffc4d15e873c193,
361    0x3ffc85e8d43f7cd0,
362    0x3ffcbf2dd7d490f2,
363    0x3ffcf8e5d84758a9,
364    0x3ffd3311bc7822b4,
365    0x3ffd6db26d16cd67,
366    0x3ffda8c8d4a66969,
367    0x3ffde455df80e3c0,
368    0x3ffe205a7bdab73e,
369    0x3ffe5cd799c6a54e,
370    0x3ffe99ce2b397649,
371    0x3ffed73f240dc142,
372    0x3fff152b7a07bb76,
373    0x3fff539424d90f5e,
374    0x3fff927a1e24bb76,
375    0x3fffd1de6182f8c9,
376    0x400008e0f64294ab,
377    0x40002912df5ce72a,
378    0x400049856cd84339,
379    0x40006a39207f0a09,
380    0x40008b2e7d2035cf,
381    0x4000ac6606916501,
382    0x4000cde041b0e9ae,
383    0x4000ef9db467dcf8,
384    0x4001119ee5ac36b6,
385    0x400133e45d82e952,
386    0x4001566ea50201d7,
387    0x4001793e4652cc50,
388    0x40019c53ccb3fc6b,
389    0x4001bfafc47bda73,
390    0x4001e352bb1a74ad,
391    0x4002073d3f1bd518,
392    0x40022b6fe02a3b9c,
393    0x40024feb2f105cb8,
394    0x400274afbdbba4a6,
395    0x400299be1f3e7f1c,
396    0x4002bf16e7d2a38c,
397    0x4002e4baacdb6614,
398    0x40030aaa04e80d05,
399    0x400330e587b62b28,
400    0x4003576dce33fead,
401    0x40037e437282d4ee,
402    0x4003a5670ff972ed,
403    0x4003ccd9432682b4,
404    0x4003f49aa9d30590,
405    0x40041cabe304cb34,
406    0x4004450d8f00edd4,
407    0x40046dc04f4e5338,
408    0x400496c4c6b832da,
409    0x4004c01b9950a111,
410    0x4004e9c56c731f5d,
411    0x400513c2e6c731d7,
412    0x40053e14b042f9ca,
413    0x400568bb722dd593,
414    0x400593b7d72305bb,
415];
416
417pub(crate) trait ExpfBackend {
418    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32;
419    fn fma(&self, x: f64, y: f64, z: f64) -> f64;
420    fn dd_fma(&self, x: f64, y: f64, z: f64) -> f64;
421    fn dyad_fma(&self, x: f64, y: f64, z: f64) -> f64;
422    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64;
423    fn polyeval5(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64) -> f64;
424    fn polyeval6(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64, a5: f64) -> f64;
425    fn polyeval7(
426        &self,
427        x: f64,
428        a0: f64,
429        a1: f64,
430        a2: f64,
431        a3: f64,
432        a4: f64,
433        a5: f64,
434        a6: f64,
435    ) -> f64;
436    fn roundf(&self, x: f32) -> f32;
437    fn round(&self, x: f64) -> f64;
438    fn floor(&self, x: f64) -> f64;
439    fn round_ties_even(&self, x: f64) -> f64;
440    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble;
441    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble;
442    fn quick_f64_mult(&self, x: f64, y: DoubleDouble) -> DoubleDouble;
443    fn exact_mult(&self, x: f64, y: f64) -> DoubleDouble;
444}
445
446pub(crate) struct GenericExpfBackend {}
447
448impl ExpfBackend for GenericExpfBackend {
449    #[inline(always)]
450    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
451        f_fmlaf(x, y, z)
452    }
453
454    #[inline(always)]
455    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
456        use crate::common::f_fmla;
457        f_fmla(x, y, z)
458    }
459    #[inline(always)]
460    fn dd_fma(&self, x: f64, y: f64, z: f64) -> f64 {
461        dd_fmla(x, y, z)
462    }
463    #[inline(always)]
464    fn dyad_fma(&self, x: f64, y: f64, z: f64) -> f64 {
465        dyad_fmla(x, y, z)
466    }
467
468    #[inline(always)]
469    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
470        use crate::polyeval::f_polyeval3;
471        f_polyeval3(x, a0, a1, a2)
472    }
473
474    #[inline(always)]
475    fn polyeval5(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64) -> f64 {
476        use crate::polyeval::f_polyeval5;
477        f_polyeval5(x, a0, a1, a2, a3, a4)
478    }
479
480    #[inline(always)]
481    fn polyeval6(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64, a5: f64) -> f64 {
482        use crate::polyeval::f_polyeval6;
483        f_polyeval6(x, a0, a1, a2, a3, a4, a5)
484    }
485
486    #[inline(always)]
487    fn polyeval7(
488        &self,
489        x: f64,
490        a0: f64,
491        a1: f64,
492        a2: f64,
493        a3: f64,
494        a4: f64,
495        a5: f64,
496        a6: f64,
497    ) -> f64 {
498        use crate::polyeval::f_polyeval7;
499        f_polyeval7(x, a0, a1, a2, a3, a4, a5, a6)
500    }
501
502    #[inline(always)]
503    fn roundf(&self, x: f32) -> f32 {
504        x.cpu_round()
505    }
506
507    #[inline(always)]
508    fn round(&self, x: f64) -> f64 {
509        x.cpu_round()
510    }
511
512    #[inline(always)]
513    fn floor(&self, x: f64) -> f64 {
514        x.cpu_floor()
515    }
516
517    #[inline(always)]
518    fn round_ties_even(&self, x: f64) -> f64 {
519        use crate::rounding::CpuRoundTiesEven;
520        x.cpu_round_ties_even()
521    }
522
523    #[inline(always)]
524    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble {
525        DoubleDouble::quick_mult(x, y)
526    }
527
528    #[inline(always)]
529    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
530        DoubleDouble::quick_mult_f64(x, y)
531    }
532
533    #[inline(always)]
534    fn quick_f64_mult(&self, x: f64, y: DoubleDouble) -> DoubleDouble {
535        DoubleDouble::quick_mult_f64(y, x)
536    }
537
538    #[inline(always)]
539    fn exact_mult(&self, x: f64, y: f64) -> DoubleDouble {
540        DoubleDouble::from_exact_mult(x, y)
541    }
542}
543
544#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
545pub(crate) struct FmaBackend {}
546
547#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
548impl ExpfBackend for FmaBackend {
549    #[inline(always)]
550    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
551        f32::mul_add(x, y, z)
552    }
553
554    #[inline(always)]
555    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
556        f64::mul_add(x, y, z)
557    }
558
559    #[inline(always)]
560    fn dd_fma(&self, x: f64, y: f64, z: f64) -> f64 {
561        f64::mul_add(x, y, z)
562    }
563    #[inline(always)]
564    fn dyad_fma(&self, x: f64, y: f64, z: f64) -> f64 {
565        f64::mul_add(x, y, z)
566    }
567
568    #[inline(always)]
569    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
570        use crate::polyeval::d_polyeval3;
571        d_polyeval3(x, a0, a1, a2)
572    }
573
574    #[inline(always)]
575    fn polyeval5(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64) -> f64 {
576        use crate::polyeval::d_polyeval5;
577        d_polyeval5(x, a0, a1, a2, a3, a4)
578    }
579
580    #[inline(always)]
581    fn polyeval6(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64, a4: f64, a5: f64) -> f64 {
582        use crate::polyeval::d_polyeval6;
583        d_polyeval6(x, a0, a1, a2, a3, a4, a5)
584    }
585
586    #[inline(always)]
587    fn polyeval7(
588        &self,
589        x: f64,
590        a0: f64,
591        a1: f64,
592        a2: f64,
593        a3: f64,
594        a4: f64,
595        a5: f64,
596        a6: f64,
597    ) -> f64 {
598        use crate::polyeval::d_polyeval7;
599        d_polyeval7(x, a0, a1, a2, a3, a4, a5, a6)
600    }
601
602    #[inline(always)]
603    fn roundf(&self, x: f32) -> f32 {
604        x.round()
605    }
606
607    #[inline(always)]
608    fn round(&self, x: f64) -> f64 {
609        x.round()
610    }
611
612    #[inline(always)]
613    fn floor(&self, x: f64) -> f64 {
614        x.floor()
615    }
616
617    #[inline(always)]
618    fn round_ties_even(&self, x: f64) -> f64 {
619        x.round_ties_even()
620    }
621
622    #[inline(always)]
623    fn quick_mult(&self, x: DoubleDouble, y: DoubleDouble) -> DoubleDouble {
624        DoubleDouble::quick_mult_fma(x, y)
625    }
626
627    #[inline(always)]
628    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
629        DoubleDouble::quick_mult_f64_fma(x, y)
630    }
631
632    #[inline(always)]
633    fn quick_f64_mult(&self, x: f64, y: DoubleDouble) -> DoubleDouble {
634        DoubleDouble::quick_mult_f64_fma(y, x)
635    }
636
637    #[inline(always)]
638    fn exact_mult(&self, x: f64, y: f64) -> DoubleDouble {
639        DoubleDouble::from_exact_mult_fma(x, y)
640    }
641}
642
643#[inline(always)]
644fn expf_gen_impl<B: ExpfBackend>(x: f32, backend: B) -> f32 {
645    let x_u = x.to_bits();
646    let x_abs = x_u & 0x7fff_ffffu32;
647    if x_abs >= 0x42b2_0000u32 || x_abs <= 0x3280_0000u32 {
648        let exp = ((x_u >> 23) & 0xFF) as i32;
649        // |x| < 2^-25
650        if exp <= 101i32 {
651            return 1.0 + x;
652        }
653
654        // When x < log(2^-150) or nan
655        if x_u >= 0xc2cf_f1b5u32 {
656            // exp(-Inf) = 0
657            if x.is_infinite() {
658                return 0.0;
659            }
660            // exp(nan) = nan
661            if x.is_nan() {
662                return x;
663            }
664            return 0.0;
665        }
666        // x >= 89 or nan
667        if x.is_sign_positive() && (x_u >= 0x42b2_0000) {
668            // x is +inf or nan
669            return x + f32::INFINITY;
670        }
671    }
672
673    // For -104 < x < 89, to compute exp(x), we perform the following range
674    // reduction: find hi, mid, lo such that:
675    //   x = hi + mid + lo, in which
676    //     hi is an integer,
677    //     mid * 2^7 is an integer
678    //     -2^(-8) <= lo < 2^-8.
679    // In particular,
680    //   hi + mid = round(x * 2^7) * 2^(-7).
681    // Then,
682    //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
683    // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
684    // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
685    // generated by Sollya.
686
687    // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
688    let kf = backend.roundf(x * 128.);
689    // Subtract (hi + mid) from x to get lo.
690    let xd = backend.fmaf(kf, -0.0078125 /* - 1/128 */, x) as f64;
691    let mut x_hi = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
692    x_hi += 104 << 7;
693    // hi = x_hi >> 7
694    let exp_hi = f64::from_bits(EXP_M1[(x_hi >> 7) as usize]);
695    // mid * 2^7 = x_hi & 0x0000'007fU;
696    let exp_mid = f64::from_bits(EXP_M2[(x_hi & 0x7f) as usize]);
697    // Degree-4 minimax polynomial generated by Sollya with the following
698    // commands:
699    // d = [-2^-8, 2^-8];
700    // f_exp = expm1(x)/x;
701    // Q = fpminimax(f_exp, 3, [|D...|], [-2^-8, 2^-8]);
702    let p = backend.polyeval5(
703        xd,
704        1.,
705        f64::from_bits(0x3feffffffffff777),
706        f64::from_bits(0x3fe000000000071c),
707        f64::from_bits(0x3fc555566668e5e7),
708        f64::from_bits(0x3fa55555555ef243),
709    );
710    (p * exp_hi * exp_mid) as f32
711}
712
713#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
714#[target_feature(enable = "avx", enable = "fma")]
715unsafe fn expf_fma_impl(x: f32) -> f32 {
716    expf_gen_impl(x, FmaBackend {})
717}
718
719/// Computes exp
720///
721/// Max found ULP 0.5
722#[inline]
723pub fn f_expf(x: f32) -> f32 {
724    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
725    {
726        expf_gen_impl(x, GenericExpfBackend {})
727    }
728    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
729    {
730        use std::sync::OnceLock;
731        static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
732        let q = EXECUTOR.get_or_init(|| {
733            if std::arch::is_x86_feature_detected!("avx")
734                && std::arch::is_x86_feature_detected!("fma")
735            {
736                expf_fma_impl
737            } else {
738                fn def_expf(x: f32) -> f32 {
739                    expf_gen_impl(x, GenericExpfBackend {})
740                }
741                def_expf
742            }
743        });
744        unsafe { q(x) }
745    }
746}
747
748#[inline]
749pub(crate) fn core_expf(x: f32) -> f64 {
750    // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
751    let kf = (x * 128.).cpu_round();
752    // Subtract (hi + mid) from x to get lo.
753    let xd = f_fmlaf(kf, -0.0078125 /* - 1/128 */, x) as f64;
754    let mut x_hi = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
755    x_hi += 104 << 7;
756    // hi = x_hi >> 7
757    let exp_hi = f64::from_bits(EXP_M1[(x_hi >> 7) as usize]);
758    // mid * 2^7 = x_hi & 0x0000'007fU;
759    let exp_mid = f64::from_bits(EXP_M2[(x_hi & 0x7f) as usize]);
760    // Degree-4 minimax polynomial generated by Sollya with the following
761    // commands:
762    // d = [-2^-8, 2^-8];
763    // f_exp = expm1(x)/x;
764    // Q = fpminimax(f_exp, 3, [|D...|], [-2^-8, 2^-8]);
765    use crate::polyeval::f_polyeval5;
766    let p = f_polyeval5(
767        xd,
768        1.,
769        f64::from_bits(0x3feffffffffff777),
770        f64::from_bits(0x3fe000000000071c),
771        f64::from_bits(0x3fc555566668e5e7),
772        f64::from_bits(0x3fa55555555ef243),
773    );
774    p * exp_hi * exp_mid
775}
776
777#[inline]
778pub(crate) fn core_expdf(x: f64) -> f64 {
779    // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
780    let kf = (x * 128.).cpu_round();
781    // Subtract (hi + mid) from x to get lo.
782    let xd = f_fmla(kf, -0.0078125 /* - 1/128 */, x);
783    let mut x_hi = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
784    x_hi += 104 << 7;
785    // hi = x_hi >> 7
786    let exp_hi = f64::from_bits(EXP_M1[(x_hi >> 7) as usize]);
787    // mid * 2^7 = x_hi & 0x0000'007fU;
788    let exp_mid = f64::from_bits(EXP_M2[(x_hi & 0x7f) as usize]);
789    // Degree-4 minimax polynomial generated by Sollya with the following
790    // commands:
791    // d = [-2^-8, 2^-8];
792    // f_exp = expm1(x)/x;
793    // Q = fpminimax(f_exp, 3, [|D...|], [-2^-8, 2^-8]);
794    use crate::polyeval::f_polyeval5;
795    let p = f_polyeval5(
796        xd,
797        1.,
798        f64::from_bits(0x3feffffffffff777),
799        f64::from_bits(0x3fe000000000071c),
800        f64::from_bits(0x3fc555566668e5e7),
801        f64::from_bits(0x3fa55555555ef243),
802    );
803    p * exp_hi * exp_mid
804}
805
806#[cfg(test)]
807mod tests {
808    use super::*;
809
810    #[test]
811    fn expf_test() {
812        assert!(
813            (expf(0f32) - 1f32).abs() < 1e-6,
814            "Invalid result {}",
815            expf(0f32)
816        );
817        assert!(
818            (expf(5f32) - 148.4131591025766f32).abs() < 1e-6,
819            "Invalid result {}",
820            expf(5f32)
821        );
822    }
823
824    #[test]
825    fn f_expf_test() {
826        assert_eq!(f_expf(-103.971596), 1e-45);
827        assert!(
828            (f_expf(0f32) - 1f32).abs() < 1e-6,
829            "Invalid result {}",
830            f_expf(0f32)
831        );
832        assert!(
833            (f_expf(5f32) - 148.4131591025766f32).abs() < 1e-6,
834            "Invalid result {}",
835            f_expf(5f32)
836        );
837        assert_eq!(f_expf(f32::INFINITY), f32::INFINITY);
838        assert_eq!(f_expf(f32::NEG_INFINITY), 0.);
839        assert!(f_expf(f32::NAN).is_nan());
840    }
841}