pxfm/exponents/
exp2m1.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::double_double::DoubleDouble;
30use crate::exponents::expf::{ExpfBackend, GenericExpfBackend};
31use crate::exponents::fast_ldexp;
32
33const LN2H: f64 = f64::from_bits(0x3fe62e42fefa39ef);
34const LN2L: f64 = f64::from_bits(0x3c7abc9e3b39803f);
35
36struct Exp2m1 {
37    exp: DoubleDouble,
38    err: f64,
39}
40
41/* For 0 <= i < 64, T1[i] = (h,l) such that h+l is the best double-double
42approximation of 2^(i/64). The approximation error is bounded as follows:
43|h + l - 2^(i/64)| < 2^-107. */
44pub(crate) static EXP_M1_2_TABLE1: [(u64, u64); 64] = [
45    (0x0000000000000000, 0x3ff0000000000000),
46    (0xbc719083535b085d, 0x3ff02c9a3e778061),
47    (0x3c8d73e2a475b465, 0x3ff059b0d3158574),
48    (0x3c6186be4bb284ff, 0x3ff0874518759bc8),
49    (0x3c98a62e4adc610b, 0x3ff0b5586cf9890f),
50    (0x3c403a1727c57b53, 0x3ff0e3ec32d3d1a2),
51    (0xbc96c51039449b3a, 0x3ff11301d0125b51),
52    (0xbc932fbf9af1369e, 0x3ff1429aaea92de0),
53    (0xbc819041b9d78a76, 0x3ff172b83c7d517b),
54    (0x3c8e5b4c7b4968e4, 0x3ff1a35beb6fcb75),
55    (0x3c9e016e00a2643c, 0x3ff1d4873168b9aa),
56    (0x3c8dc775814a8495, 0x3ff2063b88628cd6),
57    (0x3c99b07eb6c70573, 0x3ff2387a6e756238),
58    (0x3c82bd339940e9d9, 0x3ff26b4565e27cdd),
59    (0x3c8612e8afad1255, 0x3ff29e9df51fdee1),
60    (0x3c90024754db41d5, 0x3ff2d285a6e4030b),
61    (0x3c86f46ad23182e4, 0x3ff306fe0a31b715),
62    (0x3c932721843659a6, 0x3ff33c08b26416ff),
63    (0xbc963aeabf42eae2, 0x3ff371a7373aa9cb),
64    (0xbc75e436d661f5e3, 0x3ff3a7db34e59ff7),
65    (0x3c8ada0911f09ebc, 0x3ff3dea64c123422),
66    (0xbc5ef3691c309278, 0x3ff4160a21f72e2a),
67    (0x3c489b7a04ef80d0, 0x3ff44e086061892d),
68    (0x3c73c1a3b69062f0, 0x3ff486a2b5c13cd0),
69    (0x3c7d4397afec42e2, 0x3ff4bfdad5362a27),
70    (0xbc94b309d25957e3, 0x3ff4f9b2769d2ca7),
71    (0xbc807abe1db13cad, 0x3ff5342b569d4f82),
72    (0x3c99bb2c011d93ad, 0x3ff56f4736b527da),
73    (0x3c96324c054647ad, 0x3ff5ab07dd485429),
74    (0x3c9ba6f93080e65e, 0x3ff5e76f15ad2148),
75    (0xbc9383c17e40b497, 0x3ff6247eb03a5585),
76    (0xbc9bb60987591c34, 0x3ff6623882552225),
77    (0xbc9bdd3413b26456, 0x3ff6a09e667f3bcd),
78    (0xbc6bbe3a683c88ab, 0x3ff6dfb23c651a2f),
79    (0xbc816e4786887a99, 0x3ff71f75e8ec5f74),
80    (0xbc90245957316dd3, 0x3ff75feb564267c9),
81    (0xbc841577ee04992f, 0x3ff7a11473eb0187),
82    (0x3c705d02ba15797e, 0x3ff7e2f336cf4e62),
83    (0xbc9d4c1dd41532d8, 0x3ff82589994cce13),
84    (0xbc9fc6f89bd4f6ba, 0x3ff868d99b4492ed),
85    (0x3c96e9f156864b27, 0x3ff8ace5422aa0db),
86    (0x3c85cc13a2e3976c, 0x3ff8f1ae99157736),
87    (0xbc675fc781b57ebc, 0x3ff93737b0cdc5e5),
88    (0xbc9d185b7c1b85d1, 0x3ff97d829fde4e50),
89    (0x3c7c7c46b071f2be, 0x3ff9c49182a3f090),
90    (0xbc9359495d1cd533, 0x3ffa0c667b5de565),
91    (0xbc9d2f6edb8d41e1, 0x3ffa5503b23e255d),
92    (0x3c90fac90ef7fd31, 0x3ffa9e6b5579fdbf),
93    (0x3c97a1cd345dcc81, 0x3ffae89f995ad3ad),
94    (0xbc62805e3084d708, 0x3ffb33a2b84f15fb),
95    (0xbc75584f7e54ac3b, 0x3ffb7f76f2fb5e47),
96    (0x3c823dd07a2d9e84, 0x3ffbcc1e904bc1d2),
97    (0x3c811065895048dd, 0x3ffc199bdd85529c),
98    (0x3c92884dff483cad, 0x3ffc67f12e57d14b),
99    (0x3c7503cbd1e949db, 0x3ffcb720dcef9069),
100    (0xbc9cbc3743797a9c, 0x3ffd072d4a07897c),
101    (0x3c82ed02d75b3707, 0x3ffd5818dcfba487),
102    (0x3c9c2300696db532, 0x3ffda9e603db3285),
103    (0xbc91a5cd4f184b5c, 0x3ffdfc97337b9b5f),
104    (0x3c839e8980a9cc8f, 0x3ffe502ee78b3ff6),
105    (0xbc9e9c23179c2893, 0x3ffea4afa2a490da),
106    (0x3c9dc7f486a4b6b0, 0x3ffefa1bee615a27),
107    (0x3c99d3e12dd8a18b, 0x3fff50765b6e4540),
108    (0x3c874853f3a5931e, 0x3fffa7c1819e90d8),
109];
110
111/* For 0 <= i < 64, T2[i] = (h,l) such that h+l is the best double-double
112approximation of 2^(i/2^12). The approximation error is bounded as follows:
113|h + l - 2^(i/2^12)| < 2^-107. */
114pub(crate) static EXP_M1_2_TABLE2: [(u64, u64); 64] = [
115    (0x0000000000000000, 0x3ff0000000000000),
116    (0x3c9ae8e38c59c72a, 0x3ff000b175effdc7),
117    (0xbc57b5d0d58ea8f4, 0x3ff00162f3904052),
118    (0x3c94115cb6b16a8e, 0x3ff0021478e11ce6),
119    (0xbc8d7c96f201bb2f, 0x3ff002c605e2e8cf),
120    (0x3c984711d4c35e9f, 0x3ff003779a95f959),
121    (0xbc80484245243777, 0x3ff0042936faa3d8),
122    (0xbc94b237da2025f9, 0x3ff004dadb113da0),
123    (0xbc75e00e62d6b30d, 0x3ff0058c86da1c0a),
124    (0x3c9a1d6cedbb9481, 0x3ff0063e3a559473),
125    (0xbc94acf197a00142, 0x3ff006eff583fc3d),
126    (0xbc6eaf2ea42391a5, 0x3ff007a1b865a8ca),
127    (0x3c7da93f90835f75, 0x3ff0085382faef83),
128    (0xbc86a79084ab093c, 0x3ff00905554425d4),
129    (0x3c986364f8fbe8f8, 0x3ff009b72f41a12b),
130    (0xbc882e8e14e3110e, 0x3ff00a6910f3b6fd),
131    (0xbc84f6b2a7609f71, 0x3ff00b1afa5abcbf),
132    (0xbc7e1a258ea8f71b, 0x3ff00bcceb7707ec),
133    (0x3c74362ca5bc26f1, 0x3ff00c7ee448ee02),
134    (0x3c9095a56c919d02, 0x3ff00d30e4d0c483),
135    (0xbc6406ac4e81a645, 0x3ff00de2ed0ee0f5),
136    (0x3c9b5a6902767e09, 0x3ff00e94fd0398e0),
137    (0xbc991b2060859321, 0x3ff00f4714af41d3),
138    (0x3c8427068ab22306, 0x3ff00ff93412315c),
139    (0x3c9c1d0660524e08, 0x3ff010ab5b2cbd11),
140    (0xbc9e7bdfb3204be8, 0x3ff0115d89ff3a8b),
141    (0x3c8843aa8b9cbbc6, 0x3ff0120fc089ff63),
142    (0xbc734104ee7edae9, 0x3ff012c1fecd613b),
143    (0xbc72b6aeb6176892, 0x3ff0137444c9b5b5),
144    (0x3c7a8cd33b8a1bb3, 0x3ff01426927f5278),
145    (0x3c72edc08e5da99a, 0x3ff014d8e7ee8d2f),
146    (0x3c857ba2dc7e0c73, 0x3ff0158b4517bb88),
147    (0x3c9b61299ab8cdb7, 0x3ff0163da9fb3335),
148    (0xbc990565902c5f44, 0x3ff016f0169949ed),
149    (0x3c870fc41c5c2d53, 0x3ff017a28af25567),
150    (0x3c94b9a6e145d76c, 0x3ff018550706ab62),
151    (0xbc7008eff5142bf9, 0x3ff019078ad6a19f),
152    (0xbc977669f033c7de, 0x3ff019ba16628de2),
153    (0xbc909bb78eeead0a, 0x3ff01a6ca9aac5f3),
154    (0x3c9371231477ece5, 0x3ff01b1f44af9f9e),
155    (0x3c75e7626621eb5b, 0x3ff01bd1e77170b4),
156    (0xbc9bc72b100828a5, 0x3ff01c8491f08f08),
157    (0xbc6ce39cbbab8bbe, 0x3ff01d37442d5070),
158    (0x3c816996709da2e2, 0x3ff01de9fe280ac8),
159    (0xbc8c11f5239bf535, 0x3ff01e9cbfe113ef),
160    (0x3c8e1d4eb5edc6b3, 0x3ff01f4f8958c1c6),
161    (0xbc9afb99946ee3f0, 0x3ff020025a8f6a35),
162    (0xbc98f06d8a148a32, 0x3ff020b533856324),
163    (0xbc82bf310fc54eb6, 0x3ff02168143b0281),
164    (0xbc9c95a035eb4175, 0x3ff0221afcb09e3e),
165    (0xbc9491793e46834d, 0x3ff022cdece68c4f),
166    (0xbc73e8d0d9c49091, 0x3ff02380e4dd22ad),
167    (0xbc9314aa16278aa3, 0x3ff02433e494b755),
168    (0x3c848daf888e9651, 0x3ff024e6ec0da046),
169    (0x3c856dc8046821f4, 0x3ff02599fb483385),
170    (0x3c945b42356b9d47, 0x3ff0264d1244c719),
171    (0xbc7082ef51b61d7e, 0x3ff027003103b10e),
172    (0x3c72106ed0920a34, 0x3ff027b357854772),
173    (0xbc9fd4cf26ea5d0f, 0x3ff0286685c9e059),
174    (0xbc909f8775e78084, 0x3ff02919bbd1d1d8),
175    (0x3c564cbba902ca27, 0x3ff029ccf99d720a),
176    (0x3c94383ef231d207, 0x3ff02a803f2d170d),
177    (0x3c94a47a505b3a47, 0x3ff02b338c811703),
178    (0x3c9e47120223467f, 0x3ff02be6e199c811),
179];
180
181// Approximation for the fast path of exp(z) for z=zh+zl,
182// with |z| < 0.000130273 < 2^-12.88 and |zl| < 2^-42.6
183// (assuming x^y does not overflow or underflow)
184#[inline(always)]
185fn q_1<B: ExpfBackend>(dz: DoubleDouble, backend: &B) -> DoubleDouble {
186    const Q_1: [u64; 5] = [
187        0x3ff0000000000000,
188        0x3ff0000000000000,
189        0x3fe0000000000000,
190        0x3fc5555555995d37,
191        0x3fa55555558489dc,
192    ];
193    let z = dz.to_f64();
194    let mut q = backend.fma(f64::from_bits(Q_1[4]), dz.hi, f64::from_bits(Q_1[3]));
195    q = backend.fma(q, z, f64::from_bits(Q_1[2]));
196
197    let mut p0 = DoubleDouble::from_exact_add(f64::from_bits(Q_1[1]), q * z);
198    p0 = backend.quick_mult(dz, p0);
199    p0 = DoubleDouble::f64_add(f64::from_bits(Q_1[0]), p0);
200    p0
201}
202
203#[inline(always)]
204fn exp1<B: ExpfBackend>(x: DoubleDouble, backend: &B) -> DoubleDouble {
205    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); /* |INVLOG2-2^12/log(2)| < 2^-43.4 */
206    let k = backend.round_ties_even(x.hi * INVLOG2);
207
208    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
209    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
210    const LOG2DD: DoubleDouble = DoubleDouble::new(LOG2L, LOG2H);
211    let zk = backend.quick_mult_f64(LOG2DD, k);
212
213    let mut yz = DoubleDouble::from_exact_add(x.hi - zk.hi, x.lo);
214    yz.lo -= zk.lo;
215
216    let ik: i64 = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
217    let im: i64 = (ik >> 12).wrapping_add(0x3ff);
218    let i2: i64 = (ik >> 6) & 0x3f;
219    let i1: i64 = ik & 0x3f;
220
221    let t1 = DoubleDouble::from_bit_pair(EXP_M1_2_TABLE1[i2 as usize]);
222    let t2 = DoubleDouble::from_bit_pair(EXP_M1_2_TABLE2[i1 as usize]);
223
224    let p0 = backend.quick_mult(t2, t1);
225
226    let mut q = q_1(yz, backend);
227    q = backend.quick_mult(p0, q);
228
229    /* Scale by 2^k. Warning: for x near 1024, we can have k=2^22, thus
230    M = 2047, which encodes Inf */
231    let mut du = (im as u64).wrapping_shl(52);
232    if im == 0x7ff {
233        q.hi *= 2.0;
234        q.lo *= 2.0;
235        du = (im.wrapping_sub(1) as u64).wrapping_shl(52);
236    }
237    q.hi *= f64::from_bits(du);
238    q.lo *= f64::from_bits(du);
239    q
240}
241
242#[inline(always)]
243fn exp2m1_fast<B: ExpfBackend>(x: f64, tiny: bool, backend: &B) -> Exp2m1 {
244    if tiny {
245        return exp2m1_fast_tiny(x, backend);
246    }
247    /* now -54 < x < -0.125 or 0.125 < x < 1024: we approximate exp(x*log(2))
248    and subtract 1 */
249    let mut v = backend.exact_mult(LN2H, x);
250    v.lo = backend.fma(x, LN2L, v.lo);
251    /*
252    The a_mul() call is exact, and the error of the fma() is bounded by
253     ulp(l).
254     We have |t| <= ulp(h) <= ulp(LN2H*1024) = 2^-43,
255     |t+x*LN2L| <= 2^-43 * 1024*LN2L < 2^-42.7,
256     thus |l| <= |t| + |x*LN2L| + ulp(t+x*LN2L)
257              <= 2^-42.7 + 2^-95 <= 2^-42.6, and ulp(l) <= 2^-95.
258     Thus:
259     |h + l - x*log(2)| <= |h + l - x*(LN2H+LN2L)| + |x|*|LN2H+LN2L-log(2)|
260                        <= 2^-95 + 1024*2^-110.4 < 2^-94.9 */
261
262    let mut p = exp1(v, backend);
263
264    let zf: DoubleDouble = if x >= 0. {
265        // implies h >= 1 and the fast_two_sum pre-condition holds
266        DoubleDouble::from_exact_add(p.hi, -1.0)
267    } else {
268        DoubleDouble::from_exact_add(-1.0, p.hi)
269    };
270    p.lo += zf.lo;
271    p.hi = zf.hi;
272    /* The error in the above fast_two_sum is bounded by 2^-105*|h|,
273    with the new value of h, thus the total absolute error is bounded
274    by eps1*|h_in|+2^-105*|h|.
275    Relatively to h this yields eps1*|h_in/h| + 2^-105, where the maximum
276    of |h_in/h| is obtained for x near -0.125, with |2^x/(2^x-1)| < 11.05.
277    We get a relative error bound of 2^-74.138*11.05 + 2^-105 < 2^-70.67. */
278    Exp2m1 {
279        exp: p,
280        err: f64::from_bits(0x3b84200000000000) * p.hi, /* 2^-70.67 < 0x1.42p-71 */
281    }
282}
283
284// Approximation for the accurate path of exp(z) for z=zh+zl,
285// with |z| < 0.000130273 < 2^-12.88 and |zl| < 2^-42.6
286// (assuming x^y does not overflow or underflow)
287#[inline(always)]
288fn q_2<B: ExpfBackend>(dz: DoubleDouble, backend: &B) -> DoubleDouble {
289    /* Let q[0]..q[7] be the coefficients of degree 0..7 of Q_2.
290    The ulp of q[7]*z^7 is at most 2^-155, thus we can compute q[7]*z^7
291    in double precision only.
292    The ulp of q[6]*z^6 is at most 2^-139, thus we can compute q[6]*z^6
293    in double precision only.
294    The ulp of q[5]*z^5 is at most 2^-124, thus we can compute q[5]*z^5
295    in double precision only. */
296
297    /* The following is a degree-7 polynomial generated by Sollya for exp(z)
298    over [-0.000130273,0.000130273] with absolute error < 2^-113.218
299    (see file exp_accurate.sollya). Since we use this code only for
300    |x| > 0.125 in exp2m1(x), the corresponding relative error for exp2m1
301    is about 2^-113.218/|exp2m1(-0.125)| which is about 2^-110. */
302    const Q_2: [u64; 9] = [
303        0x3ff0000000000000,
304        0x3ff0000000000000,
305        0x3fe0000000000000,
306        0x3fc5555555555555,
307        0x3c655555555c4d26,
308        0x3fa5555555555555,
309        0x3f81111111111111,
310        0x3f56c16c3fbb4213,
311        0x3f2a01a023ede0d7,
312    ];
313
314    let z = dz.to_f64();
315    let mut q = backend.dd_fma(f64::from_bits(Q_2[8]), dz.hi, f64::from_bits(Q_2[7]));
316    q = backend.dd_fma(q, z, f64::from_bits(Q_2[6]));
317    q = backend.dd_fma(q, z, f64::from_bits(Q_2[5]));
318
319    // multiply q by z and add Q_2[3] + Q_2[4]
320
321    let mut p = backend.exact_mult(q, z);
322    let r0 = DoubleDouble::from_exact_add(f64::from_bits(Q_2[3]), p.hi);
323    p.hi = r0.hi;
324    p.lo += r0.lo + f64::from_bits(Q_2[4]);
325
326    // multiply hi+lo by zh+zl and add Q_2[2]
327
328    p = backend.quick_mult(p, dz);
329    let r1 = DoubleDouble::from_exact_add(f64::from_bits(Q_2[2]), p.hi);
330    p.hi = r1.hi;
331    p.lo += r1.lo;
332
333    // multiply hi+lo by zh+zl and add Q_2[1]
334    p = backend.quick_mult(p, dz);
335    let r1 = DoubleDouble::from_exact_add(f64::from_bits(Q_2[1]), p.hi);
336    p.hi = r1.hi;
337    p.lo += r1.lo;
338
339    // multiply hi+lo by zh+zl and add Q_2[0]
340    p = backend.quick_mult(p, dz);
341    let r1 = DoubleDouble::from_exact_add(f64::from_bits(Q_2[0]), p.hi);
342    p.hi = r1.hi;
343    p.lo += r1.lo;
344    p
345}
346
347// returns a double-double approximation hi+lo of exp(x*log(2)) for |x| < 745
348#[inline(always)]
349fn exp_2<B: ExpfBackend>(x: f64, backend: &B) -> DoubleDouble {
350    let k = backend.round_ties_even(x * f64::from_bits(0x40b0000000000000));
351    // since |x| <= 745 we have k <= 3051520
352
353    let yhh = backend.fma(-k, f64::from_bits(0x3f30000000000000), x); // exact, |yh| <= 2^-13
354
355    /* now x = k + yh, thus 2^x = 2^k * 2^yh, and we multiply yh by log(2)
356    to use the accurate path of exp() */
357    let ky = backend.quick_f64_mult(yhh, DoubleDouble::new(LN2L, LN2H));
358
359    let ik: i64 = unsafe {
360        k.to_int_unchecked::<i64>() // k is already integer, this is just a conversion
361    };
362    let im = (ik >> 12).wrapping_add(0x3ff);
363    let i2 = (ik >> 6) & 0x3f;
364    let i1 = ik & 0x3f;
365
366    let t1 = DoubleDouble::from_bit_pair(EXP_M1_2_TABLE1[i2 as usize]);
367    let t2 = DoubleDouble::from_bit_pair(EXP_M1_2_TABLE2[i1 as usize]);
368
369    let p = backend.quick_mult(t2, t1);
370
371    let mut q = q_2(ky, backend);
372    q = backend.quick_mult(p, q);
373    let mut ud: u64 = (im as u64).wrapping_shl(52);
374
375    if im == 0x7ff {
376        q.hi *= 2.0;
377        q.lo *= 2.0;
378        ud = (im.wrapping_sub(1) as u64).wrapping_shl(52);
379    }
380    q.hi *= f64::from_bits(ud);
381    q.lo *= f64::from_bits(ud);
382    q
383}
384
385#[cold]
386#[inline(always)]
387pub(crate) fn exp2m1_accurate_tiny<B: ExpfBackend>(x: f64, backend: &B) -> f64 {
388    let x2 = x * x;
389    let x4 = x2 * x2;
390    const Q: [u64; 22] = [
391        0x3fe62e42fefa39ef,
392        0x3c7abc9e3b398040,
393        0x3fcebfbdff82c58f,
394        0xbc65e43a53e44dcf,
395        0x3fac6b08d704a0c0,
396        0xbc4d331627517168,
397        0x3f83b2ab6fba4e77,
398        0x3c14e65df0779f8c,
399        0x3f55d87fe78a6731,
400        0x3bd0717fbf4bd050,
401        0x3f2430912f86c787,
402        0x3bcbd2bdec9bcd42,
403        0x3eeffcbfc588b0c7,
404        0xbb8e60aa6d5e4aa9,
405        0x3eb62c0223a5c824,
406        0x3e7b5253d395e7d4,
407        0x3e3e4cf5158b9160,
408        0x3dfe8cac734c6058,
409        0x3dbc3bd64f17199d,
410        0x3d78161a17e05651,
411        0x3d33150b3d792231,
412        0x3cec184260bfad7e,
413    ];
414    let mut c13 = backend.dd_fma(f64::from_bits(Q[20]), x, f64::from_bits(Q[19])); // degree 13
415    let c11 = backend.dd_fma(f64::from_bits(Q[18]), x, f64::from_bits(Q[17])); // degree 11
416    c13 = backend.dd_fma(f64::from_bits(Q[21]), x2, c13); // degree 13
417    // add Q[16]*x+c11*x2+c13*x4 to Q[15] (degree 9)
418    let mut p = DoubleDouble::from_exact_add(
419        f64::from_bits(Q[15]),
420        backend.fma(f64::from_bits(Q[16]), x, backend.fma(c11, x2, c13 * x4)),
421    );
422    // multiply h+l by x and add Q[14] (degree 8)
423    p = backend.quick_f64_mult(x, p);
424    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[14]), p.hi);
425    p.lo += p0.lo;
426    p.hi = p0.hi;
427
428    // multiply h+l by x and add Q[12]+Q[13] (degree 7)
429    p = backend.quick_f64_mult(x, p);
430    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[12]), p.hi);
431    p.lo += p0.lo + f64::from_bits(Q[13]);
432    p.hi = p0.hi;
433    // multiply h+l by x and add Q[10]+Q[11] (degree 6)
434    p = backend.quick_f64_mult(x, p);
435    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[10]), p.hi);
436    p.lo += p0.lo + f64::from_bits(Q[11]);
437    p.hi = p0.hi;
438    // multiply h+l by x and add Q[8]+Q[9] (degree 5)
439    p = backend.quick_f64_mult(x, p);
440    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[8]), p.hi);
441    p.lo += p0.lo + f64::from_bits(Q[9]);
442    p.hi = p0.hi;
443    // multiply h+l by x and add Q[6]+Q[7] (degree 4)
444    p = backend.quick_f64_mult(x, p);
445    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[6]), p.hi);
446    p.lo += p0.lo + f64::from_bits(Q[7]);
447    p.hi = p0.hi;
448    // multiply h+l by x and add Q[4]+Q[5] (degree 3)
449    p = backend.quick_f64_mult(x, p);
450    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[4]), p.hi);
451    p.lo += p0.lo + f64::from_bits(Q[5]);
452    p.hi = p0.hi;
453    // multiply h+l by x and add Q[2]+Q[3] (degree 2)
454    p = backend.quick_f64_mult(x, p);
455    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[2]), p.hi);
456    p.lo += p0.lo + f64::from_bits(Q[3]);
457    p.hi = p0.hi;
458    // multiply h+l by x and add Q[0]+Q[1] (degree 2)
459    p = backend.quick_f64_mult(x, p);
460    let p0 = DoubleDouble::from_exact_add(f64::from_bits(Q[0]), p.hi);
461    p.lo += p0.lo + f64::from_bits(Q[1]);
462    p.hi = p0.hi;
463    // multiply h+l by x
464    p = backend.quick_f64_mult(x, p);
465    p.to_f64()
466}
467
468#[cold]
469#[inline(always)]
470fn exp2m1_accurate<B: ExpfBackend>(x: f64, backend: &B) -> f64 {
471    let t = x.to_bits();
472    let ux = t;
473    let ax = ux & 0x7fffffffffffffffu64;
474
475    if ax <= 0x3fc0000000000000u64 {
476        // |x| <= 0.125
477        return exp2m1_accurate_tiny(x, backend);
478    }
479
480    let mut p = exp_2(x, backend);
481
482    let zf: DoubleDouble = DoubleDouble::from_full_exact_add(p.hi, -1.0);
483    p.lo += zf.lo;
484    p.hi = zf.hi;
485    p.to_f64()
486}
487
488/* |x| <= 0.125, put in h + l a double-double approximation of exp2m1(x),
489and return the maximal corresponding absolute error.
490We also have |x| > 0x1.0527dbd87e24dp-51.
491With xmin=RR("0x1.0527dbd87e24dp-51",16), the routine
492exp2m1_fast_tiny_all(xmin,0.125,2^-65.73) in exp2m1.sage returns
4931.63414352331297e-20 < 2^-65.73, and
494exp2m1_fast_tiny_all(-0.125,-xmin,2^-65.62) returns
4951.76283772822891e-20 < 2^-65.62, which proves the relative
496error is bounded by 2^-65.62. */
497#[inline(always)]
498fn exp2m1_fast_tiny<B: ExpfBackend>(x: f64, backend: &B) -> Exp2m1 {
499    /* The maximal value of |c4*x^4/exp2m1(x)| over [-0.125,0.125]
500    is less than 2^-15.109, where c4 is the degree-4 coefficient,
501    thus we can compute the coefficients of degree 4 or higher
502    using double precision only. */
503    const P: [u64; 12] = [
504        0x3fe62e42fefa39ef,
505        0x3c7abd1697afcaf8,
506        0x3fcebfbdff82c58f,
507        0xbc65e5a1d09e1599,
508        0x3fac6b08d704a0bf,
509        0x3f83b2ab6fba4e78,
510        0x3f55d87fe78a84e6,
511        0x3f2430912f86a480,
512        0x3eeffcbfbc1f2b36,
513        0x3eb62c0226c7f6d1,
514        0x3e7b539529819e63,
515        0x3e3e4d552bed5b9c,
516    ];
517    let x2 = x * x;
518    let x4 = x2 * x2;
519    let mut c8 = backend.dd_fma(f64::from_bits(P[10]), x, f64::from_bits(P[9])); // degree 8
520    let c6 = backend.dd_fma(f64::from_bits(P[8]), x, f64::from_bits(P[7])); // degree 6
521    let mut c4 = backend.dd_fma(f64::from_bits(P[6]), x, f64::from_bits(P[5])); // degree 4
522    c8 = backend.dd_fma(f64::from_bits(P[11]), x2, c8); // degree 8
523    c4 = backend.dd_fma(c6, x2, c4); // degree 4
524    c4 = backend.dd_fma(c8, x4, c4); // degree 4
525
526    let mut p = backend.exact_mult(c4, x);
527    let p0 = DoubleDouble::from_exact_add(f64::from_bits(P[4]), p.hi);
528    p.lo += p0.lo;
529    p.hi = p0.hi;
530
531    p = backend.quick_f64_mult(x, p);
532
533    let p1 = DoubleDouble::from_exact_add(f64::from_bits(P[2]), p.hi);
534    p.lo += p1.lo + f64::from_bits(P[3]);
535    p.hi = p1.hi;
536
537    p = backend.quick_f64_mult(x, p);
538    let p2 = DoubleDouble::from_exact_add(f64::from_bits(P[0]), p.hi);
539    p.lo += p2.lo + f64::from_bits(P[1]);
540    p.hi = p2.hi;
541
542    p = backend.quick_f64_mult(x, p);
543
544    Exp2m1 {
545        exp: p,
546        err: f64::from_bits(0x3bd4e00000000000) * p.hi, // 2^-65.62 < 0x1.4ep-66
547    }
548}
549
550#[inline(always)]
551fn exp2m1_gen<B: ExpfBackend>(d: f64, backend: B) -> f64 {
552    let mut x = d;
553    let t = x.to_bits();
554    let ux = t;
555    let ax = ux & 0x7fffffffffffffffu64;
556
557    if ux >= 0xc04b000000000000u64 {
558        // x = -NaN or x <= -54
559        if (ux >> 52) == 0xfff {
560            // -NaN or -Inf
561            return if ux > 0xfff0000000000000u64 {
562                x + x
563            } else {
564                -1.0
565            };
566        }
567        // for x <= -54, exp2m1(x) rounds to -1 to nearest
568        return -1.0 + f64::from_bits(0x3c90000000000000);
569    } else if ax >= 0x4090000000000000u64 {
570        // x = +NaN or x >= 1024
571        if (ux >> 52) == 0x7ff {
572            // +NaN
573            return x + x;
574        }
575        /* for x >= 1024, exp2m1(x) rounds to +Inf to nearest,
576        but for RNDZ/RNDD, we should have no overflow for x=1024 */
577        return backend.fma(
578            x,
579            f64::from_bits(0x7bffffffffffffff),
580            f64::from_bits(0x7fefffffffffffff),
581        );
582    } else if ax <= 0x3cc0527dbd87e24du64
583    // |x| <= 0x1.0527dbd87e24dp-51
584    /* then the second term of the Taylor expansion of 2^x-1 at x=0 is
585    smaller in absolute value than 1/2 ulp(first term):
586    log(2)*x + log(2)^2*x^2/2 + ... */
587    {
588        /* we use special code when log(2)*|x| is very small, in which case
589        the double-double approximation h+l has its lower part l
590        "truncated" */
591        return if ax <= 0x3970000000000000u64
592        // |x| <= 2^-104
593        {
594            // special case for 0
595            if x == 0. {
596                return x;
597            }
598            // scale x by 2^106
599            x *= f64::from_bits(0x4690000000000000);
600            let z = backend.quick_mult_f64(DoubleDouble::new(LN2L, LN2H), x);
601            let mut h2 = z.to_f64(); // round to 53-bit precision
602            // scale back, hoping to avoid double rounding
603            h2 *= f64::from_bits(0x3950000000000000);
604            // now subtract back h2 * 2^106 from h to get the correction term
605            let mut h = backend.dd_fma(-h2, f64::from_bits(0x4690000000000000), z.hi);
606            // add l
607            h += z.lo;
608            /* add h2 + h * 2^-106. Warning: when h=0, 2^-106*h2 might be exact,
609            thus no underflow will be raised. We have underflow for
610            0 < x <= 0x1.71547652b82fep-1022 for RNDZ, and for
611            0 < x <= 0x1.71547652b82fdp-1022 for RNDN/RNDU. */
612            backend.dyad_fma(h, f64::from_bits(0x3950000000000000), h2)
613        } else {
614            const C2: f64 = f64::from_bits(0x3fcebfbdff82c58f); // log(2)^2/2
615            let mut z = backend.exact_mult(LN2H, x);
616            z.lo = backend.dyad_fma(LN2L, x, z.lo);
617            /* h+l approximates the first term x*log(2) */
618            /* we add C2*x^2 last, so that in case there is a cancellation in
619            LN2L*x+l, it will contribute more bits */
620            z.lo = backend.fma(C2, x * x, z.lo);
621            z.to_f64()
622        };
623    }
624
625    /* now -54 < x < -0x1.0527dbd87e24dp-51
626    or 0x1.0527dbd87e24dp-51 < x < 1024 */
627
628    /* 2^x-1 is exact for x integer, -53 <= x <= 53 */
629    if ux.wrapping_shl(17) == 0 {
630        let i = unsafe { backend.floor(x).to_int_unchecked::<i32>() };
631        if x == i as f64 && -53 <= i && i <= 53 {
632            return if i >= 0 {
633                ((1u64 << i) - 1) as f64
634            } else {
635                -1.0 + fast_ldexp(1.0, i)
636            };
637        }
638    }
639
640    let result = exp2m1_fast(x, ax <= 0x3fc0000000000000u64, &backend);
641    let left = result.exp.hi + (result.exp.lo - result.err);
642    let right = result.exp.hi + (result.exp.lo + result.err);
643    if left != right {
644        return exp2m1_accurate(x, &backend);
645    }
646    left
647}
648
649#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
650#[target_feature(enable = "avx", enable = "fma")]
651unsafe fn exp2m1_fma_impl(x: f64) -> f64 {
652    use crate::exponents::expf::FmaBackend;
653    exp2m1_gen(x, FmaBackend {})
654}
655
656/// Computes 2^x - 1
657///
658/// Max found ULP 0.5
659pub fn f_exp2m1(d: f64) -> f64 {
660    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
661    {
662        exp2m1_gen(d, GenericExpfBackend {})
663    }
664    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
665    {
666        use std::sync::OnceLock;
667        static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
668        let q = EXECUTOR.get_or_init(|| {
669            if std::arch::is_x86_feature_detected!("avx")
670                && std::arch::is_x86_feature_detected!("fma")
671            {
672                exp2m1_fma_impl
673            } else {
674                fn def_exp2m1(x: f64) -> f64 {
675                    exp2m1_gen(x, GenericExpfBackend {})
676                }
677                def_exp2m1
678            }
679        });
680        unsafe { q(d) }
681    }
682}
683
684#[cfg(test)]
685mod tests {
686    use super::*;
687
688    #[test]
689    fn test_exp2m1() {
690        assert_eq!(f_exp2m1(5.4172231599824623E-312), 3.75493295981e-312);
691        assert_eq!(f_exp2m1( 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000017800593653177087), 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000012338431302992956);
692        assert_eq!(3., f_exp2m1(2.0));
693        assert_eq!(4.656854249492381, f_exp2m1(2.5));
694        assert_eq!(-0.30801352040368324, f_exp2m1(-0.5311842449009418));
695    }
696}