pxfm/exponents/
expm1f.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::exponents::expf::{ExpfBackend, GenericExpfBackend};
30
31#[inline(always)]
32fn expm1f_gen<B: ExpfBackend>(x: f32, backend: B) -> f32 {
33    let x_u: u32 = x.to_bits();
34    let x_abs = x_u & 0x7fff_ffffu32;
35
36    // When |x| > 25*log(2), or nan
37    if x_abs >= 0x418a_a123u32 {
38        // x < log(2^-25)
39        if x.is_sign_negative() {
40            // exp(-Inf) = 0
41            if x.is_infinite() {
42                return -1.0;
43            }
44            // exp(nan) = nan
45            if x.is_nan() {
46                return x;
47            }
48            return -1.0;
49        } else {
50            // x >= 89 or nan
51            if x_u >= 0x42b2_0000 {
52                return x + f32::INFINITY;
53            }
54        }
55    }
56
57    // |x| < 2^-4
58    if x_abs < 0x3d80_0000u32 {
59        // |x| < 2^-25
60        if x_abs < 0x3300_0000u32 {
61            // x = -0.0f
62            if x_u == 0x8000_0000u32 {
63                return x;
64            }
65            // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
66            // is:
67            //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
68            //                               = |x|
69            //                               < 2^-25
70            //                               < epsilon(1)/2.
71            // To simplify the rounding decision and make it more efficient, we use
72            //   fma(x, x, x) ~ x + x^2 instead.
73            // Note: to use the formula x + x^2 to decide the correct rounding, we
74            // do need fma(x, x, x) to prevent underflow caused by x*x when |x| <
75            // 2^-76. For targets without FMA instructions, we simply use double for
76            // intermediate results as it is more efficient than using an emulated
77            // version of FMA.
78            #[cfg(any(
79                all(
80                    any(target_arch = "x86", target_arch = "x86_64"),
81                    target_feature = "fma"
82                ),
83                target_arch = "aarch64"
84            ))]
85            {
86                return backend.fmaf(x, x, x);
87            }
88            #[cfg(not(any(
89                all(
90                    any(target_arch = "x86", target_arch = "x86_64"),
91                    target_feature = "fma"
92                ),
93                target_arch = "aarch64"
94            )))]
95            {
96                let xd = x as f64;
97                return backend.fma(xd, xd, xd) as f32;
98            }
99        }
100
101        const C: [u64; 7] = [
102            0x3fe0000000000000,
103            0x3fc55555555557dd,
104            0x3fa55555555552fa,
105            0x3f8111110fcd58b7,
106            0x3f56c16c1717660b,
107            0x3f2a0241f0006d62,
108            0x3efa01e3f8d3c060,
109        ];
110
111        // 2^-25 <= |x| < 2^-4
112        let xd = x as f64;
113        let xsq = xd * xd;
114        // Degree-8 minimax polynomial generated by Sollya with:
115        // > display = hexadecimal;
116        // > P = fpminimax((expm1(x) - x)/x^2, 6, [|D...|], [-2^-4, 2^-4]);
117
118        return backend.fma(
119            backend.polyeval7(
120                xd,
121                f64::from_bits(C[0]),
122                f64::from_bits(C[1]),
123                f64::from_bits(C[2]),
124                f64::from_bits(C[3]),
125                f64::from_bits(C[4]),
126                f64::from_bits(C[5]),
127                f64::from_bits(C[6]),
128            ),
129            xsq,
130            xd,
131        ) as f32;
132    }
133
134    // For -104 < x < 89, to compute expm1(x), we perform the following range
135    // reduction: find hi, mid, lo such that:
136    //   x = hi + mid + lo, in which
137    //     hi is an integer,
138    //     mid * 2^7 is an integer
139    //     -2^(-8) <= lo < 2^-8.
140    // In particular,
141    //   hi + mid = round(x * 2^7) * 2^(-7).
142    // Then,
143    //   expm1(x) = expm1(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo) - 1.
144    // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
145    // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
146    // generated by Sollya.
147
148    // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
149    let kf = backend.roundf(x * 128.);
150    // Subtract (hi + mid) from x to get lo.
151    let xd = backend.fmaf(kf, -0.0078125 /* - 1/128 */, x) as f64;
152    let mut x_hi = unsafe { kf.to_int_unchecked::<i32>() }; // it's already not indeterminate.
153    x_hi += 104 << 7;
154    // hi = x_hi >> 7
155    let exp_hi = f64::from_bits(crate::exponents::expf::EXP_M1[(x_hi >> 7) as usize]);
156    // mid * 2^7 = x_hi & 0x0000'007fU;
157    let exp_mid = f64::from_bits(crate::exponents::expf::EXP_M2[(x_hi & 0x7f) as usize]);
158    // Degree-4 minimax polynomial generated by Sollya with the following
159    // commands:
160    // d = [-2^-8, 2^-8];
161    // f_exp = expm1(x)/x;
162    // Q = fpminimax(f_exp, 3, [|D...|], [-2^-8, 2^-8]);
163    let p = backend.polyeval5(
164        xd,
165        1.,
166        f64::from_bits(0x3feffffffffff777),
167        f64::from_bits(0x3fe000000000071c),
168        f64::from_bits(0x3fc555566668e5e7),
169        f64::from_bits(0x3fa55555555ef243),
170    );
171    backend.fma(p * exp_hi, exp_mid, -1.) as f32
172}
173
174#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
175#[target_feature(enable = "avx", enable = "fma")]
176unsafe fn expm1f_fma_impl(x: f32) -> f32 {
177    use crate::exponents::expf::FmaBackend;
178    expm1f_gen(x, FmaBackend {})
179}
180
181/// Computes e^x - 1
182///
183/// Max ULP 0.5
184#[inline]
185pub fn f_expm1f(x: f32) -> f32 {
186    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
187    {
188        expm1f_gen(x, GenericExpfBackend {})
189    }
190    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
191    {
192        use std::sync::OnceLock;
193        static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
194        let q = EXECUTOR.get_or_init(|| {
195            if std::arch::is_x86_feature_detected!("avx")
196                && std::arch::is_x86_feature_detected!("fma")
197            {
198                expm1f_fma_impl
199            } else {
200                fn def_expm1f(x: f32) -> f32 {
201                    expm1f_gen(x, GenericExpfBackend {})
202                }
203                def_expm1f
204            }
205        });
206        unsafe { q(x) }
207    }
208}
209
210#[cfg(test)]
211mod tests {
212    use crate::f_expm1f;
213
214    #[test]
215    fn test_expm1f() {
216        assert_eq!(f_expm1f(-3.0923562e-5), -3.0923085e-5);
217        assert_eq!(f_expm1f(2.213121), 8.144211);
218        assert_eq!(f_expm1f(-3.213121), -0.9597691);
219        assert_eq!(f_expm1f(-2.35099e-38), -2.35099e-38);
220        assert_eq!(
221            f_expm1f(0.00000000000000000000000000000000000004355616),
222            0.00000000000000000000000000000000000004355616
223        );
224        assert_eq!(f_expm1f(25.12315), 81441420000.0);
225        assert_eq!(f_expm1f(12.986543), 436498.6);
226        assert_eq!(f_expm1f(-12.986543), -0.99999774);
227        assert_eq!(f_expm1f(-25.12315), -1.0);
228        assert_eq!(f_expm1f(f32::INFINITY), f32::INFINITY);
229        assert_eq!(f_expm1f(f32::NEG_INFINITY), -1.);
230        assert!(f_expm1f(f32::NAN).is_nan());
231    }
232}