pxfm/cube_roots/
cbrt.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::f_fmla;
30use crate::cube_roots::cbrtf::halley_refine_d;
31use crate::double_double::DoubleDouble;
32use crate::exponents::fast_ldexp;
33
34pub(crate) trait CbrtBackend {
35    fn fma(&self, x: f64, y: f64, z: f64) -> f64;
36    fn polyeval4(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64) -> f64;
37    fn halley(&self, x: f64, a: f64) -> f64;
38    fn exact_mul(&self, x: f64, y: f64) -> DoubleDouble;
39    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble;
40}
41
42pub(crate) struct GenericCbrtBackend {}
43
44impl CbrtBackend for GenericCbrtBackend {
45    #[inline(always)]
46    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
47        f_fmla(x, y, z)
48    }
49    #[inline(always)]
50    fn polyeval4(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64) -> f64 {
51        use crate::polyeval::f_polyeval4;
52        f_polyeval4(x, a0, a1, a2, a3)
53    }
54    #[inline(always)]
55    fn halley(&self, x: f64, a: f64) -> f64 {
56        halley_refine_d(x, a)
57    }
58    #[inline(always)]
59    fn exact_mul(&self, x: f64, y: f64) -> DoubleDouble {
60        DoubleDouble::from_exact_mult(x, y)
61    }
62    #[inline(always)]
63    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
64        DoubleDouble::quick_mult_f64(x, y)
65    }
66}
67
68#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
69pub(crate) struct FmaCbrtBackend {}
70
71#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
72impl CbrtBackend for FmaCbrtBackend {
73    #[inline(always)]
74    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
75        f64::mul_add(x, y, z)
76    }
77    #[inline(always)]
78    fn polyeval4(&self, x: f64, a0: f64, a1: f64, a2: f64, a3: f64) -> f64 {
79        use crate::polyeval::d_polyeval4;
80        d_polyeval4(x, a0, a1, a2, a3)
81    }
82    #[inline(always)]
83    fn halley(&self, x: f64, a: f64) -> f64 {
84        use crate::cube_roots::cbrtf::halley_refine_d_fma;
85        halley_refine_d_fma(x, a)
86    }
87    #[inline(always)]
88    fn exact_mul(&self, x: f64, y: f64) -> DoubleDouble {
89        DoubleDouble::from_exact_mult_fma(x, y)
90    }
91    #[inline(always)]
92    fn quick_mult_f64(&self, x: DoubleDouble, y: f64) -> DoubleDouble {
93        DoubleDouble::quick_mult_f64_fma(x, y)
94    }
95}
96
97#[inline(always)]
98fn cbrt_gen_impl<B: CbrtBackend>(x: f64, backend: B) -> f64 {
99    // 1; 2^{1/3}; 2^{2/3}
100    static ESCALE: [f64; 3] = [
101        1.0,
102        f64::from_bits(0x3ff428a2f98d728b),
103        f64::from_bits(0x3ff965fea53d6e3d),
104    ];
105
106    let bits = x.to_bits();
107    let mut exp = ((bits >> 52) & 0x7ff) as i32;
108    let mut mant = bits & ((1u64 << 52) - 1);
109
110    if exp == 0x7ff || x == 0.0 {
111        return x + x;
112    }
113
114    // Normalize subnormal
115    if exp == 0 && x != 0.0 {
116        let norm = x * f64::from_bits(0x4350000000000000); // * 2^54
117        let norm_bits = norm.to_bits();
118        mant = norm_bits & ((1u64 << 52) - 1);
119        exp = ((norm_bits >> 52) & 0x7ff) as i32 - 54;
120    }
121
122    exp -= 1023;
123
124    mant |= 0x3ff << 52;
125    let m = f64::from_bits(mant);
126
127    // Polynomial for x^(1/3) on [1.0; 2.0]
128    // Generated by Sollya:
129    // d = [1.0, 2.0];
130    // f_cbrt = x^(1/3);
131    // Q = fpminimax(f_cbrt, 4, [|D...|], d, relative, floating);
132    // See ./notes/cbrt.sollya
133
134    let p = backend.polyeval4(
135        m,
136        f64::from_bits(0x3fe1b0babceeaafa),
137        f64::from_bits(0x3fe2c9a3e8e06a3c),
138        f64::from_bits(0xbfc4dc30afb71885),
139        f64::from_bits(0x3f97a8d3e05458e4),
140    );
141
142    // split exponent e = 3*q + r with r in {0,1,2}
143    // use div_euclid/rem_euclid to get r >= 0
144    let q = exp.div_euclid(3);
145    let rem_scale = exp.rem_euclid(3);
146
147    let z = p * ESCALE[rem_scale as usize];
148
149    let mm = fast_ldexp(m, rem_scale); // bring mantissa into [1;8]
150
151    let r = 1.0 / mm;
152
153    // One Halley's method step
154    // then refine in partial double-double precision with Newton-Raphson iteration
155    let y0 = backend.halley(z, mm);
156    let d2y = backend.exact_mul(y0, y0);
157    let d3y = backend.quick_mult_f64(d2y, y0);
158    // Newton-Raphson step
159    // h = (x^3 - a) * r
160    // y1 = y0 - 1/3 * h * y0
161    let h = ((d3y.hi - mm) + d3y.lo) * r;
162    // y1 = y0 - 1/3*y0*(h.lo + h.hi) = y0 - 1/3 *y0*h.lo - 1/3 * y0 * h.hi
163    let y = backend.fma(-f64::from_bits(0x3fd5555555555555), y0 * h, y0);
164
165    f64::copysign(fast_ldexp(y, q), x)
166}
167
168#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
169#[target_feature(enable = "avx", enable = "fma")]
170unsafe fn cbrt_fma_impl(x: f64) -> f64 {
171    cbrt_gen_impl(x, FmaCbrtBackend {})
172}
173
174/// Computes cube root
175///
176/// Max found ULP 0.5
177pub fn f_cbrt(x: f64) -> f64 {
178    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
179    {
180        cbrt_gen_impl(x, GenericCbrtBackend {})
181    }
182    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
183    {
184        use std::sync::OnceLock;
185        static EXECUTOR: OnceLock<unsafe fn(f64) -> f64> = OnceLock::new();
186        let q = EXECUTOR.get_or_init(|| {
187            if std::arch::is_x86_feature_detected!("avx")
188                && std::arch::is_x86_feature_detected!("fma")
189            {
190                cbrt_fma_impl
191            } else {
192                fn def_cbrt(x: f64) -> f64 {
193                    cbrt_gen_impl(x, GenericCbrtBackend {})
194                }
195                def_cbrt
196            }
197        });
198        unsafe { q(x) }
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    #[test]
207    fn test_cbrt() {
208        assert_eq!(f_cbrt(0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000005432309223745),
209                   0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000017579026781511548);
210        assert_eq!(f_cbrt(1.225158611559834), 1.0700336588124544);
211        assert_eq!(f_cbrt(0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000139491540182158), 1.1173329935611586e-103);
212        assert_eq!(f_cbrt(27.0), 3.0);
213        assert_eq!(f_cbrt(64.0), 4.0);
214        assert_eq!(f_cbrt(125.0), 5.0);
215        assert_eq!(f_cbrt(216.0), 6.0);
216        assert_eq!(f_cbrt(343.0), 7.0);
217        assert_eq!(f_cbrt(512.0), 8.0);
218        assert_eq!(f_cbrt(729.0), 9.0);
219        assert_eq!(f_cbrt(-729.0), -9.0);
220        assert_eq!(f_cbrt(-512.0), -8.0);
221        assert_eq!(f_cbrt(-343.0), -7.0);
222        assert_eq!(f_cbrt(-216.0), -6.0);
223        assert_eq!(f_cbrt(-125.0), -5.0);
224        assert_eq!(f_cbrt(-64.0), -4.0);
225        assert_eq!(f_cbrt(-27.0), -3.0);
226        assert_eq!(f_cbrt(0.0), 0.0);
227        assert_eq!(f_cbrt(f64::INFINITY), f64::INFINITY);
228        assert_eq!(f_cbrt(f64::NEG_INFINITY), f64::NEG_INFINITY);
229        assert!(f_cbrt(f64::NAN).is_nan());
230    }
231}