pxfm/cube_roots/
rcbrtf.rs1use crate::common::f_fmla;
30
31#[inline(always)]
39fn rapshon_refine_inv_cbrt(x: f64, a: f64) -> f64 {
40 x * f_fmla(-1. / 3. * a, x * x * x, 4. / 3.)
41}
42
43#[inline(always)]
44#[allow(unused)]
45fn rapshon_refine_inv_cbrt_fma(x: f64, a: f64) -> f64 {
46 x * f64::mul_add(-1. / 3. * a, x * x * x, 4. / 3.)
47}
48
49#[inline(always)]
52fn halleys_div_free(x: f64, a: f64) -> f64 {
53 const K3: f64 = 2. / 9.;
54 const K2: f64 = 7. / 9.;
55 const K1: f64 = 14. / 9.;
56 let c = a * x * x * x;
57 let mut y = f_fmla(-K3, c, K2);
58 y = f_fmla(-c, y, K1);
59 y * x
60}
61
62#[inline(always)]
63#[allow(unused)]
64fn halleys_div_free_fma(x: f64, a: f64) -> f64 {
65 const K3: f64 = 2. / 9.;
66 const K2: f64 = 7. / 9.;
67 const K1: f64 = 14. / 9.;
68 let c = a * x * x * x;
69 let mut y = f64::mul_add(-K3, c, K2);
70 y = f64::mul_add(-c, y, K1);
71 y * x
72}
73
74#[inline(always)]
75fn rcbrtf_gen_impl<Halley: Fn(f64, f64) -> f64, NewtonRaphson: Fn(f64, f64) -> f64>(
76 x: f32,
77 halley: Halley,
78 rapshon: NewtonRaphson,
79) -> f32 {
80 let u = x.to_bits();
81 let au = u.wrapping_shl(1);
82 if au < (1u32 << 24) || au >= (0xffu32 << 24) {
83 if x.is_infinite() {
84 return if x.is_sign_negative() { -0.0 } else { 0.0 };
85 }
86 if au >= (0xffu32 << 24) {
87 return x + x; }
89 if x == 0. {
90 return if x.is_sign_positive() {
91 f32::INFINITY
92 } else {
93 f32::NEG_INFINITY
94 }; }
96 }
97
98 let mut ui: u32 = x.to_bits();
99 let mut hx: u32 = ui & 0x7fffffff;
100
101 if hx < 0x00800000 {
102 if hx == 0 {
104 return x; }
106 const TWO_EXP_24: f32 = f32::from_bits(0x4b800000);
107 ui = (x * TWO_EXP_24).to_bits();
108 hx = ui & 0x7fffffff;
109 const B: u32 = 0x54a21d2au32 + (8u32 << 23);
110 hx = B.wrapping_sub(hx / 3);
111 } else {
112 hx = 0x54a21d2au32.wrapping_sub(hx / 3);
113 }
114 ui &= 0x80000000;
115 ui |= hx;
116
117 let t = f32::from_bits(ui) as f64;
118 let dx = x as f64;
119 let mut t = halley(t, dx);
120 t = halley(t, dx);
121 t = rapshon(t, dx);
122 t as f32
123}
124
125#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
126#[target_feature(enable = "avx", enable = "fma")]
127unsafe fn rcbrtf_fma_impl(x: f32) -> f32 {
128 rcbrtf_gen_impl(x, halleys_div_free_fma, rapshon_refine_inv_cbrt_fma)
129}
130
131#[inline]
135pub fn f_rcbrtf(x: f32) -> f32 {
136 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
137 {
138 rcbrtf_gen_impl(x, halleys_div_free, rapshon_refine_inv_cbrt)
139 }
140 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
141 {
142 use std::sync::OnceLock;
143 static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
144 let q = EXECUTOR.get_or_init(|| {
145 if std::arch::is_x86_feature_detected!("avx")
146 && std::arch::is_x86_feature_detected!("fma")
147 {
148 rcbrtf_fma_impl
149 } else {
150 fn def_rcbrtf(x: f32) -> f32 {
151 rcbrtf_gen_impl(x, halleys_div_free, rapshon_refine_inv_cbrt)
152 }
153 def_rcbrtf
154 }
155 });
156 unsafe { q(x) }
157 }
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn test_fcbrtf() {
166 assert_eq!(f_rcbrtf(0.0), f32::INFINITY);
167 assert_eq!(f_rcbrtf(-0.0), f32::NEG_INFINITY);
168 assert_eq!(f_rcbrtf(-27.0), -1. / 3.);
169 assert_eq!(f_rcbrtf(27.0), 1. / 3.);
170 assert_eq!(f_rcbrtf(64.0), 0.25);
171 assert_eq!(f_rcbrtf(-64.0), -0.25);
172 assert_eq!(f_rcbrtf(f32::NEG_INFINITY), -0.0);
173 assert_eq!(f_rcbrtf(f32::INFINITY), 0.0);
174 assert!(f_rcbrtf(f32::NAN).is_nan());
175 }
176}