1use crate::bits::min_normal_f32;
30use crate::common::*;
31use crate::polyeval::f_polyeval3;
32use std::hint::black_box;
33
34#[repr(C, align(8))]
35pub(crate) struct LogReductionF32Aligned(pub(crate) [u32; 128]);
36
37pub(crate) static LOG_REDUCTION_F32: LogReductionF32Aligned = LogReductionF32Aligned([
38 0x3f800000, 0x3f7e0000, 0x3f7c0000, 0x3f7a0000, 0x3f780000, 0x3f760000, 0x3f740000, 0x3f720000,
39 0x3f700000, 0x3f6f0000, 0x3f6d0000, 0x3f6b0000, 0x3f6a0000, 0x3f680000, 0x3f660000, 0x3f650000,
40 0x3f630000, 0x3f620000, 0x3f600000, 0x3f5f0000, 0x3f5d0000, 0x3f5c0000, 0x3f5a0000, 0x3f590000,
41 0x3f570000, 0x3f560000, 0x3f540000, 0x3f530000, 0x3f520000, 0x3f500000, 0x3f4f0000, 0x3f4e0000,
42 0x3f4c0000, 0x3f4b0000, 0x3f4a0000, 0x3f490000, 0x3f480000, 0x3f460000, 0x3f450000, 0x3f440000,
43 0x3f430000, 0x3f420000, 0x3f400000, 0x3f3f0000, 0x3f3e0000, 0x3f3d0000, 0x3f3c0000, 0x3f3b0000,
44 0x3f3a0000, 0x3f390000, 0x3f380000, 0x3f370000, 0x3f360000, 0x3f350000, 0x3f340000, 0x3f330000,
45 0x3f320000, 0x3f310000, 0x3f300000, 0x3f2f0000, 0x3f2e0000, 0x3f2d0000, 0x3f2c0000, 0x3f2b0000,
46 0x3f2a0000, 0x3f2a0000, 0x3f290000, 0x3f280000, 0x3f270000, 0x3f260000, 0x3f250000, 0x3f250000,
47 0x3f240000, 0x3f230000, 0x3f220000, 0x3f210000, 0x3f200000, 0x3f200000, 0x3f1f0000, 0x3f1e0000,
48 0x3f1d0000, 0x3f1d0000, 0x3f1c0000, 0x3f1b0000, 0x3f1a0000, 0x3f1a0000, 0x3f190000, 0x3f180000,
49 0x3f180000, 0x3f170000, 0x3f160000, 0x3f160000, 0x3f150000, 0x3f140000, 0x3f140000, 0x3f130000,
50 0x3f120000, 0x3f120000, 0x3f110000, 0x3f100000, 0x3f100000, 0x3f0f0000, 0x3f0e0000, 0x3f0e0000,
51 0x3f0d0000, 0x3f0d0000, 0x3f0c0000, 0x3f0b0000, 0x3f0b0000, 0x3f0a0000, 0x3f0a0000, 0x3f090000,
52 0x3f080000, 0x3f080000, 0x3f070000, 0x3f070000, 0x3f060000, 0x3f060000, 0x3f050000, 0x3f050000,
53 0x3f040000, 0x3f040000, 0x3f030000, 0x3f030000, 0x3f020000, 0x3f020000, 0x3f010000, 0x3f000000,
54]);
55
56static LOG_R: [u64; 128] = [
57 0x0000000000000000,
58 0x3f8010157588de71,
59 0x3f90205658935847,
60 0x3f98492528c8cabf,
61 0x3fa0415d89e74444,
62 0x3fa466aed42de3ea,
63 0x3fa894aa149fb343,
64 0x3faccb73cdddb2cc,
65 0x3fb08598b59e3a07,
66 0x3fb1973bd1465567,
67 0x3fb3bdf5a7d1ee64,
68 0x3fb5e95a4d9791cb,
69 0x3fb700d30aeac0e1,
70 0x3fb9335e5d594989,
71 0x3fbb6ac88dad5b1c,
72 0x3fbc885801bc4b23,
73 0x3fbec739830a1120,
74 0x3fbfe89139dbd566,
75 0x3fc1178e8227e47c,
76 0x3fc1aa2b7e23f72a,
77 0x3fc2d1610c86813a,
78 0x3fc365fcb0159016,
79 0x3fc4913d8333b561,
80 0x3fc527e5e4a1b58d,
81 0x3fc6574ebe8c133a,
82 0x3fc6f0128b756abc,
83 0x3fc823c16551a3c2,
84 0x3fc8beafeb38fe8c,
85 0x3fc95a5adcf7017f,
86 0x3fca93ed3c8ad9e3,
87 0x3fcb31d8575bce3d,
88 0x3fcbd087383bd8ad,
89 0x3fcd1037f2655e7b,
90 0x3fcdb13db0d48940,
91 0x3fce530effe71012,
92 0x3fcef5ade4dcffe6,
93 0x3fcf991c6cb3b379,
94 0x3fd07138604d5862,
95 0x3fd0c42d676162e3,
96 0x3fd1178e8227e47c,
97 0x3fd16b5ccbacfb73,
98 0x3fd1bf99635a6b95,
99 0x3fd269621134db92,
100 0x3fd2bef07cdc9354,
101 0x3fd314f1e1d35ce4,
102 0x3fd36b6776be1117,
103 0x3fd3c25277333184,
104 0x3fd419b423d5e8c7,
105 0x3fd4718dc271c41b,
106 0x3fd4c9e09e172c3c,
107 0x3fd522ae0738a3d8,
108 0x3fd57bf753c8d1fb,
109 0x3fd5d5bddf595f30,
110 0x3fd630030b3aac49,
111 0x3fd68ac83e9c6a14,
112 0x3fd6e60ee6af1972,
113 0x3fd741d876c67bb1,
114 0x3fd79e26687cfb3e,
115 0x3fd7fafa3bd8151c,
116 0x3fd85855776dcbfb,
117 0x3fd8b639a88b2df5,
118 0x3fd914a8635bf68a,
119 0x3fd973a3431356ae,
120 0x3fd9d32bea15ed3b,
121 0x3fda33440224fa79,
122 0x3fda33440224fa79,
123 0x3fda93ed3c8ad9e3,
124 0x3fdaf5295248cdd0,
125 0x3fdb56fa04462909,
126 0x3fdbb9611b80e2fb,
127 0x3fdc1c60693fa39e,
128 0x3fdc1c60693fa39e,
129 0x3fdc7ff9c74554c9,
130 0x3fdce42f18064743,
131 0x3fdd490246defa6b,
132 0x3fddae75484c9616,
133 0x3fde148a1a2726ce,
134 0x3fde148a1a2726ce,
135 0x3fde7b42c3ddad73,
136 0x3fdee2a156b413e5,
137 0x3fdf4aa7ee03192d,
138 0x3fdf4aa7ee03192d,
139 0x3fdfb358af7a4884,
140 0x3fe00e5ae5b207ab,
141 0x3fe04360be7603ad,
142 0x3fe04360be7603ad,
143 0x3fe078bf0533c568,
144 0x3fe0ae76e2d054fa,
145 0x3fe0ae76e2d054fa,
146 0x3fe0e4898611cce1,
147 0x3fe11af823c75aa8,
148 0x3fe11af823c75aa8,
149 0x3fe151c3f6f29612,
150 0x3fe188ee40f23ca6,
151 0x3fe188ee40f23ca6,
152 0x3fe1c07849ae6007,
153 0x3fe1f8635fc61659,
154 0x3fe1f8635fc61659,
155 0x3fe230b0d8bebc98,
156 0x3fe269621134db92,
157 0x3fe269621134db92,
158 0x3fe2a2786d0ec107,
159 0x3fe2dbf557b0df43,
160 0x3fe2dbf557b0df43,
161 0x3fe315da4434068b,
162 0x3fe315da4434068b,
163 0x3fe35028ad9d8c86,
164 0x3fe38ae2171976e7,
165 0x3fe38ae2171976e7,
166 0x3fe3c6080c36bfb5,
167 0x3fe3c6080c36bfb5,
168 0x3fe4019c2125ca93,
169 0x3fe43d9ff2f923c5,
170 0x3fe43d9ff2f923c5,
171 0x3fe47a1527e8a2d3,
172 0x3fe47a1527e8a2d3,
173 0x3fe4b6fd6f970c1f,
174 0x3fe4b6fd6f970c1f,
175 0x3fe4f45a835a4e19,
176 0x3fe4f45a835a4e19,
177 0x3fe5322e26867857,
178 0x3fe5322e26867857,
179 0x3fe5707a26bb8c66,
180 0x3fe5707a26bb8c66,
181 0x3fe5af405c3649e0,
182 0x3fe5af405c3649e0,
183 0x3fe5ee82aa241920,
184 0x0000000000000000,
185];
186
187pub(crate) trait LogfBackend {
188 fn fmaf(&self, x: f32, y: f32, z: f32) -> f32;
189 fn fma(&self, x: f64, y: f64, z: f64) -> f64;
190 fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64;
191 const HAS_FMA: bool;
192}
193
194pub(crate) struct GenLogfBackend {}
195
196impl LogfBackend for GenLogfBackend {
197 #[inline(always)]
198 fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
199 f_fmlaf(x, y, z)
200 }
201
202 #[inline(always)]
203 fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
204 f_fmla(x, y, z)
205 }
206
207 #[inline(always)]
208 fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
209 use crate::polyeval::f_polyeval3;
210 f_polyeval3(x, a0, a1, a2)
211 }
212
213 #[cfg(any(
214 all(
215 any(target_arch = "x86", target_arch = "x86_64"),
216 target_feature = "fma"
217 ),
218 target_arch = "aarch64"
219 ))]
220 const HAS_FMA: bool = true;
221 #[cfg(not(any(
222 all(
223 any(target_arch = "x86", target_arch = "x86_64"),
224 target_feature = "fma"
225 ),
226 target_arch = "aarch64"
227 )))]
228 const HAS_FMA: bool = false;
229}
230
231#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
232pub(crate) struct FmaLogfBackend {}
233
234#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
235impl LogfBackend for FmaLogfBackend {
236 #[inline(always)]
237 fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
238 f32::mul_add(x, y, z)
239 }
240
241 #[inline(always)]
242 fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
243 f64::mul_add(x, y, z)
244 }
245
246 #[inline(always)]
247 fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
248 use crate::polyeval::d_polyeval3;
249 d_polyeval3(x, a0, a1, a2)
250 }
251
252 const HAS_FMA: bool = true;
253}
254
255#[inline(always)]
256fn logf_gen<B: LogfBackend>(x: f32, backend: B) -> f32 {
257 let mut x_u = x.to_bits();
258
259 const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
260
261 let mut m = -(E_BIAS as i32);
262 if x_u < 0x4c5d65a5u32 {
263 if x_u == 0x3f7f4d6fu32 {
264 return black_box(f64::from_bits(0xbf6659ec80000000) as f32) + min_normal_f32(true);
265 } else if x_u == 0x41178febu32 {
266 return black_box(f64::from_bits(0x4001fcbce0000000) as f32) + min_normal_f32(true);
267 } else if x_u == 0x3f800000u32 {
268 return 0.;
269 } else if x_u == 0x1e88452du32 {
270 return black_box(f64::from_bits(0xc046d7b180000000) as f32) + min_normal_f32(true);
271 }
272 if x_u < f32::MIN_POSITIVE.to_bits() {
273 if x == 0.0 {
274 return f32::NEG_INFINITY;
275 }
276 x_u = (x * f64::from_bits(0x4160000000000000) as f32).to_bits();
278 m -= 23;
279 }
280 } else {
281 if x_u == 0x4c5d65a5u32 {
282 return black_box(f32::from_bits(0x418f034b)) + min_normal_f32(true);
283 } else if x_u == 0x65d890d3u32 {
284 return black_box(f32::from_bits(0x4254d1f9)) + min_normal_f32(true);
285 } else if x_u == 0x6f31a8ecu32 {
286 return black_box(f32::from_bits(0x42845a89)) + min_normal_f32(true);
287 } else if x_u == 0x7a17f30au32 {
288 return black_box(f32::from_bits(0x42a28a1b)) + min_normal_f32(true);
289 } else if x_u == 0x500ffb03u32 {
290 return black_box(f32::from_bits(0x41b7ee9a)) + min_normal_f32(true);
291 } else if x_u == 0x5cd69e88u32 {
292 return black_box(f32::from_bits(0x4222e0a3)) + min_normal_f32(true);
293 } else if x_u == 0x5ee8984eu32 {
294 return black_box(f32::from_bits(0x422e4a21)) + min_normal_f32(true);
295 }
296
297 if x_u > f32::MAX.to_bits() {
298 if x_u == 0x80000000u32 {
299 return f32::NEG_INFINITY;
300 }
301 if x.is_sign_negative() && !x.is_nan() {
302 return f32::NAN + x;
303 }
304 if x.is_nan() {
306 return f32::NAN;
307 }
308
309 return x;
310 }
311 }
312
313 let mant = x_u & 0x007F_FFFF;
314 let index = mant.wrapping_shr(16);
316 m = m.wrapping_add(x_u.wrapping_add(1 << 16).wrapping_shr(23) as i32);
319 x_u = set_exponent_f32(x_u, 0x7F);
320
321 let u = f32::from_bits(x_u);
322
323 let v = if B::HAS_FMA {
324 backend.fmaf(u, f32::from_bits(LOG_REDUCTION_F32.0[index as usize]), -1.0) as f64 } else {
326 use crate::logs::log2::LOG_RANGE_REDUCTION;
327 backend.fma(
328 u as f64,
329 f64::from_bits(LOG_RANGE_REDUCTION[index as usize]),
330 -1.0,
331 ) };
333 const COEFFS: [u64; 4] = [
336 0xbfe000000000fe63,
337 0x3fd555556e963c16,
338 0xbfd000028dedf986,
339 0x3fc966681bfda7f7,
340 ];
341 let v2 = v * v; let p2 = backend.fma(v, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
343 let p1 = backend.fma(v, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
344 let p0 = f64::from_bits(LOG_R[index as usize]) + v;
345 const LOG_2: f64 = f64::from_bits(0x3fe62e42fefa39ef);
346 let r = backend.fma(m as f64, LOG_2, backend.polyeval3(v2, p0, p1, p2));
347 r as f32
348}
349
350#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
351#[target_feature(enable = "avx", enable = "fma")]
352unsafe fn logf_fma_impl(x: f32) -> f32 {
353 logf_gen(x, FmaLogfBackend {})
354}
355
356pub fn f_logf(x: f32) -> f32 {
360 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
361 {
362 logf_gen(x, GenLogfBackend {})
363 }
364 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
365 {
366 use std::sync::OnceLock;
367 static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
368 let q = EXECUTOR.get_or_init(|| {
369 if std::arch::is_x86_feature_detected!("avx")
370 && std::arch::is_x86_feature_detected!("fma")
371 {
372 logf_fma_impl
373 } else {
374 fn def_logf(x: f32) -> f32 {
375 logf_gen(x, GenLogfBackend {})
376 }
377 def_logf
378 }
379 });
380 unsafe { q(x) }
381 }
382}
383
384#[inline]
385pub(crate) fn fast_logf(x: f32) -> f64 {
386 let mut x_u = x.to_bits();
387 const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
388 let mut m = -(E_BIAS as i32);
389 if x_u < f32::MIN_POSITIVE.to_bits() {
390 x_u = (x * f64::from_bits(0x4160000000000000) as f32).to_bits();
392 m -= 23;
393 }
394
395 let mant = x_u & 0x007F_FFFF;
396 let index = mant.wrapping_shr(16);
398 m = m.wrapping_add(x_u.wrapping_add(1 << 16).wrapping_shr(23) as i32);
401 x_u = set_exponent_f32(x_u, 0x7F);
402
403 let v;
404 let u = f32::from_bits(x_u);
405
406 #[cfg(any(
407 all(
408 any(target_arch = "x86", target_arch = "x86_64"),
409 target_feature = "fma"
410 ),
411 target_arch = "aarch64"
412 ))]
413 {
414 v = f_fmlaf(u, f32::from_bits(LOG_REDUCTION_F32.0[index as usize]), -1.0) as f64; }
416 #[cfg(not(any(
417 all(
418 any(target_arch = "x86", target_arch = "x86_64"),
419 target_feature = "fma"
420 ),
421 target_arch = "aarch64"
422 )))]
423 {
424 use crate::logs::log2::LOG_RANGE_REDUCTION;
425 v = f_fmla(
426 u as f64,
427 f64::from_bits(LOG_RANGE_REDUCTION[index as usize]),
428 -1.0,
429 ); }
431 const COEFFS: [u64; 4] = [
434 0xbfe000000000fe63,
435 0x3fd555556e963c16,
436 0xbfd000028dedf986,
437 0x3fc966681bfda7f7,
438 ];
439 let v2 = v * v; let p2 = f_fmla(v, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
441 let p1 = f_fmla(v, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
442 let p0 = f64::from_bits(LOG_R[index as usize]) + v;
443 const LOG_2: f64 = f64::from_bits(0x3fe62e42fefa39ef);
444 f_fmla(m as f64, LOG_2, f_polyeval3(v2, p0, p1, p2))
445}
446
447#[inline]
450pub const fn logf(d: f32) -> f32 {
451 let ux = d.to_bits();
452 #[allow(clippy::collapsible_if)]
453 if ux < (1 << 23) || ux >= 0x7f800000u32 {
454 if ux == 0 || ux >= 0x7f800000u32 {
455 if ux == 0x7f800000u32 {
456 return d;
457 }
458 let ax = ux.wrapping_shl(1);
459 if ax == 0u32 {
460 return f32::NEG_INFINITY;
462 }
463 if ax > 0xff000000u32 {
464 return d + d;
465 } return f32::NAN;
467 }
468 }
469
470 let mut ix = d.to_bits();
471 ix += 0x3f800000 - 0x3f3504f3;
473 let n = (ix >> 23) as i32 - 0x7f;
474 ix = (ix & 0x007fffff) + 0x3f3504f3;
475 let a = f32::from_bits(ix) as f64;
476
477 let x = (a - 1.) / (a + 1.);
478 let x2 = x * x;
479 let mut u = 0.2222220222147750310e+0;
480 u = fmla(u, x2, 0.2857142871244668543e+0);
481 u = fmla(u, x2, 0.3999999999950960318e+0);
482 u = fmla(u, x2, 0.6666666666666734090e+0);
483 u = fmla(u, x2, 2.);
484 fmla(x, u, std::f64::consts::LN_2 * (n as f64)) as f32
485}
486
487#[cfg(test)]
488mod tests {
489 use super::*;
490
491 #[test]
492 fn test_logf() {
493 assert!(
494 (logf(1f32) - 0f32).abs() < 1e-6,
495 "Invalid result {}",
496 logf(1f32)
497 );
498 assert!(
499 (logf(5f32) - 1.60943791243410037460f32).abs() < 1e-6,
500 "Invalid result {}",
501 logf(5f32)
502 );
503 assert_eq!(logf(0.), f32::NEG_INFINITY);
504 assert!(logf(-1.).is_nan());
505 assert!(logf(f32::NAN).is_nan());
506 assert!(logf(f32::NEG_INFINITY).is_nan());
507 assert_eq!(logf(f32::INFINITY), f32::INFINITY);
508 }
509
510 #[test]
511 fn test_flogf() {
512 assert!(
513 (f_logf(1f32) - 0f32).abs() < 1e-6,
514 "Invalid result {}",
515 f_logf(1f32)
516 );
517 assert!(
518 (f_logf(5f32) - 1.60943791243410037460f32).abs() < 1e-6,
519 "Invalid result {}",
520 f_logf(5f32)
521 );
522 assert_eq!(f_logf(0.), f32::NEG_INFINITY);
523 assert!(f_logf(-1.).is_nan());
524 assert!(f_logf(f32::NAN).is_nan());
525 assert!(f_logf(f32::NEG_INFINITY).is_nan());
526 assert_eq!(f_logf(f32::INFINITY), f32::INFINITY);
527 }
528}