pxfm/logs/
logf.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::min_normal_f32;
30use crate::common::*;
31use crate::polyeval::f_polyeval3;
32use std::hint::black_box;
33
34#[repr(C, align(8))]
35pub(crate) struct LogReductionF32Aligned(pub(crate) [u32; 128]);
36
37pub(crate) static LOG_REDUCTION_F32: LogReductionF32Aligned = LogReductionF32Aligned([
38    0x3f800000, 0x3f7e0000, 0x3f7c0000, 0x3f7a0000, 0x3f780000, 0x3f760000, 0x3f740000, 0x3f720000,
39    0x3f700000, 0x3f6f0000, 0x3f6d0000, 0x3f6b0000, 0x3f6a0000, 0x3f680000, 0x3f660000, 0x3f650000,
40    0x3f630000, 0x3f620000, 0x3f600000, 0x3f5f0000, 0x3f5d0000, 0x3f5c0000, 0x3f5a0000, 0x3f590000,
41    0x3f570000, 0x3f560000, 0x3f540000, 0x3f530000, 0x3f520000, 0x3f500000, 0x3f4f0000, 0x3f4e0000,
42    0x3f4c0000, 0x3f4b0000, 0x3f4a0000, 0x3f490000, 0x3f480000, 0x3f460000, 0x3f450000, 0x3f440000,
43    0x3f430000, 0x3f420000, 0x3f400000, 0x3f3f0000, 0x3f3e0000, 0x3f3d0000, 0x3f3c0000, 0x3f3b0000,
44    0x3f3a0000, 0x3f390000, 0x3f380000, 0x3f370000, 0x3f360000, 0x3f350000, 0x3f340000, 0x3f330000,
45    0x3f320000, 0x3f310000, 0x3f300000, 0x3f2f0000, 0x3f2e0000, 0x3f2d0000, 0x3f2c0000, 0x3f2b0000,
46    0x3f2a0000, 0x3f2a0000, 0x3f290000, 0x3f280000, 0x3f270000, 0x3f260000, 0x3f250000, 0x3f250000,
47    0x3f240000, 0x3f230000, 0x3f220000, 0x3f210000, 0x3f200000, 0x3f200000, 0x3f1f0000, 0x3f1e0000,
48    0x3f1d0000, 0x3f1d0000, 0x3f1c0000, 0x3f1b0000, 0x3f1a0000, 0x3f1a0000, 0x3f190000, 0x3f180000,
49    0x3f180000, 0x3f170000, 0x3f160000, 0x3f160000, 0x3f150000, 0x3f140000, 0x3f140000, 0x3f130000,
50    0x3f120000, 0x3f120000, 0x3f110000, 0x3f100000, 0x3f100000, 0x3f0f0000, 0x3f0e0000, 0x3f0e0000,
51    0x3f0d0000, 0x3f0d0000, 0x3f0c0000, 0x3f0b0000, 0x3f0b0000, 0x3f0a0000, 0x3f0a0000, 0x3f090000,
52    0x3f080000, 0x3f080000, 0x3f070000, 0x3f070000, 0x3f060000, 0x3f060000, 0x3f050000, 0x3f050000,
53    0x3f040000, 0x3f040000, 0x3f030000, 0x3f030000, 0x3f020000, 0x3f020000, 0x3f010000, 0x3f000000,
54]);
55
56static LOG_R: [u64; 128] = [
57    0x0000000000000000,
58    0x3f8010157588de71,
59    0x3f90205658935847,
60    0x3f98492528c8cabf,
61    0x3fa0415d89e74444,
62    0x3fa466aed42de3ea,
63    0x3fa894aa149fb343,
64    0x3faccb73cdddb2cc,
65    0x3fb08598b59e3a07,
66    0x3fb1973bd1465567,
67    0x3fb3bdf5a7d1ee64,
68    0x3fb5e95a4d9791cb,
69    0x3fb700d30aeac0e1,
70    0x3fb9335e5d594989,
71    0x3fbb6ac88dad5b1c,
72    0x3fbc885801bc4b23,
73    0x3fbec739830a1120,
74    0x3fbfe89139dbd566,
75    0x3fc1178e8227e47c,
76    0x3fc1aa2b7e23f72a,
77    0x3fc2d1610c86813a,
78    0x3fc365fcb0159016,
79    0x3fc4913d8333b561,
80    0x3fc527e5e4a1b58d,
81    0x3fc6574ebe8c133a,
82    0x3fc6f0128b756abc,
83    0x3fc823c16551a3c2,
84    0x3fc8beafeb38fe8c,
85    0x3fc95a5adcf7017f,
86    0x3fca93ed3c8ad9e3,
87    0x3fcb31d8575bce3d,
88    0x3fcbd087383bd8ad,
89    0x3fcd1037f2655e7b,
90    0x3fcdb13db0d48940,
91    0x3fce530effe71012,
92    0x3fcef5ade4dcffe6,
93    0x3fcf991c6cb3b379,
94    0x3fd07138604d5862,
95    0x3fd0c42d676162e3,
96    0x3fd1178e8227e47c,
97    0x3fd16b5ccbacfb73,
98    0x3fd1bf99635a6b95,
99    0x3fd269621134db92,
100    0x3fd2bef07cdc9354,
101    0x3fd314f1e1d35ce4,
102    0x3fd36b6776be1117,
103    0x3fd3c25277333184,
104    0x3fd419b423d5e8c7,
105    0x3fd4718dc271c41b,
106    0x3fd4c9e09e172c3c,
107    0x3fd522ae0738a3d8,
108    0x3fd57bf753c8d1fb,
109    0x3fd5d5bddf595f30,
110    0x3fd630030b3aac49,
111    0x3fd68ac83e9c6a14,
112    0x3fd6e60ee6af1972,
113    0x3fd741d876c67bb1,
114    0x3fd79e26687cfb3e,
115    0x3fd7fafa3bd8151c,
116    0x3fd85855776dcbfb,
117    0x3fd8b639a88b2df5,
118    0x3fd914a8635bf68a,
119    0x3fd973a3431356ae,
120    0x3fd9d32bea15ed3b,
121    0x3fda33440224fa79,
122    0x3fda33440224fa79,
123    0x3fda93ed3c8ad9e3,
124    0x3fdaf5295248cdd0,
125    0x3fdb56fa04462909,
126    0x3fdbb9611b80e2fb,
127    0x3fdc1c60693fa39e,
128    0x3fdc1c60693fa39e,
129    0x3fdc7ff9c74554c9,
130    0x3fdce42f18064743,
131    0x3fdd490246defa6b,
132    0x3fddae75484c9616,
133    0x3fde148a1a2726ce,
134    0x3fde148a1a2726ce,
135    0x3fde7b42c3ddad73,
136    0x3fdee2a156b413e5,
137    0x3fdf4aa7ee03192d,
138    0x3fdf4aa7ee03192d,
139    0x3fdfb358af7a4884,
140    0x3fe00e5ae5b207ab,
141    0x3fe04360be7603ad,
142    0x3fe04360be7603ad,
143    0x3fe078bf0533c568,
144    0x3fe0ae76e2d054fa,
145    0x3fe0ae76e2d054fa,
146    0x3fe0e4898611cce1,
147    0x3fe11af823c75aa8,
148    0x3fe11af823c75aa8,
149    0x3fe151c3f6f29612,
150    0x3fe188ee40f23ca6,
151    0x3fe188ee40f23ca6,
152    0x3fe1c07849ae6007,
153    0x3fe1f8635fc61659,
154    0x3fe1f8635fc61659,
155    0x3fe230b0d8bebc98,
156    0x3fe269621134db92,
157    0x3fe269621134db92,
158    0x3fe2a2786d0ec107,
159    0x3fe2dbf557b0df43,
160    0x3fe2dbf557b0df43,
161    0x3fe315da4434068b,
162    0x3fe315da4434068b,
163    0x3fe35028ad9d8c86,
164    0x3fe38ae2171976e7,
165    0x3fe38ae2171976e7,
166    0x3fe3c6080c36bfb5,
167    0x3fe3c6080c36bfb5,
168    0x3fe4019c2125ca93,
169    0x3fe43d9ff2f923c5,
170    0x3fe43d9ff2f923c5,
171    0x3fe47a1527e8a2d3,
172    0x3fe47a1527e8a2d3,
173    0x3fe4b6fd6f970c1f,
174    0x3fe4b6fd6f970c1f,
175    0x3fe4f45a835a4e19,
176    0x3fe4f45a835a4e19,
177    0x3fe5322e26867857,
178    0x3fe5322e26867857,
179    0x3fe5707a26bb8c66,
180    0x3fe5707a26bb8c66,
181    0x3fe5af405c3649e0,
182    0x3fe5af405c3649e0,
183    0x3fe5ee82aa241920,
184    0x0000000000000000,
185];
186
187pub(crate) trait LogfBackend {
188    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32;
189    fn fma(&self, x: f64, y: f64, z: f64) -> f64;
190    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64;
191    const HAS_FMA: bool;
192}
193
194pub(crate) struct GenLogfBackend {}
195
196impl LogfBackend for GenLogfBackend {
197    #[inline(always)]
198    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
199        f_fmlaf(x, y, z)
200    }
201
202    #[inline(always)]
203    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
204        f_fmla(x, y, z)
205    }
206
207    #[inline(always)]
208    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
209        use crate::polyeval::f_polyeval3;
210        f_polyeval3(x, a0, a1, a2)
211    }
212
213    #[cfg(any(
214        all(
215            any(target_arch = "x86", target_arch = "x86_64"),
216            target_feature = "fma"
217        ),
218        target_arch = "aarch64"
219    ))]
220    const HAS_FMA: bool = true;
221    #[cfg(not(any(
222        all(
223            any(target_arch = "x86", target_arch = "x86_64"),
224            target_feature = "fma"
225        ),
226        target_arch = "aarch64"
227    )))]
228    const HAS_FMA: bool = false;
229}
230
231#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
232pub(crate) struct FmaLogfBackend {}
233
234#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
235impl LogfBackend for FmaLogfBackend {
236    #[inline(always)]
237    fn fmaf(&self, x: f32, y: f32, z: f32) -> f32 {
238        f32::mul_add(x, y, z)
239    }
240
241    #[inline(always)]
242    fn fma(&self, x: f64, y: f64, z: f64) -> f64 {
243        f64::mul_add(x, y, z)
244    }
245
246    #[inline(always)]
247    fn polyeval3(&self, x: f64, a0: f64, a1: f64, a2: f64) -> f64 {
248        use crate::polyeval::d_polyeval3;
249        d_polyeval3(x, a0, a1, a2)
250    }
251
252    const HAS_FMA: bool = true;
253}
254
255#[inline(always)]
256fn logf_gen<B: LogfBackend>(x: f32, backend: B) -> f32 {
257    let mut x_u = x.to_bits();
258
259    const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
260
261    let mut m = -(E_BIAS as i32);
262    if x_u < 0x4c5d65a5u32 {
263        if x_u == 0x3f7f4d6fu32 {
264            return black_box(f64::from_bits(0xbf6659ec80000000) as f32) + min_normal_f32(true);
265        } else if x_u == 0x41178febu32 {
266            return black_box(f64::from_bits(0x4001fcbce0000000) as f32) + min_normal_f32(true);
267        } else if x_u == 0x3f800000u32 {
268            return 0.;
269        } else if x_u == 0x1e88452du32 {
270            return black_box(f64::from_bits(0xc046d7b180000000) as f32) + min_normal_f32(true);
271        }
272        if x_u < f32::MIN_POSITIVE.to_bits() {
273            if x == 0.0 {
274                return f32::NEG_INFINITY;
275            }
276            // Normalize denormal inputs.
277            x_u = (x * f64::from_bits(0x4160000000000000) as f32).to_bits();
278            m -= 23;
279        }
280    } else {
281        if x_u == 0x4c5d65a5u32 {
282            return black_box(f32::from_bits(0x418f034b)) + min_normal_f32(true);
283        } else if x_u == 0x65d890d3u32 {
284            return black_box(f32::from_bits(0x4254d1f9)) + min_normal_f32(true);
285        } else if x_u == 0x6f31a8ecu32 {
286            return black_box(f32::from_bits(0x42845a89)) + min_normal_f32(true);
287        } else if x_u == 0x7a17f30au32 {
288            return black_box(f32::from_bits(0x42a28a1b)) + min_normal_f32(true);
289        } else if x_u == 0x500ffb03u32 {
290            return black_box(f32::from_bits(0x41b7ee9a)) + min_normal_f32(true);
291        } else if x_u == 0x5cd69e88u32 {
292            return black_box(f32::from_bits(0x4222e0a3)) + min_normal_f32(true);
293        } else if x_u == 0x5ee8984eu32 {
294            return black_box(f32::from_bits(0x422e4a21)) + min_normal_f32(true);
295        }
296
297        if x_u > f32::MAX.to_bits() {
298            if x_u == 0x80000000u32 {
299                return f32::NEG_INFINITY;
300            }
301            if x.is_sign_negative() && !x.is_nan() {
302                return f32::NAN + x;
303            }
304            // x is +inf or nan
305            if x.is_nan() {
306                return f32::NAN;
307            }
308
309            return x;
310        }
311    }
312
313    let mant = x_u & 0x007F_FFFF;
314    // Extract 7 leading fractional bits of the mantissa
315    let index = mant.wrapping_shr(16);
316    // Add unbiased exponent. Add an extra 1 if the 7 leading fractional bits are
317    // all 1's.
318    m = m.wrapping_add(x_u.wrapping_add(1 << 16).wrapping_shr(23) as i32);
319    x_u = set_exponent_f32(x_u, 0x7F);
320
321    let u = f32::from_bits(x_u);
322
323    let v = if B::HAS_FMA {
324        backend.fmaf(u, f32::from_bits(LOG_REDUCTION_F32.0[index as usize]), -1.0) as f64 // Exact.
325    } else {
326        use crate::logs::log2::LOG_RANGE_REDUCTION;
327        backend.fma(
328            u as f64,
329            f64::from_bits(LOG_RANGE_REDUCTION[index as usize]),
330            -1.0,
331        ) // Exact
332    };
333    // Degree-5 polynomial approximation of log generated by Sollya with:
334    // > P = fpminimax(log(1 + x)/x, 4, [|1, D...|], [-2^-8, 2^-7]);
335    const COEFFS: [u64; 4] = [
336        0xbfe000000000fe63,
337        0x3fd555556e963c16,
338        0xbfd000028dedf986,
339        0x3fc966681bfda7f7,
340    ];
341    let v2 = v * v; // Exact
342    let p2 = backend.fma(v, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
343    let p1 = backend.fma(v, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
344    let p0 = f64::from_bits(LOG_R[index as usize]) + v;
345    const LOG_2: f64 = f64::from_bits(0x3fe62e42fefa39ef);
346    let r = backend.fma(m as f64, LOG_2, backend.polyeval3(v2, p0, p1, p2));
347    r as f32
348}
349
350#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
351#[target_feature(enable = "avx", enable = "fma")]
352unsafe fn logf_fma_impl(x: f32) -> f32 {
353    logf_gen(x, FmaLogfBackend {})
354}
355
356/// Natural logarithm
357///
358/// Max found ULP 0.4999988
359pub fn f_logf(x: f32) -> f32 {
360    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
361    {
362        logf_gen(x, GenLogfBackend {})
363    }
364    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
365    {
366        use std::sync::OnceLock;
367        static EXECUTOR: OnceLock<unsafe fn(f32) -> f32> = OnceLock::new();
368        let q = EXECUTOR.get_or_init(|| {
369            if std::arch::is_x86_feature_detected!("avx")
370                && std::arch::is_x86_feature_detected!("fma")
371            {
372                logf_fma_impl
373            } else {
374                fn def_logf(x: f32) -> f32 {
375                    logf_gen(x, GenLogfBackend {})
376                }
377                def_logf
378            }
379        });
380        unsafe { q(x) }
381    }
382}
383
384#[inline]
385pub(crate) fn fast_logf(x: f32) -> f64 {
386    let mut x_u = x.to_bits();
387    const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
388    let mut m = -(E_BIAS as i32);
389    if x_u < f32::MIN_POSITIVE.to_bits() {
390        // Normalize denormal inputs.
391        x_u = (x * f64::from_bits(0x4160000000000000) as f32).to_bits();
392        m -= 23;
393    }
394
395    let mant = x_u & 0x007F_FFFF;
396    // Extract 7 leading fractional bits of the mantissa
397    let index = mant.wrapping_shr(16);
398    // Add unbiased exponent. Add an extra 1 if the 7 leading fractional bits are
399    // all 1's.
400    m = m.wrapping_add(x_u.wrapping_add(1 << 16).wrapping_shr(23) as i32);
401    x_u = set_exponent_f32(x_u, 0x7F);
402
403    let v;
404    let u = f32::from_bits(x_u);
405
406    #[cfg(any(
407        all(
408            any(target_arch = "x86", target_arch = "x86_64"),
409            target_feature = "fma"
410        ),
411        target_arch = "aarch64"
412    ))]
413    {
414        v = f_fmlaf(u, f32::from_bits(LOG_REDUCTION_F32.0[index as usize]), -1.0) as f64; // Exact.
415    }
416    #[cfg(not(any(
417        all(
418            any(target_arch = "x86", target_arch = "x86_64"),
419            target_feature = "fma"
420        ),
421        target_arch = "aarch64"
422    )))]
423    {
424        use crate::logs::log2::LOG_RANGE_REDUCTION;
425        v = f_fmla(
426            u as f64,
427            f64::from_bits(LOG_RANGE_REDUCTION[index as usize]),
428            -1.0,
429        ); // Exact
430    }
431    // Degree-5 polynomial approximation of log generated by Sollya with:
432    // > P = fpminimax(log(1 + x)/x, 4, [|1, D...|], [-2^-8, 2^-7]);
433    const COEFFS: [u64; 4] = [
434        0xbfe000000000fe63,
435        0x3fd555556e963c16,
436        0xbfd000028dedf986,
437        0x3fc966681bfda7f7,
438    ];
439    let v2 = v * v; // Exact
440    let p2 = f_fmla(v, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
441    let p1 = f_fmla(v, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
442    let p0 = f64::from_bits(LOG_R[index as usize]) + v;
443    const LOG_2: f64 = f64::from_bits(0x3fe62e42fefa39ef);
444    f_fmla(m as f64, LOG_2, f_polyeval3(v2, p0, p1, p2))
445}
446
447/// Log for given value for const context.
448/// This is simplified version just to make a good approximation on const context.
449#[inline]
450pub const fn logf(d: f32) -> f32 {
451    let ux = d.to_bits();
452    #[allow(clippy::collapsible_if)]
453    if ux < (1 << 23) || ux >= 0x7f800000u32 {
454        if ux == 0 || ux >= 0x7f800000u32 {
455            if ux == 0x7f800000u32 {
456                return d;
457            }
458            let ax = ux.wrapping_shl(1);
459            if ax == 0u32 {
460                // -0.0
461                return f32::NEG_INFINITY;
462            }
463            if ax > 0xff000000u32 {
464                return d + d;
465            } // nan
466            return f32::NAN;
467        }
468    }
469
470    let mut ix = d.to_bits();
471    /* reduce x into [sqrt(2)/2, sqrt(2)] */
472    ix += 0x3f800000 - 0x3f3504f3;
473    let n = (ix >> 23) as i32 - 0x7f;
474    ix = (ix & 0x007fffff) + 0x3f3504f3;
475    let a = f32::from_bits(ix) as f64;
476
477    let x = (a - 1.) / (a + 1.);
478    let x2 = x * x;
479    let mut u = 0.2222220222147750310e+0;
480    u = fmla(u, x2, 0.2857142871244668543e+0);
481    u = fmla(u, x2, 0.3999999999950960318e+0);
482    u = fmla(u, x2, 0.6666666666666734090e+0);
483    u = fmla(u, x2, 2.);
484    fmla(x, u, std::f64::consts::LN_2 * (n as f64)) as f32
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    #[test]
492    fn test_logf() {
493        assert!(
494            (logf(1f32) - 0f32).abs() < 1e-6,
495            "Invalid result {}",
496            logf(1f32)
497        );
498        assert!(
499            (logf(5f32) - 1.60943791243410037460f32).abs() < 1e-6,
500            "Invalid result {}",
501            logf(5f32)
502        );
503        assert_eq!(logf(0.), f32::NEG_INFINITY);
504        assert!(logf(-1.).is_nan());
505        assert!(logf(f32::NAN).is_nan());
506        assert!(logf(f32::NEG_INFINITY).is_nan());
507        assert_eq!(logf(f32::INFINITY), f32::INFINITY);
508    }
509
510    #[test]
511    fn test_flogf() {
512        assert!(
513            (f_logf(1f32) - 0f32).abs() < 1e-6,
514            "Invalid result {}",
515            f_logf(1f32)
516        );
517        assert!(
518            (f_logf(5f32) - 1.60943791243410037460f32).abs() < 1e-6,
519            "Invalid result {}",
520            f_logf(5f32)
521        );
522        assert_eq!(f_logf(0.), f32::NEG_INFINITY);
523        assert!(f_logf(-1.).is_nan());
524        assert!(f_logf(f32::NAN).is_nan());
525        assert!(f_logf(f32::NEG_INFINITY).is_nan());
526        assert_eq!(f_logf(f32::INFINITY), f32::INFINITY);
527    }
528}