ruzstd/fse/
mod.rs

1//! FSE, short for Finite State Entropy, is an encoding technique
2//! that assigns shorter codes to symbols that appear more frequently in data,
3//! and longer codes to less frequent symbols.
4//!
5//! FSE works by mutating a state and using that state to index into a table.
6//!
7//! Zstandard uses two different kinds of entropy encoding: FSE, and Huffman coding.
8//! Huffman is used to compress literals,
9//! while FSE is used for all other symbols (literal length code, match length code, offset code).
10//!
11//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse>
12//!
13//! <https://arxiv.org/pdf/1311.2540>
14
15mod fse_decoder;
16
17pub use fse_decoder::*;
18
19pub mod fse_encoder;
20
21#[test]
22fn tables_equal() {
23    let probs = &[0, 0, -1, 3, 2, 2, (1 << 6) - 8];
24    let mut dec_table = FSETable::new(255);
25    dec_table.build_from_probabilities(6, probs).unwrap();
26    let enc_table = fse_encoder::build_table_from_probabilities(probs, 6);
27
28    check_tables(&dec_table, &enc_table);
29}
30
31#[cfg(any(test, feature = "fuzz_exports"))]
32fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
33    for (idx, dec_state) in dec_table.decode.iter().enumerate() {
34        let enc_states = &enc_table.states[dec_state.symbol as usize];
35        let enc_state = enc_states
36            .states
37            .iter()
38            .find(|state| state.index == idx)
39            .unwrap();
40        assert_eq!(enc_state.baseline, dec_state.base_line as usize);
41        assert_eq!(enc_state.num_bits, dec_state.num_bits);
42    }
43}
44
45#[test]
46fn roundtrip() {
47    round_trip(&(0..64).collect::<alloc::vec::Vec<_>>());
48    let mut data = alloc::vec![];
49    data.extend(0..32);
50    data.extend(0..32);
51    data.extend(0..32);
52    data.extend(0..32);
53    data.extend(0..32);
54    data.extend(20..32);
55    data.extend(20..32);
56    data.extend(0..32);
57    data.extend(20..32);
58    data.extend(100..255);
59    data.extend(20..32);
60    data.extend(20..32);
61    round_trip(&data);
62
63    #[cfg(feature = "std")]
64    if std::fs::exists("fuzz/artifacts/fse").unwrap_or(false) {
65        for file in std::fs::read_dir("fuzz/artifacts/fse").unwrap() {
66            if file.as_ref().unwrap().file_type().unwrap().is_file() {
67                let data = std::fs::read(file.unwrap().path()).unwrap();
68                round_trip(&data);
69            }
70        }
71    }
72}
73
74/// Only needed for testing.
75///
76/// Encodes the data with a table built from that data
77/// Decodes the result again by first decoding the table and then the data
78/// Asserts that the decoded data equals the input
79#[cfg(any(test, feature = "fuzz_exports"))]
80pub fn round_trip(data: &[u8]) {
81    use crate::bit_io::{BitReaderReversed, BitWriter};
82    use fse_encoder::FSEEncoder;
83
84    if data.len() < 2 {
85        return;
86    }
87    if data.iter().all(|x| *x == data[0]) {
88        return;
89    }
90    if data.len() < 64 {
91        return;
92    }
93
94    let mut writer = BitWriter::new();
95    let mut encoder = FSEEncoder::new(
96        fse_encoder::build_table_from_data(data.iter().copied(), 22, false),
97        &mut writer,
98    );
99    let mut dec_table = FSETable::new(255);
100    encoder.encode(data);
101    let acc_log = encoder.acc_log();
102    let enc_table = encoder.into_table();
103    let encoded = writer.dump();
104
105    let table_bytes = dec_table.build_decoder(&encoded, acc_log).unwrap();
106    let encoded = &encoded[table_bytes..];
107    let mut decoder = FSEDecoder::new(&dec_table);
108
109    check_tables(&dec_table, &enc_table);
110
111    let mut br = BitReaderReversed::new(encoded);
112    let mut skipped_bits = 0;
113    loop {
114        let val = br.get_bits(1);
115        skipped_bits += 1;
116        if val == 1 || skipped_bits > 8 {
117            break;
118        }
119    }
120    if skipped_bits > 8 {
121        //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
122        panic!("Corrupted end marker");
123    }
124    decoder.init_state(&mut br).unwrap();
125    let mut decoded = alloc::vec::Vec::new();
126
127    for x in data {
128        let w = decoder.decode_symbol();
129        assert_eq!(w, *x);
130        decoded.push(w);
131        if decoded.len() < data.len() {
132            decoder.update_state(&mut br);
133        }
134    }
135
136    assert_eq!(&decoded, data);
137
138    assert_eq!(br.bits_remaining(), 0);
139}