ruzstd/blocks/
sequence_section.rs

1//! Utilities and representations for the second half of a block, the sequence section.
2//! This section copies literals from the literals section into the decompressed output.
3
4pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
5pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
6pub(crate) const MAX_OFFSET_CODE: u8 = 31;
7
8pub struct SequencesHeader {
9    pub num_sequences: u32,
10    pub modes: Option<CompressionModes>,
11}
12
13/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
14/// - A copy step, where data is copied from the literals section to the decompressed output
15/// - A *match* copy step that copies data from within the previously decompressed output.
16///
17/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
18#[derive(Clone, Copy)]
19pub struct Sequence {
20    /// Literal length, or the number of bytes to be copied from the literals section
21    /// in the copy step.
22    pub ll: u32,
23    /// The length of the match to make during the match copy step.
24    pub ml: u32,
25    /// How far back to go in the decompressed data to read from the match copy step.
26    /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
27    /// then it has special handling:
28    ///
29    /// The first 3 values define 3 different repeated offsets, with 1 referring to the most
30    /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
31    /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
32    /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
33    /// is considered corrupted.
34    pub of: u32,
35}
36
37impl core::fmt::Display for Sequence {
38    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
39        write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
40    }
41}
42
43/// This byte defines the compression mode of each symbol type
44#[derive(Copy, Clone)]
45pub struct CompressionModes(u8);
46/// The compression mode used for symbol compression
47pub enum ModeType {
48    /// A predefined FSE distribution table is used, and no distribution table
49    /// will be present.
50    Predefined,
51    /// The table consists of a single byte, which contains the symbol's value.
52    RLE,
53    /// Standard FSE compression, a distribution table will be present. This
54    /// mode should not be used when only one symbol is present.
55    FSECompressed,
56    /// The table used in the previous compressed block with at least one sequence
57    /// will be used again. If this is the first block, the table in the dictionary will
58    /// be used.
59    Repeat,
60}
61
62impl CompressionModes {
63    /// Deserialize a two bit mode value into a [ModeType]
64    pub fn decode_mode(m: u8) -> ModeType {
65        match m {
66            0 => ModeType::Predefined,
67            1 => ModeType::RLE,
68            2 => ModeType::FSECompressed,
69            3 => ModeType::Repeat,
70            _ => panic!("This can never happen"),
71        }
72    }
73    /// Read the compression mode of the literal lengths field.
74    pub fn ll_mode(self) -> ModeType {
75        Self::decode_mode(self.0 >> 6)
76    }
77
78    /// Read the compression mode of the offset value field.
79    pub fn of_mode(self) -> ModeType {
80        Self::decode_mode((self.0 >> 4) & 0x3)
81    }
82
83    /// Read the compression mode of the match lengths field.
84    pub fn ml_mode(self) -> ModeType {
85        Self::decode_mode((self.0 >> 2) & 0x3)
86    }
87}
88
89impl Default for SequencesHeader {
90    fn default() -> Self {
91        Self::new()
92    }
93}
94
95#[derive(Debug)]
96#[non_exhaustive]
97pub enum SequencesHeaderParseError {
98    NotEnoughBytes { need_at_least: u8, got: usize },
99}
100
101#[cfg(feature = "std")]
102impl std::error::Error for SequencesHeaderParseError {}
103
104impl core::fmt::Display for SequencesHeaderParseError {
105    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106        match self {
107            SequencesHeaderParseError::NotEnoughBytes { need_at_least, got } => {
108                write!(
109                    f,
110                    "source must have at least {} bytes to parse header; got {} bytes",
111                    need_at_least, got,
112                )
113            }
114        }
115    }
116}
117
118impl SequencesHeader {
119    /// Create a new [SequencesHeader].
120    pub fn new() -> SequencesHeader {
121        SequencesHeader {
122            num_sequences: 0,
123            modes: None,
124        }
125    }
126
127    /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
128    pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
129        let mut bytes_read = 0;
130        if source.is_empty() {
131            return Err(SequencesHeaderParseError::NotEnoughBytes {
132                need_at_least: 1,
133                got: 0,
134            });
135        }
136
137        let source = match source[0] {
138            0 => {
139                self.num_sequences = 0;
140                return Ok(1);
141            }
142            1..=127 => {
143                if source.len() < 2 {
144                    return Err(SequencesHeaderParseError::NotEnoughBytes {
145                        need_at_least: 2,
146                        got: source.len(),
147                    });
148                }
149                self.num_sequences = u32::from(source[0]);
150                bytes_read += 1;
151                &source[1..]
152            }
153            128..=254 => {
154                if source.len() < 3 {
155                    return Err(SequencesHeaderParseError::NotEnoughBytes {
156                        need_at_least: 3,
157                        got: source.len(),
158                    });
159                }
160                self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
161                bytes_read += 2;
162                &source[2..]
163            }
164            255 => {
165                if source.len() < 4 {
166                    return Err(SequencesHeaderParseError::NotEnoughBytes {
167                        need_at_least: 4,
168                        got: source.len(),
169                    });
170                }
171                self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
172                bytes_read += 3;
173                &source[3..]
174            }
175        };
176
177        self.modes = Some(CompressionModes(source[0]));
178        bytes_read += 1;
179
180        Ok(bytes_read)
181    }
182}