ruzstd/blocks/
sequence_section.rs

1//! Utilities and representations for the second half of a block, the sequence section.
2//! This section copies literals from the literals section into the decompressed output.
3
4use crate::decoding::errors::SequencesHeaderParseError;
5
6pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
7pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
8pub(crate) const MAX_OFFSET_CODE: u8 = 31;
9
10pub struct SequencesHeader {
11    pub num_sequences: u32,
12    pub modes: Option<CompressionModes>,
13}
14
15/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
16/// - A copy step, where data is copied from the literals section to the decompressed output
17/// - A *match* copy step that copies data from within the previously decompressed output.
18///
19/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
20#[derive(Clone, Copy)]
21pub struct Sequence {
22    /// Literal length, or the number of bytes to be copied from the literals section
23    /// in the copy step.
24    pub ll: u32,
25    /// The length of the match to make during the match copy step.
26    pub ml: u32,
27    /// How far back to go in the decompressed data to read from the match copy step.
28    /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
29    /// then it has special handling:
30    ///
31    /// The first 3 values define 3 different repeated offsets, with 1 referring to the most
32    /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
33    /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
34    /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
35    /// is considered corrupted.
36    pub of: u32,
37}
38
39impl core::fmt::Display for Sequence {
40    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
41        write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
42    }
43}
44
45/// This byte defines the compression mode of each symbol type
46#[derive(Copy, Clone)]
47pub struct CompressionModes(u8);
48/// The compression mode used for symbol compression
49pub enum ModeType {
50    /// A predefined FSE distribution table is used, and no distribution table
51    /// will be present.
52    Predefined,
53    /// The table consists of a single byte, which contains the symbol's value.
54    #[allow(clippy::upper_case_acronyms)]
55    RLE,
56    /// Standard FSE compression, a distribution table will be present. This
57    /// mode should not be used when only one symbol is present.
58    FSECompressed,
59    /// The table used in the previous compressed block with at least one sequence
60    /// will be used again. If this is the first block, the table in the dictionary will
61    /// be used.
62    Repeat,
63}
64
65impl CompressionModes {
66    /// Deserialize a two bit mode value into a [ModeType]
67    pub fn decode_mode(m: u8) -> ModeType {
68        match m {
69            0 => ModeType::Predefined,
70            1 => ModeType::RLE,
71            2 => ModeType::FSECompressed,
72            3 => ModeType::Repeat,
73            _ => panic!("This can never happen"),
74        }
75    }
76    /// Read the compression mode of the literal lengths field.
77    pub fn ll_mode(self) -> ModeType {
78        Self::decode_mode(self.0 >> 6)
79    }
80
81    /// Read the compression mode of the offset value field.
82    pub fn of_mode(self) -> ModeType {
83        Self::decode_mode((self.0 >> 4) & 0x3)
84    }
85
86    /// Read the compression mode of the match lengths field.
87    pub fn ml_mode(self) -> ModeType {
88        Self::decode_mode((self.0 >> 2) & 0x3)
89    }
90}
91
92impl Default for SequencesHeader {
93    fn default() -> Self {
94        Self::new()
95    }
96}
97
98impl SequencesHeader {
99    /// Create a new [SequencesHeader].
100    pub fn new() -> SequencesHeader {
101        SequencesHeader {
102            num_sequences: 0,
103            modes: None,
104        }
105    }
106
107    /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
108    pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
109        let mut bytes_read = 0;
110        if source.is_empty() {
111            return Err(SequencesHeaderParseError::NotEnoughBytes {
112                need_at_least: 1,
113                got: 0,
114            });
115        }
116
117        match source[0] {
118            0 => {
119                self.num_sequences = 0;
120                bytes_read += 1;
121            }
122            1..=127 => {
123                if source.len() < 2 {
124                    return Err(SequencesHeaderParseError::NotEnoughBytes {
125                        need_at_least: 2,
126                        got: source.len(),
127                    });
128                }
129                self.num_sequences = u32::from(source[0]);
130                self.modes = Some(CompressionModes(source[1]));
131                bytes_read += 2;
132            }
133            128..=254 => {
134                if source.len() < 2 {
135                    return Err(SequencesHeaderParseError::NotEnoughBytes {
136                        need_at_least: 2,
137                        got: source.len(),
138                    });
139                }
140                self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
141                bytes_read += 2;
142                if self.num_sequences != 0 {
143                    if source.len() < 3 {
144                        return Err(SequencesHeaderParseError::NotEnoughBytes {
145                            need_at_least: 3,
146                            got: source.len(),
147                        });
148                    }
149                    self.modes = Some(CompressionModes(source[2]));
150                    bytes_read += 1;
151                }
152            }
153            255 => {
154                if source.len() < 4 {
155                    return Err(SequencesHeaderParseError::NotEnoughBytes {
156                        need_at_least: 4,
157                        got: source.len(),
158                    });
159                }
160                self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
161                self.modes = Some(CompressionModes(source[3]));
162                bytes_read += 4;
163            }
164        }
165
166        Ok(bytes_read)
167    }
168}