ruzstd/blocks/sequence_section.rs
1//! Utilities and representations for the second half of a block, the sequence section.
2//! This section copies literals from the literals section into the decompressed output.
3
4pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
5pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
6pub(crate) const MAX_OFFSET_CODE: u8 = 31;
7
8pub struct SequencesHeader {
9 pub num_sequences: u32,
10 pub modes: Option<CompressionModes>,
11}
12
13/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
14/// - A copy step, where data is copied from the literals section to the decompressed output
15/// - A *match* copy step that copies data from within the previously decompressed output.
16///
17/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
18#[derive(Clone, Copy)]
19pub struct Sequence {
20 /// Literal length, or the number of bytes to be copied from the literals section
21 /// in the copy step.
22 pub ll: u32,
23 /// The length of the match to make during the match copy step.
24 pub ml: u32,
25 /// How far back to go in the decompressed data to read from the match copy step.
26 /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
27 /// then it has special handling:
28 ///
29 /// The first 3 values define 3 different repeated offsets, with 1 referring to the most
30 /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
31 /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
32 /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
33 /// is considered corrupted.
34 pub of: u32,
35}
36
37impl core::fmt::Display for Sequence {
38 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
39 write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
40 }
41}
42
43/// This byte defines the compression mode of each symbol type
44#[derive(Copy, Clone)]
45pub struct CompressionModes(u8);
46/// The compression mode used for symbol compression
47pub enum ModeType {
48 /// A predefined FSE distribution table is used, and no distribution table
49 /// will be present.
50 Predefined,
51 /// The table consists of a single byte, which contains the symbol's value.
52 RLE,
53 /// Standard FSE compression, a distribution table will be present. This
54 /// mode should not be used when only one symbol is present.
55 FSECompressed,
56 /// The table used in the previous compressed block with at least one sequence
57 /// will be used again. If this is the first block, the table in the dictionary will
58 /// be used.
59 Repeat,
60}
61
62impl CompressionModes {
63 /// Deserialize a two bit mode value into a [ModeType]
64 pub fn decode_mode(m: u8) -> ModeType {
65 match m {
66 0 => ModeType::Predefined,
67 1 => ModeType::RLE,
68 2 => ModeType::FSECompressed,
69 3 => ModeType::Repeat,
70 _ => panic!("This can never happen"),
71 }
72 }
73 /// Read the compression mode of the literal lengths field.
74 pub fn ll_mode(self) -> ModeType {
75 Self::decode_mode(self.0 >> 6)
76 }
77
78 /// Read the compression mode of the offset value field.
79 pub fn of_mode(self) -> ModeType {
80 Self::decode_mode((self.0 >> 4) & 0x3)
81 }
82
83 /// Read the compression mode of the match lengths field.
84 pub fn ml_mode(self) -> ModeType {
85 Self::decode_mode((self.0 >> 2) & 0x3)
86 }
87}
88
89impl Default for SequencesHeader {
90 fn default() -> Self {
91 Self::new()
92 }
93}
94
95#[derive(Debug)]
96#[non_exhaustive]
97pub enum SequencesHeaderParseError {
98 NotEnoughBytes { need_at_least: u8, got: usize },
99}
100
101#[cfg(feature = "std")]
102impl std::error::Error for SequencesHeaderParseError {}
103
104impl core::fmt::Display for SequencesHeaderParseError {
105 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106 match self {
107 SequencesHeaderParseError::NotEnoughBytes { need_at_least, got } => {
108 write!(
109 f,
110 "source must have at least {} bytes to parse header; got {} bytes",
111 need_at_least, got,
112 )
113 }
114 }
115 }
116}
117
118impl SequencesHeader {
119 /// Create a new [SequencesHeader].
120 pub fn new() -> SequencesHeader {
121 SequencesHeader {
122 num_sequences: 0,
123 modes: None,
124 }
125 }
126
127 /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
128 pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
129 let mut bytes_read = 0;
130 if source.is_empty() {
131 return Err(SequencesHeaderParseError::NotEnoughBytes {
132 need_at_least: 1,
133 got: 0,
134 });
135 }
136
137 let source = match source[0] {
138 0 => {
139 self.num_sequences = 0;
140 return Ok(1);
141 }
142 1..=127 => {
143 if source.len() < 2 {
144 return Err(SequencesHeaderParseError::NotEnoughBytes {
145 need_at_least: 2,
146 got: source.len(),
147 });
148 }
149 self.num_sequences = u32::from(source[0]);
150 bytes_read += 1;
151 &source[1..]
152 }
153 128..=254 => {
154 if source.len() < 3 {
155 return Err(SequencesHeaderParseError::NotEnoughBytes {
156 need_at_least: 3,
157 got: source.len(),
158 });
159 }
160 self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
161 bytes_read += 2;
162 &source[2..]
163 }
164 255 => {
165 if source.len() < 4 {
166 return Err(SequencesHeaderParseError::NotEnoughBytes {
167 need_at_least: 4,
168 got: source.len(),
169 });
170 }
171 self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
172 bytes_read += 3;
173 &source[3..]
174 }
175 };
176
177 self.modes = Some(CompressionModes(source[0]));
178 bytes_read += 1;
179
180 Ok(bytes_read)
181 }
182}