ruzstd/blocks/sequence_section.rs
1//! Utilities and representations for the second half of a block, the sequence section.
2//! This section copies literals from the literals section into the decompressed output.
3
4use crate::decoding::errors::SequencesHeaderParseError;
5
6pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
7pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
8pub(crate) const MAX_OFFSET_CODE: u8 = 31;
9
10pub struct SequencesHeader {
11 pub num_sequences: u32,
12 pub modes: Option<CompressionModes>,
13}
14
15/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
16/// - A copy step, where data is copied from the literals section to the decompressed output
17/// - A *match* copy step that copies data from within the previously decompressed output.
18///
19/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
20#[derive(Clone, Copy)]
21pub struct Sequence {
22 /// Literal length, or the number of bytes to be copied from the literals section
23 /// in the copy step.
24 pub ll: u32,
25 /// The length of the match to make during the match copy step.
26 pub ml: u32,
27 /// How far back to go in the decompressed data to read from the match copy step.
28 /// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
29 /// then it has special handling:
30 ///
31 /// The first 3 values define 3 different repeated offsets, with 1 referring to the most
32 /// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
33 /// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
34 /// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
35 /// is considered corrupted.
36 pub of: u32,
37}
38
39impl core::fmt::Display for Sequence {
40 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
41 write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
42 }
43}
44
45/// This byte defines the compression mode of each symbol type
46#[derive(Copy, Clone)]
47pub struct CompressionModes(u8);
48/// The compression mode used for symbol compression
49pub enum ModeType {
50 /// A predefined FSE distribution table is used, and no distribution table
51 /// will be present.
52 Predefined,
53 /// The table consists of a single byte, which contains the symbol's value.
54 #[allow(clippy::upper_case_acronyms)]
55 RLE,
56 /// Standard FSE compression, a distribution table will be present. This
57 /// mode should not be used when only one symbol is present.
58 FSECompressed,
59 /// The table used in the previous compressed block with at least one sequence
60 /// will be used again. If this is the first block, the table in the dictionary will
61 /// be used.
62 Repeat,
63}
64
65impl CompressionModes {
66 /// Deserialize a two bit mode value into a [ModeType]
67 pub fn decode_mode(m: u8) -> ModeType {
68 match m {
69 0 => ModeType::Predefined,
70 1 => ModeType::RLE,
71 2 => ModeType::FSECompressed,
72 3 => ModeType::Repeat,
73 _ => panic!("This can never happen"),
74 }
75 }
76 /// Read the compression mode of the literal lengths field.
77 pub fn ll_mode(self) -> ModeType {
78 Self::decode_mode(self.0 >> 6)
79 }
80
81 /// Read the compression mode of the offset value field.
82 pub fn of_mode(self) -> ModeType {
83 Self::decode_mode((self.0 >> 4) & 0x3)
84 }
85
86 /// Read the compression mode of the match lengths field.
87 pub fn ml_mode(self) -> ModeType {
88 Self::decode_mode((self.0 >> 2) & 0x3)
89 }
90}
91
92impl Default for SequencesHeader {
93 fn default() -> Self {
94 Self::new()
95 }
96}
97
98impl SequencesHeader {
99 /// Create a new [SequencesHeader].
100 pub fn new() -> SequencesHeader {
101 SequencesHeader {
102 num_sequences: 0,
103 modes: None,
104 }
105 }
106
107 /// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
108 pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
109 let mut bytes_read = 0;
110 if source.is_empty() {
111 return Err(SequencesHeaderParseError::NotEnoughBytes {
112 need_at_least: 1,
113 got: 0,
114 });
115 }
116
117 match source[0] {
118 0 => {
119 self.num_sequences = 0;
120 bytes_read += 1;
121 }
122 1..=127 => {
123 if source.len() < 2 {
124 return Err(SequencesHeaderParseError::NotEnoughBytes {
125 need_at_least: 2,
126 got: source.len(),
127 });
128 }
129 self.num_sequences = u32::from(source[0]);
130 self.modes = Some(CompressionModes(source[1]));
131 bytes_read += 2;
132 }
133 128..=254 => {
134 if source.len() < 2 {
135 return Err(SequencesHeaderParseError::NotEnoughBytes {
136 need_at_least: 2,
137 got: source.len(),
138 });
139 }
140 self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
141 bytes_read += 2;
142 if self.num_sequences != 0 {
143 if source.len() < 3 {
144 return Err(SequencesHeaderParseError::NotEnoughBytes {
145 need_at_least: 3,
146 got: source.len(),
147 });
148 }
149 self.modes = Some(CompressionModes(source[2]));
150 bytes_read += 1;
151 }
152 }
153 255 => {
154 if source.len() < 4 {
155 return Err(SequencesHeaderParseError::NotEnoughBytes {
156 need_at_least: 4,
157 got: source.len(),
158 });
159 }
160 self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
161 self.modes = Some(CompressionModes(source[3]));
162 bytes_read += 4;
163 }
164 }
165
166 Ok(bytes_read)
167 }
168}