ruzstd/blocks/
literals_section.rs

1//! Utilities and representations for the first half of a block, the literals section.
2//! It contains data that is then copied from by the sequences section.
3use crate::bit_io::BitReader;
4use crate::decoding::errors::LiteralsSectionParseError;
5
6/// A compressed block consists of two sections, a literals section, and a sequences section.
7///
8/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
9pub struct LiteralsSection {
10    /// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
11    ///   bytes long, and it contains the raw literals data to be used during the second section,
12    ///   the sequences section.
13    /// - If this block is of type [LiteralsSectionType::RLE],
14    ///   then the literal consists of a single byte repeated `regenerated_size` times.
15    /// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
16    ///   then this is the size of the decompressed data.
17    pub regenerated_size: u32,
18    /// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
19    /// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
20    ///   be set to the size of the compressed data.
21    pub compressed_size: Option<u32>,
22    /// This value will be either 1 stream or 4 streams if the literal is of type
23    /// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
24    /// is not used for RLE or uncompressed literals.
25    pub num_streams: Option<u8>,
26    /// The type of the literal section.
27    pub ls_type: LiteralsSectionType,
28}
29
30/// The way which a literal section is encoded.
31pub enum LiteralsSectionType {
32    /// Literals are stored uncompressed.
33    Raw,
34    /// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
35    #[allow(clippy::upper_case_acronyms)]
36    RLE,
37    /// This is a standard Huffman-compressed block, starting with a Huffman tree description.
38    /// In this mode, there are at least *2* different literals represented in the Huffman tree
39    /// description.
40    Compressed,
41    /// This is a Huffman-compressed block,
42    /// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
43    /// in the sequence. If this mode is triggered without any previous Huffman-tables in the
44    /// frame (or dictionary), it should be treated as data corruption.
45    Treeless,
46}
47
48impl Default for LiteralsSection {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54impl LiteralsSection {
55    /// Create a new [LiteralsSection].
56    pub fn new() -> LiteralsSection {
57        LiteralsSection {
58            regenerated_size: 0,
59            compressed_size: None,
60            num_streams: None,
61            ls_type: LiteralsSectionType::Raw,
62        }
63    }
64
65    /// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
66    pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
67        let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
68        let size_format = (first_byte >> 2) & 0x3;
69        match ls_type {
70            LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
71                match size_format {
72                    0 | 2 => {
73                        // size_format actually only uses one bit
74                        // regenerated_size uses 5 bits
75                        Ok(1)
76                    }
77                    1 => {
78                        // size_format uses 2 bit
79                        // regenerated_size uses 12 bits
80                        Ok(2)
81                    }
82                    3 => {
83                        // size_format uses 2 bit
84                        // regenerated_size uses 20 bits
85                        Ok(3)
86                    }
87                    _ => panic!(
88                        "This is a bug in the program. There should only be values between 0..3"
89                    ),
90                }
91            }
92            LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
93                match size_format {
94                    0 | 1 => {
95                        // Only differ in num_streams
96                        // both regenerated and compressed sizes use 10 bit
97                        Ok(3)
98                    }
99                    2 => {
100                        // both regenerated and compressed sizes use 14 bit
101                        Ok(4)
102                    }
103                    3 => {
104                        // both regenerated and compressed sizes use 18 bit
105                        Ok(5)
106                    }
107
108                    _ => panic!(
109                        "This is a bug in the program. There should only be values between 0..3"
110                    ),
111                }
112            }
113        }
114    }
115
116    /// Parse the header into `self`, and returns the number of bytes read.
117    pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
118        let mut br: BitReader<'_> = BitReader::new(raw);
119        let block_type = br.get_bits(2)? as u8;
120        self.ls_type = Self::section_type(block_type)?;
121        let size_format = br.get_bits(2)? as u8;
122
123        let byte_needed = self.header_bytes_needed(raw[0])?;
124        if raw.len() < byte_needed as usize {
125            return Err(LiteralsSectionParseError::NotEnoughBytes {
126                have: raw.len(),
127                need: byte_needed,
128            });
129        }
130
131        match self.ls_type {
132            LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
133                self.compressed_size = None;
134                match size_format {
135                    0 | 2 => {
136                        // size_format actually only uses one bit
137                        // regenerated_size uses 5 bits
138                        self.regenerated_size = u32::from(raw[0]) >> 3;
139                        Ok(1)
140                    }
141                    1 => {
142                        // size_format uses 2 bit
143                        // regenerated_size uses 12 bits
144                        self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
145                        Ok(2)
146                    }
147                    3 => {
148                        // size_format uses 2 bit
149                        // regenerated_size uses 20 bits
150                        self.regenerated_size = (u32::from(raw[0]) >> 4)
151                            + (u32::from(raw[1]) << 4)
152                            + (u32::from(raw[2]) << 12);
153                        Ok(3)
154                    }
155                    _ => panic!(
156                        "This is a bug in the program. There should only be values between 0..3"
157                    ),
158                }
159            }
160            LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
161                match size_format {
162                    0 => {
163                        self.num_streams = Some(1);
164                    }
165                    1..=3 => {
166                        self.num_streams = Some(4);
167                    }
168                    _ => panic!(
169                        "This is a bug in the program. There should only be values between 0..3"
170                    ),
171                };
172
173                match size_format {
174                    0 | 1 => {
175                        // Differ in num_streams see above
176                        // both regenerated and compressed sizes use 10 bit
177
178                        // 4 from the first, six from the second byte
179                        self.regenerated_size =
180                            (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);
181
182                        // 2 from the second, full last byte
183                        self.compressed_size =
184                            Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2));
185                        Ok(3)
186                    }
187                    2 => {
188                        // both regenerated and compressed sizes use 14 bit
189
190                        // 4 from first, full second, 2 from the third byte
191                        self.regenerated_size = (u32::from(raw[0]) >> 4)
192                            + (u32::from(raw[1]) << 4)
193                            + ((u32::from(raw[2]) & 0x3) << 12);
194
195                        // 6 from the third, full last byte
196                        self.compressed_size =
197                            Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
198                        Ok(4)
199                    }
200                    3 => {
201                        // both regenerated and compressed sizes use 18 bit
202
203                        // 4 from first, full second, six from third byte
204                        self.regenerated_size = (u32::from(raw[0]) >> 4)
205                            + (u32::from(raw[1]) << 4)
206                            + ((u32::from(raw[2]) & 0x3F) << 12);
207
208                        // 2 from third, full fourth, full fifth byte
209                        self.compressed_size = Some(
210                            (u32::from(raw[2]) >> 6)
211                                + (u32::from(raw[3]) << 2)
212                                + (u32::from(raw[4]) << 10),
213                        );
214                        Ok(5)
215                    }
216
217                    _ => panic!(
218                        "This is a bug in the program. There should only be values between 0..3"
219                    ),
220                }
221            }
222        }
223    }
224
225    /// Given the first two bits of a header, determine the type of a header.
226    fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
227        let t = raw & 0x3;
228        match t {
229            0 => Ok(LiteralsSectionType::Raw),
230            1 => Ok(LiteralsSectionType::RLE),
231            2 => Ok(LiteralsSectionType::Compressed),
232            3 => Ok(LiteralsSectionType::Treeless),
233            other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }),
234        }
235    }
236}