ruzstd/encoding/
match_generator.rs

1//! Matching algorithm used find repeated parts in the original data
2//!
3//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
4//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
5//!
6//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
7
8use alloc::vec::Vec;
9use core::num::NonZeroUsize;
10
11use super::CompressionLevel;
12use super::Matcher;
13use super::Sequence;
14
15const MIN_MATCH_LEN: usize = 5;
16
17/// This is the default implementation of the `Matcher` trait. It allocates and reuses the buffers when possible.
18pub struct MatchGeneratorDriver {
19    vec_pool: Vec<Vec<u8>>,
20    suffix_pool: Vec<SuffixStore>,
21    match_generator: MatchGenerator,
22    slice_size: usize,
23}
24
25impl MatchGeneratorDriver {
26    /// slice_size says how big the slices should be that are allocated to work with
27    /// max_slices_in_window says how many slices should at most be used while looking for matches
28    pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
29        Self {
30            vec_pool: Vec::new(),
31            suffix_pool: Vec::new(),
32            match_generator: MatchGenerator::new(max_slices_in_window * slice_size),
33            slice_size,
34        }
35    }
36}
37
38impl Matcher for MatchGeneratorDriver {
39    fn reset(&mut self, _level: CompressionLevel) {
40        let vec_pool = &mut self.vec_pool;
41        let suffix_pool = &mut self.suffix_pool;
42
43        self.match_generator.reset(|mut data, mut suffixes| {
44            data.resize(data.capacity(), 0);
45            vec_pool.push(data);
46            suffixes.slots.clear();
47            suffixes.slots.resize(suffixes.slots.capacity(), None);
48            suffix_pool.push(suffixes);
49        });
50    }
51
52    fn window_size(&self) -> u64 {
53        self.match_generator.max_window_size as u64
54    }
55
56    fn get_next_space(&mut self) -> Vec<u8> {
57        self.vec_pool.pop().unwrap_or_else(|| {
58            let mut space = alloc::vec![0; self.slice_size];
59            space.resize(space.capacity(), 0);
60            space
61        })
62    }
63
64    fn get_last_space(&mut self) -> &[u8] {
65        self.match_generator.window.last().unwrap().data.as_slice()
66    }
67
68    fn commit_space(&mut self, space: Vec<u8>) {
69        let vec_pool = &mut self.vec_pool;
70        let suffixes = self
71            .suffix_pool
72            .pop()
73            .unwrap_or_else(|| SuffixStore::with_capacity(space.len()));
74        let suffix_pool = &mut self.suffix_pool;
75        self.match_generator
76            .add_data(space, suffixes, |mut data, mut suffixes| {
77                data.resize(data.capacity(), 0);
78                vec_pool.push(data);
79                suffixes.slots.clear();
80                suffixes.slots.resize(suffixes.slots.capacity(), None);
81                suffix_pool.push(suffixes);
82            });
83    }
84
85    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
86        while self.match_generator.next_sequence(&mut handle_sequence) {}
87    }
88    fn skip_matching(&mut self) {
89        self.match_generator.skip_matching();
90    }
91}
92
93/// This stores the index of a suffix of a string by hashing the first few bytes of that suffix
94/// This means that collisions just overwrite and that you need to check validity after a get
95struct SuffixStore {
96    // We use NonZeroUsize to enable niche optimization here.
97    // On store we do +1 and on get -1
98    // This is ok since usize::MAX is never a valid offset
99    slots: Vec<Option<NonZeroUsize>>,
100    len_log: u32,
101}
102
103impl SuffixStore {
104    fn with_capacity(capacity: usize) -> Self {
105        Self {
106            slots: alloc::vec![None; capacity],
107            len_log: capacity.ilog2(),
108        }
109    }
110
111    #[inline(always)]
112    fn insert(&mut self, suffix: &[u8], idx: usize) {
113        let key = self.key(suffix);
114        self.slots[key] = Some(NonZeroUsize::new(idx + 1).unwrap());
115    }
116
117    #[inline(always)]
118    fn contains_key(&self, suffix: &[u8]) -> bool {
119        let key = self.key(suffix);
120        self.slots[key].is_some()
121    }
122
123    #[inline(always)]
124    fn get(&self, suffix: &[u8]) -> Option<usize> {
125        let key = self.key(suffix);
126        self.slots[key].map(|x| <NonZeroUsize as Into<usize>>::into(x) - 1)
127    }
128
129    #[inline(always)]
130    fn key(&self, suffix: &[u8]) -> usize {
131        let s0 = suffix[0] as u64;
132        let s1 = suffix[1] as u64;
133        let s2 = suffix[2] as u64;
134        let s3 = suffix[3] as u64;
135        let s4 = suffix[4] as u64;
136
137        const POLY: u64 = 0xCF3BCCDCABu64;
138
139        let s0 = (s0 << 24).wrapping_mul(POLY);
140        let s1 = (s1 << 32).wrapping_mul(POLY);
141        let s2 = (s2 << 40).wrapping_mul(POLY);
142        let s3 = (s3 << 48).wrapping_mul(POLY);
143        let s4 = (s4 << 56).wrapping_mul(POLY);
144
145        let index = s0 ^ s1 ^ s2 ^ s3 ^ s4;
146        let index = index >> (64 - self.len_log);
147        index as usize % self.slots.len()
148    }
149}
150
151/// We keep a window of a few of these entries
152/// All of these are valid targets for a match to be generated for
153struct WindowEntry {
154    data: Vec<u8>,
155    /// Stores indexes into data
156    suffixes: SuffixStore,
157    /// Makes offset calculations efficient
158    base_offset: usize,
159}
160
161pub(crate) struct MatchGenerator {
162    max_window_size: usize,
163    /// Data window we are operating on to find matches
164    /// The data we want to find matches for is in the last slice
165    window: Vec<WindowEntry>,
166    window_size: usize,
167    #[cfg(debug_assertions)]
168    concat_window: Vec<u8>,
169    /// Index in the last slice that we already processed
170    suffix_idx: usize,
171    /// Gets updated when a new sequence is returned to point right behind that sequence
172    last_idx_in_sequence: usize,
173}
174
175impl MatchGenerator {
176    /// max_size defines how many bytes will be used at most in the window used for matching
177    fn new(max_size: usize) -> Self {
178        Self {
179            max_window_size: max_size,
180            window: Vec::new(),
181            window_size: 0,
182            #[cfg(debug_assertions)]
183            concat_window: Vec::new(),
184            suffix_idx: 0,
185            last_idx_in_sequence: 0,
186        }
187    }
188
189    fn reset(&mut self, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
190        self.window_size = 0;
191        #[cfg(debug_assertions)]
192        self.concat_window.clear();
193        self.suffix_idx = 0;
194        self.last_idx_in_sequence = 0;
195        self.window.drain(..).for_each(|entry| {
196            reuse_space(entry.data, entry.suffixes);
197        });
198    }
199
200    /// Processes bytes in the current window until either a match is found or no more matches can be found
201    /// * If a match is found handle_sequence is called with the Triple variant
202    /// * If no more matches can be found but there are bytes still left handle_sequence is called with the Literals variant
203    /// * If no more matches can be found and no more bytes are left this returns false
204    fn next_sequence(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) -> bool {
205        loop {
206            let last_entry = self.window.last().unwrap();
207            let data_slice = &last_entry.data;
208
209            // We already reached the end of the window, check if we need to return a Literals{}
210            if self.suffix_idx >= data_slice.len() {
211                if self.last_idx_in_sequence != self.suffix_idx {
212                    let literals = &data_slice[self.last_idx_in_sequence..];
213                    self.last_idx_in_sequence = self.suffix_idx;
214                    handle_sequence(Sequence::Literals { literals });
215                    return true;
216                } else {
217                    return false;
218                }
219            }
220
221            // If the remaining data is smaller than the minimum match length we can stop and return a Literals{}
222            let data_slice = &data_slice[self.suffix_idx..];
223            if data_slice.len() < MIN_MATCH_LEN {
224                let last_idx_in_sequence = self.last_idx_in_sequence;
225                self.last_idx_in_sequence = last_entry.data.len();
226                self.suffix_idx = last_entry.data.len();
227                handle_sequence(Sequence::Literals {
228                    literals: &last_entry.data[last_idx_in_sequence..],
229                });
230                return true;
231            }
232
233            // This is the key we are looking to find a match for
234            let key = &data_slice[..MIN_MATCH_LEN];
235
236            // Look in each window entry
237            let mut candidate = None;
238            for (match_entry_idx, match_entry) in self.window.iter().enumerate() {
239                let is_last = match_entry_idx == self.window.len() - 1;
240                if let Some(match_index) = match_entry.suffixes.get(key) {
241                    let match_slice = if is_last {
242                        &match_entry.data[match_index..self.suffix_idx]
243                    } else {
244                        &match_entry.data[match_index..]
245                    };
246
247                    // Check how long the common prefix actually is
248                    let match_len = Self::common_prefix_len(match_slice, data_slice);
249
250                    // Collisions in the suffix store might make this check fail
251                    if match_len >= MIN_MATCH_LEN {
252                        let offset = match_entry.base_offset + self.suffix_idx - match_index;
253
254                        // If we are in debug/tests make sure the match we found is actually at the offset we calculated
255                        #[cfg(debug_assertions)]
256                        {
257                            let unprocessed = last_entry.data.len() - self.suffix_idx;
258                            let start = self.concat_window.len() - unprocessed - offset;
259                            let end = start + match_len;
260                            let check_slice = &self.concat_window[start..end];
261                            debug_assert_eq!(check_slice, &match_slice[..match_len]);
262                        }
263
264                        if let Some((old_offset, old_match_len)) = candidate {
265                            if match_len > old_match_len
266                                || (match_len == old_match_len && offset < old_offset)
267                            {
268                                candidate = Some((offset, match_len));
269                            }
270                        } else {
271                            candidate = Some((offset, match_len));
272                        }
273                    }
274                }
275            }
276
277            if let Some((offset, match_len)) = candidate {
278                // For each index in the match we found we do not need to look for another match
279                // But we still want them registered in the suffix store
280                self.add_suffixes_till(self.suffix_idx + match_len);
281
282                // All literals that were not included between this match and the last are now included here
283                let last_entry = self.window.last().unwrap();
284                let literals = &last_entry.data[self.last_idx_in_sequence..self.suffix_idx];
285
286                // Update the indexes, all indexes upto and including the current index have been included in a sequence now
287                self.suffix_idx += match_len;
288                self.last_idx_in_sequence = self.suffix_idx;
289                handle_sequence(Sequence::Triple {
290                    literals,
291                    offset,
292                    match_len,
293                });
294
295                return true;
296            }
297
298            let last_entry = self.window.last_mut().unwrap();
299            let key = &last_entry.data[self.suffix_idx..self.suffix_idx + MIN_MATCH_LEN];
300            if !last_entry.suffixes.contains_key(key) {
301                last_entry.suffixes.insert(key, self.suffix_idx);
302            }
303            self.suffix_idx += 1;
304        }
305    }
306
307    /// Find the common prefix length between two byte slices
308    #[inline(always)]
309    fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
310        Self::mismatch_chunks::<8>(a, b)
311    }
312
313    /// Find the common prefix length between two byte slices with a configurable chunk length
314    /// This enables vectorization optimizations
315    fn mismatch_chunks<const N: usize>(xs: &[u8], ys: &[u8]) -> usize {
316        let off = core::iter::zip(xs.chunks_exact(N), ys.chunks_exact(N))
317            .take_while(|(x, y)| x == y)
318            .count()
319            * N;
320        off + core::iter::zip(&xs[off..], &ys[off..])
321            .take_while(|(x, y)| x == y)
322            .count()
323    }
324
325    /// Process bytes and add the suffixes to the suffix store up to a specific index
326    #[inline(always)]
327    fn add_suffixes_till(&mut self, idx: usize) {
328        let last_entry = self.window.last_mut().unwrap();
329        if last_entry.data.len() < MIN_MATCH_LEN {
330            return;
331        }
332        let slice = &last_entry.data[self.suffix_idx..idx];
333        for (key_index, key) in slice.windows(MIN_MATCH_LEN).enumerate() {
334            if !last_entry.suffixes.contains_key(key) {
335                last_entry.suffixes.insert(key, self.suffix_idx + key_index);
336            }
337        }
338    }
339
340    /// Skip matching for the whole current window entry
341    fn skip_matching(&mut self) {
342        let len = self.window.last().unwrap().data.len();
343        self.add_suffixes_till(len);
344        self.suffix_idx = len;
345        self.last_idx_in_sequence = len;
346    }
347
348    /// Add a new window entry. Will panic if the last window entry hasn't been processed properly.
349    /// If any resources are released by pushing the new entry they are returned via the callback
350    fn add_data(
351        &mut self,
352        data: Vec<u8>,
353        suffixes: SuffixStore,
354        reuse_space: impl FnMut(Vec<u8>, SuffixStore),
355    ) {
356        assert!(
357            self.window.is_empty() || self.suffix_idx == self.window.last().unwrap().data.len()
358        );
359        self.reserve(data.len(), reuse_space);
360        #[cfg(debug_assertions)]
361        self.concat_window.extend_from_slice(&data);
362
363        if let Some(last_len) = self.window.last().map(|last| last.data.len()) {
364            for entry in self.window.iter_mut() {
365                entry.base_offset += last_len;
366            }
367        }
368
369        let len = data.len();
370        self.window.push(WindowEntry {
371            data,
372            suffixes,
373            base_offset: 0,
374        });
375        self.window_size += len;
376        self.suffix_idx = 0;
377        self.last_idx_in_sequence = 0;
378    }
379
380    /// Reserve space for a new window entry
381    /// If any resources are released by pushing the new entry they are returned via the callback
382    fn reserve(&mut self, amount: usize, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
383        assert!(self.max_window_size >= amount);
384        while self.window_size + amount > self.max_window_size {
385            let removed = self.window.remove(0);
386            self.window_size -= removed.data.len();
387            #[cfg(debug_assertions)]
388            self.concat_window.drain(0..removed.data.len());
389
390            let WindowEntry {
391                suffixes,
392                data: leaked_vec,
393                base_offset: _,
394            } = removed;
395            reuse_space(leaked_vec, suffixes);
396        }
397    }
398}
399
400#[test]
401fn matches() {
402    let mut matcher = MatchGenerator::new(1000);
403    let mut original_data = Vec::new();
404    let mut reconstructed = Vec::new();
405
406    let assert_seq_equal = |seq1: Sequence<'_>, seq2: Sequence<'_>, reconstructed: &mut Vec<u8>| {
407        assert_eq!(seq1, seq2);
408        match seq2 {
409            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
410            Sequence::Triple {
411                literals,
412                offset,
413                match_len,
414            } => {
415                reconstructed.extend_from_slice(literals);
416                let start = reconstructed.len() - offset;
417                let end = start + match_len;
418                reconstructed.extend_from_within(start..end);
419            }
420        }
421    };
422
423    matcher.add_data(
424        alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
425        SuffixStore::with_capacity(100),
426        |_, _| {},
427    );
428    original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
429
430    matcher.next_sequence(|seq| {
431        assert_seq_equal(
432            seq,
433            Sequence::Triple {
434                literals: &[0, 0, 0, 0, 0],
435                offset: 5,
436                match_len: 5,
437            },
438            &mut reconstructed,
439        )
440    });
441
442    assert!(!matcher.next_sequence(|_| {}));
443
444    matcher.add_data(
445        alloc::vec![1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,],
446        SuffixStore::with_capacity(100),
447        |_, _| {},
448    );
449    original_data.extend_from_slice(&[
450        1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
451    ]);
452
453    matcher.next_sequence(|seq| {
454        assert_seq_equal(
455            seq,
456            Sequence::Triple {
457                literals: &[1, 2, 3, 4, 5, 6],
458                offset: 6,
459                match_len: 6,
460            },
461            &mut reconstructed,
462        )
463    });
464    matcher.next_sequence(|seq| {
465        assert_seq_equal(
466            seq,
467            Sequence::Triple {
468                literals: &[],
469                offset: 12,
470                match_len: 6,
471            },
472            &mut reconstructed,
473        )
474    });
475    matcher.next_sequence(|seq| {
476        assert_seq_equal(
477            seq,
478            Sequence::Triple {
479                literals: &[],
480                offset: 28,
481                match_len: 5,
482            },
483            &mut reconstructed,
484        )
485    });
486    assert!(!matcher.next_sequence(|_| {}));
487
488    matcher.add_data(
489        alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
490        SuffixStore::with_capacity(100),
491        |_, _| {},
492    );
493    original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
494
495    matcher.next_sequence(|seq| {
496        assert_seq_equal(
497            seq,
498            Sequence::Triple {
499                literals: &[],
500                offset: 23,
501                match_len: 6,
502            },
503            &mut reconstructed,
504        )
505    });
506    matcher.next_sequence(|seq| {
507        assert_seq_equal(
508            seq,
509            Sequence::Triple {
510                literals: &[7, 8, 9, 10, 11],
511                offset: 16,
512                match_len: 5,
513            },
514            &mut reconstructed,
515        )
516    });
517    assert!(!matcher.next_sequence(|_| {}));
518
519    matcher.add_data(
520        alloc::vec![0, 0, 0, 0, 0],
521        SuffixStore::with_capacity(100),
522        |_, _| {},
523    );
524    original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
525
526    matcher.next_sequence(|seq| {
527        assert_seq_equal(
528            seq,
529            Sequence::Triple {
530                literals: &[],
531                offset: 5,
532                match_len: 5,
533            },
534            &mut reconstructed,
535        )
536    });
537    assert!(!matcher.next_sequence(|_| {}));
538
539    matcher.add_data(
540        alloc::vec![7, 8, 9, 10, 11],
541        SuffixStore::with_capacity(100),
542        |_, _| {},
543    );
544    original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
545
546    matcher.next_sequence(|seq| {
547        assert_seq_equal(
548            seq,
549            Sequence::Triple {
550                literals: &[],
551                offset: 15,
552                match_len: 5,
553            },
554            &mut reconstructed,
555        )
556    });
557    assert!(!matcher.next_sequence(|_| {}));
558
559    matcher.add_data(
560        alloc::vec![1, 3, 5, 7, 9],
561        SuffixStore::with_capacity(100),
562        |_, _| {},
563    );
564    matcher.skip_matching();
565    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
566    reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
567    assert!(!matcher.next_sequence(|_| {}));
568
569    matcher.add_data(
570        alloc::vec![1, 3, 5, 7, 9],
571        SuffixStore::with_capacity(100),
572        |_, _| {},
573    );
574    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
575
576    matcher.next_sequence(|seq| {
577        assert_seq_equal(
578            seq,
579            Sequence::Triple {
580                literals: &[],
581                offset: 5,
582                match_len: 5,
583            },
584            &mut reconstructed,
585        )
586    });
587    assert!(!matcher.next_sequence(|_| {}));
588
589    matcher.add_data(
590        alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
591        SuffixStore::with_capacity(100),
592        |_, _| {},
593    );
594    original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
595
596    matcher.next_sequence(|seq| {
597        assert_seq_equal(
598            seq,
599            Sequence::Triple {
600                literals: &[0, 0, 11, 13, 15, 17, 20],
601                offset: 5,
602                match_len: 5,
603            },
604            &mut reconstructed,
605        )
606    });
607    matcher.next_sequence(|seq| {
608        assert_seq_equal(
609            seq,
610            Sequence::Literals {
611                literals: &[21, 23],
612            },
613            &mut reconstructed,
614        )
615    });
616    assert!(!matcher.next_sequence(|_| {}));
617
618    assert_eq!(reconstructed, original_data);
619}