deflate/
chained_hash_table.rs

1//use deflate_state::DebugCounter;
2use std::{mem, ptr};
3
4pub const WINDOW_SIZE: usize = 32768;
5pub const WINDOW_MASK: usize = WINDOW_SIZE - 1;
6#[cfg(test)]
7pub const HASH_BYTES: usize = 3;
8const HASH_SHIFT: u16 = 5;
9const HASH_MASK: u16 = WINDOW_MASK as u16;
10
11/// Helper struct to let us allocate both head and prev in the same block.
12struct Tables {
13    /// Starts of hash chains (in prev)
14    pub head: [u16; WINDOW_SIZE],
15    /// Link to previous occurence of this hash value
16    pub prev: [u16; WINDOW_SIZE],
17}
18
19impl Default for Tables {
20    #[inline]
21    fn default() -> Tables {
22        // # Unsafe
23        // This struct is not public and is only used in this module, and
24        // the values are immediately filled in after this struct is
25        // created.
26        unsafe {
27            Tables {
28                head: mem::uninitialized(),
29                prev: mem::uninitialized(),
30            }
31        }
32    }
33}
34
35impl Tables {
36    fn fill_prev(&mut self) {
37        assert_eq!(self.head.len(), self.prev.len());
38        // # Unsafe
39        //
40        // The arrays are created with the same length statically, so this should be safe.
41        // We use this rather than copy_from_slice as prev starts out unitialized.
42        unsafe {
43            ptr::copy_nonoverlapping(self.head.as_ptr(), self.prev.as_mut_ptr(), self.prev.len())
44        }
45    }
46}
47
48/// Create and box the hash chains.
49fn create_tables() -> Box<Tables> {
50    // Using default here is a trick to get around the lack of box syntax on stable rust.
51    //
52    // Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised
53    // but using default, which simply calls `box value` internally allows us to get around this.
54    //
55    // We could use vec instead, but using a boxed array helps the compiler optimise
56    // away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true.
57    let mut t: Box<Tables> = Box::default();
58
59    for (n, b) in t.head.iter_mut().enumerate() {
60        // # Unsafe
61        //
62        // Using ptr::write here since the values are uninitialised.
63        // u16 is a primitive and doesn't implement drop, so this would be safe either way.
64        unsafe {
65            ptr::write(b, n as u16);
66        }
67    }
68
69    t.fill_prev();
70
71    t
72}
73
74/// Returns a new hash value based on the previous value and the next byte
75#[inline]
76pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 {
77    update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK)
78}
79
80#[inline]
81fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 {
82    ((current_hash << shift) ^ (to_insert as u16)) & mask
83}
84
85#[inline]
86fn reset_array(arr: &mut [u16; WINDOW_SIZE]) {
87    for (n, b) in arr.iter_mut().enumerate() {
88        *b = n as u16;
89    }
90}
91
92pub struct ChainedHashTable {
93    // Current running hash value of the last 3 bytes
94    current_hash: u16,
95    // Hash chains.
96    c: Box<Tables>,
97    // Used for testing
98    // count: DebugCounter,
99}
100
101impl ChainedHashTable {
102    pub fn new() -> ChainedHashTable {
103        ChainedHashTable {
104            current_hash: 0,
105            c: create_tables(),
106            //count: DebugCounter::default(),
107        }
108    }
109
110    #[cfg(test)]
111    pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable {
112        let mut t = ChainedHashTable::new();
113        t.current_hash = update_hash(t.current_hash, v1);
114        t.current_hash = update_hash(t.current_hash, v2);
115        t
116    }
117
118    /// Resets the hash value and hash chains
119    pub fn reset(&mut self) {
120        self.current_hash = 0;
121        reset_array(&mut self.c.head);
122        {
123            let h = self.c.head;
124            let mut c = self.c.prev;
125            c[..].copy_from_slice(&h[..]);
126        }
127        /*if cfg!(debug_assertions) {
128            self.count.reset();
129        }*/
130    }
131
132    pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) {
133        self.current_hash = update_hash(self.current_hash, v1);
134        self.current_hash = update_hash(self.current_hash, v2);
135    }
136
137    /// Insert a byte into the hash table
138    #[inline]
139    pub fn add_hash_value(&mut self, position: usize, value: u8) {
140        // Check that all bytes are input in order and at the correct positions.
141        // Disabled for now as it breaks when sync flushing.
142        /*debug_assert_eq!(
143            position & WINDOW_MASK,
144            self.count.get() as usize & WINDOW_MASK
145        );*/
146        debug_assert!(
147            position < WINDOW_SIZE * 2,
148            "Position is larger than 2 * window size! {}",
149            position
150        );
151        // Storing the hash in a temporary variable here makes the compiler avoid the
152        // bounds checks in this function.
153        let new_hash = update_hash(self.current_hash, value);
154
155        self.add_with_hash(position, new_hash);
156
157        // Update the stored hash value with the new hash.
158        self.current_hash = new_hash;
159    }
160
161    /// Directly set the current hash value
162    #[inline]
163    pub fn set_hash(&mut self, hash: u16) {
164        self.current_hash = hash;
165    }
166
167    /// Update the tables directly, providing the hash.
168    #[inline]
169    pub fn add_with_hash(&mut self, position: usize, hash: u16) {
170        /*if cfg!(debug_assertions) {
171            self.count.add(1);
172        }*/
173
174        self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize];
175
176        // Ignoring any bits over 16 here is deliberate, as we only concern ourselves about
177        // where in the buffer (which is 64k bytes) we are referring to.
178        self.c.head[hash as usize] = position as u16;
179    }
180
181    // Get the head of the hash chain for the current hash value
182    #[cfg(test)]
183    #[inline]
184    pub fn current_head(&self) -> u16 {
185        self.c.head[self.current_hash as usize]
186    }
187
188    #[inline]
189    pub fn current_hash(&self) -> u16 {
190        self.current_hash
191    }
192
193    #[inline]
194    pub fn get_prev(&self, bytes: usize) -> u16 {
195        self.c.prev[bytes & WINDOW_MASK]
196    }
197
198    #[cfg(test)]
199    #[inline]
200    pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize {
201        let to_check = match_len.saturating_sub(2);
202
203        let mut n = 0;
204        let mut smallest_prev =
205            self.get_prev(match_pos);
206        let mut smallest_pos = 0;
207        while n < to_check {
208            let prev =
209                self.get_prev(match_pos + n);
210            if prev < smallest_prev {
211                smallest_prev = prev;
212                smallest_pos = n;
213            }
214            n += 1;
215        }
216        smallest_pos
217    }
218
219    #[inline]
220    fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 {
221        if b >= bytes {
222            b - bytes
223        } else {
224            pos
225        }
226    }
227
228    #[inline]
229    fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) {
230        for (n, b) in table.iter_mut().enumerate() {
231            *b = ChainedHashTable::slide_value(*b, n as u16, bytes);
232        }
233    }
234
235    pub fn slide(&mut self, bytes: usize) {
236        /*if cfg!(debug_assertions) && bytes != WINDOW_SIZE {
237            // This should only happen in tests in this file.
238            self.count.reset();
239        }*/
240        ChainedHashTable::slide_table(&mut self.c.head, bytes as u16);
241        ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16);
242    }
243}
244
245#[cfg(test)]
246pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable {
247    assert!(data.len() <= (WINDOW_SIZE * 2) + 2);
248    let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]);
249    for (n, b) in data[2..].iter().enumerate() {
250        hash_table.add_hash_value(n, *b);
251    }
252    hash_table
253}
254
255#[cfg(test)]
256mod test {
257    use super::{filled_hash_table, ChainedHashTable};
258
259    #[test]
260    fn chained_hash() {
261        use std::str;
262
263        let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
264                           eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \
265                           enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \
266                           ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \
267                           reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
268                           pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
269                           culpa qui officia deserunt mollit anim id est laborum.";
270
271        let test_data = test_string.as_bytes();
272
273        let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()];
274
275        let num_iters = test_string
276            .matches(str::from_utf8(current_bytes).unwrap())
277            .count();
278
279        let hash_table = filled_hash_table(test_data);
280
281        // Test that the positions in the chain are valid
282        let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize;
283        let mut count = 0;
284        let mut current = hash_table.current_head() as usize;
285        while current != prev_value {
286            count += 1;
287            current = prev_value;
288            prev_value = hash_table.get_prev(prev_value) as usize;
289        }
290        // There should be at least as many occurences of the hash of the checked bytes as the
291        // numbers of occurences of the checked bytes themselves. As the hashes are not large enough
292        // to store 8 * 3 = 24 bits, there could be more with different input data.
293        assert!(count >= num_iters);
294    }
295
296    #[test]
297    fn table_unique() {
298        let mut test_data = Vec::new();
299        test_data.extend(0u8..255);
300        test_data.extend(255u8..0);
301        let hash_table = filled_hash_table(&test_data);
302        let prev_pos = hash_table.get_prev(hash_table.current_head() as usize);
303        // Since all sequences in the input are unique, there shouldn't be any previous values.
304        assert_eq!(prev_pos, hash_table.current_hash());
305    }
306
307    #[test]
308    fn table_slide() {
309        use std::fs::File;
310        use std::io::Read;
311
312        let window_size = super::WINDOW_SIZE;
313        let window_size16 = super::WINDOW_SIZE as u16;
314
315        let mut input = Vec::new();
316
317        let mut f = File::open("tests/pg11.txt").unwrap();
318
319        f.read_to_end(&mut input).unwrap();
320
321        let mut hash_table = filled_hash_table(&input[..window_size + 2]);
322
323        for (n, b) in input[2..window_size + 2].iter().enumerate() {
324            hash_table.add_hash_value(n + window_size, *b);
325        }
326
327        hash_table.slide(window_size);
328
329        {
330            let max_head = hash_table.c.head.iter().max().unwrap();
331            // After sliding there should be no hashes referring to values
332            // higher than the window size
333            assert!(*max_head < window_size16);
334            assert!(*max_head > 0);
335            let pos = hash_table.get_prev(hash_table.current_head() as usize);
336            // There should be a previous occurence since we inserted the data 3 times
337            assert!(pos < window_size16);
338            assert!(pos > 0);
339        }
340
341        for (n, b) in input[2..(window_size / 2)].iter().enumerate() {
342            hash_table.add_hash_value(n + window_size, *b);
343        }
344
345        // There should hashes referring to values in the upper part of the input window
346        // at this point
347        let max_prev = hash_table.c.prev.iter().max().unwrap();
348        assert!(*max_prev > window_size16);
349
350        let mut pos = hash_table.current_head();
351        // There should be a previous occurence since we inserted the data 3 times
352        assert!(pos > window_size16);
353        let end_byte = input[(window_size / 2) - 1 - 2];
354        let mut iterations = 0;
355        while pos > window_size16 && iterations < 5000 {
356            assert_eq!(input[pos as usize & window_size - 1], end_byte);
357
358            pos = hash_table.get_prev(pos as usize);
359            iterations += 1;
360        }
361    }
362
363    #[test]
364    /// Ensure that the initial hash values are correct.
365    fn initial_chains() {
366        let t = ChainedHashTable::new();
367        for (n, &b) in t.c.head.iter().enumerate() {
368            assert_eq!(n, b as usize);
369        }
370        for (n, &b) in t.c.prev.iter().enumerate() {
371            assert_eq!(n, b as usize);
372        }
373    }
374}