deflate/
lz77.rs

Help
1//! This module contains functionality for doing lz77 compression of data.
2#![macro_use]
3use std::cmp;
4use std::ops::{Range, RangeFrom};
5use std::iter::{self, Iterator};
6use std::slice::Iter;
7use std::fmt;
8
9use input_buffer::InputBuffer;
10use matching::longest_match;
11#[cfg(test)]
12use lzvalue::{LZValue, LZType};
13use huffman_table;
14use chained_hash_table::{ChainedHashTable, update_hash};
15#[cfg(test)]
16use compression_options::{HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN};
17use output_writer::{BufferStatus, DynamicWriter};
18use compress::Flush;
19use rle::process_chunk_greedy_rle;
20
21const MAX_MATCH: usize = huffman_table::MAX_MATCH as usize;
22const MIN_MATCH: usize = huffman_table::MIN_MATCH as usize;
23
24const NO_RLE: u16 = 43212;
25
26/// An enum describing whether we use lazy or greedy matching.
27#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
28pub enum MatchingType {
29    /// Use greedy matching: the matching algorithm simply uses a match right away
30    /// if found.
31    Greedy,
32    /// Use lazy matching: after finding a match, the next input byte is checked, to see
33    /// if there is a better match starting at that byte.
34    ///
35    /// As a special case, if max_hash_checks is set to 0, compression using only run-length
36    /// (i.e maximum match distance of 1) is performed instead.
37    Lazy,
38}
39
40impl fmt::Display for MatchingType {
41    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42        match *self {
43            MatchingType::Greedy => write!(f, "Greedy matching"),
44            MatchingType::Lazy => write!(f, "Lazy matching"),
45        }
46    }
47}
48
49/// A struct that contains the hash table, and keeps track of where we are in the input data
50pub struct LZ77State {
51    /// Struct containing hash chains that will be used to find matches.
52    hash_table: ChainedHashTable,
53    /// True if this is the first window that is being processed.
54    is_first_window: bool,
55    /// Set to true when the last block has been processed.
56    is_last_block: bool,
57    /// How many bytes the last match in the previous window extended into the current one.
58    overlap: usize,
59    /// How many bytes of input the current block contains.
60    current_block_input_bytes: u64,
61    /// The maximum number of hash entries to search.
62    max_hash_checks: u16,
63    /// Only lazy match if we have a match length less than this.
64    lazy_if_less_than: u16,
65    /// Whether to use greedy or lazy parsing
66    matching_type: MatchingType,
67    /// Keep track of the previous match and byte in case the buffer is full when lazy matching.
68    match_state: ChunkState,
69    /// Keep track of how many bytes in the lookahead that was part of a match, but has not been
70    /// added to the hash chain yet.
71    bytes_to_hash: usize,
72    /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be
73    /// hashed.
74    was_synced: bool,
75}
76
77impl LZ77State {
78    /// Creates a new LZ77 state
79    pub fn new(
80        max_hash_checks: u16,
81        lazy_if_less_than: u16,
82        matching_type: MatchingType,
83    ) -> LZ77State {
84        LZ77State {
85            hash_table: ChainedHashTable::new(),
86            is_first_window: true,
87            is_last_block: false,
88            overlap: 0,
89            current_block_input_bytes: 0,
90            max_hash_checks: max_hash_checks,
91            lazy_if_less_than: lazy_if_less_than,
92            matching_type: matching_type,
93            match_state: ChunkState::new(),
94            bytes_to_hash: 0,
95            was_synced: false,
96        }
97    }
98
99    /// Resets the state excluding max_hash_checks and lazy_if_less_than
100    pub fn reset(&mut self) {
101        self.hash_table.reset();
102        self.is_first_window = true;
103        self.is_last_block = false;
104        self.overlap = 0;
105        self.current_block_input_bytes = 0;
106        self.match_state = ChunkState::new();
107        self.bytes_to_hash = 0
108    }
109
110    pub fn set_last(&mut self) {
111        self.is_last_block = true;
112    }
113
114    /// Is this the last block we are outputting?
115    pub fn is_last_block(&self) -> bool {
116        self.is_last_block
117    }
118
119    /// How many bytes of input the current block contains.
120    pub fn current_block_input_bytes(&self) -> u64 {
121        self.current_block_input_bytes
122    }
123
124    /// Sets the number of input bytes for the current block to 0.
125    pub fn reset_input_bytes(&mut self) {
126        self.current_block_input_bytes = 0;
127    }
128
129    /// Is there a buffered byte that has not been output yet?
130    pub fn pending_byte(&self) -> bool {
131        self.match_state.add
132    }
133
134    /// Returns 1 if pending_byte is true, 0 otherwise.
135    pub fn pending_byte_as_num(&self) -> usize {
136        // This could be implemented by using `as usize` as the documentation states this would give
137        // the same result, but not sure if that should be relied upon.
138        if self.match_state.add {
139            1
140        } else {
141            0
142        }
143    }
144}
145
146const DEFAULT_WINDOW_SIZE: usize = 32768;
147
148#[derive(Debug)]
149/// Status after calling `process_chunk`.
150pub enum ProcessStatus {
151    /// All the input data was processed.
152    Ok,
153    /// The output buffer was full.
154    ///
155    /// The argument is the position of the next byte to be checked by `process_chunk`.
156    BufferFull(usize),
157}
158
159#[derive(Debug)]
160/// A struct to keep track of status between calls of `process_chunk_lazy`
161///
162/// This is needed as the output buffer might become full before having output all pending data.
163pub struct ChunkState {
164    /// Length of the last match that was found, if any.
165    current_length: u16,
166    /// Distance of the last match that was found, if any.
167    current_distance: u16,
168    /// The previous byte checked in process_chunk.
169    prev_byte: u8,
170    /// The current byte being checked in process_chunk.
171    cur_byte: u8,
172    /// Whether prev_byte still needs to be output.
173    add: bool,
174}
175
176impl ChunkState {
177    pub fn new() -> ChunkState {
178        ChunkState {
179            current_length: 0,
180            current_distance: 0,
181            prev_byte: 0,
182            cur_byte: 0,
183            add: false,
184        }
185    }
186}
187
188pub fn buffer_full(position: usize) -> ProcessStatus {
189    ProcessStatus::BufferFull(position)
190}
191
192fn process_chunk(
193    data: &[u8],
194    iterated_data: &Range<usize>,
195    mut match_state: &mut ChunkState,
196    hash_table: &mut ChainedHashTable,
197    writer: &mut DynamicWriter,
198    max_hash_checks: u16,
199    lazy_if_less_than: usize,
200    matching_type: MatchingType,
201) -> (usize, ProcessStatus) {
202    let avoid_rle = if cfg!(test) {
203        // Avoid RLE if lazy_if_less than is a specific value.
204        // This is used in some tests, ideally we should probably do this in a less clunky way,
205        // but we use a value here that is higher than the maximum sensible one anyhow, and will
206        // be truncated by deflate_state for calls from outside the library.
207        lazy_if_less_than == NO_RLE as usize
208    } else {
209        false
210    };
211    match matching_type {
212        MatchingType::Greedy => {
213            process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks)
214        }
215        MatchingType::Lazy => {
216            if max_hash_checks > 0 || avoid_rle {
217                process_chunk_lazy(
218                    data,
219                    iterated_data,
220                    &mut match_state,
221                    hash_table,
222                    writer,
223                    max_hash_checks,
224                    lazy_if_less_than,
225                )
226            } else {
227                // Use the RLE method if max_hash_checks is set to 0.
228                process_chunk_greedy_rle(data, iterated_data, writer)
229            }
230        }
231    }
232}
233
234/// Add the specified number of bytes to the hash table from the iterators
235/// adding `start` to the position supplied to the hash table.
236fn add_to_hash_table(
237    bytes_to_add: usize,
238    insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>,
239    hash_it: &mut Iter<u8>,
240    hash_table: &mut ChainedHashTable,
241) {
242    let taker = insert_it.by_ref().take(bytes_to_add);
243    let mut hash_taker = hash_it.by_ref().take(bytes_to_add);
244    // Update the hash manually here to keep it in a register.
245    let mut hash = hash_table.current_hash();
246    // Advance the iterators and add the bytes we jump over to the hash table and
247    // checksum
248    for (ipos, _) in taker {
249        if let Some(&i_hash_byte) = hash_taker.next() {
250            hash = update_hash(hash, i_hash_byte);
251            hash_table.add_with_hash(ipos, hash);
252        }
253    }
254    // Write the hash back once we are done.
255    hash_table.set_hash(hash);
256}
257
258/// Write the specified literal `byte` to the writer `w`, and return
259/// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing.
260///
261/// `pos` should indicate the byte to start at in the next call to `process_chunk`,
262/// `is_hashed` should be set to true of the byte at pos has been added to the hash chain.
263macro_rules! write_literal{
264    ($w:ident, $byte:expr, $pos:expr) => {
265        let b_status = $w.write_literal($byte);
266
267        if let BufferStatus::Full = b_status {
268            return (0, buffer_full($pos));
269        }
270    };
271}
272
273/// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take
274/// up more space than it would save.
275#[inline]
276fn match_too_far(match_len: usize, match_dist: usize) -> bool {
277    const TOO_FAR: usize = 8 * 1024;
278    match_len == MIN_MATCH && match_dist > TOO_FAR
279}
280
281///Create the iterators used when processing through a chunk of data.
282fn create_iterators<'a>(
283    data: &'a [u8],
284    iterated_data: &Range<usize>,
285) -> (
286    usize,
287    iter::Zip<RangeFrom<usize>, Iter<'a, u8>>,
288    Iter<'a, u8>,
289) {
290    let end = cmp::min(data.len(), iterated_data.end);
291    let start = iterated_data.start;
292    let current_chunk = &data[start..end];
293
294    let insert_it = (start..).zip(current_chunk.iter());
295    let hash_it = {
296        let hash_start = if data.len() - start > 2 {
297            start + 2
298        } else {
299            data.len()
300        };
301        (&data[hash_start..]).iter()
302    };
303    (end, insert_it, hash_it)
304}
305
306fn process_chunk_lazy(
307    data: &[u8],
308    iterated_data: &Range<usize>,
309    state: &mut ChunkState,
310    mut hash_table: &mut ChainedHashTable,
311    writer: &mut DynamicWriter,
312    max_hash_checks: u16,
313    lazy_if_less_than: usize,
314) -> (usize, ProcessStatus) {
315
316    let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
317
318    const NO_LENGTH: u16 = 0;
319
320    // The previous match length, if any.
321    let mut prev_length = state.current_length;
322    // The distance of the previous match if any.
323    let mut prev_distance = state.current_distance;
324
325    state.current_length = 0;
326    state.current_distance = 0;
327
328    // The number of bytes past end that was added due to finding a match that extends into
329    // the lookahead window.
330    let mut overlap = 0;
331
332
333    // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`,
334    // indicating that we won't lazy match (check for a better match at the next byte).
335    // If we had a good match, carry this over from the previous call.
336    let mut ignore_next = prev_length as usize >= lazy_if_less_than;
337
338    // This is to output the correct byte in case there is one pending to be output
339    // from the previous call.
340    state.prev_byte = state.cur_byte;
341
342    // Iterate through the slice, adding literals or length/distance pairs
343    while let Some((position, &b)) = insert_it.next() {
344        state.cur_byte = b;
345        if let Some(&hash_byte) = hash_it.next() {
346            hash_table.add_hash_value(position, hash_byte);
347
348            // Only lazy match if we have a match shorter than a set value
349            // TODO: This should be cleaned up a bit
350            if !ignore_next {
351                let (mut match_len, match_dist) = {
352                    // If there already was a decent match at the previous byte
353                    // and we are lazy matching, do less match checks in this step.
354                    let max_hash_checks = if prev_length >= 32 {
355                        max_hash_checks >> 2
356                    } else {
357                        max_hash_checks
358                    };
359
360                    // Check if we can find a better match here than the one we had at
361                    // the previous byte.
362                    longest_match(
363                        data,
364                        hash_table,
365                        position,
366                        prev_length as usize,
367                        max_hash_checks,
368                    )
369                };
370
371                // If the match is only 3 bytes long and very far back, it's probably not worth
372                // outputting.
373                if match_too_far(match_len, match_dist) {
374                    match_len = NO_LENGTH as usize;
375                };
376
377                if match_len >= lazy_if_less_than {
378                    // We found a decent match, so we won't check for a better one at the next byte.
379                    ignore_next = true;
380                }
381                state.current_length = match_len as u16;
382                state.current_distance = match_dist as u16;
383            } else {
384                // We already had a decent match, so we don't bother checking for another one.
385                state.current_length = NO_LENGTH;
386                state.current_distance = 0;
387                // Make sure we check again next time.
388                ignore_next = false;
389            };
390
391            if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 {
392                // The previous match was better so we add it.
393                // Casting note: length and distance is already bounded by the longest match
394                // function. Usize is just used for convenience.
395                let b_status =
396                    writer.write_length_distance(prev_length as u16, prev_distance as u16);
397
398                // We add the bytes to the hash table and checksum.
399                // Since we've already added two of them, we need to add two less than
400                // the length.
401                let bytes_to_add = prev_length - 2;
402
403                add_to_hash_table(
404                    bytes_to_add as usize,
405                    &mut insert_it,
406                    &mut hash_it,
407                    &mut hash_table,
408                );
409
410                // If the match is longer than the current window, we have note how many
411                // bytes we overlap, since we don't need to do any matching on these bytes
412                // in the next call of this function.
413                // We don't have to worry about setting overlap to 0 if this is false, as the
414                // function will stop after this condition is true, and overlap is not altered
415                // elsewhere.
416                if position + prev_length as usize > end {
417                    // We need to subtract 1 since the byte at pos is also included.
418                    overlap = position + prev_length as usize - end - 1;
419                };
420
421                state.add = false;
422
423                // Note that there is no current match.
424                state.current_length = 0;
425                state.current_distance = 0;
426
427                if let BufferStatus::Full = b_status {
428                    // MATCH(lazy)
429                    return (overlap, buffer_full(position + prev_length as usize - 1));
430                }
431
432                ignore_next = false;
433
434            } else if state.add {
435                // We found a better match (or there was no previous match)
436                // so output the previous byte.
437                // BETTER OR NO MATCH
438                write_literal!(writer, state.prev_byte, position + 1);
439            } else {
440                state.add = true
441            }
442
443            prev_length = state.current_length;
444            prev_distance = state.current_distance;
445            state.prev_byte = b;
446        } else {
447            // If there is a match at this point, it will not have been added, so we need to add it.
448            if prev_length >= MIN_MATCH as u16 {
449                let b_status =
450                    writer.write_length_distance(prev_length as u16, prev_distance as u16);
451
452                state.current_length = 0;
453                state.current_distance = 0;
454                state.add = false;
455
456                // As this will be a 3-length match at the end of the input data, there can't be any
457                // overlap.
458                // TODO: Not sure if we need to signal that the buffer is full here.
459                // It's only needed in the case of syncing.
460                if let BufferStatus::Full = b_status {
461                    // TODO: These bytes should be hashed when doing a sync flush.
462                    // This can't be done here as the new input data does not exist yet.
463                    return (0, buffer_full(end));
464                } else {
465                    return (0, ProcessStatus::Ok);
466                }
467            };
468
469            if state.add {
470                // We may still have a leftover byte at this point, so we add it here if needed.
471                state.add = false;
472
473                // ADD
474                write_literal!(writer, state.prev_byte, position + 1);
475
476            };
477
478            // We are at the last two bytes we want to add, so there is no point
479            // searching for matches here.
480
481            // AFTER ADD
482            write_literal!(writer, b, position + 1);
483        }
484    }
485    (overlap, ProcessStatus::Ok)
486}
487
488fn process_chunk_greedy(
489    data: &[u8],
490    iterated_data: &Range<usize>,
491    mut hash_table: &mut ChainedHashTable,
492    writer: &mut DynamicWriter,
493    max_hash_checks: u16,
494) -> (usize, ProcessStatus) {
495
496    let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
497
498    const NO_LENGTH: usize = 0;
499
500    // The number of bytes past end that was added due to finding a match that extends into
501    // the lookahead window.
502    let mut overlap = 0;
503
504    // Iterate through the slice, adding literals or length/distance pairs.
505    while let Some((position, &b)) = insert_it.next() {
506        if let Some(&hash_byte) = hash_it.next() {
507            hash_table.add_hash_value(position, hash_byte);
508
509            // TODO: This should be cleaned up a bit.
510            let (match_len, match_dist) =
511                { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) };
512
513            if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) {
514                // Casting note: length and distance is already bounded by the longest match
515                // function. Usize is just used for convenience.
516                let b_status = writer.write_length_distance(match_len as u16, match_dist as u16);
517
518                // We add the bytes to the hash table and checksum.
519                // Since we've already added one of them, we need to add one less than
520                // the length.
521                let bytes_to_add = match_len - 1;
522                add_to_hash_table(
523                    bytes_to_add,
524                    &mut insert_it,
525                    &mut hash_it,
526                    &mut hash_table,
527                );
528
529                // If the match is longer than the current window, we have note how many
530                // bytes we overlap, since we don't need to do any matching on these bytes
531                // in the next call of this function.
532                if position + match_len > end {
533                    // We need to subtract 1 since the byte at pos is also included.
534                    overlap = position + match_len - end;
535                };
536
537                if let BufferStatus::Full = b_status {
538                    // MATCH
539                    return (overlap, buffer_full(position + match_len));
540                }
541
542            } else {
543                // NO MATCH
544                write_literal!(writer, b, position + 1);
545            }
546        } else {
547            // We are at the last two bytes we want to add, so there is no point
548            // searching for matches here.
549            // END
550            write_literal!(writer, b, position + 1);
551        }
552    }
553    (overlap, ProcessStatus::Ok)
554}
555
556
557#[derive(Eq, PartialEq, Clone, Copy, Debug)]
558pub enum LZ77Status {
559    /// Waiting for more input before doing any processing
560    NeedInput,
561    /// The output buffer is full, so the current block needs to be ended so the
562    /// buffer can be flushed.
563    EndBlock,
564    /// All pending data has been processed.
565    Finished,
566}
567
568#[cfg(test)]
569pub fn lz77_compress_block_finish(
570    data: &[u8],
571    state: &mut LZ77State,
572    buffer: &mut InputBuffer,
573    mut writer: &mut DynamicWriter,
574) -> (usize, LZ77Status) {
575    let (consumed, status, _) =
576        lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish);
577    (consumed, status)
578}
579
580/// Compress a slice with lz77 compression.
581///
582/// This function processes one window at a time, and returns when there is no input left,
583/// or it determines it's time to end a block.
584///
585/// Returns the number of bytes of the input that were consumed, a status describing
586/// whether there is no input, it's time to finish, or it's time to end the block, and the position
587/// of the first byte in the input buffer that has not been output (but may have been checked for
588/// matches).
589pub fn lz77_compress_block(
590    data: &[u8],
591    state: &mut LZ77State,
592    buffer: &mut InputBuffer,
593    mut writer: &mut DynamicWriter,
594    flush: Flush,
595) -> (usize, LZ77Status, usize) {
596    // Currently we only support the maximum window size
597    let window_size = DEFAULT_WINDOW_SIZE;
598
599    // Indicates whether we should try to process all the data including the lookahead, or if we
600    // should wait until we have at least one window size of data before doing anything.
601    let finish = flush == Flush::Finish || flush == Flush::Sync;
602    let sync = flush == Flush::Sync;
603
604    let mut current_position = 0;
605
606    // The current status of the encoding.
607    let mut status = LZ77Status::EndBlock;
608
609    // Whether warm up the hash chain with the two first values.
610    let mut add_initial = true;
611
612    // If we have synced, add the two first bytes to the hash as they couldn't be added before.
613    if state.was_synced {
614        if buffer.current_end() > 2 {
615            let pos_add = buffer.current_end() - 2;
616            for (n, &b) in data.iter().take(2).enumerate() {
617                state.hash_table.add_hash_value(n + pos_add, b);
618            }
619            add_initial = false;
620        }
621        state.was_synced = false;
622    }
623
624    // Add data to the input buffer and keep a reference to the slice of data not added yet.
625    let mut remaining_data = buffer.add_data(data);
626
627    loop {
628        // Note if there is a pending byte from the previous call to process_chunk,
629        // so we get the block input size right.
630        let pending_previous = state.pending_byte_as_num();
631
632        assert!(writer.buffer_length() <= (window_size * 2));
633        // The process is a bit different for the first 32k bytes.
634        // TODO: There is a lot of duplicate code between the two branches here, we should be able
635        // to simplify this.
636        if state.is_first_window {
637            // Don't do anything until we are either flushing, or we have at least one window of
638            // data.
639            if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
640
641                if buffer.get_buffer().len() >= 2 && add_initial &&
642                    state.current_block_input_bytes == 0
643                {
644                    let b = buffer.get_buffer();
645                    // Warm up the hash with the two first values, so we can find  matches at
646                    // index 0.
647                    state.hash_table.add_initial_hash_values(b[0], b[1]);
648                    add_initial = false;
649                }
650
651                let first_chunk_end = cmp::min(window_size, buffer.current_end());
652
653                let start = state.overlap;
654
655                let (overlap, p_status) = process_chunk(
656                    buffer.get_buffer(),
657                    &(start..first_chunk_end),
658                    &mut state.match_state,
659                    &mut state.hash_table,
660                    &mut writer,
661                    state.max_hash_checks,
662                    state.lazy_if_less_than as usize,
663                    state.matching_type,
664                );
665
666                state.overlap = overlap;
667                state.bytes_to_hash = overlap;
668
669                // If the buffer is full, we want to end the block.
670                if let ProcessStatus::BufferFull(written) = p_status {
671                    state.overlap = if overlap > 0 { overlap } else { written };
672                    status = LZ77Status::EndBlock;
673                    current_position = written - state.pending_byte_as_num();
674                    state.current_block_input_bytes +=
675                        (written - start + overlap + pending_previous -
676                             state.pending_byte_as_num()) as u64;
677                    break;
678                }
679
680
681                // Update the length of how many input bytes the current block is representing,
682                // taking into account pending bytes.
683                state.current_block_input_bytes +=
684                    (first_chunk_end - start + overlap + pending_previous -
685                         state.pending_byte_as_num()) as u64;
686
687                // We are at the first window so we don't need to slide the hash table yet.
688                // If finishing or syncing, we stop here.
689                if first_chunk_end >= buffer.current_end() && finish {
690                    current_position = first_chunk_end - state.pending_byte_as_num();
691                    if !sync {
692                        state.set_last();
693                        state.is_first_window = false;
694                    } else {
695                        state.overlap = first_chunk_end;
696                        state.was_synced = true;
697                    }
698                    debug_assert!(
699                        !state.pending_byte(),
700                        "Bug! Ended compression wit a pending byte!"
701                    );
702                    status = LZ77Status::Finished;
703                    break;
704                }
705                // Otherwise, continue.
706                state.is_first_window = false;
707            } else {
708                status = LZ77Status::NeedInput;
709                break;
710            }
711        } else if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
712            if buffer.current_end() >= window_size + 2 {
713                for (n, &h) in buffer.get_buffer()[window_size + 2..]
714                    .iter()
715                    .enumerate()
716                    .take(state.bytes_to_hash)
717                {
718                    state.hash_table.add_hash_value(window_size + n, h);
719                }
720                state.bytes_to_hash = 0;
721            }
722            // This isn't the first chunk, so we start reading at one window in in the
723            // buffer plus any additional overlap from earlier.
724            let start = window_size + state.overlap;
725
726            // Determine where we have to stop iterating to slide the buffer and hash,
727            // or stop because we are at the end of the input data.
728            let end = cmp::min(window_size * 2, buffer.current_end());
729
730            let (overlap, p_status) = process_chunk(
731                buffer.get_buffer(),
732                &(start..end),
733                &mut state.match_state,
734                &mut state.hash_table,
735                &mut writer,
736                state.max_hash_checks,
737                state.lazy_if_less_than as usize,
738                state.matching_type,
739            );
740
741            state.bytes_to_hash = overlap;
742
743
744            if let ProcessStatus::BufferFull(written) = p_status {
745                state.current_block_input_bytes +=
746                    (written - start + pending_previous - state.pending_byte_as_num()) as u64;
747
748                // If the buffer is full, return and end the block.
749                // If overlap is non-zero, the buffer was full after outputting the last byte,
750                // otherwise we have to skip to the point in the buffer where we stopped in the
751                // next call.
752                state.overlap = if overlap > 0 {
753                    // If we are at the end of the window, make sure we slide the buffer and the
754                    // hash table.
755                    if state.max_hash_checks > 0 {
756                        state.hash_table.slide(window_size);
757                    }
758                    remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
759                    overlap
760                } else {
761                    written - window_size
762                };
763
764                current_position = written - state.pending_byte_as_num();
765
766
767                // Status is already EndBlock at this point.
768                // status = LZ77Status::EndBlock;
769                break;
770            }
771
772            state.current_block_input_bytes +=
773                (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64;
774
775            // The buffer is not full, but we still need to note if there is any overlap into the
776            // next window.
777            state.overlap = overlap;
778
779            if remaining_data.is_none() && finish && end == buffer.current_end() {
780                current_position = buffer.current_end();
781                debug_assert!(
782                    !state.pending_byte(),
783                    "Bug! Ended compression wit a pending byte!"
784                );
785
786                // We stopped before or at the window size, so we are at the end.
787                if !sync {
788                    // If we are finishing and not syncing, we simply indicate that we are done.
789                    state.set_last();
790                } else {
791                    // For sync flushing we need to slide the buffer and the hash chains so that the
792                    // next call to this function starts at the right place.
793
794                    // There won't be any overlap, since when syncing, we process to the end of the.
795                    // pending data.
796                    state.overlap = buffer.current_end() - window_size;
797                    state.was_synced = true;
798                }
799                status = LZ77Status::Finished;
800                break;
801            } else {
802                // We are not at the end, so slide and continue.
803                // We slide the hash table back to make space for new hash values
804                // We only need to remember 2^15 bytes back (the maximum distance allowed by the
805                // deflate spec).
806                if state.max_hash_checks > 0 {
807                    state.hash_table.slide(window_size);
808                }
809
810                // Also slide the buffer, discarding data we no longer need and adding new data.
811                remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
812            }
813        } else {
814            // The caller has not indicated that they want to finish or flush, and there is less
815            // than a window + lookahead of new data, so we wait for more.
816            status = LZ77Status::NeedInput;
817            break;
818        }
819
820    }
821
822    (
823        data.len() - remaining_data.unwrap_or(&[]).len(),
824        status,
825        current_position,
826    )
827}
828
829#[cfg(test)]
830pub fn decompress_lz77(input: &[LZValue]) -> Vec<u8> {
831    decompress_lz77_with_backbuffer(input, &[])
832}
833
834#[cfg(test)]
835pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8> {
836    let mut output = Vec::new();
837    for p in input {
838        match p.value() {
839            LZType::Literal(l) => output.push(l),
840            LZType::StoredLengthDistance(l, d) => {
841                // We found a match, so we have to get the data that the match refers to.
842                let d = d as usize;
843                let prev_output_len = output.len();
844                // Get match data from the back buffer if the match extends that far.
845                let consumed = if d > output.len() {
846                    let into_back_buffer = d - output.len();
847
848                    assert!(
849                        into_back_buffer <= back_buffer.len(),
850                        "ERROR: Attempted to refer to a match in non-existing data!\
851                         into_back_buffer: {}, back_buffer len {}, d {}, l {:?}",
852                        into_back_buffer,
853                        back_buffer.len(),
854                        d,
855                        l
856                    );
857                    let start = back_buffer.len() - into_back_buffer;
858                    let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize);
859                    output.extend_from_slice(&back_buffer[start..end]);
860
861                    end - start
862                } else {
863                    0
864                };
865
866                // Get match data from the currently decompressed data.
867                let start = prev_output_len.saturating_sub(d);
868                let mut n = 0;
869                while n < (l.actual_length() as usize).saturating_sub(consumed) {
870                    let b = output[start + n];
871                    output.push(b);
872                    n += 1;
873                }
874            }
875        }
876    }
877    output
878}
879
880#[cfg(test)]
881pub struct TestStruct {
882    state: LZ77State,
883    buffer: InputBuffer,
884    writer: DynamicWriter,
885}
886
887#[cfg(test)]
888impl TestStruct {
889    fn new() -> TestStruct {
890        TestStruct::with_config(
891            HIGH_MAX_HASH_CHECKS,
892            HIGH_LAZY_IF_LESS_THAN,
893            MatchingType::Lazy,
894        )
895    }
896
897    fn with_config(
898        max_hash_checks: u16,
899        lazy_if_less_than: u16,
900        matching_type: MatchingType,
901    ) -> TestStruct {
902        TestStruct {
903            state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type),
904            buffer: InputBuffer::empty(),
905            writer: DynamicWriter::new(),
906        }
907    }
908
909    fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) {
910        lz77_compress_block(
911            data,
912            &mut self.state,
913            &mut self.buffer,
914            &mut self.writer,
915            if flush { Flush::Finish } else { Flush::None },
916        )
917    }
918}
919
920#[cfg(test)]
921pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> {
922    lz77_compress_conf(
923        data,
924        HIGH_MAX_HASH_CHECKS,
925        HIGH_LAZY_IF_LESS_THAN,
926        MatchingType::Lazy,
927    )
928}
929
930/// Compress a slice, not storing frequency information
931///
932/// This is a convenience function for compression with fixed huffman values
933/// Only used in tests for now
934#[cfg(test)]
935pub fn lz77_compress_conf(
936    data: &[u8],
937    max_hash_checks: u16,
938    lazy_if_less_than: u16,
939    matching_type: MatchingType,
940) -> Option<Vec<LZValue>> {
941    let mut test_boxed = Box::new(TestStruct::with_config(
942        max_hash_checks,
943        lazy_if_less_than,
944        matching_type,
945    ));
946    let mut out = Vec::<LZValue>::with_capacity(data.len() / 3);
947    {
948        let test = test_boxed.as_mut();
949        let mut slice = data;
950
951        while !test.state.is_last_block {
952            let bytes_written = lz77_compress_block_finish(
953                slice,
954                &mut test.state,
955                &mut test.buffer,
956                &mut test.writer,
957            ).0;
958            slice = &slice[bytes_written..];
959            out.extend(test.writer.get_buffer());
960            test.writer.clear();
961        }
962
963    }
964
965    Some(out)
966}
967
968#[cfg(test)]
969mod test {
970    use super::*;
971
972    use lzvalue::{LZValue, LZType, lit, ld};
973    use chained_hash_table::WINDOW_SIZE;
974    use compression_options::DEFAULT_LAZY_IF_LESS_THAN;
975    use test_utils::get_test_data;
976    use output_writer::MAX_BUFFER_LENGTH;
977
978    /// Helper function to print the output from the lz77 compression function
979    fn print_output(input: &[LZValue]) {
980        let mut output = vec![];
981        for l in input {
982            match l.value() {
983                LZType::Literal(l) => output.push(l),
984                LZType::StoredLengthDistance(l, d) => {
985                    output.extend(format!("<L {}>", l.actual_length()).into_bytes());
986                    output.extend(format!("<D {}>", d).into_bytes())
987                }
988            }
989        }
990
991        println!("\"{}\"", String::from_utf8(output).unwrap());
992    }
993
994    /// Test that a short string from an example on SO compresses correctly
995    #[test]
996    fn compress_short() {
997        let test_bytes = String::from("Deflate late").into_bytes();
998        let res = lz77_compress(&test_bytes).unwrap();
999
1000        let decompressed = decompress_lz77(&res);
1001
1002        assert_eq!(test_bytes, decompressed);
1003        assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5));
1004    }
1005
1006    /// Test that compression is working for a longer file
1007    #[test]
1008    fn compress_long() {
1009        let input = get_test_data();
1010        let compressed = lz77_compress(&input).unwrap();
1011        assert!(compressed.len() < input.len());
1012
1013        let decompressed = decompress_lz77(&compressed);
1014        println!("compress_long length: {}", input.len());
1015
1016        // This is to check where the compression fails, if it were to
1017        for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() {
1018            if a != b {
1019                println!("First difference at {}, input: {}, output: {}", n, a, b);
1020                break;
1021            }
1022        }
1023        assert_eq!(input.len(), decompressed.len());
1024        assert!(&decompressed == &input);
1025    }
1026
1027    /// Check that lazy matching is working as intended
1028    #[test]
1029    fn lazy() {
1030        // We want to match on `badger` rather than `nba` as it is longer
1031        // let data = b" nba nbadg badger nbadger";
1032        let data = b"nba badger nbadger";
1033        let compressed = lz77_compress(data).unwrap();
1034        let test = compressed[compressed.len() - 1];
1035        if let LZType::StoredLengthDistance(l, _) = test.value() {
1036            assert_eq!(l.actual_length(), 6);
1037        } else {
1038            print_output(&compressed);
1039            panic!();
1040        }
1041    }
1042
1043    fn roundtrip(data: &[u8]) {
1044        let compressed = super::lz77_compress(&data).unwrap();
1045        let decompressed = decompress_lz77(&compressed);
1046        assert!(decompressed == data);
1047    }
1048
1049    // Check that data with the exact window size is working properly
1050    #[test]
1051    #[allow(unused)]
1052    fn exact_window_size() {
1053        use std::io::Write;
1054        let mut data = vec![0; WINDOW_SIZE];
1055        roundtrip(&data);
1056        {
1057            data.write(&[22; WINDOW_SIZE]);
1058        }
1059        roundtrip(&data);
1060        {
1061            data.write(&[55; WINDOW_SIZE]);
1062        }
1063        roundtrip(&data);
1064    }
1065
1066    /// Test that matches at the window border are working correctly
1067    #[test]
1068    fn border() {
1069        use chained_hash_table::WINDOW_SIZE;
1070        let mut data = vec![35; WINDOW_SIZE];
1071        data.extend(b"Test");
1072        let compressed = super::lz77_compress(&data).unwrap();
1073        assert!(compressed.len() < data.len());
1074        let decompressed = decompress_lz77(&compressed);
1075
1076        assert_eq!(decompressed.len(), data.len());
1077        assert!(decompressed == data);
1078    }
1079
1080    #[test]
1081    fn border_multiple_blocks() {
1082        use chained_hash_table::WINDOW_SIZE;
1083        let mut data = vec![0; (WINDOW_SIZE * 2) + 50];
1084        data.push(1);
1085        let compressed = super::lz77_compress(&data).unwrap();
1086        assert!(compressed.len() < data.len());
1087        let decompressed = decompress_lz77(&compressed);
1088        assert_eq!(decompressed.len(), data.len());
1089        assert!(decompressed == data);
1090    }
1091
1092    #[test]
1093    fn compress_block_status() {
1094        use input_buffer::InputBuffer;
1095
1096        let data = b"Test data data";
1097        let mut writer = DynamicWriter::new();
1098
1099        let mut buffer = InputBuffer::empty();
1100        let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1101        let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer);
1102        assert_eq!(status.1, LZ77Status::Finished);
1103        assert!(&buffer.get_buffer()[..data.len()] == data);
1104        assert_eq!(buffer.current_end(), data.len());
1105    }
1106
1107    #[test]
1108    fn compress_block_multiple_windows() {
1109        use input_buffer::InputBuffer;
1110        use output_writer::DynamicWriter;
1111
1112        let data = get_test_data();
1113        assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH);
1114        let mut writer = DynamicWriter::new();
1115
1116        let mut buffer = InputBuffer::empty();
1117        let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1118        let (bytes_consumed, status) =
1119            lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer);
1120        assert_eq!(
1121            buffer.get_buffer().len(),
1122            (WINDOW_SIZE * 2) + super::MAX_MATCH
1123        );
1124        assert_eq!(status, LZ77Status::EndBlock);
1125        let buf_len = buffer.get_buffer().len();
1126        assert!(buffer.get_buffer()[..] == data[..buf_len]);
1127
1128        writer.clear();
1129        let (_, status) = lz77_compress_block_finish(
1130            &data[bytes_consumed..],
1131            &mut state,
1132            &mut buffer,
1133            &mut writer,
1134        );
1135        assert_eq!(status, LZ77Status::EndBlock);
1136
1137    }
1138
1139    #[test]
1140    fn multiple_inputs() {
1141        let data = b"Badger badger bababa test data 25 asfgestghresjkgh";
1142        let comp1 = lz77_compress(data).unwrap();
1143        let comp2 = {
1144            const SPLIT: usize = 25;
1145            let first_part = &data[..SPLIT];
1146            let second_part = &data[SPLIT..];
1147            let mut state = TestStruct::new();
1148            let (bytes_written, status, _) = state.compress_block(first_part, false);
1149            assert_eq!(bytes_written, first_part.len());
1150            assert_eq!(status, LZ77Status::NeedInput);
1151            let (bytes_written, status, _) = state.compress_block(second_part, true);
1152            assert_eq!(bytes_written, second_part.len());
1153            assert_eq!(status, LZ77Status::Finished);
1154            Vec::from(state.writer.get_buffer())
1155        };
1156        assert!(comp1 == comp2);
1157    }
1158
1159
1160    #[test]
1161    /// Test that the exit from process_chunk when buffer is full is working correctly.
1162    fn buffer_fill() {
1163        let data = get_test_data();
1164        // The comments above these calls refers the positions with the
1165        // corersponding comments in process_chunk_{greedy/lazy}.
1166        // POS BETTER OR NO MATCH
1167        buffer_test_literals(&data);
1168        // POS END
1169        // POS NO MATCH
1170        buffer_test_last_bytes(MatchingType::Greedy, &data);
1171        // POS ADD
1172        // POS AFTER ADD
1173        buffer_test_last_bytes(MatchingType::Lazy, &data);
1174
1175        // POS MATCH
1176        buffer_test_match(MatchingType::Greedy);
1177        // POS MATCH(lazy)
1178        buffer_test_match(MatchingType::Lazy);
1179
1180        // POS END
1181        buffer_test_add_end(&data);
1182    }
1183
1184    /// Test buffer fill when a byte is added due to no match being found.
1185    fn buffer_test_literals(data: &[u8]) {
1186        let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy);
1187        let (bytes_consumed, status, position) = state.compress_block(&data, false);
1188
1189        // There should be enough data for the block to have ended.
1190        assert_eq!(status, LZ77Status::EndBlock);
1191        assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH);
1192
1193        // The buffer should be full.
1194        assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH);
1195        assert_eq!(position, state.writer.get_buffer().len());
1196        // Since all literals have been input, the block should have the exact number of litlens
1197        // as there were input bytes.
1198        assert_eq!(
1199            state.state.current_block_input_bytes() as usize,
1200            MAX_BUFFER_LENGTH
1201        );
1202        state.state.reset_input_bytes();
1203
1204        let mut out = decompress_lz77(state.writer.get_buffer());
1205
1206        state.writer.clear();
1207        // The buffer should now be cleared.
1208        assert_eq!(state.writer.get_buffer().len(), 0);
1209
1210        assert!(data[..out.len()] == out[..]);
1211
1212        let _ = state.compress_block(&data[bytes_consumed..], false);
1213        // We should have some new data in the buffer at this point.
1214        assert!(state.writer.get_buffer().len() > 0);
1215        assert_eq!(
1216            state.state.current_block_input_bytes() as usize,
1217            MAX_BUFFER_LENGTH
1218        );
1219
1220        out.extend_from_slice(&decompress_lz77(state.writer.get_buffer()));
1221        assert!(data[..out.len()] == out[..]);
1222    }
1223
1224    /// Test buffer fill at the last two bytes that are not hashed.
1225    fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) {
1226        const BYTES_USED: usize = MAX_BUFFER_LENGTH;
1227        assert!(
1228            &data[..BYTES_USED] ==
1229                &decompress_lz77(&lz77_compress_conf(
1230                    &data[..BYTES_USED],
1231                    0,
1232                    NO_RLE,
1233                    matching_type,
1234                ).unwrap())
1235                    [..]
1236        );
1237        assert!(
1238            &data[..BYTES_USED + 1] ==
1239                &decompress_lz77(&lz77_compress_conf(
1240                    &data[..BYTES_USED + 1],
1241                    0,
1242                    NO_RLE,
1243                    matching_type,
1244                ).unwrap())
1245                    [..]
1246        );
1247    }
1248
1249    /// Test buffer fill when buffer is full at a match.
1250    fn buffer_test_match(matching_type: MatchingType) {
1251        // TODO: Also test this for the second block to make sure
1252        // buffer is slid.
1253        let mut state = TestStruct::with_config(1, 0, matching_type);
1254        for _ in 0..MAX_BUFFER_LENGTH - 4 {
1255            assert!(state.writer.write_literal(0) == BufferStatus::NotFull);
1256        }
1257        state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true);
1258        assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3));
1259
1260    }
1261
1262    /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end.
1263    fn buffer_test_add_end(_data: &[u8]) {
1264        // This is disabled while the buffer size has not been stabilized.
1265        /*
1266        let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS,
1267                                                DEFAULT_LAZY_IF_LESS_THAN,
1268                                                MatchingType::Lazy);
1269        // For the test file, this is how much data needs to be added to get the buffer
1270        // full at the right spot to test that this buffer full exit is workong correctly.
1271        for i in 0..31743 {
1272            assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i);
1273        }
1274
1275        let dec = decompress_lz77(&state.writer.get_buffer()[pos..]);
1276        assert!(dec.len() > 0);
1277        assert!(dec[..] == data[..dec.len()]);
1278         */
1279    }
1280
1281    /// Check that decompressing lz77-data that refers to the back-buffer works.
1282    #[test]
1283    fn test_decompress_with_backbuffer() {
1284        let bb = vec![5; WINDOW_SIZE];
1285        let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)];
1286        let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb);
1287
1288        // ------------l2 l4  <-ld4,20-> l1 l1  <---ld5,10-->
1289        assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]);
1290    }
1291
1292}
1293
1294#[cfg(all(test, feature = "benchmarks"))]
1295mod bench {
1296    use test_std::Bencher;
1297    use test_utils::get_test_data;
1298    use super::lz77_compress;
1299    #[bench]
1300    fn test_file_zlib_lz77_only(b: &mut Bencher) {
1301        let test_data = get_test_data();
1302        b.iter(|| lz77_compress(&test_data));
1303    }
1304}
deflate/lz77.rs

deflate/
lz77.rs