deflate/
huffman_lengths.rs

1use length_encode::{EncodedLength, encode_lengths_m, huffman_lengths_from_frequency_m,
2                    COPY_PREVIOUS, REPEAT_ZERO_3_BITS, REPEAT_ZERO_7_BITS};
3use huffman_table::{HuffmanTable, create_codes_in_place, num_extra_bits_for_length_code,
4                    num_extra_bits_for_distance_code, NUM_LITERALS_AND_LENGTHS,
5                    NUM_DISTANCE_CODES, MAX_CODE_LENGTH, FIXED_CODE_LENGTHS, LENGTH_BITS_START};
6use bitstream::LsbWriter;
7use output_writer::FrequencyType;
8use stored_block::MAX_STORED_BLOCK_LENGTH;
9use deflate_state::LengthBuffers;
10
11use std::cmp;
12
13// The minimum number of literal/length values
14pub const MIN_NUM_LITERALS_AND_LENGTHS: usize = 257;
15// The minimum number of distances
16pub const MIN_NUM_DISTANCES: usize = 1;
17
18const NUM_HUFFMAN_LENGTHS: usize = 19;
19
20// The output ordering of the lengths for the huffman codes used to encode the lengths
21// used to build the full huffman tree for length/literal codes.
22// http://www.gzip.org/zlib/rfc-deflate.html#dyn
23const HUFFMAN_LENGTH_ORDER: [u8; NUM_HUFFMAN_LENGTHS] = [
24    16,
25    17,
26    18,
27    0,
28    8,
29    7,
30    9,
31    6,
32    10,
33    5,
34    11,
35    4,
36    12,
37    3,
38    13,
39    2,
40    14,
41    1,
42    15,
43];
44
45// Number of bits used for the values specifying the number of codes
46const HLIT_BITS: u8 = 5;
47const HDIST_BITS: u8 = 5;
48const HCLEN_BITS: u8 = 4;
49
50// The longest a huffman code describing another huffman length can be
51const MAX_HUFFMAN_CODE_LENGTH: usize = 7;
52
53// How many bytes (not including padding and the 3-bit block type) the stored block header takes up.
54const STORED_BLOCK_HEADER_LENGTH: u64 = 4;
55const BLOCK_MARKER_LENGTH: u8 = 3;
56
57/// Creates a new slice from the input slice that stops at the final non-zero value
58pub fn remove_trailing_zeroes<T: From<u8> + PartialEq>(input: &[T], min_length: usize) -> &[T] {
59    let num_zeroes = input.iter().rev().take_while(|&a| *a == T::from(0)).count();
60    &input[0..cmp::max(input.len() - num_zeroes, min_length)]
61}
62
63/// How many extra bits the huffman length code uses to represent a value.
64fn extra_bits_for_huffman_length_code(code: u8) -> u8 {
65    match code {
66        16...17 => 3,
67        18 => 7,
68        _ => 0,
69    }
70}
71
72/// Calculate how many bits the huffman-encoded huffman lengths will use.
73fn calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u64 {
74    frequencies.iter().zip(code_lengths).enumerate().fold(
75        0,
76        |acc, (n, (&f, &l))| {
77            acc +
78                (u64::from(f) *
79                     (u64::from(l) + u64::from(extra_bits_for_huffman_length_code(n as u8))))
80        },
81    )
82}
83
84/// Calculate how many bits data with the given frequencies will use when compressed with dynamic
85/// code lengths (first return value) and static code lengths (second return value).
86///
87/// Parameters:
88/// Frequencies, length of dynamic codes, and a function to get how many extra bits in addition
89/// to the length of the huffman code the symbol will use.
90fn calculate_block_length<F>(
91    frequencies: &[FrequencyType],
92    dyn_code_lengths: &[u8],
93    get_num_extra_bits: &F,
94) -> (u64, u64)
95where
96    F: Fn(usize) -> u64,
97{
98    // Length of data represented by dynamic codes.
99    let mut d_ll_length = 0u64;
100    // length of data represented by static codes.
101    let mut s_ll_length = 0u64;
102
103    let iter = frequencies
104        .iter()
105        .zip(dyn_code_lengths.iter().zip(FIXED_CODE_LENGTHS.iter()))
106        .enumerate();
107
108    // This could maybe be optimised a bit by splitting the iteration of codes using extra bits and
109    // codes not using extra bits, but the extra complexity may not be worth it.
110    for (c, (&f, (&l, &fl))) in iter {
111        // Frequency
112        let f = u64::from(f);
113        // How many extra bits the current code number needs.
114        let extra_bits_for_code = get_num_extra_bits(c);
115
116        d_ll_length += f * (u64::from(l) + extra_bits_for_code);
117        s_ll_length += f * (u64::from(fl) + extra_bits_for_code);
118
119    }
120
121    (d_ll_length, s_ll_length)
122}
123
124/// Get how extra padding bits after a block start header a stored block would use.
125///
126/// # Panics
127/// Panics if `pending_bits > 8`
128fn stored_padding(pending_bits: u8) -> u64 {
129    assert!(pending_bits <= 8);
130    let free_space = 8 - pending_bits;
131    if free_space >= BLOCK_MARKER_LENGTH {
132        // There is space in the current byte for the header.
133        free_space - BLOCK_MARKER_LENGTH
134    } else {
135        // The header will require an extra byte.
136        8 - (BLOCK_MARKER_LENGTH - free_space)
137    }.into()
138}
139
140/// Calculate the number of bits storing the data in stored blocks will take up, excluding the
141/// first block start code and potential padding bits. As stored blocks have a maximum length,
142/// (as opposed to fixed and dynamic ones), multiple blocks may have to be utilised.
143///
144/// # Panics
145/// Panics if `input_bytes` is 0.
146fn stored_length(input_bytes: u64) -> u64 {
147    // Check how many stored blocks these bytes would take up.
148    // (Integer divison rounding up.)
149    let num_blocks = (input_bytes
150                          .checked_sub(1)
151                          .expect("Underflow calculating stored block length!") /
152                          MAX_STORED_BLOCK_LENGTH as u64) + 1;
153    // The length will be the input length and the headers for each block. (Excluding the start
154    // of block code for the first one)
155    (input_bytes + (STORED_BLOCK_HEADER_LENGTH as u64 * num_blocks) + (num_blocks - 1)) * 8
156}
157
158pub enum BlockType {
159    Stored,
160    Fixed,
161    Dynamic(DynamicBlockHeader),
162}
163
164/// A struct containing the different data needed to write the header for a dynamic block.
165///
166/// The code lengths are stored directly in the `HuffmanTable` struct.
167/// TODO: Do the same for other things here.
168pub struct DynamicBlockHeader {
169    /// Length of the run-length encoding symbols.
170    pub huffman_table_lengths: Vec<u8>,
171    /// Number of lengths for values describing the huffman table that encodes the length values
172    /// of the main huffman tables.
173    pub used_hclens: usize,
174}
175
176/// Generate the lengths of the huffman codes we will be using, using the
177/// frequency of the different symbols/lengths/distances, and determine what block type will give
178/// the shortest representation.
179/// TODO: This needs a test
180pub fn gen_huffman_lengths(
181    l_freqs: &[FrequencyType],
182    d_freqs: &[FrequencyType],
183    num_input_bytes: u64,
184    pending_bits: u8,
185    l_lengths: &mut [u8; 288],
186    d_lengths: &mut [u8; 32],
187    length_buffers: &mut LengthBuffers,
188) -> BlockType {
189    // Avoid corner cases and issues if this is called for an empty block.
190    // For blocks this short, a fixed block will be the shortest.
191    // TODO: Find the minimum value it's worth doing calculations for.
192    if num_input_bytes <= 4 {
193        return BlockType::Fixed;
194    };
195
196    let l_freqs = remove_trailing_zeroes(l_freqs, MIN_NUM_LITERALS_AND_LENGTHS);
197    let d_freqs = remove_trailing_zeroes(d_freqs, MIN_NUM_DISTANCES);
198
199    // The huffman spec allows us to exclude zeroes at the end of the
200    // table of huffman lengths.
201    // Since a frequency of 0 will give an huffman
202    // length of 0. We strip off the trailing zeroes before even
203    // generating the lengths to save some work.
204    // There is however a minimum number of values we have to keep
205    // according to the deflate spec.
206    // TODO: We could probably compute some of this in parallel.
207    huffman_lengths_from_frequency_m(
208        l_freqs,
209        MAX_CODE_LENGTH,
210        &mut length_buffers.leaf_buf,
211        l_lengths,
212    );
213    huffman_lengths_from_frequency_m(
214        d_freqs,
215        MAX_CODE_LENGTH,
216        &mut length_buffers.leaf_buf,
217        d_lengths,
218    );
219
220
221    let used_lengths = l_freqs.len();
222    let used_distances = d_freqs.len();
223
224    // Encode length values
225    let mut freqs = [0u16; 19];
226    encode_lengths_m(
227        l_lengths[..used_lengths]
228            .iter()
229            .chain(&d_lengths[..used_distances]),
230        &mut length_buffers.length_buf,
231        &mut freqs,
232    );
233
234    // Create huffman lengths for the length/distance code lengths
235    let mut huffman_table_lengths = vec![0; freqs.len()];
236    huffman_lengths_from_frequency_m(
237        &freqs,
238        MAX_HUFFMAN_CODE_LENGTH,
239        &mut length_buffers.leaf_buf,
240        huffman_table_lengths.as_mut_slice(),
241    );
242
243    // Count how many of these lengths we use.
244    let used_hclens = HUFFMAN_LENGTH_ORDER.len() -
245        HUFFMAN_LENGTH_ORDER
246            .iter()
247            .rev()
248            .take_while(|&&n| huffman_table_lengths[n as usize] == 0)
249            .count();
250
251    // There has to be at least 4 hclens, so if there isn't, something went wrong.
252    debug_assert!(used_hclens >= 4);
253
254    // Calculate how many bytes of space this block will take up with the different block types
255    // (excluding the 3-bit block header since it's used in all block types).
256
257    // Total length of the compressed literals/lengths.
258    let (d_ll_length, s_ll_length) = calculate_block_length(l_freqs, l_lengths, &|c| {
259        num_extra_bits_for_length_code(c.saturating_sub(LENGTH_BITS_START as usize) as u8).into()
260    });
261
262    // Total length of the compressed distances.
263    let (d_dist_length, s_dist_length) = calculate_block_length(d_freqs, d_lengths, &|c| {
264        num_extra_bits_for_distance_code(c as u8).into()
265    });
266
267    // Total length of the compressed huffman code lengths.
268    let huff_table_length = calculate_huffman_length(&freqs, &huffman_table_lengths);
269
270    // For dynamic blocks the huffman tables takes up some extra space.
271    let dynamic_length = d_ll_length + d_dist_length + huff_table_length +
272        (used_hclens as u64 * 3) + u64::from(HLIT_BITS) +
273        u64::from(HDIST_BITS) + u64::from(HCLEN_BITS);
274
275    // Static blocks don't have any extra header data.
276    let static_length = s_ll_length + s_dist_length;
277
278    // Calculate how many bits it will take to store the data in uncompressed (stored) block(s).
279    let stored_length = stored_length(num_input_bytes) + stored_padding(pending_bits % 8);
280
281    let used_length = cmp::min(cmp::min(dynamic_length, static_length), stored_length);
282
283    // Check if the block is actually compressed. If using a dynamic block
284    // increases the length of the block (for instance if the input data is mostly random or
285    // already compressed), we want to output a stored(uncompressed) block instead to avoid wasting
286    // space.
287    if used_length == static_length {
288        BlockType::Fixed
289    } else if used_length == stored_length {
290        BlockType::Stored
291    } else {
292        BlockType::Dynamic(DynamicBlockHeader {
293            huffman_table_lengths: huffman_table_lengths,
294            used_hclens: used_hclens,
295        })
296    }
297}
298
299/// Write the specified huffman lengths to the bit writer
300pub fn write_huffman_lengths(
301    header: &DynamicBlockHeader,
302    huffman_table: &HuffmanTable,
303    encoded_lengths: &[EncodedLength],
304    writer: &mut LsbWriter,
305) {
306    // Ignore trailing zero lengths as allowed by the deflate spec.
307    let (literal_len_lengths, distance_lengths) = huffman_table.get_lengths();
308    let literal_len_lengths =
309        remove_trailing_zeroes(literal_len_lengths, MIN_NUM_LITERALS_AND_LENGTHS);
310    let distance_lengths = remove_trailing_zeroes(distance_lengths, MIN_NUM_DISTANCES);
311    let huffman_table_lengths = &header.huffman_table_lengths;
312    let used_hclens = header.used_hclens;
313
314    assert!(literal_len_lengths.len() <= NUM_LITERALS_AND_LENGTHS);
315    assert!(literal_len_lengths.len() >= MIN_NUM_LITERALS_AND_LENGTHS);
316    assert!(distance_lengths.len() <= NUM_DISTANCE_CODES);
317    assert!(distance_lengths.len() >= MIN_NUM_DISTANCES);
318
319    // Number of length codes - 257.
320    let hlit = (literal_len_lengths.len() - MIN_NUM_LITERALS_AND_LENGTHS) as u16;
321    writer.write_bits(hlit, HLIT_BITS);
322    // Number of distance codes - 1.
323    let hdist = (distance_lengths.len() - MIN_NUM_DISTANCES) as u16;
324    writer.write_bits(hdist, HDIST_BITS);
325
326    // Number of huffman table lengths - 4.
327    let hclen = used_hclens.saturating_sub(4);
328
329    // Write HCLEN.
330    // Casting to u16 is safe since the length can never be more than the length of
331    // `HUFFMAN_LENGTH_ORDER` anyhow.
332    writer.write_bits(hclen as u16, HCLEN_BITS);
333
334    // Write the lengths for the huffman table describing the huffman table
335    // Each length is 3 bits
336    for n in &HUFFMAN_LENGTH_ORDER[..used_hclens] {
337        writer.write_bits(huffman_table_lengths[usize::from(*n)] as u16, 3);
338    }
339
340    // Generate codes for the main huffman table using the lengths we just wrote
341    let mut codes = [0u16; NUM_HUFFMAN_LENGTHS];
342    create_codes_in_place(&mut codes[..], huffman_table_lengths);
343
344    // Write the actual huffman lengths
345    for v in encoded_lengths {
346        match *v {
347            EncodedLength::Length(n) => {
348                let (c, l) = (codes[usize::from(n)], huffman_table_lengths[usize::from(n)]);
349                writer.write_bits(c, l);
350            }
351            EncodedLength::CopyPrevious(n) => {
352                let (c, l) = (codes[COPY_PREVIOUS], huffman_table_lengths[COPY_PREVIOUS]);
353                writer.write_bits(c, l);
354                debug_assert!(n >= 3);
355                debug_assert!(n <= 6);
356                writer.write_bits((n - 3).into(), 2);
357            }
358            EncodedLength::RepeatZero3Bits(n) => {
359                let (c, l) = (
360                    codes[REPEAT_ZERO_3_BITS],
361                    huffman_table_lengths[REPEAT_ZERO_3_BITS],
362                );
363                writer.write_bits(c, l);
364                debug_assert!(n >= 3);
365                writer.write_bits((n - 3).into(), 3);
366            }
367            EncodedLength::RepeatZero7Bits(n) => {
368                let (c, l) = (
369                    codes[REPEAT_ZERO_7_BITS],
370                    huffman_table_lengths[REPEAT_ZERO_7_BITS],
371                );
372                writer.write_bits(c, l);
373                debug_assert!(n >= 11);
374                debug_assert!(n <= 138);
375                writer.write_bits((n - 11).into(), 7);
376            }
377        }
378    }
379}
380
381
382#[cfg(test)]
383mod test {
384    use super::stored_padding;
385    #[test]
386    fn padding() {
387        assert_eq!(stored_padding(0), 5);
388        assert_eq!(stored_padding(1), 4);
389        assert_eq!(stored_padding(2), 3);
390        assert_eq!(stored_padding(3), 2);
391        assert_eq!(stored_padding(4), 1);
392        assert_eq!(stored_padding(5), 0);
393        assert_eq!(stored_padding(6), 7);
394        assert_eq!(stored_padding(7), 6);
395    }
396}