deflate/output_writer.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
use std::u16;
use lzvalue::LZValue;
use huffman_table::{NUM_LITERALS_AND_LENGTHS, NUM_DISTANCE_CODES, END_OF_BLOCK_POSITION,
get_distance_code, get_length_code};
/// The type used for representing how many times a literal, length or distance code has been ouput
/// to the current buffer.
/// As we are limiting the blocks to be at most 2^16 bytes long, we can represent frequencies using
/// 16-bit values.
pub type FrequencyType = u16;
/// The maximum number of literals/lengths in the buffer, which in practice also means the maximum
/// number of literals/lengths output before a new block is started.
/// This should not be larger than the maximum value `FrequencyType` can represent to prevent
/// overflowing (which would degrade, or in the worst case break compression).
pub const MAX_BUFFER_LENGTH: usize = 1024 * 31;
#[derive(Debug, PartialEq)]
pub enum BufferStatus {
NotFull,
Full,
}
/// Struct that buffers lz77 data and keeps track of the usage of different codes
pub struct DynamicWriter {
buffer: Vec<LZValue>,
// The two last length codes are not actually used, but only participates in code construction
// Therefore, we ignore them to get the correct number of lengths
frequencies: [FrequencyType; NUM_LITERALS_AND_LENGTHS],
distance_frequencies: [FrequencyType; NUM_DISTANCE_CODES],
}
impl DynamicWriter {
#[inline]
pub fn check_buffer_length(&self) -> BufferStatus {
if self.buffer.len() >= MAX_BUFFER_LENGTH {
BufferStatus::Full
} else {
BufferStatus::NotFull
}
}
#[inline]
pub fn write_literal(&mut self, literal: u8) -> BufferStatus {
debug_assert!(self.buffer.len() < MAX_BUFFER_LENGTH);
self.buffer.push(LZValue::literal(literal));
self.frequencies[usize::from(literal)] += 1;
self.check_buffer_length()
}
#[inline]
pub fn write_length_distance(&mut self, length: u16, distance: u16) -> BufferStatus {
self.buffer.push(LZValue::length_distance(length, distance));
let l_code_num = get_length_code(length);
// As we limit the buffer to 2^16 values, this should be safe from overflowing.
if cfg!(debug_assertions) {
self.frequencies[l_code_num] += 1;
} else {
// #Safety
// None of the values in the table of length code numbers will give a value
// that is out of bounds.
// There is a test to ensure that these functions can not produce too large values.
unsafe {
*self.frequencies.get_unchecked_mut(l_code_num) += 1;
}
}
let d_code_num = get_distance_code(distance);
// The compiler seems to be able to evade the bounds check here somehow.
self.distance_frequencies[usize::from(d_code_num)] += 1;
self.check_buffer_length()
}
pub fn buffer_length(&self) -> usize {
self.buffer.len()
}
pub fn get_buffer(&self) -> &[LZValue] {
&self.buffer
}
pub fn new() -> DynamicWriter {
let mut w = DynamicWriter {
buffer: Vec::with_capacity(MAX_BUFFER_LENGTH),
frequencies: [0; NUM_LITERALS_AND_LENGTHS],
distance_frequencies: [0; NUM_DISTANCE_CODES],
};
// This will always be 1,
// since there will always only be one end of block marker in each block
w.frequencies[END_OF_BLOCK_POSITION] = 1;
w
}
/// Special output function used with RLE compression
/// that avoids bothering to lookup a distance code.
#[inline]
pub fn write_length_rle(&mut self, length: u16) -> BufferStatus {
self.buffer.push(LZValue::length_distance(length, 1));
let l_code_num = get_length_code(length);
// As we limit the buffer to 2^16 values, this should be safe from overflowing.
if cfg!(debug_assertions) {
self.frequencies[l_code_num] += 1;
} else {
// #Safety
// None of the values in the table of length code numbers will give a value
// that is out of bounds.
// There is a test to ensure that these functions won't produce too large values.
unsafe {
*self.frequencies.get_unchecked_mut(l_code_num) += 1;
}
}
self.distance_frequencies[0] += 1;
self.check_buffer_length()
}
pub fn get_frequencies(&self) -> (&[u16], &[u16]) {
(&self.frequencies, &self.distance_frequencies)
}
pub fn clear_frequencies(&mut self) {
self.frequencies = [0; NUM_LITERALS_AND_LENGTHS];
self.distance_frequencies = [0; NUM_DISTANCE_CODES];
self.frequencies[END_OF_BLOCK_POSITION] = 1;
}
pub fn clear_data(&mut self) {
self.buffer.clear()
}
pub fn clear(&mut self) {
self.clear_frequencies();
self.clear_data();
}
}
#[cfg(test)]
mod test {
use super::*;
use huffman_table::{get_length_code, get_distance_code};
#[test]
/// Ensure that these function won't produce values that would overflow the output_writer
/// tables since we use some unsafe indexing.
fn array_bounds() {
let w = DynamicWriter::new();
for i in 0..u16::max_value() {
get_length_code(i) < w.frequencies.len();
}
for i in 0..u16::max_value() {
get_distance_code(i) < w.distance_frequencies.len() as u8;
}
}
}