zstd/
dict.rs

1//! Train a dictionary from various sources.
2//!
3//! A dictionary can help improve the compression of small files.
4//! The dictionary must be present during decompression,
5//! but can be shared accross multiple "similar" files.
6//!
7//! Creating a dictionary using the `zstd` C library,
8//! using the `zstd` command-line interface, using this library,
9//! or using the `train` binary provided, should give the same result,
10//! and are therefore completely compatible.
11//!
12//! To use, see [`Encoder::with_dictionary`] or [`Decoder::with_dictionary`].
13//!
14//! [`Encoder::with_dictionary`]: ../struct.Encoder.html#method.with_dictionary
15//! [`Decoder::with_dictionary`]: ../struct.Decoder.html#method.with_dictionary
16
17#[cfg(feature = "zdict_builder")]
18use std::io::{self, Read};
19
20pub use zstd_safe::{CDict, DDict};
21
22/// Prepared dictionary for compression
23///
24/// A dictionary can include its own copy of the data (if it is `'static`), or it can merely point
25/// to a separate buffer (if it has another lifetime).
26pub struct EncoderDictionary<'a> {
27    cdict: CDict<'a>,
28}
29
30impl EncoderDictionary<'static> {
31    /// Creates a prepared dictionary for compression.
32    ///
33    /// This will copy the dictionary internally.
34    pub fn copy(dictionary: &[u8], level: i32) -> Self {
35        Self {
36            cdict: zstd_safe::create_cdict(dictionary, level),
37        }
38    }
39}
40
41impl<'a> EncoderDictionary<'a> {
42    #[cfg(feature = "experimental")]
43    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
44    /// Create prepared dictionary for compression
45    ///
46    /// A level of `0` uses zstd's default (currently `3`).
47    ///
48    /// Only available with the `experimental` feature. Use `EncoderDictionary::copy` otherwise.
49    pub fn new(dictionary: &'a [u8], level: i32) -> Self {
50        Self {
51            cdict: zstd_safe::create_cdict_by_reference(dictionary, level),
52        }
53    }
54
55    /// Returns reference to `CDict` inner object
56    pub fn as_cdict(&self) -> &CDict<'a> {
57        &self.cdict
58    }
59}
60
61/// Prepared dictionary for decompression
62pub struct DecoderDictionary<'a> {
63    ddict: DDict<'a>,
64}
65
66impl DecoderDictionary<'static> {
67    /// Create a prepared dictionary for decompression.
68    ///
69    /// This will copy the dictionary internally.
70    pub fn copy(dictionary: &[u8]) -> Self {
71        Self {
72            ddict: zstd_safe::DDict::create(dictionary),
73        }
74    }
75}
76
77impl<'a> DecoderDictionary<'a> {
78    #[cfg(feature = "experimental")]
79    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
80    /// Create prepared dictionary for decompression
81    ///
82    /// Only available with the `experimental` feature. Use `DecoderDictionary::copy` otherwise.
83    pub fn new(dict: &'a [u8]) -> Self {
84        Self {
85            ddict: zstd_safe::create_ddict_by_reference(dict),
86        }
87    }
88
89    /// Returns reference to `DDict` inner object
90    pub fn as_ddict(&self) -> &DDict<'a> {
91        &self.ddict
92    }
93}
94
95/// Train a dictionary from a big continuous chunk of data.
96///
97/// This is the most efficient way to train a dictionary,
98/// since this is directly fed into `zstd`.
99#[cfg(feature = "zdict_builder")]
100#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
101pub fn from_continuous(
102    sample_data: &[u8],
103    sample_sizes: &[usize],
104    max_size: usize,
105) -> io::Result<Vec<u8>> {
106    use crate::map_error_code;
107
108    // Complain if the lengths don't add up to the entire data.
109    if sample_sizes.iter().sum::<usize>() != sample_data.len() {
110        return Err(io::Error::new(
111            io::ErrorKind::Other,
112            "sample sizes don't add up".to_string(),
113        ));
114    }
115
116    let mut result = Vec::with_capacity(max_size);
117    zstd_safe::train_from_buffer(&mut result, sample_data, sample_sizes)
118        .map_err(map_error_code)?;
119    Ok(result)
120}
121
122/// Train a dictionary from multiple samples.
123///
124/// The samples will internaly be copied to a single continuous buffer,
125/// so make sure you have enough memory available.
126///
127/// If you need to stretch your system's limits,
128/// [`from_continuous`] directly uses the given slice.
129///
130/// [`from_continuous`]: ./fn.from_continuous.html
131#[cfg(feature = "zdict_builder")]
132#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
133pub fn from_samples<S: AsRef<[u8]>>(
134    samples: &[S],
135    max_size: usize,
136) -> io::Result<Vec<u8>> {
137    // Copy every sample to a big chunk of memory
138    let data: Vec<_> =
139        samples.iter().flat_map(|s| s.as_ref()).cloned().collect();
140    let sizes: Vec<_> = samples.iter().map(|s| s.as_ref().len()).collect();
141
142    from_continuous(&data, &sizes, max_size)
143}
144
145/// Train a dict from a list of files.
146#[cfg(feature = "zdict_builder")]
147#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
148pub fn from_files<I, P>(filenames: I, max_size: usize) -> io::Result<Vec<u8>>
149where
150    P: AsRef<std::path::Path>,
151    I: IntoIterator<Item = P>,
152{
153    use std::fs;
154
155    let mut buffer = Vec::new();
156    let mut sizes = Vec::new();
157
158    for filename in filenames {
159        let mut file = fs::File::open(filename)?;
160        let len = file.read_to_end(&mut buffer)?;
161        sizes.push(len);
162    }
163
164    from_continuous(&buffer, &sizes, max_size)
165}
166
167#[cfg(test)]
168#[cfg(feature = "zdict_builder")]
169mod tests {
170    use std::fs;
171    use std::io;
172    use std::io::Read;
173
174    use walkdir;
175
176    #[test]
177    fn test_dict_training() {
178        // Train a dictionary
179        let paths: Vec<_> = walkdir::WalkDir::new("src")
180            .into_iter()
181            .map(|entry| entry.unwrap())
182            .map(|entry| entry.into_path())
183            .filter(|path| path.to_str().unwrap().ends_with(".rs"))
184            .collect();
185
186        let dict = super::from_files(&paths, 4000).unwrap();
187
188        for path in paths {
189            let mut buffer = Vec::new();
190            let mut file = fs::File::open(path).unwrap();
191            let mut content = Vec::new();
192            file.read_to_end(&mut content).unwrap();
193            io::copy(
194                &mut &content[..],
195                &mut crate::stream::Encoder::with_dictionary(
196                    &mut buffer,
197                    1,
198                    &dict,
199                )
200                .unwrap()
201                .auto_finish(),
202            )
203            .unwrap();
204
205            let mut result = Vec::new();
206            io::copy(
207                &mut crate::stream::Decoder::with_dictionary(
208                    &buffer[..],
209                    &dict[..],
210                )
211                .unwrap(),
212                &mut result,
213            )
214            .unwrap();
215
216            assert_eq!(&content, &result);
217        }
218    }
219}