aho_corasick/
classes.rs

1use std::fmt;
2
3/// A representation of byte oriented equivalence classes.
4///
5/// This is used in an FSM to reduce the size of the transition table. This can
6/// have a particularly large impact not only on the total size of an FSM, but
7/// also on compile times.
8#[derive(Clone, Copy)]
9pub struct ByteClasses([u8; 256]);
10
11impl ByteClasses {
12    /// Creates a new set of equivalence classes where all bytes are mapped to
13    /// the same class.
14    pub fn empty() -> ByteClasses {
15        ByteClasses([0; 256])
16    }
17
18    /// Creates a new set of equivalence classes where each byte belongs to
19    /// its own equivalence class.
20    pub fn singletons() -> ByteClasses {
21        let mut classes = ByteClasses::empty();
22        for i in 0..256 {
23            classes.set(i as u8, i as u8);
24        }
25        classes
26    }
27
28    /// Set the equivalence class for the given byte.
29    #[inline]
30    pub fn set(&mut self, byte: u8, class: u8) {
31        self.0[byte as usize] = class;
32    }
33
34    /// Get the equivalence class for the given byte.
35    #[inline]
36    pub fn get(&self, byte: u8) -> u8 {
37        // SAFETY: This is safe because all dense transitions have
38        // exactly 256 elements, so all u8 values are valid indices.
39        self.0[byte as usize]
40    }
41
42    /// Return the total number of elements in the alphabet represented by
43    /// these equivalence classes. Equivalently, this returns the total number
44    /// of equivalence classes.
45    #[inline]
46    pub fn alphabet_len(&self) -> usize {
47        self.0[255] as usize + 1
48    }
49
50    /// Returns true if and only if every byte in this class maps to its own
51    /// equivalence class. Equivalently, there are 256 equivalence classes
52    /// and each class contains exactly one byte.
53    #[inline]
54    pub fn is_singleton(&self) -> bool {
55        self.alphabet_len() == 256
56    }
57
58    /// Returns an iterator over a sequence of representative bytes from each
59    /// equivalence class. Namely, this yields exactly N items, where N is
60    /// equivalent to the number of equivalence classes. Each item is an
61    /// arbitrary byte drawn from each equivalence class.
62    ///
63    /// This is useful when one is determinizing an NFA and the NFA's alphabet
64    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
65    /// byte from each equivalence class then permits a full exploration of
66    /// the NFA instead of using every possible byte value.
67    pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
68        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
69    }
70
71    /// Returns all of the bytes in the given equivalence class.
72    ///
73    /// The second element in the tuple indicates the number of elements in
74    /// the array.
75    fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
76        let (mut array, mut len) = ([0; 256], 0);
77        for b in 0..256 {
78            if self.get(b as u8) == equiv {
79                array[len] = b as u8;
80                len += 1;
81            }
82        }
83        (array, len)
84    }
85}
86
87impl fmt::Debug for ByteClasses {
88    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
89        if self.is_singleton() {
90            write!(f, "ByteClasses({{singletons}})")
91        } else {
92            write!(f, "ByteClasses(")?;
93            for equiv in 0..self.alphabet_len() {
94                let (members, len) = self.elements(equiv as u8);
95                write!(f, " {} => {:?}", equiv, &members[..len])?;
96            }
97            write!(f, ")")
98        }
99    }
100}
101
102/// An iterator over representative bytes from each equivalence class.
103#[derive(Debug)]
104pub struct ByteClassRepresentatives<'a> {
105    classes: &'a ByteClasses,
106    byte: usize,
107    last_class: Option<u8>,
108}
109
110impl<'a> Iterator for ByteClassRepresentatives<'a> {
111    type Item = u8;
112
113    fn next(&mut self) -> Option<u8> {
114        while self.byte < 256 {
115            let byte = self.byte as u8;
116            let class = self.classes.get(byte);
117            self.byte += 1;
118
119            if self.last_class != Some(class) {
120                self.last_class = Some(class);
121                return Some(byte);
122            }
123        }
124        None
125    }
126}
127
128/// A byte class builder keeps track of an *approximation* of equivalence
129/// classes of bytes during NFA construction. That is, every byte in an
130/// equivalence class cannot discriminate between a match and a non-match.
131///
132/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
133/// and [{-\xFF] never discriminate between a match and a non-match, precisely
134/// because they never occur in the literals anywhere.
135///
136/// Note though that this does not necessarily compute the minimal set of
137/// equivalence classes. For example, in the literals above, the byte ranges
138/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
139/// classes even though they could be treated a single class. The reason for
140/// this is implementation complexity. In the future, we should endeavor to
141/// compute the minimal equivalence classes since they can have a rather large
142/// impact on the size of the DFA.
143///
144/// The representation here is 256 booleans, all initially set to false. Each
145/// boolean maps to its corresponding byte based on position. A `true` value
146/// indicates the end of an equivalence class, where its corresponding byte
147/// and all of the bytes corresponding to all previous contiguous `false`
148/// values are in the same equivalence class.
149///
150/// This particular representation only permits contiguous ranges of bytes to
151/// be in the same equivalence class, which means that we can never discover
152/// the true minimal set of equivalence classes.
153#[derive(Debug)]
154pub struct ByteClassBuilder(Vec<bool>);
155
156impl ByteClassBuilder {
157    /// Create a new builder of byte classes where all bytes are part of the
158    /// same equivalence class.
159    pub fn new() -> ByteClassBuilder {
160        ByteClassBuilder(vec![false; 256])
161    }
162
163    /// Indicate the the range of byte given (inclusive) can discriminate a
164    /// match between it and all other bytes outside of the range.
165    pub fn set_range(&mut self, start: u8, end: u8) {
166        debug_assert!(start <= end);
167        if start > 0 {
168            self.0[start as usize - 1] = true;
169        }
170        self.0[end as usize] = true;
171    }
172
173    /// Build byte classes that map all byte values to their corresponding
174    /// equivalence class. The last mapping indicates the largest equivalence
175    /// class identifier (which is never bigger than 255).
176    pub fn build(&self) -> ByteClasses {
177        let mut classes = ByteClasses::empty();
178        let mut class = 0u8;
179        let mut i = 0;
180        loop {
181            classes.set(i as u8, class as u8);
182            if i >= 255 {
183                break;
184            }
185            if self.0[i] {
186                class = class.checked_add(1).unwrap();
187            }
188            i += 1;
189        }
190        classes
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    #[test]
199    fn byte_classes() {
200        let mut set = ByteClassBuilder::new();
201        set.set_range(b'a', b'z');
202
203        let classes = set.build();
204        assert_eq!(classes.get(0), 0);
205        assert_eq!(classes.get(1), 0);
206        assert_eq!(classes.get(2), 0);
207        assert_eq!(classes.get(b'a' - 1), 0);
208        assert_eq!(classes.get(b'a'), 1);
209        assert_eq!(classes.get(b'm'), 1);
210        assert_eq!(classes.get(b'z'), 1);
211        assert_eq!(classes.get(b'z' + 1), 2);
212        assert_eq!(classes.get(254), 2);
213        assert_eq!(classes.get(255), 2);
214
215        let mut set = ByteClassBuilder::new();
216        set.set_range(0, 2);
217        set.set_range(4, 6);
218        let classes = set.build();
219        assert_eq!(classes.get(0), 0);
220        assert_eq!(classes.get(1), 0);
221        assert_eq!(classes.get(2), 0);
222        assert_eq!(classes.get(3), 1);
223        assert_eq!(classes.get(4), 2);
224        assert_eq!(classes.get(5), 2);
225        assert_eq!(classes.get(6), 2);
226        assert_eq!(classes.get(7), 3);
227        assert_eq!(classes.get(255), 3);
228    }
229
230    #[test]
231    fn full_byte_classes() {
232        let mut set = ByteClassBuilder::new();
233        for i in 0..256u16 {
234            set.set_range(i as u8, i as u8);
235        }
236        assert_eq!(set.build().alphabet_len(), 256);
237    }
238}