regex_automata/classes.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
use core::fmt;
/// A representation of byte oriented equivalence classes.
///
/// This is used in a DFA to reduce the size of the transition table. This can
/// have a particularly large impact not only on the total size of a dense DFA,
/// but also on compile times.
#[derive(Clone, Copy)]
pub struct ByteClasses([u8; 256]);
impl ByteClasses {
/// Creates a new set of equivalence classes where all bytes are mapped to
/// the same class.
pub fn empty() -> ByteClasses {
ByteClasses([0; 256])
}
/// Creates a new set of equivalence classes where each byte belongs to
/// its own equivalence class.
pub fn singletons() -> ByteClasses {
let mut classes = ByteClasses::empty();
for i in 0..256 {
classes.set(i as u8, i as u8);
}
classes
}
/// Copies the byte classes given. The given slice must have length 0 or
/// length 256. Slices of length 0 are treated as singletons (every byte
/// is its own class).
pub fn from_slice(slice: &[u8]) -> ByteClasses {
assert!(slice.is_empty() || slice.len() == 256);
if slice.is_empty() {
ByteClasses::singletons()
} else {
let mut classes = ByteClasses::empty();
for (b, &class) in slice.iter().enumerate() {
classes.set(b as u8, class);
}
classes
}
}
/// Set the equivalence class for the given byte.
#[inline]
pub fn set(&mut self, byte: u8, class: u8) {
self.0[byte as usize] = class;
}
/// Get the equivalence class for the given byte.
#[inline]
pub fn get(&self, byte: u8) -> u8 {
self.0[byte as usize]
}
/// Get the equivalence class for the given byte while forcefully
/// eliding bounds checks.
#[inline]
pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
*self.0.get_unchecked(byte as usize)
}
/// Return the total number of elements in the alphabet represented by
/// these equivalence classes. Equivalently, this returns the total number
/// of equivalence classes.
#[inline]
pub fn alphabet_len(&self) -> usize {
self.0[255] as usize + 1
}
/// Returns true if and only if every byte in this class maps to its own
/// equivalence class. Equivalently, there are 256 equivalence classes
/// and each class contains exactly one byte.
#[inline]
pub fn is_singleton(&self) -> bool {
self.alphabet_len() == 256
}
/// Returns an iterator over a sequence of representative bytes from each
/// equivalence class. Namely, this yields exactly N items, where N is
/// equivalent to the number of equivalence classes. Each item is an
/// arbitrary byte drawn from each equivalence class.
///
/// This is useful when one is determinizing an NFA and the NFA's alphabet
/// hasn't been converted to equivalence classes yet. Picking an arbitrary
/// byte from each equivalence class then permits a full exploration of
/// the NFA instead of using every possible byte value.
#[cfg(feature = "std")]
pub fn representatives(&self) -> ByteClassRepresentatives {
ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
}
/// Returns all of the bytes in the given equivalence class.
///
/// The second element in the tuple indicates the number of elements in
/// the array.
fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
let (mut array, mut len) = ([0; 256], 0);
for b in 0..256 {
if self.get(b as u8) == equiv {
array[len] = b as u8;
len += 1;
}
}
(array, len)
}
}
impl fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.is_singleton() {
write!(f, "ByteClasses({{singletons}})")
} else {
write!(f, "ByteClasses(")?;
for equiv in 0..self.alphabet_len() {
let (members, len) = self.elements(equiv as u8);
write!(f, "{} => {:?}", equiv, &members[..len])?;
}
write!(f, ")")
}
}
}
/// An iterator over representative bytes from each equivalence class.
#[cfg(feature = "std")]
#[derive(Debug)]
pub struct ByteClassRepresentatives<'a> {
classes: &'a ByteClasses,
byte: usize,
last_class: Option<u8>,
}
#[cfg(feature = "std")]
impl<'a> Iterator for ByteClassRepresentatives<'a> {
type Item = u8;
fn next(&mut self) -> Option<u8> {
while self.byte < 256 {
let byte = self.byte as u8;
let class = self.classes.get(byte);
self.byte += 1;
if self.last_class != Some(class) {
self.last_class = Some(class);
return Some(byte);
}
}
None
}
}
/// A byte class set keeps track of an *approximation* of equivalence classes
/// of bytes during NFA construction. That is, every byte in an equivalence
/// class cannot discriminate between a match and a non-match.
///
/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
/// same equivalence class because it never matters whether an `a` or a `b` is
/// seen, and no combination of `a`s and `b`s in the text can discriminate
/// a match.
///
/// Note though that this does not compute the minimal set of equivalence
/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
/// same equivalence class for the same reason that `a` and `b` are in the
/// same equivalence class in the aforementioned regex. However, in this
/// implementation, `a` and `c` are put into distinct equivalence classes.
/// The reason for this is implementation complexity. In the future, we should
/// endeavor to compute the minimal equivalence classes since they can have a
/// rather large impact on the size of the DFA.
///
/// The representation here is 256 booleans, all initially set to false. Each
/// boolean maps to its corresponding byte based on position. A `true` value
/// indicates the end of an equivalence class, where its corresponding byte
/// and all of the bytes corresponding to all previous contiguous `false`
/// values are in the same equivalence class.
///
/// This particular representation only permits contiguous ranges of bytes to
/// be in the same equivalence class, which means that we can never discover
/// the true minimal set of equivalence classes.
#[cfg(feature = "std")]
#[derive(Debug)]
pub struct ByteClassSet(Vec<bool>);
#[cfg(feature = "std")]
impl ByteClassSet {
/// Create a new set of byte classes where all bytes are part of the same
/// equivalence class.
pub fn new() -> Self {
ByteClassSet(vec![false; 256])
}
/// Indicate the the range of byte given (inclusive) can discriminate a
/// match between it and all other bytes outside of the range.
pub fn set_range(&mut self, start: u8, end: u8) {
debug_assert!(start <= end);
if start > 0 {
self.0[start as usize - 1] = true;
}
self.0[end as usize] = true;
}
/// Convert this boolean set to a map that maps all byte values to their
/// corresponding equivalence class. The last mapping indicates the largest
/// equivalence class identifier (which is never bigger than 255).
pub fn byte_classes(&self) -> ByteClasses {
let mut classes = ByteClasses::empty();
let mut class = 0u8;
let mut i = 0;
loop {
classes.set(i as u8, class as u8);
if i >= 255 {
break;
}
if self.0[i] {
class = class.checked_add(1).unwrap();
}
i += 1;
}
classes
}
}
#[cfg(test)]
mod tests {
#[cfg(feature = "std")]
#[test]
fn byte_classes() {
use super::ByteClassSet;
let mut set = ByteClassSet::new();
set.set_range(b'a', b'z');
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(b'a' - 1), 0);
assert_eq!(classes.get(b'a'), 1);
assert_eq!(classes.get(b'm'), 1);
assert_eq!(classes.get(b'z'), 1);
assert_eq!(classes.get(b'z' + 1), 2);
assert_eq!(classes.get(254), 2);
assert_eq!(classes.get(255), 2);
let mut set = ByteClassSet::new();
set.set_range(0, 2);
set.set_range(4, 6);
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(3), 1);
assert_eq!(classes.get(4), 2);
assert_eq!(classes.get(5), 2);
assert_eq!(classes.get(6), 2);
assert_eq!(classes.get(7), 3);
assert_eq!(classes.get(255), 3);
}
#[cfg(feature = "std")]
#[test]
fn full_byte_classes() {
use super::ByteClassSet;
let mut set = ByteClassSet::new();
for i in 0..256u16 {
set.set_range(i as u8, i as u8);
}
assert_eq!(set.byte_classes().alphabet_len(), 256);
}
}