regex_automata/nfa/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
use std::fmt;
use classes::ByteClasses;
pub use nfa::compiler::Builder;
mod compiler;
mod map;
mod range_trie;
/// The representation for an NFA state identifier.
pub type StateID = usize;
/// A final compiled NFA.
///
/// The states of the NFA are indexed by state IDs, which are how transitions
/// are expressed.
#[derive(Clone)]
pub struct NFA {
/// Whether this NFA can only match at the beginning of input or not.
///
/// When true, a match should only be reported if it begins at the 0th
/// index of the haystack.
anchored: bool,
/// The starting state of this NFA.
start: StateID,
/// The state list. This list is guaranteed to be indexable by the starting
/// state ID, and it is also guaranteed to contain exactly one `Match`
/// state.
states: Vec<State>,
/// A mapping from any byte value to its corresponding equivalence class
/// identifier. Two bytes in the same equivalence class cannot discriminate
/// between a match or a non-match. This map can be used to shrink the
/// total size of a DFA's transition table with a small match-time cost.
///
/// Note that the NFA's transitions are *not* defined in terms of these
/// equivalence classes. The NFA's transitions are defined on the original
/// byte values. For the most part, this is because they wouldn't really
/// help the NFA much since the NFA already uses a sparse representation
/// to represent transitions. Byte classes are most effective in a dense
/// representation.
byte_classes: ByteClasses,
}
impl NFA {
/// Returns an NFA that always matches at every position.
pub fn always_match() -> NFA {
NFA {
anchored: false,
start: 0,
states: vec![State::Match],
byte_classes: ByteClasses::empty(),
}
}
/// Returns an NFA that never matches at any position.
pub fn never_match() -> NFA {
NFA {
anchored: false,
start: 0,
states: vec![State::Fail],
byte_classes: ByteClasses::empty(),
}
}
/// Returns true if and only if this NFA is anchored.
pub fn is_anchored(&self) -> bool {
self.anchored
}
/// Return the number of states in this NFA.
pub fn len(&self) -> usize {
self.states.len()
}
/// Return the ID of the initial state of this NFA.
pub fn start(&self) -> StateID {
self.start
}
/// Return the NFA state corresponding to the given ID.
pub fn state(&self, id: StateID) -> &State {
&self.states[id]
}
/// Return the set of equivalence classes for this NFA. The slice returned
/// always has length 256 and maps each possible byte value to its
/// corresponding equivalence class ID (which is never more than 255).
pub fn byte_classes(&self) -> &ByteClasses {
&self.byte_classes
}
}
impl fmt::Debug for NFA {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for (i, state) in self.states.iter().enumerate() {
let status = if i == self.start { '>' } else { ' ' };
writeln!(f, "{}{:06}: {:?}", status, i, state)?;
}
Ok(())
}
}
/// A state in a final compiled NFA.
#[derive(Clone, Eq, PartialEq)]
pub enum State {
/// A state that transitions to `next` if and only if the current input
/// byte is in the range `[start, end]` (inclusive).
///
/// This is a special case of Sparse in that it encodes only one transition
/// (and therefore avoids the allocation).
Range { range: Transition },
/// A state with possibly many transitions, represented in a sparse
/// fashion. Transitions are ordered lexicographically by input range.
/// As such, this may only be used when every transition has equal
/// priority. (In practice, this is only used for encoding large UTF-8
/// automata.)
Sparse { ranges: Box<[Transition]> },
/// An alternation such that there exists an epsilon transition to all
/// states in `alternates`, where matches found via earlier transitions
/// are preferred over later transitions.
Union { alternates: Box<[StateID]> },
/// A fail state. When encountered, the automaton is guaranteed to never
/// reach a match state.
Fail,
/// A match state. There is exactly one such occurrence of this state in
/// an NFA.
Match,
}
/// A transition to another state, only if the given byte falls in the
/// inclusive range specified.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Transition {
pub start: u8,
pub end: u8,
pub next: StateID,
}
impl State {
/// Returns true if and only if this state contains one or more epsilon
/// transitions.
pub fn is_epsilon(&self) -> bool {
match *self {
State::Range { .. }
| State::Sparse { .. }
| State::Fail
| State::Match => false,
State::Union { .. } => true,
}
}
/// Remap the transitions in this state using the given map. Namely, the
/// given map should be indexed according to the transitions currently
/// in this state.
///
/// This is used during the final phase of the NFA compiler, which turns
/// its intermediate NFA into the final NFA.
fn remap(&mut self, remap: &[StateID]) {
match *self {
State::Range { ref mut range } => range.next = remap[range.next],
State::Sparse { ref mut ranges } => {
for r in ranges.iter_mut() {
r.next = remap[r.next];
}
}
State::Union { ref mut alternates } => {
for alt in alternates.iter_mut() {
*alt = remap[*alt];
}
}
State::Fail => {}
State::Match => {}
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
State::Range { ref range } => range.fmt(f),
State::Sparse { ref ranges } => {
let rs = ranges
.iter()
.map(|t| format!("{:?}", t))
.collect::<Vec<String>>()
.join(", ");
write!(f, "sparse({})", rs)
}
State::Union { ref alternates } => {
let alts = alternates
.iter()
.map(|id| format!("{}", id))
.collect::<Vec<String>>()
.join(", ");
write!(f, "alt({})", alts)
}
State::Fail => write!(f, "FAIL"),
State::Match => write!(f, "MATCH"),
}
}
}
impl fmt::Debug for Transition {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let Transition { start, end, next } = *self;
if self.start == self.end {
write!(f, "{} => {}", escape(start), next)
} else {
write!(f, "{}-{} => {}", escape(start), escape(end), next)
}
}
}
/// Return the given byte as its escaped string form.
fn escape(b: u8) -> String {
use std::ascii;
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use dense;
use dfa::DFA;
#[test]
fn always_match() {
let nfa = NFA::always_match();
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
assert_eq!(Some(0), dfa.find_at(b"", 0));
assert_eq!(Some(0), dfa.find_at(b"a", 0));
assert_eq!(Some(1), dfa.find_at(b"a", 1));
assert_eq!(Some(0), dfa.find_at(b"ab", 0));
assert_eq!(Some(1), dfa.find_at(b"ab", 1));
assert_eq!(Some(2), dfa.find_at(b"ab", 2));
}
#[test]
fn never_match() {
let nfa = NFA::never_match();
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
assert_eq!(None, dfa.find_at(b"", 0));
assert_eq!(None, dfa.find_at(b"a", 0));
assert_eq!(None, dfa.find_at(b"a", 1));
assert_eq!(None, dfa.find_at(b"ab", 0));
assert_eq!(None, dfa.find_at(b"ab", 1));
assert_eq!(None, dfa.find_at(b"ab", 2));
}
}