regex_automata/determinize.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
use std::collections::HashMap;
use std::mem;
use std::rc::Rc;
use dense;
use error::Result;
use nfa::{self, NFA};
use sparse_set::SparseSet;
use state_id::{dead_id, StateID};
type DFARepr<S> = dense::Repr<Vec<S>, S>;
/// A determinizer converts an NFA to a DFA.
///
/// This determinizer follows the typical powerset construction, where each
/// DFA state is comprised of one or more NFA states. In the worst case, there
/// is one DFA state for every possible combination of NFA states. In practice,
/// this only happens in certain conditions, typically when there are bounded
/// repetitions.
///
/// The type variable `S` refers to the chosen state identifier representation
/// used for the DFA.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA being
/// converted to a DFA.
#[derive(Debug)]
pub(crate) struct Determinizer<'a, S: StateID> {
/// The NFA we're converting into a DFA.
nfa: &'a NFA,
/// The DFA we're building.
dfa: DFARepr<S>,
/// Each DFA state being built is defined as an *ordered* set of NFA
/// states, along with a flag indicating whether the state is a match
/// state or not.
///
/// This is never empty. The first state is always a dummy state such that
/// a state id == 0 corresponds to a dead state.
builder_states: Vec<Rc<State>>,
/// A cache of DFA states that already exist and can be easily looked up
/// via ordered sets of NFA states.
cache: HashMap<Rc<State>, S>,
/// Scratch space for a stack of NFA states to visit, for depth first
/// visiting without recursion.
stack: Vec<nfa::StateID>,
/// Scratch space for storing an ordered sequence of NFA states, for
/// amortizing allocation.
scratch_nfa_states: Vec<nfa::StateID>,
/// Whether to build a DFA that finds the longest possible match.
longest_match: bool,
}
/// An intermediate representation for a DFA state during determinization.
#[derive(Debug, Eq, Hash, PartialEq)]
struct State {
/// Whether this state is a match state or not.
is_match: bool,
/// An ordered sequence of NFA states that make up this DFA state.
nfa_states: Vec<nfa::StateID>,
}
impl<'a, S: StateID> Determinizer<'a, S> {
/// Create a new determinizer for converting the given NFA to a DFA.
pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
let dead = Rc::new(State::dead());
let mut cache = HashMap::default();
cache.insert(dead.clone(), dead_id());
Determinizer {
nfa,
dfa: DFARepr::empty().anchored(nfa.is_anchored()),
builder_states: vec![dead],
cache,
stack: vec![],
scratch_nfa_states: vec![],
longest_match: false,
}
}
/// Instruct the determinizer to use equivalence classes as the transition
/// alphabet instead of all possible byte values.
pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
let byte_classes = self.nfa.byte_classes().clone();
self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
.anchored(self.nfa.is_anchored());
self
}
/// Instruct the determinizer to build a DFA that recognizes the longest
/// possible match instead of the leftmost first match. This is useful when
/// constructing reverse DFAs for finding the start of a match.
pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
self.longest_match = yes;
self
}
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
pub fn build(mut self) -> Result<DFARepr<S>> {
let representative_bytes: Vec<u8> =
self.dfa.byte_classes().representatives().collect();
let mut sparse = self.new_sparse_set();
let mut uncompiled = vec![self.add_start(&mut sparse)?];
while let Some(dfa_id) = uncompiled.pop() {
for &b in &representative_bytes {
let (next_dfa_id, is_new) =
self.cached_state(dfa_id, b, &mut sparse)?;
self.dfa.add_transition(dfa_id, b, next_dfa_id);
if is_new {
uncompiled.push(next_dfa_id);
}
}
}
// At this point, we shuffle the matching states in the final DFA to
// the beginning. This permits a DFA's match loop to detect a match
// condition by merely inspecting the current state's identifier, and
// avoids the need for any additional auxiliary storage.
let is_match: Vec<bool> =
self.builder_states.iter().map(|s| s.is_match).collect();
self.dfa.shuffle_match_states(&is_match);
Ok(self.dfa)
}
/// Return the identifier for the next DFA state given an existing DFA
/// state and an input byte. If the next DFA state already exists, then
/// return its identifier from the cache. Otherwise, build the state, cache
/// it and return its identifier.
///
/// The given sparse set is used for scratch space. It must have a capacity
/// equivalent to the total number of NFA states, but its contents are
/// otherwise unspecified.
///
/// This routine returns a boolean indicating whether a new state was
/// built. If a new state is built, then the caller needs to add it to its
/// frontier of uncompiled DFA states to compute transitions for.
fn cached_state(
&mut self,
dfa_id: S,
b: u8,
sparse: &mut SparseSet,
) -> Result<(S, bool)> {
sparse.clear();
// Compute the set of all reachable NFA states, including epsilons.
self.next(dfa_id, b, sparse);
// Build a candidate state and check if it has already been built.
let state = self.new_state(sparse);
if let Some(&cached_id) = self.cache.get(&state) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
return Ok((cached_id, false));
}
// Nothing was in the cache, so add this state to the cache.
self.add_state(state).map(|s| (s, true))
}
/// Compute the set of all eachable NFA states, including the full epsilon
/// closure, from a DFA state for a single byte of input.
fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
next_nfa_states.clear();
for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
match *self.nfa.state(nfa_id) {
nfa::State::Union { .. }
| nfa::State::Fail
| nfa::State::Match => {}
nfa::State::Range { range: ref r } => {
if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
}
}
nfa::State::Sparse { ref ranges } => {
for r in ranges.iter() {
if r.start > b {
break;
} else if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
break;
}
}
}
}
}
}
/// Compute the epsilon closure for the given NFA state.
fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
if !self.nfa.state(start).is_epsilon() {
set.insert(start);
return;
}
self.stack.push(start);
while let Some(mut id) = self.stack.pop() {
loop {
if set.contains(id) {
break;
}
set.insert(id);
match *self.nfa.state(id) {
nfa::State::Range { .. }
| nfa::State::Sparse { .. }
| nfa::State::Fail
| nfa::State::Match => break,
nfa::State::Union { ref alternates } => {
id = match alternates.get(0) {
None => break,
Some(&id) => id,
};
self.stack.extend(alternates[1..].iter().rev());
}
}
}
}
}
/// Compute the initial DFA state and return its identifier.
///
/// The sparse set given is used for scratch space, and must have capacity
/// equal to the total number of NFA states. Its contents are unspecified.
fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
sparse.clear();
self.epsilon_closure(self.nfa.start(), sparse);
let state = self.new_state(&sparse);
let id = self.add_state(state)?;
self.dfa.set_start_state(id);
Ok(id)
}
/// Add the given state to the DFA and make it available in the cache.
///
/// The state initially has no transitions. That is, it transitions to the
/// dead state for all possible inputs.
fn add_state(&mut self, state: State) -> Result<S> {
let id = self.dfa.add_empty_state()?;
let rstate = Rc::new(state);
self.builder_states.push(rstate.clone());
self.cache.insert(rstate, id);
Ok(id)
}
/// Convert the given set of ordered NFA states to a DFA state.
fn new_state(&mut self, set: &SparseSet) -> State {
let mut state = State {
is_match: false,
nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
};
state.nfa_states.clear();
for &id in set {
match *self.nfa.state(id) {
nfa::State::Range { .. } => {
state.nfa_states.push(id);
}
nfa::State::Sparse { .. } => {
state.nfa_states.push(id);
}
nfa::State::Fail => {
break;
}
nfa::State::Match => {
state.is_match = true;
if !self.longest_match {
break;
}
}
nfa::State::Union { .. } => {}
}
}
state
}
/// Create a new sparse set with enough capacity to hold all NFA states.
fn new_sparse_set(&self) -> SparseSet {
SparseSet::new(self.nfa.len())
}
}
impl State {
/// Create a new empty dead state.
fn dead() -> State {
State { nfa_states: vec![], is_match: false }
}
}