regex_automata/determinize.rs

use std::collections::HashMap;
use std::mem;
use std::rc::Rc;
use dense;
use error::Result;
use nfa::{self, NFA};
use sparse_set::SparseSet;
use state_id::{dead_id, StateID};
type DFARepr<S> = dense::Repr<Vec<S>, S>;
/// A determinizer converts an NFA to a DFA.
///
/// This determinizer follows the typical powerset construction, where each
/// DFA state is comprised of one or more NFA states. In the worst case, there
/// is one DFA state for every possible combination of NFA states. In practice,
/// this only happens in certain conditions, typically when there are bounded
/// repetitions.
///
/// The type variable `S` refers to the chosen state identifier representation
/// used for the DFA.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA being
/// converted to a DFA.
#[derive(Debug)]
pub(crate) struct Determinizer<'a, S: StateID> {
/// The NFA we're converting into a DFA.
nfa: &'a NFA,
/// The DFA we're building.
dfa: DFARepr<S>,
/// Each DFA state being built is defined as an *ordered* set of NFA
/// states, along with a flag indicating whether the state is a match
/// state or not.
///
/// This is never empty. The first state is always a dummy state such that
/// a state id == 0 corresponds to a dead state.
builder_states: Vec<Rc<State>>,
/// A cache of DFA states that already exist and can be easily looked up
/// via ordered sets of NFA states.
cache: HashMap<Rc<State>, S>,
/// Scratch space for a stack of NFA states to visit, for depth first
/// visiting without recursion.
stack: Vec<nfa::StateID>,
/// Scratch space for storing an ordered sequence of NFA states, for
/// amortizing allocation.
scratch_nfa_states: Vec<nfa::StateID>,
/// Whether to build a DFA that finds the longest possible match.
longest_match: bool,
}
/// An intermediate representation for a DFA state during determinization.
#[derive(Debug, Eq, Hash, PartialEq)]
struct State {
/// Whether this state is a match state or not.
is_match: bool,
/// An ordered sequence of NFA states that make up this DFA state.
nfa_states: Vec<nfa::StateID>,
}
impl<'a, S: StateID> Determinizer<'a, S> {
/// Create a new determinizer for converting the given NFA to a DFA.
pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
let dead = Rc::new(State::dead());
let mut cache = HashMap::default();
cache.insert(dead.clone(), dead_id());
Determinizer {
nfa,
dfa: DFARepr::empty().anchored(nfa.is_anchored()),
builder_states: vec![dead],
cache,
stack: vec![],
scratch_nfa_states: vec![],
longest_match: false,
}
}
/// Instruct the determinizer to use equivalence classes as the transition
/// alphabet instead of all possible byte values.
pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
let byte_classes = self.nfa.byte_classes().clone();
self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
.anchored(self.nfa.is_anchored());
self
}
/// Instruct the determinizer to build a DFA that recognizes the longest
/// possible match instead of the leftmost first match. This is useful when
/// constructing reverse DFAs for finding the start of a match.
pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
self.longest_match = yes;
self
}
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
pub fn build(mut self) -> Result<DFARepr<S>> {
let representative_bytes: Vec<u8> =
self.dfa.byte_classes().representatives().collect();
let mut sparse = self.new_sparse_set();
let mut uncompiled = vec![self.add_start(&mut sparse)?];
while let Some(dfa_id) = uncompiled.pop() {
for &b in &representative_bytes {
let (next_dfa_id, is_new) =
self.cached_state(dfa_id, b, &mut sparse)?;
self.dfa.add_transition(dfa_id, b, next_dfa_id);
if is_new {
uncompiled.push(next_dfa_id);
}
}
}
// At this point, we shuffle the matching states in the final DFA to
// the beginning. This permits a DFA's match loop to detect a match
// condition by merely inspecting the current state's identifier, and
// avoids the need for any additional auxiliary storage.
let is_match: Vec<bool> =
self.builder_states.iter().map(|s| s.is_match).collect();
self.dfa.shuffle_match_states(&is_match);
Ok(self.dfa)
}
/// Return the identifier for the next DFA state given an existing DFA
/// state and an input byte. If the next DFA state already exists, then
/// return its identifier from the cache. Otherwise, build the state, cache
/// it and return its identifier.
///
/// The given sparse set is used for scratch space. It must have a capacity
/// equivalent to the total number of NFA states, but its contents are
/// otherwise unspecified.
///
/// This routine returns a boolean indicating whether a new state was
/// built. If a new state is built, then the caller needs to add it to its
/// frontier of uncompiled DFA states to compute transitions for.
fn cached_state(
&mut self,
dfa_id: S,
b: u8,
sparse: &mut SparseSet,
) -> Result<(S, bool)> {
sparse.clear();
// Compute the set of all reachable NFA states, including epsilons.
self.next(dfa_id, b, sparse);
// Build a candidate state and check if it has already been built.
let state = self.new_state(sparse);
if let Some(&cached_id) = self.cache.get(&state) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
return Ok((cached_id, false));
}
// Nothing was in the cache, so add this state to the cache.
self.add_state(state).map(|s| (s, true))
}
/// Compute the set of all eachable NFA states, including the full epsilon
/// closure, from a DFA state for a single byte of input.
fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
next_nfa_states.clear();
for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
match *self.nfa.state(nfa_id) {
nfa::State::Union { .. }
| nfa::State::Fail
| nfa::State::Match => {}
nfa::State::Range { range: ref r } => {
if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
}
}
nfa::State::Sparse { ref ranges } => {
for r in ranges.iter() {
if r.start > b {
break;
} else if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
break;
}
}
}
}
}
}
/// Compute the epsilon closure for the given NFA state.
fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
if !self.nfa.state(start).is_epsilon() {
set.insert(start);
return;
}
self.stack.push(start);
while let Some(mut id) = self.stack.pop() {
loop {
if set.contains(id) {
break;
}
set.insert(id);
match *self.nfa.state(id) {
nfa::State::Range { .. }
| nfa::State::Sparse { .. }
| nfa::State::Fail
| nfa::State::Match => break,
nfa::State::Union { ref alternates } => {
id = match alternates.get(0) {
None => break,
Some(&id) => id,
};
self.stack.extend(alternates[1..].iter().rev());
}
}
}
}
}
/// Compute the initial DFA state and return its identifier.
///
/// The sparse set given is used for scratch space, and must have capacity
/// equal to the total number of NFA states. Its contents are unspecified.
fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
sparse.clear();
self.epsilon_closure(self.nfa.start(), sparse);
let state = self.new_state(&sparse);
let id = self.add_state(state)?;
self.dfa.set_start_state(id);
Ok(id)
}
/// Add the given state to the DFA and make it available in the cache.
///
/// The state initially has no transitions. That is, it transitions to the
/// dead state for all possible inputs.
fn add_state(&mut self, state: State) -> Result<S> {
let id = self.dfa.add_empty_state()?;
let rstate = Rc::new(state);
self.builder_states.push(rstate.clone());
self.cache.insert(rstate, id);
Ok(id)
}
/// Convert the given set of ordered NFA states to a DFA state.
fn new_state(&mut self, set: &SparseSet) -> State {
let mut state = State {
is_match: false,
nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
};
state.nfa_states.clear();
for &id in set {
match *self.nfa.state(id) {
nfa::State::Range { .. } => {
state.nfa_states.push(id);
}
nfa::State::Sparse { .. } => {
state.nfa_states.push(id);
}
nfa::State::Fail => {
break;
}
nfa::State::Match => {
state.is_match = true;
if !self.longest_match {
break;
}
}
nfa::State::Union { .. } => {}
}
}
state
}
/// Create a new sparse set with enough capacity to hold all NFA states.
fn new_sparse_set(&self) -> SparseSet {
SparseSet::new(self.nfa.len())
}
}
impl State {
/// Create a new empty dead state.
fn dead() -> State {
State { nfa_states: vec![], is_match: false }
}
}