regex_automata/
determinize.rs

use std::collections::HashMap;
use std::mem;
use std::rc::Rc;

use dense;
use error::Result;
use nfa::{self, NFA};
use sparse_set::SparseSet;
use state_id::{dead_id, StateID};

type DFARepr<S> = dense::Repr<Vec<S>, S>;

/// A determinizer converts an NFA to a DFA.
///
/// This determinizer follows the typical powerset construction, where each
/// DFA state is comprised of one or more NFA states. In the worst case, there
/// is one DFA state for every possible combination of NFA states. In practice,
/// this only happens in certain conditions, typically when there are bounded
/// repetitions.
///
/// The type variable `S` refers to the chosen state identifier representation
/// used for the DFA.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA being
/// converted to a DFA.
#[derive(Debug)]
pub(crate) struct Determinizer<'a, S: StateID> {
    /// The NFA we're converting into a DFA.
    nfa: &'a NFA,
    /// The DFA we're building.
    dfa: DFARepr<S>,
    /// Each DFA state being built is defined as an *ordered* set of NFA
    /// states, along with a flag indicating whether the state is a match
    /// state or not.
    ///
    /// This is never empty. The first state is always a dummy state such that
    /// a state id == 0 corresponds to a dead state.
    builder_states: Vec<Rc<State>>,
    /// A cache of DFA states that already exist and can be easily looked up
    /// via ordered sets of NFA states.
    cache: HashMap<Rc<State>, S>,
    /// Scratch space for a stack of NFA states to visit, for depth first
    /// visiting without recursion.
    stack: Vec<nfa::StateID>,
    /// Scratch space for storing an ordered sequence of NFA states, for
    /// amortizing allocation.
    scratch_nfa_states: Vec<nfa::StateID>,
    /// Whether to build a DFA that finds the longest possible match.
    longest_match: bool,
}

/// An intermediate representation for a DFA state during determinization.
#[derive(Debug, Eq, Hash, PartialEq)]
struct State {
    /// Whether this state is a match state or not.
    is_match: bool,
    /// An ordered sequence of NFA states that make up this DFA state.
    nfa_states: Vec<nfa::StateID>,
}

impl<'a, S: StateID> Determinizer<'a, S> {
    /// Create a new determinizer for converting the given NFA to a DFA.
    pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
        let dead = Rc::new(State::dead());
        let mut cache = HashMap::default();
        cache.insert(dead.clone(), dead_id());

        Determinizer {
            nfa,
            dfa: DFARepr::empty().anchored(nfa.is_anchored()),
            builder_states: vec![dead],
            cache,
            stack: vec![],
            scratch_nfa_states: vec![],
            longest_match: false,
        }
    }

    /// Instruct the determinizer to use equivalence classes as the transition
    /// alphabet instead of all possible byte values.
    pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
        let byte_classes = self.nfa.byte_classes().clone();
        self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
            .anchored(self.nfa.is_anchored());
        self
    }

    /// Instruct the determinizer to build a DFA that recognizes the longest
    /// possible match instead of the leftmost first match. This is useful when
    /// constructing reverse DFAs for finding the start of a match.
    pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
        self.longest_match = yes;
        self
    }

    /// Build the DFA. If there was a problem constructing the DFA (e.g., if
    /// the chosen state identifier representation is too small), then an error
    /// is returned.
    pub fn build(mut self) -> Result<DFARepr<S>> {
        let representative_bytes: Vec<u8> =
            self.dfa.byte_classes().representatives().collect();
        let mut sparse = self.new_sparse_set();
        let mut uncompiled = vec![self.add_start(&mut sparse)?];
        while let Some(dfa_id) = uncompiled.pop() {
            for &b in &representative_bytes {
                let (next_dfa_id, is_new) =
                    self.cached_state(dfa_id, b, &mut sparse)?;
                self.dfa.add_transition(dfa_id, b, next_dfa_id);
                if is_new {
                    uncompiled.push(next_dfa_id);
                }
            }
        }

        // At this point, we shuffle the matching states in the final DFA to
        // the beginning. This permits a DFA's match loop to detect a match
        // condition by merely inspecting the current state's identifier, and
        // avoids the need for any additional auxiliary storage.
        let is_match: Vec<bool> =
            self.builder_states.iter().map(|s| s.is_match).collect();
        self.dfa.shuffle_match_states(&is_match);
        Ok(self.dfa)
    }

    /// Return the identifier for the next DFA state given an existing DFA
    /// state and an input byte. If the next DFA state already exists, then
    /// return its identifier from the cache. Otherwise, build the state, cache
    /// it and return its identifier.
    ///
    /// The given sparse set is used for scratch space. It must have a capacity
    /// equivalent to the total number of NFA states, but its contents are
    /// otherwise unspecified.
    ///
    /// This routine returns a boolean indicating whether a new state was
    /// built. If a new state is built, then the caller needs to add it to its
    /// frontier of uncompiled DFA states to compute transitions for.
    fn cached_state(
        &mut self,
        dfa_id: S,
        b: u8,
        sparse: &mut SparseSet,
    ) -> Result<(S, bool)> {
        sparse.clear();
        // Compute the set of all reachable NFA states, including epsilons.
        self.next(dfa_id, b, sparse);
        // Build a candidate state and check if it has already been built.
        let state = self.new_state(sparse);
        if let Some(&cached_id) = self.cache.get(&state) {
            // Since we have a cached state, put the constructed state's
            // memory back into our scratch space, so that it can be reused.
            mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
            return Ok((cached_id, false));
        }
        // Nothing was in the cache, so add this state to the cache.
        self.add_state(state).map(|s| (s, true))
    }

    /// Compute the set of all eachable NFA states, including the full epsilon
    /// closure, from a DFA state for a single byte of input.
    fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
        next_nfa_states.clear();
        for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
            let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
            match *self.nfa.state(nfa_id) {
                nfa::State::Union { .. }
                | nfa::State::Fail
                | nfa::State::Match => {}
                nfa::State::Range { range: ref r } => {
                    if r.start <= b && b <= r.end {
                        self.epsilon_closure(r.next, next_nfa_states);
                    }
                }
                nfa::State::Sparse { ref ranges } => {
                    for r in ranges.iter() {
                        if r.start > b {
                            break;
                        } else if r.start <= b && b <= r.end {
                            self.epsilon_closure(r.next, next_nfa_states);
                            break;
                        }
                    }
                }
            }
        }
    }

    /// Compute the epsilon closure for the given NFA state.
    fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
        if !self.nfa.state(start).is_epsilon() {
            set.insert(start);
            return;
        }

        self.stack.push(start);
        while let Some(mut id) = self.stack.pop() {
            loop {
                if set.contains(id) {
                    break;
                }
                set.insert(id);
                match *self.nfa.state(id) {
                    nfa::State::Range { .. }
                    | nfa::State::Sparse { .. }
                    | nfa::State::Fail
                    | nfa::State::Match => break,
                    nfa::State::Union { ref alternates } => {
                        id = match alternates.get(0) {
                            None => break,
                            Some(&id) => id,
                        };
                        self.stack.extend(alternates[1..].iter().rev());
                    }
                }
            }
        }
    }

    /// Compute the initial DFA state and return its identifier.
    ///
    /// The sparse set given is used for scratch space, and must have capacity
    /// equal to the total number of NFA states. Its contents are unspecified.
    fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
        sparse.clear();
        self.epsilon_closure(self.nfa.start(), sparse);
        let state = self.new_state(&sparse);
        let id = self.add_state(state)?;
        self.dfa.set_start_state(id);
        Ok(id)
    }

    /// Add the given state to the DFA and make it available in the cache.
    ///
    /// The state initially has no transitions. That is, it transitions to the
    /// dead state for all possible inputs.
    fn add_state(&mut self, state: State) -> Result<S> {
        let id = self.dfa.add_empty_state()?;
        let rstate = Rc::new(state);
        self.builder_states.push(rstate.clone());
        self.cache.insert(rstate, id);
        Ok(id)
    }

    /// Convert the given set of ordered NFA states to a DFA state.
    fn new_state(&mut self, set: &SparseSet) -> State {
        let mut state = State {
            is_match: false,
            nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
        };
        state.nfa_states.clear();

        for &id in set {
            match *self.nfa.state(id) {
                nfa::State::Range { .. } => {
                    state.nfa_states.push(id);
                }
                nfa::State::Sparse { .. } => {
                    state.nfa_states.push(id);
                }
                nfa::State::Fail => {
                    break;
                }
                nfa::State::Match => {
                    state.is_match = true;
                    if !self.longest_match {
                        break;
                    }
                }
                nfa::State::Union { .. } => {}
            }
        }
        state
    }

    /// Create a new sparse set with enough capacity to hold all NFA states.
    fn new_sparse_set(&self) -> SparseSet {
        SparseSet::new(self.nfa.len())
    }
}

impl State {
    /// Create a new empty dead state.
    fn dead() -> State {
        State { nfa_states: vec![], is_match: false }
    }
}
regex_automata/determinize.rs

regex_automata/
determinize.rs