regex/
dfa.rs

Help
1/*!
2The DFA matching engine.
3
4A DFA provides faster matching because the engine is in exactly one state at
5any point in time. In the NFA, there may be multiple active states, and
6considerable CPU cycles are spent shuffling them around. In finite automata
7speak, the DFA follows epsilon transitions in the regex far less than the NFA.
8
9A DFA is a classic trade off between time and space. The NFA is slower, but
10its memory requirements are typically small and predictable. The DFA is faster,
11but given the right regex and the right input, the number of states in the
12DFA can grow exponentially. To mitigate this space problem, we do two things:
13
141. We implement an *online* DFA. That is, the DFA is constructed from the NFA
15   during a search. When a new state is computed, it is stored in a cache so
16   that it may be reused. An important consequence of this implementation
17   is that states that are never reached for a particular input are never
18   computed. (This is impossible in an "offline" DFA which needs to compute
19   all possible states up front.)
202. If the cache gets too big, we wipe it and continue matching.
21
22In pathological cases, a new state can be created for every byte of input.
23(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
24In this case, performance regresses to slightly slower than the full NFA
25simulation, in large part because the cache becomes useless. If the cache
26is wiped too frequently, the DFA quits and control falls back to one of the
27NFA simulations.
28
29Because of the "lazy" nature of this DFA, the inner matching loop is
30considerably more complex than one might expect out of a DFA. A number of
31tricks are employed to make it fast. Tread carefully.
32
33N.B. While this implementation is heavily commented, Russ Cox's series of
34articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
35(As is the DFA implementation in RE2, which heavily influenced this
36implementation.)
37*/
38
39use std::collections::HashMap;
40use std::fmt;
41use std::iter::repeat;
42use std::mem;
43use std::sync::Arc;
44
45use crate::exec::ProgramCache;
46use crate::prog::{Inst, Program};
47use crate::sparse::SparseSet;
48
49/// Return true if and only if the given program can be executed by a DFA.
50///
51/// Generally, a DFA is always possible. A pathological case where it is not
52/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
53/// this function will return false.
54///
55/// This function will also return false if the given program has any Unicode
56/// instructions (Char or Ranges) since the DFA operates on bytes only.
57pub fn can_exec(insts: &Program) -> bool {
58    use crate::prog::Inst::*;
59    // If for some reason we manage to allocate a regex program with more
60    // than i32::MAX instructions, then we can't execute the DFA because we
61    // use 32 bit instruction pointer deltas for memory savings.
62    // If i32::MAX is the largest positive delta,
63    // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
64    // and we are OK to use 32 bits.
65    if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
66        return false;
67    }
68    for inst in insts {
69        match *inst {
70            Char(_) | Ranges(_) => return false,
71            EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
72        }
73    }
74    true
75}
76
77/// A reusable cache of DFA states.
78///
79/// This cache is reused between multiple invocations of the same regex
80/// program. (It is not shared simultaneously between threads. If there is
81/// contention, then new caches are created.)
82#[derive(Debug)]
83pub struct Cache {
84    /// Group persistent DFA related cache state together. The sparse sets
85    /// listed below are used as scratch space while computing uncached states.
86    inner: CacheInner,
87    /// qcur and qnext are ordered sets with constant time
88    /// addition/membership/clearing-whole-set and linear time iteration. They
89    /// are used to manage the sets of NFA states in DFA states when computing
90    /// cached DFA states. In particular, the order of the NFA states matters
91    /// for leftmost-first style matching. Namely, when computing a cached
92    /// state, the set of NFA states stops growing as soon as the first Match
93    /// instruction is observed.
94    qcur: SparseSet,
95    qnext: SparseSet,
96}
97
98/// `CacheInner` is logically just a part of Cache, but groups together fields
99/// that aren't passed as function parameters throughout search. (This split
100/// is mostly an artifact of the borrow checker. It is happily paid.)
101#[derive(Debug)]
102struct CacheInner {
103    /// A cache of pre-compiled DFA states, keyed by the set of NFA states
104    /// and the set of empty-width flags set at the byte in the input when the
105    /// state was observed.
106    ///
107    /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
108    /// things, we just pass indexes around manually. The performance impact of
109    /// this is probably an instruction or two in the inner loop. However, on
110    /// 64 bit, each StatePtr is half the size of a *State.
111    compiled: StateMap,
112    /// The transition table.
113    ///
114    /// The transition table is laid out in row-major order, where states are
115    /// rows and the transitions for each state are columns. At a high level,
116    /// given state `s` and byte `b`, the next state can be found at index
117    /// `s * 256 + b`.
118    ///
119    /// This is, of course, a lie. A StatePtr is actually a pointer to the
120    /// *start* of a row in this table. When indexing in the DFA's inner loop,
121    /// this removes the need to multiply the StatePtr by the stride. Yes, it
122    /// matters. This reduces the number of states we can store, but: the
123    /// stride is rarely 256 since we define transitions in terms of
124    /// *equivalence classes* of bytes. Each class corresponds to a set of
125    /// bytes that never discriminate a distinct path through the DFA from each
126    /// other.
127    trans: Transitions,
128    /// A set of cached start states, which are limited to the number of
129    /// permutations of flags set just before the initial byte of input. (The
130    /// index into this vec is a `EmptyFlags`.)
131    ///
132    /// N.B. A start state can be "dead" (i.e., no possible match), so we
133    /// represent it with a StatePtr.
134    start_states: Vec<StatePtr>,
135    /// Stack scratch space used to follow epsilon transitions in the NFA.
136    /// (This permits us to avoid recursion.)
137    ///
138    /// The maximum stack size is the number of NFA states.
139    stack: Vec<InstPtr>,
140    /// The total number of times this cache has been flushed by the DFA
141    /// because of space constraints.
142    flush_count: u64,
143    /// The total heap size of the DFA's cache. We use this to determine when
144    /// we should flush the cache.
145    size: usize,
146    /// Scratch space used when building instruction pointer lists for new
147    /// states. This helps amortize allocation.
148    insts_scratch_space: Vec<u8>,
149}
150
151/// The transition table.
152///
153/// It is laid out in row-major order, with states as rows and byte class
154/// transitions as columns.
155///
156/// The transition table is responsible for producing valid `StatePtrs`. A
157/// `StatePtr` points to the start of a particular row in this table. When
158/// indexing to find the next state this allows us to avoid a multiplication
159/// when computing an index into the table.
160#[derive(Clone)]
161struct Transitions {
162    /// The table.
163    table: Vec<StatePtr>,
164    /// The stride.
165    num_byte_classes: usize,
166}
167
168/// Fsm encapsulates the actual execution of the DFA.
169#[derive(Debug)]
170pub struct Fsm<'a> {
171    /// prog contains the NFA instruction opcodes. DFA execution uses either
172    /// the `dfa` instructions or the `dfa_reverse` instructions from
173    /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
174    /// Unicode opcodes that cannot be executed by the DFA.)
175    prog: &'a Program,
176    /// The start state. We record it here because the pointer may change
177    /// when the cache is wiped.
178    start: StatePtr,
179    /// The current position in the input.
180    at: usize,
181    /// Should we quit after seeing the first match? e.g., When the caller
182    /// uses `is_match` or `shortest_match`.
183    quit_after_match: bool,
184    /// The last state that matched.
185    ///
186    /// When no match has occurred, this is set to STATE_UNKNOWN.
187    ///
188    /// This is only useful when matching regex sets. The last match state
189    /// is useful because it contains all of the match instructions seen,
190    /// thereby allowing us to enumerate which regexes in the set matched.
191    last_match_si: StatePtr,
192    /// The input position of the last cache flush. We use this to determine
193    /// if we're thrashing in the cache too often. If so, the DFA quits so
194    /// that we can fall back to the NFA algorithm.
195    last_cache_flush: usize,
196    /// All cached DFA information that is persisted between searches.
197    cache: &'a mut CacheInner,
198}
199
200/// The result of running the DFA.
201///
202/// Generally, the result is either a match or not a match, but sometimes the
203/// DFA runs too slowly because the cache size is too small. In that case, it
204/// gives up with the intent of falling back to the NFA algorithm.
205///
206/// The DFA can also give up if it runs out of room to create new states, or if
207/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
208#[derive(Clone, Debug)]
209pub enum Result<T> {
210    Match(T),
211    NoMatch(usize),
212    Quit,
213}
214
215impl<T> Result<T> {
216    /// Returns true if this result corresponds to a match.
217    pub fn is_match(&self) -> bool {
218        match *self {
219            Result::Match(_) => true,
220            Result::NoMatch(_) | Result::Quit => false,
221        }
222    }
223
224    /// Maps the given function onto T and returns the result.
225    ///
226    /// If this isn't a match, then this is a no-op.
227    #[cfg(feature = "perf-literal")]
228    pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
229        match self {
230            Result::Match(t) => Result::Match(f(t)),
231            Result::NoMatch(x) => Result::NoMatch(x),
232            Result::Quit => Result::Quit,
233        }
234    }
235
236    /// Sets the non-match position.
237    ///
238    /// If this isn't a non-match, then this is a no-op.
239    fn set_non_match(self, at: usize) -> Result<T> {
240        match self {
241            Result::NoMatch(_) => Result::NoMatch(at),
242            r => r,
243        }
244    }
245}
246
247/// `State` is a DFA state. It contains an ordered set of NFA states (not
248/// necessarily complete) and a smattering of flags.
249///
250/// The flags are packed into the first byte of data.
251///
252/// States don't carry their transitions. Instead, transitions are stored in
253/// a single row-major table.
254///
255/// Delta encoding is used to store the instruction pointers.
256/// The first instruction pointer is stored directly starting
257/// at data[1], and each following pointer is stored as an offset
258/// to the previous one. If a delta is in the range -127..127,
259/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
260/// is coded as a flag, followed by 4 bytes encoding the delta.
261#[derive(Clone, Eq, Hash, PartialEq)]
262struct State {
263    data: Arc<[u8]>,
264}
265
266/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
267/// an NFA state).
268///
269/// Throughout this library, this is usually set to `usize`, but we force a
270/// `u32` here for the DFA to save on space.
271type InstPtr = u32;
272
273/// Adds ip to data using delta encoding with respect to prev.
274///
275/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
276fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
277    let delta = (ip as i32) - (*prev as i32);
278    write_vari32(data, delta);
279    *prev = ip;
280}
281
282struct InstPtrs<'a> {
283    base: usize,
284    data: &'a [u8],
285}
286
287impl<'a> Iterator for InstPtrs<'a> {
288    type Item = usize;
289
290    fn next(&mut self) -> Option<usize> {
291        if self.data.is_empty() {
292            return None;
293        }
294        let (delta, nread) = read_vari32(self.data);
295        let base = self.base as i32 + delta;
296        debug_assert!(base >= 0);
297        debug_assert!(nread > 0);
298        self.data = &self.data[nread..];
299        self.base = base as usize;
300        Some(self.base)
301    }
302}
303
304impl State {
305    fn flags(&self) -> StateFlags {
306        StateFlags(self.data[0])
307    }
308
309    fn inst_ptrs(&self) -> InstPtrs<'_> {
310        InstPtrs { base: 0, data: &self.data[1..] }
311    }
312}
313
314/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
315/// table.
316///
317/// It has many special values. There are two types of special values:
318/// sentinels and flags.
319///
320/// Sentinels corresponds to special states that carry some kind of
321/// significance. There are three such states: unknown, dead and quit states.
322///
323/// Unknown states are states that haven't been computed yet. They indicate
324/// that a transition should be filled in that points to either an existing
325/// cached state or a new state altogether. In general, an unknown state means
326/// "follow the NFA's epsilon transitions."
327///
328/// Dead states are states that can never lead to a match, no matter what
329/// subsequent input is observed. This means that the DFA should quit
330/// immediately and return the longest match it has found thus far.
331///
332/// Quit states are states that imply the DFA is not capable of matching the
333/// regex correctly. Currently, this is only used when a Unicode word boundary
334/// exists in the regex *and* a non-ASCII byte is observed.
335///
336/// The other type of state pointer is a state pointer with special flag bits.
337/// There are two flags: a start flag and a match flag. The lower bits of both
338/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
339/// mask).
340///
341/// The start flag means that the state is a start state, and therefore may be
342/// subject to special prefix scanning optimizations.
343///
344/// The match flag means that the state is a match state, and therefore the
345/// current position in the input (while searching) should be recorded.
346///
347/// The above exists mostly in the service of making the inner loop fast.
348/// In particular, the inner *inner* loop looks something like this:
349///
350/// ```ignore
351/// while state <= STATE_MAX and i < len(text):
352///     state = state.next[i]
353/// ```
354///
355/// This is nice because it lets us execute a lazy DFA as if it were an
356/// entirely offline DFA (i.e., with very few instructions). The loop will
357/// quit only when we need to examine a case that needs special attention.
358type StatePtr = u32;
359
360/// An unknown state means that the state has not been computed yet, and that
361/// the only way to progress is to compute it.
362const STATE_UNKNOWN: StatePtr = 1 << 31;
363
364/// A dead state means that the state has been computed and it is known that
365/// once it is entered, no future match can ever occur.
366const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
367
368/// A quit state means that the DFA came across some input that it doesn't
369/// know how to process correctly. The DFA should quit and another matching
370/// engine should be run in its place.
371const STATE_QUIT: StatePtr = STATE_DEAD + 1;
372
373/// A start state is a state that the DFA can start in.
374///
375/// Note that start states have their lower bits set to a state pointer.
376const STATE_START: StatePtr = 1 << 30;
377
378/// A match state means that the regex has successfully matched.
379///
380/// Note that match states have their lower bits set to a state pointer.
381const STATE_MATCH: StatePtr = 1 << 29;
382
383/// The maximum state pointer. This is useful to mask out the "valid" state
384/// pointer from a state with the "start" or "match" bits set.
385///
386/// It doesn't make sense to use this with unknown, dead or quit state
387/// pointers, since those pointers are sentinels and never have their lower
388/// bits set to anything meaningful.
389const STATE_MAX: StatePtr = STATE_MATCH - 1;
390
391/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
392/// special EOF sentinel value.
393#[derive(Copy, Clone, Debug)]
394struct Byte(u16);
395
396/// A set of flags for zero-width assertions.
397#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
398struct EmptyFlags {
399    start: bool,
400    end: bool,
401    start_line: bool,
402    end_line: bool,
403    word_boundary: bool,
404    not_word_boundary: bool,
405}
406
407/// A set of flags describing various configurations of a DFA state. This is
408/// represented by a `u8` so that it is compact.
409#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
410struct StateFlags(u8);
411
412impl Cache {
413    /// Create new empty cache for the DFA engine.
414    pub fn new(prog: &Program) -> Self {
415        // We add 1 to account for the special EOF byte.
416        let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
417        let starts = vec![STATE_UNKNOWN; 256];
418        let mut cache = Cache {
419            inner: CacheInner {
420                compiled: StateMap::new(num_byte_classes),
421                trans: Transitions::new(num_byte_classes),
422                start_states: starts,
423                stack: vec![],
424                flush_count: 0,
425                size: 0,
426                insts_scratch_space: vec![],
427            },
428            qcur: SparseSet::new(prog.insts.len()),
429            qnext: SparseSet::new(prog.insts.len()),
430        };
431        cache.inner.reset_size();
432        cache
433    }
434}
435
436impl CacheInner {
437    /// Resets the cache size to account for fixed costs, such as the program
438    /// and stack sizes.
439    fn reset_size(&mut self) {
440        self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
441            + (self.stack.len() * mem::size_of::<InstPtr>());
442    }
443}
444
445impl<'a> Fsm<'a> {
446    #[cfg_attr(feature = "perf-inline", inline(always))]
447    pub fn forward(
448        prog: &'a Program,
449        cache: &ProgramCache,
450        quit_after_match: bool,
451        text: &[u8],
452        at: usize,
453    ) -> Result<usize> {
454        let mut cache = cache.borrow_mut();
455        let cache = &mut cache.dfa;
456        let mut dfa = Fsm {
457            prog,
458            start: 0, // filled in below
459            at,
460            quit_after_match,
461            last_match_si: STATE_UNKNOWN,
462            last_cache_flush: at,
463            cache: &mut cache.inner,
464        };
465        let (empty_flags, state_flags) = dfa.start_flags(text, at);
466        dfa.start =
467            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
468                None => return Result::Quit,
469                Some(STATE_DEAD) => return Result::NoMatch(at),
470                Some(si) => si,
471            };
472        debug_assert!(dfa.start != STATE_UNKNOWN);
473        dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
474    }
475
476    #[cfg_attr(feature = "perf-inline", inline(always))]
477    pub fn reverse(
478        prog: &'a Program,
479        cache: &ProgramCache,
480        quit_after_match: bool,
481        text: &[u8],
482        at: usize,
483    ) -> Result<usize> {
484        let mut cache = cache.borrow_mut();
485        let cache = &mut cache.dfa_reverse;
486        let mut dfa = Fsm {
487            prog,
488            start: 0, // filled in below
489            at,
490            quit_after_match,
491            last_match_si: STATE_UNKNOWN,
492            last_cache_flush: at,
493            cache: &mut cache.inner,
494        };
495        let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
496        dfa.start =
497            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
498                None => return Result::Quit,
499                Some(STATE_DEAD) => return Result::NoMatch(at),
500                Some(si) => si,
501            };
502        debug_assert!(dfa.start != STATE_UNKNOWN);
503        dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
504    }
505
506    #[cfg_attr(feature = "perf-inline", inline(always))]
507    pub fn forward_many(
508        prog: &'a Program,
509        cache: &ProgramCache,
510        matches: &mut [bool],
511        text: &[u8],
512        at: usize,
513    ) -> Result<usize> {
514        debug_assert!(matches.len() == prog.matches.len());
515        let mut cache = cache.borrow_mut();
516        let cache = &mut cache.dfa;
517        let mut dfa = Fsm {
518            prog,
519            start: 0, // filled in below
520            at,
521            quit_after_match: false,
522            last_match_si: STATE_UNKNOWN,
523            last_cache_flush: at,
524            cache: &mut cache.inner,
525        };
526        let (empty_flags, state_flags) = dfa.start_flags(text, at);
527        dfa.start =
528            match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
529                None => return Result::Quit,
530                Some(STATE_DEAD) => return Result::NoMatch(at),
531                Some(si) => si,
532            };
533        debug_assert!(dfa.start != STATE_UNKNOWN);
534        let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
535        if result.is_match() {
536            if matches.len() == 1 {
537                matches[0] = true;
538            } else {
539                debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
540                debug_assert!(dfa.last_match_si != STATE_DEAD);
541                for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
542                    if let Inst::Match(slot) = dfa.prog[ip] {
543                        matches[slot] = true;
544                    }
545                }
546            }
547        }
548        result
549    }
550
551    /// Executes the DFA on a forward NFA.
552    ///
553    /// {qcur,qnext} are scratch ordered sets which may be non-empty.
554    #[cfg_attr(feature = "perf-inline", inline(always))]
555    fn exec_at(
556        &mut self,
557        qcur: &mut SparseSet,
558        qnext: &mut SparseSet,
559        text: &[u8],
560    ) -> Result<usize> {
561        // For the most part, the DFA is basically:
562        //
563        //   last_match = null
564        //   while current_byte != EOF:
565        //     si = current_state.next[current_byte]
566        //     if si is match
567        //       last_match = si
568        //   return last_match
569        //
570        // However, we need to deal with a few things:
571        //
572        //   1. This is an *online* DFA, so the current state's next list
573        //      may not point to anywhere yet, so we must go out and compute
574        //      them. (They are then cached into the current state's next list
575        //      to avoid re-computation.)
576        //   2. If we come across a state that is known to be dead (i.e., never
577        //      leads to a match), then we can quit early.
578        //   3. If the caller just wants to know if a match occurs, then we
579        //      can quit as soon as we know we have a match. (Full leftmost
580        //      first semantics require continuing on.)
581        //   4. If we're in the start state, then we can use a pre-computed set
582        //      of prefix literals to skip quickly along the input.
583        //   5. After the input is exhausted, we run the DFA on one symbol
584        //      that stands for EOF. This is useful for handling empty width
585        //      assertions.
586        //   6. We can't actually do state.next[byte]. Instead, we have to do
587        //      state.next[byte_classes[byte]], which permits us to keep the
588        //      'next' list very small.
589        //
590        // Since there's a bunch of extra stuff we need to consider, we do some
591        // pretty hairy tricks to get the inner loop to run as fast as
592        // possible.
593        debug_assert!(!self.prog.is_reverse);
594
595        // The last match is the currently known ending match position. It is
596        // reported as an index to the most recent byte that resulted in a
597        // transition to a match state and is always stored in capture slot `1`
598        // when searching forwards. Its maximum value is `text.len()`.
599        let mut result = Result::NoMatch(self.at);
600        let (mut prev_si, mut next_si) = (self.start, self.start);
601        let mut at = self.at;
602        while at < text.len() {
603            // This is the real inner loop. We take advantage of special bits
604            // set in the state pointer to determine whether a state is in the
605            // "common" case or not. Specifically, the common case is a
606            // non-match non-start non-dead state that has already been
607            // computed. So long as we remain in the common case, this inner
608            // loop will chew through the input.
609            //
610            // We also unroll the loop 4 times to amortize the cost of checking
611            // whether we've consumed the entire input. We are also careful
612            // to make sure that `prev_si` always represents the previous state
613            // and `next_si` always represents the next state after the loop
614            // exits, even if it isn't always true inside the loop.
615            while next_si <= STATE_MAX && at < text.len() {
616                // Argument for safety is in the definition of next_si.
617                prev_si = unsafe { self.next_si(next_si, text, at) };
618                at += 1;
619                if prev_si > STATE_MAX || at + 2 >= text.len() {
620                    mem::swap(&mut prev_si, &mut next_si);
621                    break;
622                }
623                next_si = unsafe { self.next_si(prev_si, text, at) };
624                at += 1;
625                if next_si > STATE_MAX {
626                    break;
627                }
628                prev_si = unsafe { self.next_si(next_si, text, at) };
629                at += 1;
630                if prev_si > STATE_MAX {
631                    mem::swap(&mut prev_si, &mut next_si);
632                    break;
633                }
634                next_si = unsafe { self.next_si(prev_si, text, at) };
635                at += 1;
636            }
637            if next_si & STATE_MATCH > 0 {
638                // A match state is outside of the common case because it needs
639                // special case analysis. In particular, we need to record the
640                // last position as having matched and possibly quit the DFA if
641                // we don't need to keep matching.
642                next_si &= !STATE_MATCH;
643                result = Result::Match(at - 1);
644                if self.quit_after_match {
645                    return result;
646                }
647                self.last_match_si = next_si;
648                prev_si = next_si;
649
650                // This permits short-circuiting when matching a regex set.
651                // In particular, if this DFA state contains only match states,
652                // then it's impossible to extend the set of matches since
653                // match states are final. Therefore, we can quit.
654                if self.prog.matches.len() > 1 {
655                    let state = self.state(next_si);
656                    let just_matches =
657                        state.inst_ptrs().all(|ip| self.prog[ip].is_match());
658                    if just_matches {
659                        return result;
660                    }
661                }
662
663                // Another inner loop! If the DFA stays in this particular
664                // match state, then we can rip through all of the input
665                // very quickly, and only recording the match location once
666                // we've left this particular state.
667                let cur = at;
668                while (next_si & !STATE_MATCH) == prev_si
669                    && at + 2 < text.len()
670                {
671                    // Argument for safety is in the definition of next_si.
672                    next_si = unsafe {
673                        self.next_si(next_si & !STATE_MATCH, text, at)
674                    };
675                    at += 1;
676                }
677                if at > cur {
678                    result = Result::Match(at - 2);
679                }
680            } else if next_si & STATE_START > 0 {
681                // A start state isn't in the common case because we may
682                // want to do quick prefix scanning. If the program doesn't
683                // have a detected prefix, then start states are actually
684                // considered common and this case is never reached.
685                debug_assert!(self.has_prefix());
686                next_si &= !STATE_START;
687                prev_si = next_si;
688                at = match self.prefix_at(text, at) {
689                    None => return Result::NoMatch(text.len()),
690                    Some(i) => i,
691                };
692            } else if next_si >= STATE_UNKNOWN {
693                if next_si == STATE_QUIT {
694                    return Result::Quit;
695                }
696                // Finally, this corresponds to the case where the transition
697                // entered a state that can never lead to a match or a state
698                // that hasn't been computed yet. The latter being the "slow"
699                // path.
700                let byte = Byte::byte(text[at - 1]);
701                // We no longer care about the special bits in the state
702                // pointer.
703                prev_si &= STATE_MAX;
704                // Record where we are. This is used to track progress for
705                // determining whether we should quit if we've flushed the
706                // cache too much.
707                self.at = at;
708                next_si = match self.next_state(qcur, qnext, prev_si, byte) {
709                    None => return Result::Quit,
710                    Some(STATE_DEAD) => return result.set_non_match(at),
711                    Some(si) => si,
712                };
713                debug_assert!(next_si != STATE_UNKNOWN);
714                if next_si & STATE_MATCH > 0 {
715                    next_si &= !STATE_MATCH;
716                    result = Result::Match(at - 1);
717                    if self.quit_after_match {
718                        return result;
719                    }
720                    self.last_match_si = next_si;
721                }
722                prev_si = next_si;
723            } else {
724                prev_si = next_si;
725            }
726        }
727
728        // Run the DFA once more on the special EOF sentinel value.
729        // We don't care about the special bits in the state pointer any more,
730        // so get rid of them.
731        prev_si &= STATE_MAX;
732        prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
733            None => return Result::Quit,
734            Some(STATE_DEAD) => return result.set_non_match(text.len()),
735            Some(si) => si & !STATE_START,
736        };
737        debug_assert!(prev_si != STATE_UNKNOWN);
738        if prev_si & STATE_MATCH > 0 {
739            prev_si &= !STATE_MATCH;
740            self.last_match_si = prev_si;
741            result = Result::Match(text.len());
742        }
743        result
744    }
745
746    /// Executes the DFA on a reverse NFA.
747    #[cfg_attr(feature = "perf-inline", inline(always))]
748    fn exec_at_reverse(
749        &mut self,
750        qcur: &mut SparseSet,
751        qnext: &mut SparseSet,
752        text: &[u8],
753    ) -> Result<usize> {
754        // The comments in `exec_at` above mostly apply here too. The main
755        // difference is that we move backwards over the input and we look for
756        // the longest possible match instead of the leftmost-first match.
757        //
758        // N.B. The code duplication here is regrettable. Efforts to improve
759        // it without sacrificing performance are welcome. ---AG
760        debug_assert!(self.prog.is_reverse);
761        let mut result = Result::NoMatch(self.at);
762        let (mut prev_si, mut next_si) = (self.start, self.start);
763        let mut at = self.at;
764        while at > 0 {
765            while next_si <= STATE_MAX && at > 0 {
766                // Argument for safety is in the definition of next_si.
767                at -= 1;
768                prev_si = unsafe { self.next_si(next_si, text, at) };
769                if prev_si > STATE_MAX || at <= 4 {
770                    mem::swap(&mut prev_si, &mut next_si);
771                    break;
772                }
773                at -= 1;
774                next_si = unsafe { self.next_si(prev_si, text, at) };
775                if next_si > STATE_MAX {
776                    break;
777                }
778                at -= 1;
779                prev_si = unsafe { self.next_si(next_si, text, at) };
780                if prev_si > STATE_MAX {
781                    mem::swap(&mut prev_si, &mut next_si);
782                    break;
783                }
784                at -= 1;
785                next_si = unsafe { self.next_si(prev_si, text, at) };
786            }
787            if next_si & STATE_MATCH > 0 {
788                next_si &= !STATE_MATCH;
789                result = Result::Match(at + 1);
790                if self.quit_after_match {
791                    return result;
792                }
793                self.last_match_si = next_si;
794                prev_si = next_si;
795                let cur = at;
796                while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
797                    // Argument for safety is in the definition of next_si.
798                    at -= 1;
799                    next_si = unsafe {
800                        self.next_si(next_si & !STATE_MATCH, text, at)
801                    };
802                }
803                if at < cur {
804                    result = Result::Match(at + 2);
805                }
806            } else if next_si >= STATE_UNKNOWN {
807                if next_si == STATE_QUIT {
808                    return Result::Quit;
809                }
810                let byte = Byte::byte(text[at]);
811                prev_si &= STATE_MAX;
812                self.at = at;
813                next_si = match self.next_state(qcur, qnext, prev_si, byte) {
814                    None => return Result::Quit,
815                    Some(STATE_DEAD) => return result.set_non_match(at),
816                    Some(si) => si,
817                };
818                debug_assert!(next_si != STATE_UNKNOWN);
819                if next_si & STATE_MATCH > 0 {
820                    next_si &= !STATE_MATCH;
821                    result = Result::Match(at + 1);
822                    if self.quit_after_match {
823                        return result;
824                    }
825                    self.last_match_si = next_si;
826                }
827                prev_si = next_si;
828            } else {
829                prev_si = next_si;
830            }
831        }
832
833        // Run the DFA once more on the special EOF sentinel value.
834        prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
835            None => return Result::Quit,
836            Some(STATE_DEAD) => return result.set_non_match(0),
837            Some(si) => si,
838        };
839        debug_assert!(prev_si != STATE_UNKNOWN);
840        if prev_si & STATE_MATCH > 0 {
841            prev_si &= !STATE_MATCH;
842            self.last_match_si = prev_si;
843            result = Result::Match(0);
844        }
845        result
846    }
847
848    /// next_si transitions to the next state, where the transition input
849    /// corresponds to text[i].
850    ///
851    /// This elides bounds checks, and is therefore not safe.
852    #[cfg_attr(feature = "perf-inline", inline(always))]
853    unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
854        // What is the argument for safety here?
855        // We have three unchecked accesses that could possibly violate safety:
856        //
857        //   1. The given byte of input (`text[i]`).
858        //   2. The class of the byte of input (`classes[text[i]]`).
859        //   3. The transition for the class (`trans[si + cls]`).
860        //
861        // (1) is only safe when calling next_si is guarded by
862        // `i < text.len()`.
863        //
864        // (2) is the easiest case to guarantee since `text[i]` is always a
865        // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
866        // (See `ByteClassSet.byte_classes` in `compile.rs`.)
867        //
868        // (3) is only safe if (1)+(2) are safe. Namely, the transitions
869        // of every state are defined to have length equal to the number of
870        // byte classes in the program. Therefore, a valid class leads to a
871        // valid transition. (All possible transitions are valid lookups, even
872        // if it points to a state that hasn't been computed yet.) (3) also
873        // relies on `si` being correct, but StatePtrs should only ever be
874        // retrieved from the transition table, which ensures they are correct.
875        debug_assert!(i < text.len());
876        let b = *text.get_unchecked(i);
877        debug_assert!((b as usize) < self.prog.byte_classes.len());
878        let cls = *self.prog.byte_classes.get_unchecked(b as usize);
879        self.cache.trans.next_unchecked(si, cls as usize)
880    }
881
882    /// Computes the next state given the current state and the current input
883    /// byte (which may be EOF).
884    ///
885    /// If STATE_DEAD is returned, then there is no valid state transition.
886    /// This implies that no permutation of future input can lead to a match
887    /// state.
888    ///
889    /// STATE_UNKNOWN can never be returned.
890    fn exec_byte(
891        &mut self,
892        qcur: &mut SparseSet,
893        qnext: &mut SparseSet,
894        mut si: StatePtr,
895        b: Byte,
896    ) -> Option<StatePtr> {
897        use crate::prog::Inst::*;
898
899        // Initialize a queue with the current DFA state's NFA states.
900        qcur.clear();
901        for ip in self.state(si).inst_ptrs() {
902            qcur.insert(ip);
903        }
904
905        // Before inspecting the current byte, we may need to also inspect
906        // whether the position immediately preceding the current byte
907        // satisfies the empty assertions found in the current state.
908        //
909        // We only need to do this step if there are any empty assertions in
910        // the current state.
911        let is_word_last = self.state(si).flags().is_word();
912        let is_word = b.is_ascii_word();
913        if self.state(si).flags().has_empty() {
914            // Compute the flags immediately preceding the current byte.
915            // This means we only care about the "end" or "end line" flags.
916            // (The "start" flags are computed immediately following the
917            // current byte and are handled below.)
918            let mut flags = EmptyFlags::default();
919            if b.is_eof() {
920                flags.end = true;
921                flags.end_line = true;
922            } else if b.as_byte().map_or(false, |b| b == b'\n') {
923                flags.end_line = true;
924            }
925            if is_word_last == is_word {
926                flags.not_word_boundary = true;
927            } else {
928                flags.word_boundary = true;
929            }
930            // Now follow epsilon transitions from every NFA state, but make
931            // sure we only follow transitions that satisfy our flags.
932            qnext.clear();
933            for &ip in &*qcur {
934                self.follow_epsilons(usize_to_u32(ip), qnext, flags);
935            }
936            mem::swap(qcur, qnext);
937        }
938
939        // Now we set flags for immediately after the current byte. Since start
940        // states are processed separately, and are the only states that can
941        // have the StartText flag set, we therefore only need to worry about
942        // the StartLine flag here.
943        //
944        // We do also keep track of whether this DFA state contains a NFA state
945        // that is a matching state. This is precisely how we delay the DFA
946        // matching by one byte in order to process the special EOF sentinel
947        // byte. Namely, if this DFA state containing a matching NFA state,
948        // then it is the *next* DFA state that is marked as a match.
949        let mut empty_flags = EmptyFlags::default();
950        let mut state_flags = StateFlags::default();
951        empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
952        if b.is_ascii_word() {
953            state_flags.set_word();
954        }
955        // Now follow all epsilon transitions again, but only after consuming
956        // the current byte.
957        qnext.clear();
958        for &ip in &*qcur {
959            match self.prog[ip as usize] {
960                // These states never happen in a byte-based program.
961                Char(_) | Ranges(_) => unreachable!(),
962                // These states are handled when following epsilon transitions.
963                Save(_) | Split(_) | EmptyLook(_) => {}
964                Match(_) => {
965                    state_flags.set_match();
966                    if !self.continue_past_first_match() {
967                        break;
968                    } else if self.prog.matches.len() > 1
969                        && !qnext.contains(ip as usize)
970                    {
971                        // If we are continuing on to find other matches,
972                        // then keep a record of the match states we've seen.
973                        qnext.insert(ip);
974                    }
975                }
976                Bytes(ref inst) => {
977                    if b.as_byte().map_or(false, |b| inst.matches(b)) {
978                        self.follow_epsilons(
979                            inst.goto as InstPtr,
980                            qnext,
981                            empty_flags,
982                        );
983                    }
984                }
985            }
986        }
987
988        let cache = if b.is_eof() && self.prog.matches.len() > 1 {
989            // If we're processing the last byte of the input and we're
990            // matching a regex set, then make the next state contain the
991            // previous states transitions. We do this so that the main
992            // matching loop can extract all of the match instructions.
993            mem::swap(qcur, qnext);
994            // And don't cache this state because it's totally bunk.
995            false
996        } else {
997            true
998        };
999
1000        // We've now built up the set of NFA states that ought to comprise the
1001        // next DFA state, so try to find it in the cache, and if it doesn't
1002        // exist, cache it.
1003        //
1004        // N.B. We pass `&mut si` here because the cache may clear itself if
1005        // it has gotten too full. When that happens, the location of the
1006        // current state may change.
1007        let mut next =
1008            match self.cached_state(qnext, state_flags, Some(&mut si)) {
1009                None => return None,
1010                Some(next) => next,
1011            };
1012        if (self.start & !STATE_START) == next {
1013            // Start states can never be match states since all matches are
1014            // delayed by one byte.
1015            debug_assert!(!self.state(next).flags().is_match());
1016            next = self.start_ptr(next);
1017        }
1018        if next <= STATE_MAX && self.state(next).flags().is_match() {
1019            next |= STATE_MATCH;
1020        }
1021        debug_assert!(next != STATE_UNKNOWN);
1022        // And now store our state in the current state's next list.
1023        if cache {
1024            let cls = self.byte_class(b);
1025            self.cache.trans.set_next(si, cls, next);
1026        }
1027        Some(next)
1028    }
1029
1030    /// Follows the epsilon transitions starting at (and including) `ip`. The
1031    /// resulting states are inserted into the ordered set `q`.
1032    ///
1033    /// Conditional epsilon transitions (i.e., empty width assertions) are only
1034    /// followed if they are satisfied by the given flags, which should
1035    /// represent the flags set at the current location in the input.
1036    ///
1037    /// If the current location corresponds to the empty string, then only the
1038    /// end line and/or end text flags may be set. If the current location
1039    /// corresponds to a real byte in the input, then only the start line
1040    /// and/or start text flags may be set.
1041    ///
1042    /// As an exception to the above, when finding the initial state, any of
1043    /// the above flags may be set:
1044    ///
1045    /// If matching starts at the beginning of the input, then start text and
1046    /// start line should be set. If the input is empty, then end text and end
1047    /// line should also be set.
1048    ///
1049    /// If matching starts after the beginning of the input, then only start
1050    /// line should be set if the preceding byte is `\n`. End line should never
1051    /// be set in this case. (Even if the following byte is a `\n`, it will
1052    /// be handled in a subsequent DFA state.)
1053    fn follow_epsilons(
1054        &mut self,
1055        ip: InstPtr,
1056        q: &mut SparseSet,
1057        flags: EmptyFlags,
1058    ) {
1059        use crate::prog::EmptyLook::*;
1060        use crate::prog::Inst::*;
1061
1062        // We need to traverse the NFA to follow epsilon transitions, so avoid
1063        // recursion with an explicit stack.
1064        self.cache.stack.push(ip);
1065        while let Some(mut ip) = self.cache.stack.pop() {
1066            // Try to munch through as many states as possible without
1067            // pushes/pops to the stack.
1068            loop {
1069                // Don't visit states we've already added.
1070                if q.contains(ip as usize) {
1071                    break;
1072                }
1073                q.insert(ip as usize);
1074                match self.prog[ip as usize] {
1075                    Char(_) | Ranges(_) => unreachable!(),
1076                    Match(_) | Bytes(_) => {
1077                        break;
1078                    }
1079                    EmptyLook(ref inst) => {
1080                        // Only follow empty assertion states if our flags
1081                        // satisfy the assertion.
1082                        match inst.look {
1083                            StartLine if flags.start_line => {
1084                                ip = inst.goto as InstPtr;
1085                            }
1086                            EndLine if flags.end_line => {
1087                                ip = inst.goto as InstPtr;
1088                            }
1089                            StartText if flags.start => {
1090                                ip = inst.goto as InstPtr;
1091                            }
1092                            EndText if flags.end => {
1093                                ip = inst.goto as InstPtr;
1094                            }
1095                            WordBoundaryAscii if flags.word_boundary => {
1096                                ip = inst.goto as InstPtr;
1097                            }
1098                            NotWordBoundaryAscii
1099                                if flags.not_word_boundary =>
1100                            {
1101                                ip = inst.goto as InstPtr;
1102                            }
1103                            WordBoundary if flags.word_boundary => {
1104                                ip = inst.goto as InstPtr;
1105                            }
1106                            NotWordBoundary if flags.not_word_boundary => {
1107                                ip = inst.goto as InstPtr;
1108                            }
1109                            StartLine | EndLine | StartText | EndText
1110                            | WordBoundaryAscii | NotWordBoundaryAscii
1111                            | WordBoundary | NotWordBoundary => {
1112                                break;
1113                            }
1114                        }
1115                    }
1116                    Save(ref inst) => {
1117                        ip = inst.goto as InstPtr;
1118                    }
1119                    Split(ref inst) => {
1120                        self.cache.stack.push(inst.goto2 as InstPtr);
1121                        ip = inst.goto1 as InstPtr;
1122                    }
1123                }
1124            }
1125        }
1126    }
1127
1128    /// Find a previously computed state matching the given set of instructions
1129    /// and is_match bool.
1130    ///
1131    /// The given set of instructions should represent a single state in the
1132    /// NFA along with all states reachable without consuming any input.
1133    ///
1134    /// The is_match bool should be true if and only if the preceding DFA state
1135    /// contains an NFA matching state. The cached state produced here will
1136    /// then signify a match. (This enables us to delay a match by one byte,
1137    /// in order to account for the EOF sentinel byte.)
1138    ///
1139    /// If the cache is full, then it is wiped before caching a new state.
1140    ///
1141    /// The current state should be specified if it exists, since it will need
1142    /// to be preserved if the cache clears itself. (Start states are
1143    /// always saved, so they should not be passed here.) It takes a mutable
1144    /// pointer to the index because if the cache is cleared, the state's
1145    /// location may change.
1146    fn cached_state(
1147        &mut self,
1148        q: &SparseSet,
1149        mut state_flags: StateFlags,
1150        current_state: Option<&mut StatePtr>,
1151    ) -> Option<StatePtr> {
1152        // If we couldn't come up with a non-empty key to represent this state,
1153        // then it is dead and can never lead to a match.
1154        //
1155        // Note that inst_flags represent the set of empty width assertions
1156        // in q. We use this as an optimization in exec_byte to determine when
1157        // we should follow epsilon transitions at the empty string preceding
1158        // the current byte.
1159        let key = match self.cached_state_key(q, &mut state_flags) {
1160            None => return Some(STATE_DEAD),
1161            Some(v) => v,
1162        };
1163        // In the cache? Cool. Done.
1164        if let Some(si) = self.cache.compiled.get_ptr(&key) {
1165            return Some(si);
1166        }
1167        // If the cache has gotten too big, wipe it.
1168        if self.approximate_size() > self.prog.dfa_size_limit
1169            && !self.clear_cache_and_save(current_state)
1170        {
1171            // Ooops. DFA is giving up.
1172            return None;
1173        }
1174        // Allocate room for our state and add it.
1175        self.add_state(key)
1176    }
1177
1178    /// Produces a key suitable for describing a state in the DFA cache.
1179    ///
1180    /// The key invariant here is that equivalent keys are produced for any two
1181    /// sets of ordered NFA states (and toggling of whether the previous NFA
1182    /// states contain a match state) that do not discriminate a match for any
1183    /// input.
1184    ///
1185    /// Specifically, q should be an ordered set of NFA states and is_match
1186    /// should be true if and only if the previous NFA states contained a match
1187    /// state.
1188    fn cached_state_key(
1189        &mut self,
1190        q: &SparseSet,
1191        state_flags: &mut StateFlags,
1192    ) -> Option<State> {
1193        use crate::prog::Inst::*;
1194
1195        // We need to build up enough information to recognize pre-built states
1196        // in the DFA. Generally speaking, this includes every instruction
1197        // except for those which are purely epsilon transitions, e.g., the
1198        // Save and Split instructions.
1199        //
1200        // Empty width assertions are also epsilon transitions, but since they
1201        // are conditional, we need to make them part of a state's key in the
1202        // cache.
1203
1204        let mut insts =
1205            mem::replace(&mut self.cache.insts_scratch_space, vec![]);
1206        insts.clear();
1207        // Reserve 1 byte for flags.
1208        insts.push(0);
1209
1210        let mut prev = 0;
1211        for &ip in q {
1212            let ip = usize_to_u32(ip);
1213            match self.prog[ip as usize] {
1214                Char(_) | Ranges(_) => unreachable!(),
1215                Save(_) | Split(_) => {}
1216                Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
1217                EmptyLook(_) => {
1218                    state_flags.set_empty();
1219                    push_inst_ptr(&mut insts, &mut prev, ip)
1220                }
1221                Match(_) => {
1222                    push_inst_ptr(&mut insts, &mut prev, ip);
1223                    if !self.continue_past_first_match() {
1224                        break;
1225                    }
1226                }
1227            }
1228        }
1229        // If we couldn't transition to any other instructions and we didn't
1230        // see a match when expanding NFA states previously, then this is a
1231        // dead state and no amount of additional input can transition out
1232        // of this state.
1233        let opt_state = if insts.len() == 1 && !state_flags.is_match() {
1234            None
1235        } else {
1236            let StateFlags(f) = *state_flags;
1237            insts[0] = f;
1238            Some(State { data: Arc::from(&*insts) })
1239        };
1240        self.cache.insts_scratch_space = insts;
1241        opt_state
1242    }
1243
1244    /// Clears the cache, but saves and restores current_state if it is not
1245    /// none.
1246    ///
1247    /// The current state must be provided here in case its location in the
1248    /// cache changes.
1249    ///
1250    /// This returns false if the cache is not cleared and the DFA should
1251    /// give up.
1252    fn clear_cache_and_save(
1253        &mut self,
1254        current_state: Option<&mut StatePtr>,
1255    ) -> bool {
1256        if self.cache.compiled.is_empty() {
1257            // Nothing to clear...
1258            return true;
1259        }
1260        match current_state {
1261            None => self.clear_cache(),
1262            Some(si) => {
1263                let cur = self.state(*si).clone();
1264                if !self.clear_cache() {
1265                    return false;
1266                }
1267                // The unwrap is OK because we just cleared the cache and
1268                // therefore know that the next state pointer won't exceed
1269                // STATE_MAX.
1270                *si = self.restore_state(cur).unwrap();
1271                true
1272            }
1273        }
1274    }
1275
1276    /// Wipes the state cache, but saves and restores the current start state.
1277    ///
1278    /// This returns false if the cache is not cleared and the DFA should
1279    /// give up.
1280    fn clear_cache(&mut self) -> bool {
1281        // Bail out of the DFA if we're moving too "slowly."
1282        // A heuristic from RE2: assume the DFA is too slow if it is processing
1283        // 10 or fewer bytes per state.
1284        // Additionally, we permit the cache to be flushed a few times before
1285        // caling it quits.
1286        let nstates = self.cache.compiled.len();
1287        if self.cache.flush_count >= 3
1288            && self.at >= self.last_cache_flush
1289            && (self.at - self.last_cache_flush) <= 10 * nstates
1290        {
1291            return false;
1292        }
1293        // Update statistics tracking cache flushes.
1294        self.last_cache_flush = self.at;
1295        self.cache.flush_count += 1;
1296
1297        // OK, actually flush the cache.
1298        let start = self.state(self.start & !STATE_START).clone();
1299        let last_match = if self.last_match_si <= STATE_MAX {
1300            Some(self.state(self.last_match_si).clone())
1301        } else {
1302            None
1303        };
1304        self.cache.reset_size();
1305        self.cache.trans.clear();
1306        self.cache.compiled.clear();
1307        for s in &mut self.cache.start_states {
1308            *s = STATE_UNKNOWN;
1309        }
1310        // The unwraps are OK because we just cleared the cache and therefore
1311        // know that the next state pointer won't exceed STATE_MAX.
1312        let start_ptr = self.restore_state(start).unwrap();
1313        self.start = self.start_ptr(start_ptr);
1314        if let Some(last_match) = last_match {
1315            self.last_match_si = self.restore_state(last_match).unwrap();
1316        }
1317        true
1318    }
1319
1320    /// Restores the given state back into the cache, and returns a pointer
1321    /// to it.
1322    fn restore_state(&mut self, state: State) -> Option<StatePtr> {
1323        // If we've already stored this state, just return a pointer to it.
1324        // None will be the wiser.
1325        if let Some(si) = self.cache.compiled.get_ptr(&state) {
1326            return Some(si);
1327        }
1328        self.add_state(state)
1329    }
1330
1331    /// Returns the next state given the current state si and current byte
1332    /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
1333    /// states.
1334    ///
1335    /// This tries to fetch the next state from the cache, but if that fails,
1336    /// it computes the next state, caches it and returns a pointer to it.
1337    ///
1338    /// The pointer can be to a real state, or it can be STATE_DEAD.
1339    /// STATE_UNKNOWN cannot be returned.
1340    ///
1341    /// None is returned if a new state could not be allocated (i.e., the DFA
1342    /// ran out of space and thinks it's running too slowly).
1343    fn next_state(
1344        &mut self,
1345        qcur: &mut SparseSet,
1346        qnext: &mut SparseSet,
1347        si: StatePtr,
1348        b: Byte,
1349    ) -> Option<StatePtr> {
1350        if si == STATE_DEAD {
1351            return Some(STATE_DEAD);
1352        }
1353        match self.cache.trans.next(si, self.byte_class(b)) {
1354            STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
1355            STATE_QUIT => None,
1356            nsi => Some(nsi),
1357        }
1358    }
1359
1360    /// Computes and returns the start state, where searching begins at
1361    /// position `at` in `text`. If the state has already been computed,
1362    /// then it is pulled from the cache. If the state hasn't been cached,
1363    /// then it is computed, cached and a pointer to it is returned.
1364    ///
1365    /// This may return STATE_DEAD but never STATE_UNKNOWN.
1366    #[cfg_attr(feature = "perf-inline", inline(always))]
1367    fn start_state(
1368        &mut self,
1369        q: &mut SparseSet,
1370        empty_flags: EmptyFlags,
1371        state_flags: StateFlags,
1372    ) -> Option<StatePtr> {
1373        // Compute an index into our cache of start states based on the set
1374        // of empty/state flags set at the current position in the input. We
1375        // don't use every flag since not all flags matter. For example, since
1376        // matches are delayed by one byte, start states can never be match
1377        // states.
1378        let flagi = {
1379            (((empty_flags.start as u8) << 0)
1380                | ((empty_flags.end as u8) << 1)
1381                | ((empty_flags.start_line as u8) << 2)
1382                | ((empty_flags.end_line as u8) << 3)
1383                | ((empty_flags.word_boundary as u8) << 4)
1384                | ((empty_flags.not_word_boundary as u8) << 5)
1385                | ((state_flags.is_word() as u8) << 6)) as usize
1386        };
1387        match self.cache.start_states[flagi] {
1388            STATE_UNKNOWN => {}
1389            si => return Some(si),
1390        }
1391        q.clear();
1392        let start = usize_to_u32(self.prog.start);
1393        self.follow_epsilons(start, q, empty_flags);
1394        // Start states can never be match states because we delay every match
1395        // by one byte. Given an empty string and an empty match, the match
1396        // won't actually occur until the DFA processes the special EOF
1397        // sentinel byte.
1398        let sp = match self.cached_state(q, state_flags, None) {
1399            None => return None,
1400            Some(sp) => self.start_ptr(sp),
1401        };
1402        self.cache.start_states[flagi] = sp;
1403        Some(sp)
1404    }
1405
1406    /// Computes the set of starting flags for the given position in text.
1407    ///
1408    /// This should only be used when executing the DFA forwards over the
1409    /// input.
1410    fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
1411        let mut empty_flags = EmptyFlags::default();
1412        let mut state_flags = StateFlags::default();
1413        empty_flags.start = at == 0;
1414        empty_flags.end = text.is_empty();
1415        empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
1416        empty_flags.end_line = text.is_empty();
1417
1418        let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1419        let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
1420        if is_word_last {
1421            state_flags.set_word();
1422        }
1423        if is_word == is_word_last {
1424            empty_flags.not_word_boundary = true;
1425        } else {
1426            empty_flags.word_boundary = true;
1427        }
1428        (empty_flags, state_flags)
1429    }
1430
1431    /// Computes the set of starting flags for the given position in text.
1432    ///
1433    /// This should only be used when executing the DFA in reverse over the
1434    /// input.
1435    fn start_flags_reverse(
1436        &self,
1437        text: &[u8],
1438        at: usize,
1439    ) -> (EmptyFlags, StateFlags) {
1440        let mut empty_flags = EmptyFlags::default();
1441        let mut state_flags = StateFlags::default();
1442        empty_flags.start = at == text.len();
1443        empty_flags.end = text.is_empty();
1444        empty_flags.start_line = at == text.len() || text[at] == b'\n';
1445        empty_flags.end_line = text.is_empty();
1446
1447        let is_word_last =
1448            at < text.len() && Byte::byte(text[at]).is_ascii_word();
1449        let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1450        if is_word_last {
1451            state_flags.set_word();
1452        }
1453        if is_word == is_word_last {
1454            empty_flags.not_word_boundary = true;
1455        } else {
1456            empty_flags.word_boundary = true;
1457        }
1458        (empty_flags, state_flags)
1459    }
1460
1461    /// Returns a reference to a State given a pointer to it.
1462    fn state(&self, si: StatePtr) -> &State {
1463        self.cache.compiled.get_state(si).unwrap()
1464    }
1465
1466    /// Adds the given state to the DFA.
1467    ///
1468    /// This allocates room for transitions out of this state in
1469    /// self.cache.trans. The transitions can be set with the returned
1470    /// StatePtr.
1471    ///
1472    /// If None is returned, then the state limit was reached and the DFA
1473    /// should quit.
1474    fn add_state(&mut self, state: State) -> Option<StatePtr> {
1475        // This will fail if the next state pointer exceeds STATE_PTR. In
1476        // practice, the cache limit will prevent us from ever getting here,
1477        // but maybe callers will set the cache size to something ridiculous...
1478        let si = match self.cache.trans.add() {
1479            None => return None,
1480            Some(si) => si,
1481        };
1482        // If the program has a Unicode word boundary, then set any transitions
1483        // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
1484        // transition, then it will quit and an alternative matching engine
1485        // will take over.
1486        if self.prog.has_unicode_word_boundary {
1487            for b in 128..256 {
1488                let cls = self.byte_class(Byte::byte(b as u8));
1489                self.cache.trans.set_next(si, cls, STATE_QUIT);
1490            }
1491        }
1492        // Finally, put our actual state on to our heap of states and index it
1493        // so we can find it later.
1494        self.cache.size += self.cache.trans.state_heap_size()
1495            + state.data.len()
1496            + (2 * mem::size_of::<State>())
1497            + mem::size_of::<StatePtr>();
1498        self.cache.compiled.insert(state, si);
1499        // Transition table and set of states and map should all be in sync.
1500        debug_assert!(
1501            self.cache.compiled.len() == self.cache.trans.num_states()
1502        );
1503        Some(si)
1504    }
1505
1506    /// Quickly finds the next occurrence of any literal prefixes in the regex.
1507    /// If there are no literal prefixes, then the current position is
1508    /// returned. If there are literal prefixes and one could not be found,
1509    /// then None is returned.
1510    ///
1511    /// This should only be called when the DFA is in a start state.
1512    fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
1513        self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
1514    }
1515
1516    /// Returns the number of byte classes required to discriminate transitions
1517    /// in each state.
1518    ///
1519    /// invariant: num_byte_classes() == len(State.next)
1520    fn num_byte_classes(&self) -> usize {
1521        // We add 1 to account for the special EOF byte.
1522        (self.prog.byte_classes[255] as usize + 1) + 1
1523    }
1524
1525    /// Given an input byte or the special EOF sentinel, return its
1526    /// corresponding byte class.
1527    #[cfg_attr(feature = "perf-inline", inline(always))]
1528    fn byte_class(&self, b: Byte) -> usize {
1529        match b.as_byte() {
1530            None => self.num_byte_classes() - 1,
1531            Some(b) => self.u8_class(b),
1532        }
1533    }
1534
1535    /// Like byte_class, but explicitly for u8s.
1536    #[cfg_attr(feature = "perf-inline", inline(always))]
1537    fn u8_class(&self, b: u8) -> usize {
1538        self.prog.byte_classes[b as usize] as usize
1539    }
1540
1541    /// Returns true if the DFA should continue searching past the first match.
1542    ///
1543    /// Leftmost first semantics in the DFA are preserved by not following NFA
1544    /// transitions after the first match is seen.
1545    ///
1546    /// On occasion, we want to avoid leftmost first semantics to find either
1547    /// the longest match (for reverse search) or all possible matches (for
1548    /// regex sets).
1549    fn continue_past_first_match(&self) -> bool {
1550        self.prog.is_reverse || self.prog.matches.len() > 1
1551    }
1552
1553    /// Returns true if there is a prefix we can quickly search for.
1554    fn has_prefix(&self) -> bool {
1555        !self.prog.is_reverse
1556            && !self.prog.prefixes.is_empty()
1557            && !self.prog.is_anchored_start
1558    }
1559
1560    /// Sets the STATE_START bit in the given state pointer if and only if
1561    /// we have a prefix to scan for.
1562    ///
1563    /// If there's no prefix, then it's a waste to treat the start state
1564    /// specially.
1565    fn start_ptr(&self, si: StatePtr) -> StatePtr {
1566        if self.has_prefix() {
1567            si | STATE_START
1568        } else {
1569            si
1570        }
1571    }
1572
1573    /// Approximate size returns the approximate heap space currently used by
1574    /// the DFA. It is used to determine whether the DFA's state cache needs to
1575    /// be wiped. Namely, it is possible that for certain regexes on certain
1576    /// inputs, a new state could be created for every byte of input. (This is
1577    /// bad for memory use, so we bound it with a cache.)
1578    fn approximate_size(&self) -> usize {
1579        self.cache.size + self.prog.approximate_size()
1580    }
1581}
1582
1583/// An abstraction for representing a map of states. The map supports two
1584/// different ways of state lookup. One is fast constant time access via a
1585/// state pointer. The other is a hashmap lookup based on the DFA's
1586/// constituent NFA states.
1587///
1588/// A DFA state internally uses an Arc such that we only need to store the
1589/// set of NFA states on the heap once, even though we support looking up
1590/// states by two different means. A more natural way to express this might
1591/// use raw pointers, but an Arc is safe and effectively achieves the same
1592/// thing.
1593#[derive(Debug)]
1594struct StateMap {
1595    /// The keys are not actually static but rely on always pointing to a
1596    /// buffer in `states` which will never be moved except when clearing
1597    /// the map or on drop, in which case the keys of this map will be
1598    /// removed before
1599    map: HashMap<State, StatePtr>,
1600    /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
1601    /// this Vec rather than just a `StatePtr`.
1602    states: Vec<State>,
1603    /// The number of byte classes in the DFA. Used to index `states`.
1604    num_byte_classes: usize,
1605}
1606
1607impl StateMap {
1608    fn new(num_byte_classes: usize) -> StateMap {
1609        StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
1610    }
1611
1612    fn len(&self) -> usize {
1613        self.states.len()
1614    }
1615
1616    fn is_empty(&self) -> bool {
1617        self.states.is_empty()
1618    }
1619
1620    fn get_ptr(&self, state: &State) -> Option<StatePtr> {
1621        self.map.get(state).cloned()
1622    }
1623
1624    fn get_state(&self, si: StatePtr) -> Option<&State> {
1625        self.states.get(si as usize / self.num_byte_classes)
1626    }
1627
1628    fn insert(&mut self, state: State, si: StatePtr) {
1629        self.map.insert(state.clone(), si);
1630        self.states.push(state);
1631    }
1632
1633    fn clear(&mut self) {
1634        self.map.clear();
1635        self.states.clear();
1636    }
1637}
1638
1639impl Transitions {
1640    /// Create a new transition table.
1641    ///
1642    /// The number of byte classes corresponds to the stride. Every state will
1643    /// have `num_byte_classes` slots for transitions.
1644    fn new(num_byte_classes: usize) -> Transitions {
1645        Transitions { table: vec![], num_byte_classes }
1646    }
1647
1648    /// Returns the total number of states currently in this table.
1649    fn num_states(&self) -> usize {
1650        self.table.len() / self.num_byte_classes
1651    }
1652
1653    /// Allocates room for one additional state and returns a pointer to it.
1654    ///
1655    /// If there's no more room, None is returned.
1656    fn add(&mut self) -> Option<StatePtr> {
1657        let si = self.table.len();
1658        if si > STATE_MAX as usize {
1659            return None;
1660        }
1661        self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
1662        Some(usize_to_u32(si))
1663    }
1664
1665    /// Clears the table of all states.
1666    fn clear(&mut self) {
1667        self.table.clear();
1668    }
1669
1670    /// Sets the transition from (si, cls) to next.
1671    fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
1672        self.table[si as usize + cls] = next;
1673    }
1674
1675    /// Returns the transition corresponding to (si, cls).
1676    fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
1677        self.table[si as usize + cls]
1678    }
1679
1680    /// The heap size, in bytes, of a single state in the transition table.
1681    fn state_heap_size(&self) -> usize {
1682        self.num_byte_classes * mem::size_of::<StatePtr>()
1683    }
1684
1685    /// Like `next`, but uses unchecked access and is therefore not safe.
1686    unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
1687        debug_assert!((si as usize) < self.table.len());
1688        debug_assert!(cls < self.num_byte_classes);
1689        *self.table.get_unchecked(si as usize + cls)
1690    }
1691}
1692
1693impl StateFlags {
1694    fn is_match(&self) -> bool {
1695        self.0 & 0b0000_0001 > 0
1696    }
1697
1698    fn set_match(&mut self) {
1699        self.0 |= 0b0000_0001;
1700    }
1701
1702    fn is_word(&self) -> bool {
1703        self.0 & 0b0000_0010 > 0
1704    }
1705
1706    fn set_word(&mut self) {
1707        self.0 |= 0b0000_0010;
1708    }
1709
1710    fn has_empty(&self) -> bool {
1711        self.0 & 0b0000_0100 > 0
1712    }
1713
1714    fn set_empty(&mut self) {
1715        self.0 |= 0b0000_0100;
1716    }
1717}
1718
1719impl Byte {
1720    fn byte(b: u8) -> Self {
1721        Byte(b as u16)
1722    }
1723    fn eof() -> Self {
1724        Byte(256)
1725    }
1726    fn is_eof(&self) -> bool {
1727        self.0 == 256
1728    }
1729
1730    fn is_ascii_word(&self) -> bool {
1731        let b = match self.as_byte() {
1732            None => return false,
1733            Some(b) => b,
1734        };
1735        match b {
1736            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
1737            _ => false,
1738        }
1739    }
1740
1741    fn as_byte(&self) -> Option<u8> {
1742        if self.is_eof() {
1743            None
1744        } else {
1745            Some(self.0 as u8)
1746        }
1747    }
1748}
1749
1750impl fmt::Debug for State {
1751    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1752        let ips: Vec<usize> = self.inst_ptrs().collect();
1753        f.debug_struct("State")
1754            .field("flags", &self.flags())
1755            .field("insts", &ips)
1756            .finish()
1757    }
1758}
1759
1760impl fmt::Debug for Transitions {
1761    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1762        let mut fmtd = f.debug_map();
1763        for si in 0..self.num_states() {
1764            let s = si * self.num_byte_classes;
1765            let e = s + self.num_byte_classes;
1766            fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
1767        }
1768        fmtd.finish()
1769    }
1770}
1771
1772struct TransitionsRow<'a>(&'a [StatePtr]);
1773
1774impl<'a> fmt::Debug for TransitionsRow<'a> {
1775    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1776        let mut fmtd = f.debug_map();
1777        for (b, si) in self.0.iter().enumerate() {
1778            match *si {
1779                STATE_UNKNOWN => {}
1780                STATE_DEAD => {
1781                    fmtd.entry(&vb(b as usize), &"DEAD");
1782                }
1783                si => {
1784                    fmtd.entry(&vb(b as usize), &si.to_string());
1785                }
1786            }
1787        }
1788        fmtd.finish()
1789    }
1790}
1791
1792impl fmt::Debug for StateFlags {
1793    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1794        f.debug_struct("StateFlags")
1795            .field("is_match", &self.is_match())
1796            .field("is_word", &self.is_word())
1797            .field("has_empty", &self.has_empty())
1798            .finish()
1799    }
1800}
1801
1802/// Helper function for formatting a byte as a nice-to-read escaped string.
1803fn vb(b: usize) -> String {
1804    use std::ascii::escape_default;
1805
1806    if b > ::std::u8::MAX as usize {
1807        "EOF".to_owned()
1808    } else {
1809        let escaped = escape_default(b as u8).collect::<Vec<u8>>();
1810        String::from_utf8_lossy(&escaped).into_owned()
1811    }
1812}
1813
1814fn usize_to_u32(n: usize) -> u32 {
1815    if (n as u64) > (::std::u32::MAX as u64) {
1816        panic!("BUG: {} is too big to fit into u32", n)
1817    }
1818    n as u32
1819}
1820
1821#[allow(dead_code)] // useful for debugging
1822fn show_state_ptr(si: StatePtr) -> String {
1823    let mut s = format!("{:?}", si & STATE_MAX);
1824    if si == STATE_UNKNOWN {
1825        s = format!("{} (unknown)", s);
1826    }
1827    if si == STATE_DEAD {
1828        s = format!("{} (dead)", s);
1829    }
1830    if si == STATE_QUIT {
1831        s = format!("{} (quit)", s);
1832    }
1833    if si & STATE_START > 0 {
1834        s = format!("{} (start)", s);
1835    }
1836    if si & STATE_MATCH > 0 {
1837        s = format!("{} (match)", s);
1838    }
1839    s
1840}
1841
1842/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1843fn write_vari32(data: &mut Vec<u8>, n: i32) {
1844    let mut un = (n as u32) << 1;
1845    if n < 0 {
1846        un = !un;
1847    }
1848    write_varu32(data, un)
1849}
1850
1851/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1852fn read_vari32(data: &[u8]) -> (i32, usize) {
1853    let (un, i) = read_varu32(data);
1854    let mut n = (un >> 1) as i32;
1855    if un & 1 != 0 {
1856        n = !n;
1857    }
1858    (n, i)
1859}
1860
1861/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1862fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
1863    while n >= 0b1000_0000 {
1864        data.push((n as u8) | 0b1000_0000);
1865        n >>= 7;
1866    }
1867    data.push(n as u8);
1868}
1869
1870/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1871fn read_varu32(data: &[u8]) -> (u32, usize) {
1872    let mut n: u32 = 0;
1873    let mut shift: u32 = 0;
1874    for (i, &b) in data.iter().enumerate() {
1875        if b < 0b1000_0000 {
1876            return (n | ((b as u32) << shift), i + 1);
1877        }
1878        n |= ((b as u32) & 0b0111_1111) << shift;
1879        shift += 7;
1880    }
1881    (0, 0)
1882}
1883
1884#[cfg(test)]
1885mod tests {
1886
1887    use super::{
1888        push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
1889        State, StateFlags,
1890    };
1891    use quickcheck::{quickcheck, Gen, QuickCheck};
1892    use std::sync::Arc;
1893
1894    #[test]
1895    fn prop_state_encode_decode() {
1896        fn p(mut ips: Vec<u32>, flags: u8) -> bool {
1897            // It looks like our encoding scheme can't handle instruction
1898            // pointers at or above 2**31. We should fix that, but it seems
1899            // unlikely to occur in real code due to the amount of memory
1900            // required for such a state machine. So for now, we just clamp
1901            // our test data.
1902            for ip in &mut ips {
1903                if *ip >= 1 << 31 {
1904                    *ip = (1 << 31) - 1;
1905                }
1906            }
1907            let mut data = vec![flags];
1908            let mut prev = 0;
1909            for &ip in ips.iter() {
1910                push_inst_ptr(&mut data, &mut prev, ip);
1911            }
1912            let state = State { data: Arc::from(&data[..]) };
1913
1914            let expected: Vec<usize> =
1915                ips.into_iter().map(|ip| ip as usize).collect();
1916            let got: Vec<usize> = state.inst_ptrs().collect();
1917            expected == got && state.flags() == StateFlags(flags)
1918        }
1919        QuickCheck::new()
1920            .gen(Gen::new(10_000))
1921            .quickcheck(p as fn(Vec<u32>, u8) -> bool);
1922    }
1923
1924    #[test]
1925    fn prop_read_write_u32() {
1926        fn p(n: u32) -> bool {
1927            let mut buf = vec![];
1928            write_varu32(&mut buf, n);
1929            let (got, nread) = read_varu32(&buf);
1930            nread == buf.len() && got == n
1931        }
1932        quickcheck(p as fn(u32) -> bool);
1933    }
1934
1935    #[test]
1936    fn prop_read_write_i32() {
1937        fn p(n: i32) -> bool {
1938            let mut buf = vec![];
1939            write_vari32(&mut buf, n);
1940            let (got, nread) = read_vari32(&buf);
1941            nread == buf.len() && got == n
1942        }
1943        quickcheck(p as fn(i32) -> bool);
1944    }
1945}
regex/dfa.rs

regex/
dfa.rs