aho_corasick/
automaton.rs

1use crate::ahocorasick::MatchKind;
2use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
3use crate::state_id::{dead_id, fail_id, StateID};
4use crate::Match;
5
6// NOTE: This trait essentially started as a copy of the same trait from from
7// regex-automata, with some wording changed since we use this trait for
8// NFAs in addition to DFAs in this crate. Additionally, we do not export
9// this trait. It's only used internally to reduce code duplication. The
10// regex-automata crate needs to expose it because its Regex type is generic
11// over implementations of this trait. In this crate, we encapsulate everything
12// behind the AhoCorasick type.
13//
14// This trait is a bit of a mess, but it's not quite clear how to fix it.
15// Basically, there are several competing concerns:
16//
17// * We need performance, so everything effectively needs to get monomorphized.
18// * There are several variations on searching Aho-Corasick automatons:
19//   overlapping, standard and leftmost. Overlapping and standard are somewhat
20//   combined together below, but there is no real way to combine standard with
21//   leftmost. Namely, leftmost requires continuing a search even after a match
22//   is found, in order to correctly disambiguate a match.
23// * On top of that, *sometimes* callers want to know which state the automaton
24//   is in after searching. This is principally useful for overlapping and
25//   stream searches. However, when callers don't care about this, we really
26//   do not want to be forced to compute it, since it sometimes requires extra
27//   work. Thus, there are effectively two copies of leftmost searching: one
28//   for tracking the state ID and one that doesn't. We should ideally do the
29//   same for standard searching, but my sanity stopped me.
30
31// SAFETY RATIONALE: Previously, the code below went to some length to remove
32// all bounds checks. This generally produced tighter assembly and lead to
33// 20-50% improvements in micro-benchmarks on corpora made up of random
34// characters. This somewhat makes sense, since the branch predictor is going
35// to be at its worse on random text.
36//
37// However, using the aho-corasick-debug tool and manually benchmarking
38// different inputs, the code *with* bounds checks actually wound up being
39// slightly faster:
40//
41//     $ cat input
42//     Sherlock Holmes
43//     John Watson
44//     Professor Moriarty
45//     Irene Adler
46//     Mary Watson
47//
48//     $ aho-corasick-debug-safe \
49//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
50//     pattern read time: 32.824µs
51//     automaton build time: 444.687µs
52//     automaton heap usage: 72392 bytes
53//     match count: 639
54//     count time: 1.809961702s
55//
56//     $ aho-corasick-debug-master \
57//         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
58//     pattern read time: 31.425µs
59//     automaton build time: 317.434µs
60//     automaton heap usage: 72392 bytes
61//     match count: 639
62//     count time: 2.059157705s
63//
64// I was able to reproduce this result on two different machines (an i5 and
65// an i7). Therefore, we go the route of safe code for now.
66
67/// A trait describing the interface of an Aho-Corasick finite state machine.
68///
69/// Every automaton has exactly one fail state, one dead state and exactly one
70/// start state. Generally, these correspond to the first, second and third
71/// states, respectively. The failure state is always treated as a sentinel.
72/// That is, no correct Aho-Corasick automaton will ever transition into the
73/// fail state. The dead state, however, can be transitioned into, but only
74/// when leftmost-first or leftmost-longest match semantics are enabled and
75/// only when at least one match has been observed.
76///
77/// Every automaton also has one or more match states, such that
78/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
79/// corresponds to a match state.
80pub trait Automaton {
81    /// The representation used for state identifiers in this automaton.
82    ///
83    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
84    type ID: StateID;
85
86    /// The type of matching that should be done.
87    fn match_kind(&self) -> &MatchKind;
88
89    /// Returns true if and only if this automaton uses anchored searches.
90    fn anchored(&self) -> bool;
91
92    /// An optional prefilter for quickly skipping to the next candidate match.
93    /// A prefilter must report at least every match, although it may report
94    /// positions that do not correspond to a match. That is, it must not allow
95    /// false negatives, but can allow false positives.
96    ///
97    /// Currently, a prefilter only runs when the automaton is in the start
98    /// state. That is, the position reported by a prefilter should always
99    /// correspond to the start of a potential match.
100    fn prefilter(&self) -> Option<&dyn Prefilter>;
101
102    /// Return the identifier of this automaton's start state.
103    fn start_state(&self) -> Self::ID;
104
105    /// Returns true if and only if the given state identifier refers to a
106    /// valid state.
107    fn is_valid(&self, id: Self::ID) -> bool;
108
109    /// Returns true if and only if the given identifier corresponds to a match
110    /// state.
111    ///
112    /// The state ID given must be valid, or else implementors may panic.
113    fn is_match_state(&self, id: Self::ID) -> bool;
114
115    /// Returns true if and only if the given identifier corresponds to a state
116    /// that is either the dead state or a match state.
117    ///
118    /// Depending on the implementation of the automaton, this routine can
119    /// be used to save a branch in the core matching loop. Nevertheless,
120    /// `is_match_state(id) || id == dead_id()` is always a valid
121    /// implementation. Indeed, this is the default implementation.
122    ///
123    /// The state ID given must be valid, or else implementors may panic.
124    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
125        id == dead_id() || self.is_match_state(id)
126    }
127
128    /// If the given state is a match state, return the match corresponding
129    /// to the given match index. `end` must be the ending position of the
130    /// detected match. If no match exists or if `match_index` exceeds the
131    /// number of matches in this state, then `None` is returned.
132    ///
133    /// The state ID given must be valid, or else implementors may panic.
134    ///
135    /// If the given state ID is correct and if the `match_index` is less than
136    /// the number of matches for that state, then this is guaranteed to return
137    /// a match.
138    fn get_match(
139        &self,
140        id: Self::ID,
141        match_index: usize,
142        end: usize,
143    ) -> Option<Match>;
144
145    /// Returns the number of matches for the given state. If the given state
146    /// is not a match state, then this returns 0.
147    ///
148    /// The state ID given must be valid, or else implementors must panic.
149    fn match_count(&self, id: Self::ID) -> usize;
150
151    /// Given the current state that this automaton is in and the next input
152    /// byte, this method returns the identifier of the next state. The
153    /// identifier returned must always be valid and may never correspond to
154    /// the fail state. The returned identifier may, however, point to the
155    /// dead state.
156    ///
157    /// This is not safe so that implementors may look up the next state
158    /// without memory safety checks such as bounds checks. As such, callers
159    /// must ensure that the given identifier corresponds to a valid automaton
160    /// state. Implementors must, in turn, ensure that this routine is safe for
161    /// all valid state identifiers and for all possible `u8` values.
162    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
163
164    /// Like next_state, but debug_asserts that the underlying
165    /// implementation never returns a `fail_id()` for the next state.
166    fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
167        let next = self.next_state(current, input);
168        // We should never see a transition to the failure state.
169        debug_assert!(
170            next != fail_id(),
171            "automaton should never return fail_id for next state"
172        );
173        next
174    }
175
176    /// Execute a search using standard match semantics.
177    ///
178    /// This can be used even when the automaton was constructed with leftmost
179    /// match semantics when you want to find the earliest possible match. This
180    /// can also be used as part of an overlapping search implementation.
181    ///
182    /// N.B. This does not report a match if `state_id` is given as a matching
183    /// state. As such, this should not be used directly.
184    #[inline(always)]
185    fn standard_find_at(
186        &self,
187        prestate: &mut PrefilterState,
188        haystack: &[u8],
189        at: usize,
190        state_id: &mut Self::ID,
191    ) -> Option<Match> {
192        if let Some(pre) = self.prefilter() {
193            self.standard_find_at_imp(
194                prestate,
195                Some(pre),
196                haystack,
197                at,
198                state_id,
199            )
200        } else {
201            self.standard_find_at_imp(prestate, None, haystack, at, state_id)
202        }
203    }
204
205    // It's important for this to always be inlined. Namely, its only caller
206    // is standard_find_at, and the inlining should remove the case analysis
207    // for prefilter scanning when there is no prefilter available.
208    #[inline(always)]
209    fn standard_find_at_imp(
210        &self,
211        prestate: &mut PrefilterState,
212        prefilter: Option<&dyn Prefilter>,
213        haystack: &[u8],
214        mut at: usize,
215        state_id: &mut Self::ID,
216    ) -> Option<Match> {
217        while at < haystack.len() {
218            if let Some(pre) = prefilter {
219                if prestate.is_effective(at) && *state_id == self.start_state()
220                {
221                    let c = prefilter::next(prestate, pre, haystack, at)
222                        .into_option();
223                    match c {
224                        None => return None,
225                        Some(i) => {
226                            at = i;
227                        }
228                    }
229                }
230            }
231            // CORRECTNESS: next_state is correct for all possible u8 values,
232            // so the only thing we're concerned about is the validity of
233            // `state_id`. `state_id` either comes from the caller (in which
234            // case, we assume it is correct), or it comes from the return
235            // value of next_state, which is guaranteed to be correct.
236            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
237            at += 1;
238            // This routine always quits immediately after seeing a
239            // match, and since dead states can only come after seeing
240            // a match, seeing a dead state here is impossible. (Unless
241            // we have an anchored automaton, in which case, dead states
242            // are used to stop a search.)
243            debug_assert!(
244                *state_id != dead_id() || self.anchored(),
245                "standard find should never see a dead state"
246            );
247
248            if self.is_match_or_dead_state(*state_id) {
249                return if *state_id == dead_id() {
250                    None
251                } else {
252                    self.get_match(*state_id, 0, at)
253                };
254            }
255        }
256        None
257    }
258
259    /// Execute a search using leftmost (either first or longest) match
260    /// semantics.
261    ///
262    /// The principle difference between searching with standard semantics and
263    /// searching with leftmost semantics is that leftmost searching will
264    /// continue searching even after a match has been found. Once a match
265    /// is found, the search does not stop until either the haystack has been
266    /// exhausted or a dead state is observed in the automaton. (Dead states
267    /// only exist in automatons constructed with leftmost semantics.) That is,
268    /// we rely on the construction of the automaton to tell us when to quit.
269    #[inline(never)]
270    fn leftmost_find_at(
271        &self,
272        prestate: &mut PrefilterState,
273        haystack: &[u8],
274        at: usize,
275        state_id: &mut Self::ID,
276    ) -> Option<Match> {
277        if let Some(pre) = self.prefilter() {
278            self.leftmost_find_at_imp(
279                prestate,
280                Some(pre),
281                haystack,
282                at,
283                state_id,
284            )
285        } else {
286            self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
287        }
288    }
289
290    // It's important for this to always be inlined. Namely, its only caller
291    // is leftmost_find_at, and the inlining should remove the case analysis
292    // for prefilter scanning when there is no prefilter available.
293    #[inline(always)]
294    fn leftmost_find_at_imp(
295        &self,
296        prestate: &mut PrefilterState,
297        prefilter: Option<&dyn Prefilter>,
298        haystack: &[u8],
299        mut at: usize,
300        state_id: &mut Self::ID,
301    ) -> Option<Match> {
302        debug_assert!(self.match_kind().is_leftmost());
303        if self.anchored() && at > 0 && *state_id == self.start_state() {
304            return None;
305        }
306        let mut last_match = self.get_match(*state_id, 0, at);
307        while at < haystack.len() {
308            if let Some(pre) = prefilter {
309                if prestate.is_effective(at) && *state_id == self.start_state()
310                {
311                    let c = prefilter::next(prestate, pre, haystack, at)
312                        .into_option();
313                    match c {
314                        None => return None,
315                        Some(i) => {
316                            at = i;
317                        }
318                    }
319                }
320            }
321            // CORRECTNESS: next_state is correct for all possible u8 values,
322            // so the only thing we're concerned about is the validity of
323            // `state_id`. `state_id` either comes from the caller (in which
324            // case, we assume it is correct), or it comes from the return
325            // value of next_state, which is guaranteed to be correct.
326            *state_id = self.next_state_no_fail(*state_id, haystack[at]);
327            at += 1;
328            if self.is_match_or_dead_state(*state_id) {
329                if *state_id == dead_id() {
330                    // The only way to enter into a dead state is if a match
331                    // has been found, so we assert as much. This is different
332                    // from normal automata, where you might enter a dead state
333                    // if you know a subsequent match will never be found
334                    // (regardless of whether a match has already been found).
335                    // For Aho-Corasick, it is built so that we can match at
336                    // any position, so the possibility of a match always
337                    // exists.
338                    //
339                    // (Unless we have an anchored automaton, in which case,
340                    // dead states are used to stop a search.)
341                    debug_assert!(
342                        last_match.is_some() || self.anchored(),
343                        "failure state should only be seen after match"
344                    );
345                    return last_match;
346                }
347                last_match = self.get_match(*state_id, 0, at);
348            }
349        }
350        last_match
351    }
352
353    /// This is like leftmost_find_at, but does not need to track a caller
354    /// provided state id. In other words, the only output of this routine is a
355    /// match, if one exists.
356    ///
357    /// It is regrettable that we need to effectively copy a chunk of
358    /// implementation twice, but when we don't need to track the state ID, we
359    /// can allow the prefilter to report matches immediately without having
360    /// to re-confirm them with the automaton. The re-confirmation step is
361    /// necessary in leftmost_find_at because tracing through the automaton is
362    /// the only way to correctly set the state ID. (Perhaps an alternative
363    /// would be to keep a map from pattern ID to matching state ID, but that
364    /// complicates the code and still doesn't permit us to defer to the
365    /// prefilter entirely when possible.)
366    ///
367    /// I did try a few things to avoid the code duplication here, but nothing
368    /// optimized as well as this approach. (In microbenchmarks, there was
369    /// about a 25% difference.)
370    #[inline(never)]
371    fn leftmost_find_at_no_state(
372        &self,
373        prestate: &mut PrefilterState,
374        haystack: &[u8],
375        at: usize,
376    ) -> Option<Match> {
377        if let Some(pre) = self.prefilter() {
378            self.leftmost_find_at_no_state_imp(
379                prestate,
380                Some(pre),
381                haystack,
382                at,
383            )
384        } else {
385            self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
386        }
387    }
388
389    // It's important for this to always be inlined. Namely, its only caller
390    // is leftmost_find_at_no_state, and the inlining should remove the case
391    // analysis for prefilter scanning when there is no prefilter available.
392    #[inline(always)]
393    fn leftmost_find_at_no_state_imp(
394        &self,
395        prestate: &mut PrefilterState,
396        prefilter: Option<&dyn Prefilter>,
397        haystack: &[u8],
398        mut at: usize,
399    ) -> Option<Match> {
400        debug_assert!(self.match_kind().is_leftmost());
401        if self.anchored() && at > 0 {
402            return None;
403        }
404        // If our prefilter handles confirmation of matches 100% of the
405        // time, and since we don't need to track state IDs, we can avoid
406        // Aho-Corasick completely.
407        if let Some(pre) = prefilter {
408            // We should never have a prefilter during an anchored search.
409            debug_assert!(!self.anchored());
410            if !pre.reports_false_positives() {
411                return match pre.next_candidate(prestate, haystack, at) {
412                    Candidate::None => None,
413                    Candidate::Match(m) => Some(m),
414                    Candidate::PossibleStartOfMatch(_) => unreachable!(),
415                };
416            }
417        }
418
419        let mut state_id = self.start_state();
420        let mut last_match = self.get_match(state_id, 0, at);
421        while at < haystack.len() {
422            if let Some(pre) = prefilter {
423                if prestate.is_effective(at) && state_id == self.start_state()
424                {
425                    match prefilter::next(prestate, pre, haystack, at) {
426                        Candidate::None => return None,
427                        // Since we aren't tracking a state ID, we can
428                        // quit early once we know we have a match.
429                        Candidate::Match(m) => return Some(m),
430                        Candidate::PossibleStartOfMatch(i) => {
431                            at = i;
432                        }
433                    }
434                }
435            }
436            // CORRECTNESS: next_state is correct for all possible u8 values,
437            // so the only thing we're concerned about is the validity of
438            // `state_id`. `state_id` either comes from the caller (in which
439            // case, we assume it is correct), or it comes from the return
440            // value of next_state, which is guaranteed to be correct.
441            state_id = self.next_state_no_fail(state_id, haystack[at]);
442            at += 1;
443            if self.is_match_or_dead_state(state_id) {
444                if state_id == dead_id() {
445                    // The only way to enter into a dead state is if a
446                    // match has been found, so we assert as much. This
447                    // is different from normal automata, where you might
448                    // enter a dead state if you know a subsequent match
449                    // will never be found (regardless of whether a match
450                    // has already been found). For Aho-Corasick, it is
451                    // built so that we can match at any position, so the
452                    // possibility of a match always exists.
453                    //
454                    // (Unless we have an anchored automaton, in which
455                    // case, dead states are used to stop a search.)
456                    debug_assert!(
457                        last_match.is_some() || self.anchored(),
458                        "failure state should only be seen after match"
459                    );
460                    return last_match;
461                }
462                last_match = self.get_match(state_id, 0, at);
463            }
464        }
465        last_match
466    }
467
468    /// Execute an overlapping search.
469    ///
470    /// When executing an overlapping match, the previous state ID in addition
471    /// to the previous match index should be given. If there are more matches
472    /// at the given state, then the match is reported and the given index is
473    /// incremented.
474    #[inline(always)]
475    fn overlapping_find_at(
476        &self,
477        prestate: &mut PrefilterState,
478        haystack: &[u8],
479        at: usize,
480        state_id: &mut Self::ID,
481        match_index: &mut usize,
482    ) -> Option<Match> {
483        if self.anchored() && at > 0 && *state_id == self.start_state() {
484            return None;
485        }
486
487        let match_count = self.match_count(*state_id);
488        if *match_index < match_count {
489            // This is guaranteed to return a match since
490            // match_index < match_count.
491            let result = self.get_match(*state_id, *match_index, at);
492            debug_assert!(result.is_some(), "must be a match");
493            *match_index += 1;
494            return result;
495        }
496
497        *match_index = 0;
498        match self.standard_find_at(prestate, haystack, at, state_id) {
499            None => None,
500            Some(m) => {
501                *match_index = 1;
502                Some(m)
503            }
504        }
505    }
506
507    /// Return the earliest match found. This returns as soon as we know that
508    /// we have a match. As such, this does not necessarily correspond to the
509    /// leftmost starting match, but rather, the leftmost position at which a
510    /// match ends.
511    #[inline(always)]
512    fn earliest_find_at(
513        &self,
514        prestate: &mut PrefilterState,
515        haystack: &[u8],
516        at: usize,
517        state_id: &mut Self::ID,
518    ) -> Option<Match> {
519        if *state_id == self.start_state() {
520            if self.anchored() && at > 0 {
521                return None;
522            }
523            if let Some(m) = self.get_match(*state_id, 0, at) {
524                return Some(m);
525            }
526        }
527        self.standard_find_at(prestate, haystack, at, state_id)
528    }
529
530    /// A convenience function for finding the next match according to the
531    /// match semantics of this automaton. For standard match semantics, this
532    /// finds the earliest match. Otherwise, the leftmost match is found.
533    #[inline(always)]
534    fn find_at(
535        &self,
536        prestate: &mut PrefilterState,
537        haystack: &[u8],
538        at: usize,
539        state_id: &mut Self::ID,
540    ) -> Option<Match> {
541        match *self.match_kind() {
542            MatchKind::Standard => {
543                self.earliest_find_at(prestate, haystack, at, state_id)
544            }
545            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
546                self.leftmost_find_at(prestate, haystack, at, state_id)
547            }
548            MatchKind::__Nonexhaustive => unreachable!(),
549        }
550    }
551
552    /// Like find_at, but does not track state identifiers. This permits some
553    /// optimizations when a prefilter that confirms its own matches is
554    /// present.
555    #[inline(always)]
556    fn find_at_no_state(
557        &self,
558        prestate: &mut PrefilterState,
559        haystack: &[u8],
560        at: usize,
561    ) -> Option<Match> {
562        match *self.match_kind() {
563            MatchKind::Standard => {
564                let mut state = self.start_state();
565                self.earliest_find_at(prestate, haystack, at, &mut state)
566            }
567            MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
568                self.leftmost_find_at_no_state(prestate, haystack, at)
569            }
570            MatchKind::__Nonexhaustive => unreachable!(),
571        }
572    }
573}