aho_corasick/nfa.rs
1use std::cmp;
2use std::collections::{BTreeSet, VecDeque};
3use std::fmt;
4use std::mem::size_of;
5use std::ops::{Index, IndexMut};
6
7use crate::ahocorasick::MatchKind;
8use crate::automaton::Automaton;
9use crate::classes::{ByteClassBuilder, ByteClasses};
10use crate::error::Result;
11use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13use crate::Match;
14
15/// The identifier for a pattern, which is simply the position of the pattern
16/// in the sequence of patterns given by the caller.
17pub type PatternID = usize;
18
19/// The length of a pattern, in bytes.
20pub type PatternLength = usize;
21
22/// An Aho-Corasick automaton, represented as an NFA.
23///
24/// This is the classical formulation of Aho-Corasick, which involves building
25/// up a prefix trie of a given set of patterns, and then wiring up failure
26/// transitions between states in order to guarantee linear time matching. The
27/// standard formulation is, technically, an NFA because of these failure
28/// transitions. That is, one can see them as enabling the automaton to be in
29/// multiple states at once. Indeed, during search, it is possible to check
30/// the transitions on multiple states for a single input byte.
31///
32/// This particular implementation not only supports the standard style of
33/// matching, but also provides a mode for choosing leftmost-first or
34/// leftmost-longest match semantics. When a leftmost mode is chosen, some
35/// failure transitions that would otherwise be added are elided. See
36/// the documentation of `MatchKind` for more details and examples on how the
37/// match semantics may differ.
38///
39/// If one wants a DFA, then it is necessary to first build an NFA and convert
40/// it into a DFA. Note, however, that because we've constrained ourselves to
41/// matching literal patterns, this does not need to use subset construction
42/// for determinization. Instead, the DFA has at most a number of states
43/// equivalent to the number of NFA states. The only real difference between
44/// them is that all failure transitions are followed and pre-computed. This
45/// uses much more memory, but also executes searches more quickly.
46#[derive(Clone)]
47pub struct NFA<S> {
48 /// The match semantics built into this NFA.
49 match_kind: MatchKind,
50 /// The start state id as an index into `states`.
51 start_id: S,
52 /// The length, in bytes, of the longest pattern in this automaton. This
53 /// information is useful for keeping correct buffer sizes when searching
54 /// on streams.
55 max_pattern_len: usize,
56 /// The total number of patterns added to this automaton, including
57 /// patterns that may never be matched.
58 pattern_count: usize,
59 /// The number of bytes of heap used by this NFA's transition table.
60 heap_bytes: usize,
61 /// A prefilter for quickly skipping to candidate matches, if pertinent.
62 prefilter: Option<PrefilterObj>,
63 /// Whether this automaton anchors all matches to the start of input.
64 anchored: bool,
65 /// A set of equivalence classes in terms of bytes. We compute this while
66 /// building the NFA, but don't use it in the NFA's states. Instead, we
67 /// use this for building the DFA. We store it on the NFA since it's easy
68 /// to compute while visiting the the patterns.
69 byte_classes: ByteClasses,
70 /// A set of states. Each state defines its own transitions, a fail
71 /// transition and a set of indices corresponding to matches.
72 ///
73 /// The first state is always the fail state, which is used only as a
74 /// sentinel. Namely, in the final NFA, no transition into the fail state
75 /// exists. (Well, they do, but they aren't followed. Instead, the state's
76 /// failure transition is followed.)
77 ///
78 /// The second state (index 1) is always the dead state. Dead states are
79 /// in every automaton, but only used when leftmost-{first,longest} match
80 /// semantics are enabled. Specifically, they instruct search to stop
81 /// at specific points in order to report the correct match location. In
82 /// the standard Aho-Corasick construction, there are no transitions to
83 /// the dead state.
84 ///
85 /// The third state (index 2) is generally intended to be the starting or
86 /// "root" state.
87 states: Vec<State<S>>,
88}
89
90impl<S: StateID> NFA<S> {
91 /// Returns the equivalence classes of bytes found while constructing
92 /// this NFA.
93 ///
94 /// Note that the NFA doesn't actually make use of these equivalence
95 /// classes. Instead, these are useful for building the DFA when desired.
96 pub fn byte_classes(&self) -> &ByteClasses {
97 &self.byte_classes
98 }
99
100 /// Returns a prefilter, if one exists.
101 pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102 self.prefilter.as_ref()
103 }
104
105 /// Returns the total number of heap bytes used by this NFA's transition
106 /// table.
107 pub fn heap_bytes(&self) -> usize {
108 self.heap_bytes
109 + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
110 }
111
112 /// Return the length of the longest pattern in this automaton.
113 pub fn max_pattern_len(&self) -> usize {
114 self.max_pattern_len
115 }
116
117 /// Return the total number of patterns added to this automaton.
118 pub fn pattern_count(&self) -> usize {
119 self.pattern_count
120 }
121
122 /// Returns the total number of states in this NFA.
123 pub fn state_len(&self) -> usize {
124 self.states.len()
125 }
126
127 /// Returns the matches for the given state.
128 pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129 &self.states[id.to_usize()].matches
130 }
131
132 /// Returns an iterator over all transitions in the given state according
133 /// to the given equivalence classes, including transitions to `fail_id()`.
134 /// The number of transitions returned is always equivalent to the number
135 /// of equivalence classes.
136 pub fn iter_all_transitions<F: FnMut(u8, S)>(
137 &self,
138 byte_classes: &ByteClasses,
139 id: S,
140 f: F,
141 ) {
142 self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143 }
144
145 /// Returns the failure transition for the given state.
146 pub fn failure_transition(&self, id: S) -> S {
147 self.states[id.to_usize()].fail
148 }
149
150 /// Returns the next state for the given state and input byte.
151 ///
152 /// Note that this does not follow failure transitions. As such, the id
153 /// returned may be `fail_id`.
154 pub fn next_state(&self, current: S, input: u8) -> S {
155 self.states[current.to_usize()].next_state(input)
156 }
157
158 fn state(&self, id: S) -> &State<S> {
159 &self.states[id.to_usize()]
160 }
161
162 fn state_mut(&mut self, id: S) -> &mut State<S> {
163 &mut self.states[id.to_usize()]
164 }
165
166 fn start(&self) -> &State<S> {
167 self.state(self.start_id)
168 }
169
170 fn start_mut(&mut self) -> &mut State<S> {
171 let id = self.start_id;
172 self.state_mut(id)
173 }
174
175 fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
176 IterTransitionsMut::new(self, id)
177 }
178
179 fn copy_matches(&mut self, src: S, dst: S) {
180 let (src, dst) =
181 get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182 dst.matches.extend_from_slice(&src.matches);
183 }
184
185 fn copy_empty_matches(&mut self, dst: S) {
186 let start_id = self.start_id;
187 self.copy_matches(start_id, dst);
188 }
189
190 fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191 let trans = Transitions::Dense(Dense::new());
192 let id = usize_to_state_id(self.states.len())?;
193 self.states.push(State {
194 trans,
195 // Anchored automatons do not have any failure transitions.
196 fail: if self.anchored { dead_id() } else { self.start_id },
197 depth,
198 matches: vec![],
199 });
200 Ok(id)
201 }
202
203 fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204 let trans = Transitions::Sparse(vec![]);
205 let id = usize_to_state_id(self.states.len())?;
206 self.states.push(State {
207 trans,
208 // Anchored automatons do not have any failure transitions.
209 fail: if self.anchored { dead_id() } else { self.start_id },
210 depth,
211 matches: vec![],
212 });
213 Ok(id)
214 }
215}
216
217impl<S: StateID> Automaton for NFA<S> {
218 type ID = S;
219
220 fn match_kind(&self) -> &MatchKind {
221 &self.match_kind
222 }
223
224 fn anchored(&self) -> bool {
225 self.anchored
226 }
227
228 fn prefilter(&self) -> Option<&dyn Prefilter> {
229 self.prefilter.as_ref().map(|p| p.as_ref())
230 }
231
232 fn start_state(&self) -> S {
233 self.start_id
234 }
235
236 fn is_valid(&self, id: S) -> bool {
237 id.to_usize() < self.states.len()
238 }
239
240 fn is_match_state(&self, id: S) -> bool {
241 self.states[id.to_usize()].is_match()
242 }
243
244 fn get_match(
245 &self,
246 id: S,
247 match_index: usize,
248 end: usize,
249 ) -> Option<Match> {
250 let state = match self.states.get(id.to_usize()) {
251 None => return None,
252 Some(state) => state,
253 };
254 state.matches.get(match_index).map(|&(id, len)| Match {
255 pattern: id,
256 len,
257 end,
258 })
259 }
260
261 fn match_count(&self, id: S) -> usize {
262 self.states[id.to_usize()].matches.len()
263 }
264
265 fn next_state(&self, mut current: S, input: u8) -> S {
266 // This terminates since:
267 //
268 // 1. `State.fail` never points to fail_id().
269 // 2. All `State.fail` values point to a state closer to `start`.
270 // 3. The start state has no transitions to fail_id().
271 loop {
272 let state = &self.states[current.to_usize()];
273 let next = state.next_state(input);
274 if next != fail_id() {
275 return next;
276 }
277 current = state.fail;
278 }
279 }
280}
281
282/// A representation of an NFA state for an Aho-Corasick automaton.
283///
284/// It contains the transitions to the next state, a failure transition for
285/// cases where there exists no other transition for the current input byte,
286/// the matches implied by visiting this state (if any) and the depth of this
287/// state. The depth of a state is simply the distance from it to the start
288/// state in the automaton, where the depth of the start state is 0.
289#[derive(Clone, Debug)]
290pub struct State<S> {
291 trans: Transitions<S>,
292 fail: S,
293 matches: Vec<(PatternID, PatternLength)>,
294 // TODO: Strictly speaking, this isn't needed for searching. It's only
295 // used when building an NFA that supports leftmost match semantics. We
296 // could drop this from the state and dynamically build a map only when
297 // computing failure transitions, but it's not clear which is better.
298 // Benchmark this.
299 depth: usize,
300}
301
302impl<S: StateID> State<S> {
303 fn heap_bytes(&self) -> usize {
304 self.trans.heap_bytes()
305 + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306 }
307
308 fn add_match(&mut self, i: PatternID, len: PatternLength) {
309 self.matches.push((i, len));
310 }
311
312 fn is_match(&self) -> bool {
313 !self.matches.is_empty()
314 }
315
316 fn get_longest_match_len(&self) -> Option<usize> {
317 // Why is this true? Because the first match in any matching state
318 // will always correspond to the match added to it during trie
319 // construction (since when we copy matches due to failure transitions,
320 // we always append them). Therefore, it follows that the first match
321 // must always be longest since any subsequent match must be from a
322 // failure transition, and a failure transition by construction points
323 // to a proper suffix. A proper suffix is, by definition, smaller.
324 self.matches.get(0).map(|&(_, len)| len)
325 }
326
327 fn next_state(&self, input: u8) -> S {
328 self.trans.next_state(input)
329 }
330
331 fn set_next_state(&mut self, input: u8, next: S) {
332 self.trans.set_next_state(input, next);
333 }
334}
335
336/// Represents the transitions for a single dense state.
337///
338/// The primary purpose here is to encapsulate index access. Namely, since a
339/// dense representation always contains 256 elements, all values of `u8` are
340/// valid indices.
341#[derive(Clone, Debug)]
342struct Dense<S>(Vec<S>);
343
344impl<S> Dense<S>
345where
346 S: StateID,
347{
348 fn new() -> Self {
349 Dense(vec![fail_id(); 256])
350 }
351
352 #[inline]
353 fn len(&self) -> usize {
354 self.0.len()
355 }
356}
357
358impl<S> Index<u8> for Dense<S> {
359 type Output = S;
360
361 #[inline]
362 fn index(&self, i: u8) -> &S {
363 // SAFETY: This is safe because all dense transitions have
364 // exactly 256 elements, so all u8 values are valid indices.
365 &self.0[i as usize]
366 }
367}
368
369impl<S> IndexMut<u8> for Dense<S> {
370 #[inline]
371 fn index_mut(&mut self, i: u8) -> &mut S {
372 // SAFETY: This is safe because all dense transitions have
373 // exactly 256 elements, so all u8 values are valid indices.
374 &mut self.0[i as usize]
375 }
376}
377
378/// A representation of transitions in an NFA.
379///
380/// Transitions have either a sparse representation, which is slower for
381/// lookups but uses less memory, or a dense representation, which is faster
382/// for lookups but uses more memory. In the sparse representation, the absence
383/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
384/// are still explicitly represented.
385///
386/// For the NFA, by default, we use a dense representation for transitions for
387/// states close to the start state because it's likely these are the states
388/// that will be most frequently visited.
389#[derive(Clone, Debug)]
390enum Transitions<S> {
391 Sparse(Vec<(u8, S)>),
392 Dense(Dense<S>),
393}
394
395impl<S: StateID> Transitions<S> {
396 fn heap_bytes(&self) -> usize {
397 match *self {
398 Transitions::Sparse(ref sparse) => {
399 sparse.len() * size_of::<(u8, S)>()
400 }
401 Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
402 }
403 }
404
405 fn next_state(&self, input: u8) -> S {
406 match *self {
407 Transitions::Sparse(ref sparse) => {
408 for &(b, id) in sparse {
409 if b == input {
410 return id;
411 }
412 }
413 fail_id()
414 }
415 Transitions::Dense(ref dense) => dense[input],
416 }
417 }
418
419 fn set_next_state(&mut self, input: u8, next: S) {
420 match *self {
421 Transitions::Sparse(ref mut sparse) => {
422 match sparse.binary_search_by_key(&input, |&(b, _)| b) {
423 Ok(i) => sparse[i] = (input, next),
424 Err(i) => sparse.insert(i, (input, next)),
425 }
426 }
427 Transitions::Dense(ref mut dense) => {
428 dense[input] = next;
429 }
430 }
431 }
432
433 /// Iterate over transitions in this state while skipping over transitions
434 /// to `fail_id()`.
435 fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
436 match *self {
437 Transitions::Sparse(ref sparse) => {
438 for &(b, id) in sparse {
439 f(b, id);
440 }
441 }
442 Transitions::Dense(ref dense) => {
443 for b in AllBytesIter::new() {
444 let id = dense[b];
445 if id != fail_id() {
446 f(b, id);
447 }
448 }
449 }
450 }
451 }
452
453 /// Iterate over all transitions in this state according to the given
454 /// equivalence classes, including transitions to `fail_id()`.
455 fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
456 if classes.is_singleton() {
457 match *self {
458 Transitions::Sparse(ref sparse) => {
459 sparse_iter(sparse, f);
460 }
461 Transitions::Dense(ref dense) => {
462 for b in AllBytesIter::new() {
463 f(b, dense[b]);
464 }
465 }
466 }
467 } else {
468 // In this case, we only want to yield a single byte for each
469 // equivalence class.
470 match *self {
471 Transitions::Sparse(ref sparse) => {
472 let mut last_class = None;
473 sparse_iter(sparse, |b, next| {
474 let class = classes.get(b);
475 if last_class != Some(class) {
476 last_class = Some(class);
477 f(b, next);
478 }
479 })
480 }
481 Transitions::Dense(ref dense) => {
482 for b in classes.representatives() {
483 f(b, dense[b]);
484 }
485 }
486 }
487 }
488 }
489}
490
491/// Iterator over transitions in a state, skipping transitions to `fail_id()`.
492///
493/// This abstracts over the representation of NFA transitions, which may be
494/// either in a sparse or dense representation.
495///
496/// This somewhat idiosyncratically borrows the NFA mutably, so that when one
497/// is iterating over transitions, the caller can still mutate the NFA. This
498/// is useful when creating failure transitions.
499#[derive(Debug)]
500struct IterTransitionsMut<'a, S: StateID> {
501 nfa: &'a mut NFA<S>,
502 state_id: S,
503 cur: usize,
504}
505
506impl<'a, S: StateID> IterTransitionsMut<'a, S> {
507 fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
508 IterTransitionsMut { nfa, state_id, cur: 0 }
509 }
510
511 fn nfa(&mut self) -> &mut NFA<S> {
512 self.nfa
513 }
514}
515
516impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
517 type Item = (u8, S);
518
519 fn next(&mut self) -> Option<(u8, S)> {
520 match self.nfa.states[self.state_id.to_usize()].trans {
521 Transitions::Sparse(ref sparse) => {
522 if self.cur >= sparse.len() {
523 return None;
524 }
525 let i = self.cur;
526 self.cur += 1;
527 Some(sparse[i])
528 }
529 Transitions::Dense(ref dense) => {
530 while self.cur < dense.len() {
531 // There are always exactly 255 transitions in dense repr.
532 debug_assert!(self.cur < 256);
533
534 let b = self.cur as u8;
535 let id = dense[b];
536 self.cur += 1;
537 if id != fail_id() {
538 return Some((b, id));
539 }
540 }
541 None
542 }
543 }
544 }
545}
546
547/// A simple builder for configuring the NFA construction of Aho-Corasick.
548#[derive(Clone, Debug)]
549pub struct Builder {
550 dense_depth: usize,
551 match_kind: MatchKind,
552 prefilter: bool,
553 anchored: bool,
554 ascii_case_insensitive: bool,
555}
556
557impl Default for Builder {
558 fn default() -> Builder {
559 Builder {
560 dense_depth: 2,
561 match_kind: MatchKind::default(),
562 prefilter: true,
563 anchored: false,
564 ascii_case_insensitive: false,
565 }
566 }
567}
568
569impl Builder {
570 pub fn new() -> Builder {
571 Builder::default()
572 }
573
574 pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
575 where
576 I: IntoIterator<Item = P>,
577 P: AsRef<[u8]>,
578 {
579 Compiler::new(self)?.compile(patterns)
580 }
581
582 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
583 self.match_kind = kind;
584 self
585 }
586
587 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
588 self.dense_depth = depth;
589 self
590 }
591
592 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
593 self.prefilter = yes;
594 self
595 }
596
597 pub fn anchored(&mut self, yes: bool) -> &mut Builder {
598 self.anchored = yes;
599 self
600 }
601
602 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
603 self.ascii_case_insensitive = yes;
604 self
605 }
606}
607
608/// A compiler uses a builder configuration and builds up the NFA formulation
609/// of an Aho-Corasick automaton. This roughly corresponds to the standard
610/// formulation described in textbooks.
611#[derive(Debug)]
612struct Compiler<'a, S: StateID> {
613 builder: &'a Builder,
614 prefilter: prefilter::Builder,
615 nfa: NFA<S>,
616 byte_classes: ByteClassBuilder,
617}
618
619impl<'a, S: StateID> Compiler<'a, S> {
620 fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
621 Ok(Compiler {
622 builder,
623 prefilter: prefilter::Builder::new(builder.match_kind)
624 .ascii_case_insensitive(builder.ascii_case_insensitive),
625 nfa: NFA {
626 match_kind: builder.match_kind,
627 start_id: usize_to_state_id(2)?,
628 max_pattern_len: 0,
629 pattern_count: 0,
630 heap_bytes: 0,
631 prefilter: None,
632 anchored: builder.anchored,
633 byte_classes: ByteClasses::singletons(),
634 states: vec![],
635 },
636 byte_classes: ByteClassBuilder::new(),
637 })
638 }
639
640 fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
641 where
642 I: IntoIterator<Item = P>,
643 P: AsRef<[u8]>,
644 {
645 self.add_state(0)?; // the fail state, which is never entered
646 self.add_state(0)?; // the dead state, only used for leftmost
647 self.add_state(0)?; // the start state
648 self.build_trie(patterns)?;
649 self.add_start_state_loop();
650 self.add_dead_state_loop();
651 if !self.builder.anchored {
652 if self.match_kind().is_leftmost() {
653 self.fill_failure_transitions_leftmost();
654 } else {
655 self.fill_failure_transitions_standard();
656 }
657 }
658 self.close_start_state_loop();
659 self.nfa.byte_classes = self.byte_classes.build();
660 if !self.builder.anchored {
661 self.nfa.prefilter = self.prefilter.build();
662 }
663 self.calculate_size();
664 Ok(self.nfa)
665 }
666
667 /// This sets up the initial prefix trie that makes up the Aho-Corasick
668 /// automaton. Effectively, it creates the basic structure of the
669 /// automaton, where every pattern given has a path from the start state to
670 /// the end of the pattern.
671 fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
672 where
673 I: IntoIterator<Item = P>,
674 P: AsRef<[u8]>,
675 {
676 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
677 let pat = pat.as_ref();
678 self.nfa.max_pattern_len =
679 cmp::max(self.nfa.max_pattern_len, pat.len());
680 self.nfa.pattern_count += 1;
681
682 let mut prev = self.nfa.start_id;
683 let mut saw_match = false;
684 for (depth, &b) in pat.iter().enumerate() {
685 // When leftmost-first match semantics are requested, we
686 // specifically stop adding patterns when a previously added
687 // pattern is a prefix of it. We avoid adding it because
688 // leftmost-first semantics imply that the pattern can never
689 // match. This is not just an optimization to save space! It
690 // is necessary for correctness. In fact, this is the only
691 // difference in the automaton between the implementations for
692 // leftmost-first and leftmost-longest.
693 saw_match = saw_match || self.nfa.state(prev).is_match();
694 if self.builder.match_kind.is_leftmost_first() && saw_match {
695 // Skip to the next pattern immediately. This avoids
696 // incorrectly adding a match after this loop terminates.
697 continue 'PATTERNS;
698 }
699
700 // Add this byte to our equivalence classes. We don't use these
701 // for NFA construction. These are instead used only if we're
702 // building a DFA. They would technically be useful for the
703 // NFA, but it would require a second pass over the patterns.
704 self.byte_classes.set_range(b, b);
705 if self.builder.ascii_case_insensitive {
706 let b = opposite_ascii_case(b);
707 self.byte_classes.set_range(b, b);
708 }
709
710 // If the transition from prev using the current byte already
711 // exists, then just move through it. Otherwise, add a new
712 // state. We track the depth here so that we can determine
713 // how to represent transitions. States near the start state
714 // use a dense representation that uses more memory but is
715 // faster. Other states use a sparse representation that uses
716 // less memory but is slower.
717 let next = self.nfa.state(prev).next_state(b);
718 if next != fail_id() {
719 prev = next;
720 } else {
721 let next = self.add_state(depth + 1)?;
722 self.nfa.state_mut(prev).set_next_state(b, next);
723 if self.builder.ascii_case_insensitive {
724 let b = opposite_ascii_case(b);
725 self.nfa.state_mut(prev).set_next_state(b, next);
726 }
727 prev = next;
728 }
729 }
730 // Once the pattern has been added, log the match in the final
731 // state that it reached.
732 self.nfa.state_mut(prev).add_match(pati, pat.len());
733 // ... and hand it to the prefilter builder, if applicable.
734 if self.builder.prefilter {
735 self.prefilter.add(pat);
736 }
737 }
738 Ok(())
739 }
740
741 /// This routine creates failure transitions according to the standard
742 /// textbook formulation of the Aho-Corasick algorithm.
743 ///
744 /// Building failure transitions is the most interesting part of building
745 /// the Aho-Corasick automaton, because they are what allow searches to
746 /// be performed in linear time. Specifically, a failure transition is
747 /// a single transition associated with each state that points back to
748 /// the longest proper suffix of the pattern being searched. The failure
749 /// transition is followed whenever there exists no transition on the
750 /// current state for the current input byte. If there is no other proper
751 /// suffix, then the failure transition points back to the starting state.
752 ///
753 /// For example, let's say we built an Aho-Corasick automaton with the
754 /// following patterns: 'abcd' and 'cef'. The trie looks like this:
755 ///
756 /// ```ignore
757 /// a - S1 - b - S2 - c - S3 - d - S4*
758 /// /
759 /// S0 - c - S5 - e - S6 - f - S7*
760 /// ```
761 ///
762 /// At this point, it should be fairly straight-forward to see how this
763 /// trie can be used in a simplistic way. At any given position in the
764 /// text we're searching (called the "subject" string), all we need to do
765 /// is follow the transitions in the trie by consuming one transition for
766 /// each byte in the subject string. If we reach a match state, then we can
767 /// report that location as a match.
768 ///
769 /// The trick comes when searching a subject string like 'abcef'. We'll
770 /// initially follow the transition from S0 to S1 and wind up in S3 after
771 /// observng the 'c' byte. At this point, the next byte is 'e' but state
772 /// S3 has no transition for 'e', so the search fails. We then would need
773 /// to restart the search at the next position in 'abcef', which
774 /// corresponds to 'b'. The match would fail, but the next search starting
775 /// at 'c' would finally succeed. The problem with this approach is that
776 /// we wind up searching the subject string potentially many times. In
777 /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
778 /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
779 /// like to achieve a `O(n + m)` worst case complexity.
780 ///
781 /// This is where failure transitions come in. Instead of dying at S3 in
782 /// the first search, the automaton can instruct the search to move to
783 /// another part of the automaton that corresponds to a suffix of what
784 /// we've seen so far. Recall that we've seen 'abc' in the subject string,
785 /// and the automaton does indeed have a non-empty suffix, 'c', that could
786 /// potentially lead to another match. Thus, the actual Aho-Corasick
787 /// automaton for our patterns in this case looks like this:
788 ///
789 /// ```ignore
790 /// a - S1 - b - S2 - c - S3 - d - S4*
791 /// / /
792 /// / ----------------
793 /// / /
794 /// S0 - c - S5 - e - S6 - f - S7*
795 /// ```
796 ///
797 /// That is, we have a failure transition from S3 to S5, which is followed
798 /// exactly in cases when we are in state S3 but see any byte other than
799 /// 'd' (that is, we've "failed" to find a match in this portion of our
800 /// trie). We know we can transition back to S5 because we've already seen
801 /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
802 /// with the search starting at S5 and complete our match.
803 ///
804 /// Adding failure transitions to a trie is fairly simple, but subtle. The
805 /// key issue is that you might have multiple failure transition that you
806 /// need to follow. For example, look at the trie for the patterns
807 /// 'abcd', 'b', 'bcd' and 'cd':
808 ///
809 /// ```ignore
810 /// - a - S1 - b - S2 - c - S3 - d - S4*
811 /// /
812 /// S0 - b - S5* - c - S6 - d - S7*
813 /// \
814 /// - c - S8 - d - S9*
815 /// ```
816 ///
817 /// The failure transitions for this trie are defined from S2 to S5,
818 /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
819 /// corresponds to a match, since its failure transition to S5 is itself
820 /// a match state.
821 ///
822 /// Perhaps simplest way to think about adding these failure transitions
823 /// is recursively. That is, if you know the failure transitions for every
824 /// possible previous state that could be visited (e.g., when computing the
825 /// failure transition for S3, you already know the failure transitions
826 /// for S0, S1 and S2), then you can simply follow the failure transition
827 /// of the previous state and check whether the incoming transition is
828 /// defined after following the failure transition.
829 ///
830 /// For example, when determining the failure state for S3, by our
831 /// assumptions, we already know that there is a failure transition from
832 /// S2 (the previous state) to S5. So we follow that transition and check
833 /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
834 /// as there is a transition from S5 to S6 for the byte 'c'. If no such
835 /// transition existed, we could keep following the failure transitions
836 /// until we reach the start state, which is the failure transition for
837 /// every state that has no corresponding proper suffix.
838 ///
839 /// We don't actually use recursion to implement this, but instead, use a
840 /// breadth first search of the automaton. Our base case is the start
841 /// state, whose failure transition is just a transition to itself.
842 fn fill_failure_transitions_standard(&mut self) {
843 // Initialize the queue for breadth first search with all transitions
844 // out of the start state. We handle the start state specially because
845 // we only want to follow non-self transitions. If we followed self
846 // transitions, then this would never terminate.
847 let mut queue = VecDeque::new();
848 let mut seen = self.queued_set();
849 for b in AllBytesIter::new() {
850 let next = self.nfa.start().next_state(b);
851 if next != self.nfa.start_id {
852 if !seen.contains(next) {
853 queue.push_back(next);
854 seen.insert(next);
855 }
856 }
857 }
858 while let Some(id) = queue.pop_front() {
859 let mut it = self.nfa.iter_transitions_mut(id);
860 while let Some((b, next)) = it.next() {
861 if seen.contains(next) {
862 // The only way to visit a duplicate state in a transition
863 // list is when ASCII case insensitivity is enabled. In
864 // this case, we want to skip it since it's redundant work.
865 // But it would also end up duplicating matches, which
866 // results in reporting duplicate matches in some cases.
867 // See the 'acasei010' regression test.
868 continue;
869 }
870 queue.push_back(next);
871 seen.insert(next);
872
873 let mut fail = it.nfa().state(id).fail;
874 while it.nfa().state(fail).next_state(b) == fail_id() {
875 fail = it.nfa().state(fail).fail;
876 }
877 fail = it.nfa().state(fail).next_state(b);
878 it.nfa().state_mut(next).fail = fail;
879 it.nfa().copy_matches(fail, next);
880 }
881 // If the start state is a match state, then this automaton can
882 // match the empty string. This implies all states are match states
883 // since every position matches the empty string, so copy the
884 // matches from the start state to every state. Strictly speaking,
885 // this is only necessary for overlapping matches since each
886 // non-empty non-start match state needs to report empty matches
887 // in addition to its own. For the non-overlapping case, such
888 // states only report the first match, which is never empty since
889 // it isn't a start state.
890 it.nfa().copy_empty_matches(id);
891 }
892 }
893
894 /// This routine is just like fill_failure_transitions_standard, except
895 /// it adds failure transitions in a way that preserves leftmost match
896 /// semantics (for both leftmost-first and leftmost-longest).
897 ///
898 /// The algorithms are so similar that it would be possible to write it
899 /// generically. But doing so without overhead would require a bit of
900 /// ceremony, so we just copy it and add in the extra leftmost logic.
901 /// Moreover, the standard algorithm above is so simple that it feels like
902 /// a crime to disturb it.
903 ///
904 /// In effect, this proceeds just like the standard approach, but we
905 /// specifically add only a subset of all failure transitions. Namely, we
906 /// only add failure transitions that either do not occur after a match
907 /// or failure transitions that do occur after a match but preserve the
908 /// match. The comments in the implementation below should help.
909 ///
910 /// N.B. The only differences in the automaton between leftmost-first and
911 /// leftmost-longest are in trie construction. Otherwise, both have exactly
912 /// the same set of failure transitions. leftmost-longest adds everything
913 /// to the trie, where as leftmost-first skips any patterns for which there
914 /// exists a prefix of it that was added earlier.
915 ///
916 /// N.B. I came up with this algorithm on my own, and after scouring all of
917 /// the other AC implementations I know of (Perl, Snort, many on GitHub).
918 /// I couldn't find any that implement leftmost semantics like this.
919 /// Perl of course needs leftmost-first semantics, but they implement it
920 /// with a seeming hack at *search* time instead of encoding it into the
921 /// automaton. There are also a couple Java libraries that support leftmost
922 /// longest semantics, but they do it by building a queue of matches at
923 /// search time, which is even worse than what Perl is doing. ---AG
924 fn fill_failure_transitions_leftmost(&mut self) {
925 /// Represents an item in our queue of states to process.
926 ///
927 /// Fundamentally, this queue serves the same purpose as the queue
928 /// for filling failure transitions using the standard formulation.
929 /// In the leftmost case, though, we need to track a bit more
930 /// information. See comments below.
931 #[derive(Clone, Copy, Debug)]
932 struct QueuedState<S> {
933 /// The id of the state to visit.
934 id: S,
935 /// The depth at which the first match was observed in the path
936 /// to this state. Note that this corresponds to the depth at
937 /// which the beginning of the match was detected. If no match
938 /// has been seen, then this is None.
939 match_at_depth: Option<usize>,
940 }
941
942 impl<S: StateID> QueuedState<S> {
943 /// Create a queued state corresponding to the given NFA's start
944 /// state.
945 fn start(nfa: &NFA<S>) -> QueuedState<S> {
946 let match_at_depth =
947 if nfa.start().is_match() { Some(0) } else { None };
948 QueuedState { id: nfa.start_id, match_at_depth }
949 }
950
951 /// Return the next state to queue up. The given id must be a state
952 /// corresponding to a single transition from this queued state.
953 fn next_queued_state(
954 &self,
955 nfa: &NFA<S>,
956 id: S,
957 ) -> QueuedState<S> {
958 let match_at_depth = self.next_match_at_depth(nfa, id);
959 QueuedState { id, match_at_depth }
960 }
961
962 /// Return the earliest depth at which a match has occurred for
963 /// the given state. The given state must correspond to a single
964 /// transition from this queued state.
965 fn next_match_at_depth(
966 &self,
967 nfa: &NFA<S>,
968 next: S,
969 ) -> Option<usize> {
970 // This is a little tricky. If the previous state has already
971 // seen a match or if `next` isn't a match state, then nothing
972 // needs to change since a later state cannot find an earlier
973 // match.
974 match self.match_at_depth {
975 Some(x) => return Some(x),
976 None if nfa.state(next).is_match() => {}
977 None => return None,
978 }
979 let depth = nfa.state(next).depth
980 - nfa.state(next).get_longest_match_len().unwrap()
981 + 1;
982 Some(depth)
983 }
984 }
985
986 // Initialize the queue for breadth first search with all transitions
987 // out of the start state. We handle the start state specially because
988 // we only want to follow non-self transitions. If we followed self
989 // transitions, then this would never terminate.
990 let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
991 let mut seen = self.queued_set();
992 let start = QueuedState::start(&self.nfa);
993 for b in AllBytesIter::new() {
994 let next_id = self.nfa.start().next_state(b);
995 if next_id != start.id {
996 let next = start.next_queued_state(&self.nfa, next_id);
997 if !seen.contains(next.id) {
998 queue.push_back(next);
999 seen.insert(next.id);
1000 }
1001 // If a state immediately following the start state is a match
1002 // state, then we never want to follow its failure transition
1003 // since the failure transition necessarily leads back to the
1004 // start state, which we never want to do for leftmost matching
1005 // after a match has been found.
1006 //
1007 // N.B. This is a special case of the more general handling
1008 // found below.
1009 if self.nfa.state(next_id).is_match() {
1010 self.nfa.state_mut(next_id).fail = dead_id();
1011 }
1012 }
1013 }
1014 while let Some(item) = queue.pop_front() {
1015 let mut any_trans = false;
1016 let mut it = self.nfa.iter_transitions_mut(item.id);
1017 while let Some((b, next_id)) = it.next() {
1018 any_trans = true;
1019
1020 // Queue up the next state.
1021 let next = item.next_queued_state(it.nfa(), next_id);
1022 if seen.contains(next.id) {
1023 // The only way to visit a duplicate state in a transition
1024 // list is when ASCII case insensitivity is enabled. In
1025 // this case, we want to skip it since it's redundant work.
1026 // But it would also end up duplicating matches, which
1027 // results in reporting duplicate matches in some cases.
1028 // See the 'acasei010' regression test.
1029 continue;
1030 }
1031 queue.push_back(next);
1032 seen.insert(next.id);
1033
1034 // Find the failure state for next. Same as standard.
1035 let mut fail = it.nfa().state(item.id).fail;
1036 while it.nfa().state(fail).next_state(b) == fail_id() {
1037 fail = it.nfa().state(fail).fail;
1038 }
1039 fail = it.nfa().state(fail).next_state(b);
1040
1041 // This is the key difference from the standard formulation.
1042 // Namely, if we've seen a match, then we only want a failure
1043 // transition if the failure transition preserves the match
1044 // we've seen. In general, this is not true of all failure
1045 // transitions since they can point back to any suffix of what
1046 // we've seen so far. Instead, we only want to point back to
1047 // suffixes that contain any match we've seen.
1048 //
1049 // We achieve this by comparing the depth of the failure
1050 // transition with the number of states between this state
1051 // and the beginning of the earliest match detected. If the
1052 // depth of the failure state is smaller than this difference,
1053 // then it cannot contain the match. If it's bigger or equal
1054 // to the difference, then it necessarily includes the match
1055 // we've seen since all failure transitions correspond to a
1056 // suffix.
1057 //
1058 // If we've determined that we don't want the failure
1059 // transition, then we set this state's failure transition to
1060 // the dead state. In other words, when a search hits this
1061 // state, it will not continue and correctly stop. (N.B. A
1062 // dead state is different than a fail state. A dead state
1063 // MUST be preceded by a match and acts as a sentinel to search
1064 // routines to terminate.)
1065 //
1066 // Understanding this is tricky, and it took me several days
1067 // to think through this and get it right. If you want to grok
1068 // it, then I'd recommend: 1) switch the implementation to
1069 // always use the standard algorithm for filling in failure
1070 // transitions, 2) run the test suite and 3) examine the test
1071 // failures. Write out the automatons for them and try to work
1072 // backwards by figuring out which failure transitions should
1073 // be removed. You should arrive at the same rule used below.
1074 if let Some(match_depth) = next.match_at_depth {
1075 let fail_depth = it.nfa().state(fail).depth;
1076 let next_depth = it.nfa().state(next.id).depth;
1077 if next_depth - match_depth + 1 > fail_depth {
1078 it.nfa().state_mut(next.id).fail = dead_id();
1079 continue;
1080 }
1081 assert_ne!(
1082 start.id,
1083 it.nfa().state(next.id).fail,
1084 "states that are match states or follow match \
1085 states should never have a failure transition \
1086 back to the start state in leftmost searching",
1087 );
1088 }
1089 it.nfa().state_mut(next.id).fail = fail;
1090 it.nfa().copy_matches(fail, next.id);
1091 }
1092 // If there are no transitions for this state and if it's a match
1093 // state, then we must set its failure transition to the dead
1094 // state since we never want it to restart the search.
1095 if !any_trans && it.nfa().state(item.id).is_match() {
1096 it.nfa().state_mut(item.id).fail = dead_id();
1097 }
1098 // We don't need to copy empty matches from the start state here
1099 // because that's only necessary for overlapping matches and
1100 // leftmost match kinds don't support overlapping matches.
1101 }
1102 }
1103
1104 /// Returns a set that tracked queued states.
1105 ///
1106 /// This is only necessary when ASCII case insensitivity is enabled, since
1107 /// it is the only way to visit the same state twice. Otherwise, this
1108 /// returns an inert set that nevers adds anything and always reports
1109 /// `false` for every member test.
1110 fn queued_set(&self) -> QueuedSet<S> {
1111 if self.builder.ascii_case_insensitive {
1112 QueuedSet::active()
1113 } else {
1114 QueuedSet::inert()
1115 }
1116 }
1117
1118 /// Set the failure transitions on the start state to loop back to the
1119 /// start state. This effectively permits the Aho-Corasick automaton to
1120 /// match at any position. This is also required for finding the next
1121 /// state to terminate, namely, finding the next state should never return
1122 /// a fail_id.
1123 ///
1124 /// This must be done after building the initial trie, since trie
1125 /// construction depends on transitions to `fail_id` to determine whether a
1126 /// state already exists or not.
1127 fn add_start_state_loop(&mut self) {
1128 let start_id = self.nfa.start_id;
1129 let start = self.nfa.start_mut();
1130 for b in AllBytesIter::new() {
1131 if start.next_state(b) == fail_id() {
1132 start.set_next_state(b, start_id);
1133 }
1134 }
1135 }
1136
1137 /// Remove the start state loop by rewriting any transitions on the start
1138 /// state back to the start state with transitions to the dead state.
1139 ///
1140 /// The loop is only closed when two conditions are met: the start state
1141 /// is a match state and the match kind is leftmost-first or
1142 /// leftmost-longest. (Alternatively, if this is an anchored automaton,
1143 /// then the start state is always closed, regardless of aforementioned
1144 /// conditions.)
1145 ///
1146 /// The reason for this is that under leftmost semantics, a start state
1147 /// that is also a match implies that we should never restart the search
1148 /// process. We allow normal transitions out of the start state, but if
1149 /// none exist, we transition to the dead state, which signals that
1150 /// searching should stop.
1151 fn close_start_state_loop(&mut self) {
1152 if self.builder.anchored
1153 || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
1154 {
1155 let start_id = self.nfa.start_id;
1156 let start = self.nfa.start_mut();
1157 for b in AllBytesIter::new() {
1158 if start.next_state(b) == start_id {
1159 start.set_next_state(b, dead_id());
1160 }
1161 }
1162 }
1163 }
1164
1165 /// Sets all transitions on the dead state to point back to the dead state.
1166 /// Normally, missing transitions map back to the failure state, but the
1167 /// point of the dead state is to act as a sink that can never be escaped.
1168 fn add_dead_state_loop(&mut self) {
1169 let dead = self.nfa.state_mut(dead_id());
1170 for b in AllBytesIter::new() {
1171 dead.set_next_state(b, dead_id());
1172 }
1173 }
1174
1175 /// Computes the total amount of heap used by this NFA in bytes.
1176 fn calculate_size(&mut self) {
1177 let mut size = 0;
1178 for state in &self.nfa.states {
1179 size += state.heap_bytes();
1180 }
1181 self.nfa.heap_bytes = size;
1182 }
1183
1184 /// Add a new state to the underlying NFA with the given depth. The depth
1185 /// is used to determine how to represent the transitions.
1186 ///
1187 /// If adding the new state would overflow the chosen state ID
1188 /// representation, then this returns an error.
1189 fn add_state(&mut self, depth: usize) -> Result<S> {
1190 if depth < self.builder.dense_depth {
1191 self.nfa.add_dense_state(depth)
1192 } else {
1193 self.nfa.add_sparse_state(depth)
1194 }
1195 }
1196
1197 /// Returns the match kind configured on the underlying builder.
1198 fn match_kind(&self) -> MatchKind {
1199 self.builder.match_kind
1200 }
1201}
1202
1203/// A set of state identifiers used to avoid revisiting the same state multiple
1204/// times when filling in failure transitions.
1205///
1206/// This set has an "inert" and an "active" mode. When inert, the set never
1207/// stores anything and always returns `false` for every member test. This is
1208/// useful to avoid the performance and memory overhead of maintaining this
1209/// set when it is not needed.
1210#[derive(Debug)]
1211struct QueuedSet<S> {
1212 set: Option<BTreeSet<S>>,
1213}
1214
1215impl<S: StateID> QueuedSet<S> {
1216 /// Return an inert set that returns `false` for every state ID membership
1217 /// test.
1218 fn inert() -> QueuedSet<S> {
1219 QueuedSet { set: None }
1220 }
1221
1222 /// Return an active set that tracks state ID membership.
1223 fn active() -> QueuedSet<S> {
1224 QueuedSet { set: Some(BTreeSet::new()) }
1225 }
1226
1227 /// Inserts the given state ID into this set. (If the set is inert, then
1228 /// this is a no-op.)
1229 fn insert(&mut self, state_id: S) {
1230 if let Some(ref mut set) = self.set {
1231 set.insert(state_id);
1232 }
1233 }
1234
1235 /// Returns true if and only if the given state ID is in this set. If the
1236 /// set is inert, this always returns false.
1237 fn contains(&self, state_id: S) -> bool {
1238 match self.set {
1239 None => false,
1240 Some(ref set) => set.contains(&state_id),
1241 }
1242 }
1243}
1244
1245/// An iterator over every byte value.
1246///
1247/// We use this instead of (0..256).map(|b| b as u8) because this optimizes
1248/// better in debug builds.
1249///
1250/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1251/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1252/// once our MSRV is Rust 1.26 or newer.
1253#[derive(Debug)]
1254struct AllBytesIter(u16);
1255
1256impl AllBytesIter {
1257 fn new() -> AllBytesIter {
1258 AllBytesIter(0)
1259 }
1260}
1261
1262impl Iterator for AllBytesIter {
1263 type Item = u8;
1264
1265 fn next(&mut self) -> Option<Self::Item> {
1266 if self.0 >= 256 {
1267 None
1268 } else {
1269 let b = self.0 as u8;
1270 self.0 += 1;
1271 Some(b)
1272 }
1273 }
1274}
1275
1276impl<S: StateID> fmt::Debug for NFA<S> {
1277 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1278 writeln!(f, "NFA(")?;
1279 writeln!(f, "match_kind: {:?}", self.match_kind)?;
1280 writeln!(f, "prefilter: {:?}", self.prefilter)?;
1281 writeln!(f, "{}", "-".repeat(79))?;
1282 for (id, s) in self.states.iter().enumerate() {
1283 let mut trans = vec![];
1284 s.trans.iter(|byte, next| {
1285 // The start state has a bunch of uninteresting transitions
1286 // back into itself. It's questionable to hide them since they
1287 // are critical to understanding the automaton, but they are
1288 // very noisy without better formatting for contiugous ranges
1289 // to the same state.
1290 if id == self.start_id.to_usize() && next == self.start_id {
1291 return;
1292 }
1293 // Similarly, the dead state has a bunch of uninteresting
1294 // transitions too.
1295 if id == dead_id() {
1296 return;
1297 }
1298 trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1299 });
1300 writeln!(f, "{:04}: {}", id, trans.join(", "))?;
1301
1302 let matches: Vec<String> = s
1303 .matches
1304 .iter()
1305 .map(|&(pattern_id, _)| pattern_id.to_string())
1306 .collect();
1307 writeln!(f, " matches: {}", matches.join(", "))?;
1308 writeln!(f, " fail: {}", s.fail.to_usize())?;
1309 writeln!(f, " depth: {}", s.depth)?;
1310 }
1311 writeln!(f, "{}", "-".repeat(79))?;
1312 writeln!(f, ")")?;
1313 Ok(())
1314 }
1315}
1316
1317/// Iterate over all possible byte transitions given a sparse set.
1318fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1319 let mut byte = 0u16;
1320 for &(b, id) in trans {
1321 while byte < (b as u16) {
1322 f(byte as u8, fail_id());
1323 byte += 1;
1324 }
1325 f(b, id);
1326 byte += 1;
1327 }
1328 for b in byte..256 {
1329 f(b as u8, fail_id());
1330 }
1331}
1332
1333/// Safely return two mutable borrows to two different locations in the given
1334/// slice.
1335///
1336/// This panics if i == j.
1337fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1338 assert!(i != j, "{} must not be equal to {}", i, j);
1339 if i < j {
1340 let (before, after) = xs.split_at_mut(j);
1341 (&mut before[i], &mut after[0])
1342 } else {
1343 let (before, after) = xs.split_at_mut(i);
1344 (&mut after[0], &mut before[j])
1345 }
1346}
1347
1348/// Return the given byte as its escaped string form.
1349fn escape(b: u8) -> String {
1350 use std::ascii;
1351
1352 String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1353}
1354
1355#[cfg(test)]
1356mod tests {
1357 use super::*;
1358
1359 #[test]
1360 fn scratch() {
1361 let nfa: NFA<usize> = Builder::new()
1362 .dense_depth(0)
1363 // .match_kind(MatchKind::LeftmostShortest)
1364 // .match_kind(MatchKind::LeftmostLongest)
1365 .match_kind(MatchKind::LeftmostFirst)
1366 // .build(&["abcd", "ce", "b"])
1367 // .build(&["ab", "bc"])
1368 // .build(&["b", "bcd", "ce"])
1369 // .build(&["abc", "bx"])
1370 // .build(&["abc", "bd", "ab"])
1371 // .build(&["abcdefghi", "hz", "abcdefgh"])
1372 // .build(&["abcd", "bce", "b"])
1373 .build(&["abcdefg", "bcde", "bcdef"])
1374 .unwrap();
1375 println!("{:?}", nfa);
1376 }
1377}