aho_corasick/
ahocorasick.rs

1use std::io;
2
3use crate::automaton::Automaton;
4use crate::buffer::Buffer;
5use crate::dfa::{self, DFA};
6use crate::error::Result;
7use crate::nfa::{self, NFA};
8use crate::packed;
9use crate::prefilter::{Prefilter, PrefilterState};
10use crate::state_id::StateID;
11use crate::Match;
12
13/// An automaton for searching multiple strings in linear time.
14///
15/// The `AhoCorasick` type supports a few basic ways of constructing an
16/// automaton, including
17/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
18/// and
19/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
20/// However, there are a fair number of configurable options that can be set
21/// by using
22/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
23/// instead. Such options include, but are not limited to, how matches are
24/// determined, simple case insensitivity, whether to use a DFA or not and
25/// various knobs for controlling the space-vs-time trade offs taken when
26/// building the automaton.
27///
28/// If you aren't sure where to start, try beginning with
29/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
30///
31/// # Resource usage
32///
33/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
34/// is the combined length of all patterns being searched. With that said,
35/// building an automaton can be fairly costly because of high constant
36/// factors, particularly when enabling the
37/// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
38/// option (which is disabled by default). For this reason, it's generally a
39/// good idea to build an automaton once and reuse it as much as possible.
40///
41/// Aho-Corasick automatons can also use a fair bit of memory. To get a
42/// concrete idea of how much memory is being used, try using the
43/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
44/// method.
45///
46/// # Examples
47///
48/// This example shows how to search for occurrences of multiple patterns
49/// simultaneously in a case insensitive fashion. Each match includes the
50/// pattern that matched along with the byte offsets of the match.
51///
52/// ```
53/// use aho_corasick::AhoCorasickBuilder;
54///
55/// let patterns = &["apple", "maple", "snapple"];
56/// let haystack = "Nobody likes maple in their apple flavored Snapple.";
57///
58/// let ac = AhoCorasickBuilder::new()
59///     .ascii_case_insensitive(true)
60///     .build(patterns);
61/// let mut matches = vec![];
62/// for mat in ac.find_iter(haystack) {
63///     matches.push((mat.pattern(), mat.start(), mat.end()));
64/// }
65/// assert_eq!(matches, vec![
66///     (1, 13, 18),
67///     (0, 28, 33),
68///     (2, 43, 50),
69/// ]);
70/// ```
71///
72/// This example shows how to replace matches with some other string:
73///
74/// ```
75/// use aho_corasick::AhoCorasick;
76///
77/// let patterns = &["fox", "brown", "quick"];
78/// let haystack = "The quick brown fox.";
79/// let replace_with = &["sloth", "grey", "slow"];
80///
81/// let ac = AhoCorasick::new(patterns);
82/// let result = ac.replace_all(haystack, replace_with);
83/// assert_eq!(result, "The slow grey sloth.");
84/// ```
85#[derive(Clone, Debug)]
86pub struct AhoCorasick<S: StateID = usize> {
87    imp: Imp<S>,
88    match_kind: MatchKind,
89}
90
91impl AhoCorasick {
92    /// Create a new Aho-Corasick automaton using the default configuration.
93    ///
94    /// The default configuration optimizes for less space usage, but at the
95    /// expense of longer search times. To change the configuration, use
96    /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
97    /// for fine-grained control, or
98    /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
99    /// for automatic configuration if you aren't sure which settings to pick.
100    ///
101    /// This uses the default
102    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
103    /// match semantics, which reports a match as soon as it is found. This
104    /// corresponds to the standard match semantics supported by textbook
105    /// descriptions of the Aho-Corasick algorithm.
106    ///
107    /// # Examples
108    ///
109    /// Basic usage:
110    ///
111    /// ```
112    /// use aho_corasick::AhoCorasick;
113    ///
114    /// let ac = AhoCorasick::new(&[
115    ///     "foo", "bar", "baz",
116    /// ]);
117    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
118    /// ```
119    pub fn new<I, P>(patterns: I) -> AhoCorasick
120    where
121        I: IntoIterator<Item = P>,
122        P: AsRef<[u8]>,
123    {
124        AhoCorasickBuilder::new().build(patterns)
125    }
126
127    /// Build an Aho-Corasick automaton with an automatically determined
128    /// configuration.
129    ///
130    /// Specifically, this requires a slice of patterns instead of an iterator
131    /// since the configuration is determined by looking at the patterns before
132    /// constructing the automaton. The idea here is to balance space and time
133    /// automatically. That is, when searching a small number of patterns, this
134    /// will attempt to use the fastest possible configuration since the total
135    /// space required will be small anyway. As the number of patterns grows,
136    /// this will fall back to slower configurations that use less space.
137    ///
138    /// If you want auto configuration but with match semantics different from
139    /// the default `MatchKind::Standard`, then use
140    /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
141    ///
142    /// # Examples
143    ///
144    /// Basic usage is just like `new`, except you must provide a slice:
145    ///
146    /// ```
147    /// use aho_corasick::AhoCorasick;
148    ///
149    /// let ac = AhoCorasick::new_auto_configured(&[
150    ///     "foo", "bar", "baz",
151    /// ]);
152    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
153    /// ```
154    pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
155    where
156        B: AsRef<[u8]>,
157    {
158        AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
159    }
160}
161
162impl<S: StateID> AhoCorasick<S> {
163    /// Returns true if and only if this automaton matches the haystack at any
164    /// position.
165    ///
166    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
167    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
168    /// `&[u8]` itself.
169    ///
170    /// # Examples
171    ///
172    /// Basic usage:
173    ///
174    /// ```
175    /// use aho_corasick::AhoCorasick;
176    ///
177    /// let ac = AhoCorasick::new(&[
178    ///     "foo", "bar", "quux", "baz",
179    /// ]);
180    /// assert!(ac.is_match("xxx bar xxx"));
181    /// assert!(!ac.is_match("xxx qux xxx"));
182    /// ```
183    pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
184        self.earliest_find(haystack).is_some()
185    }
186
187    /// Returns the location of the first detected match in `haystack`.
188    ///
189    /// This method has the same behavior regardless of the
190    /// [`MatchKind`](enum.MatchKind.html)
191    /// of this automaton.
192    ///
193    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
194    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
195    /// `&[u8]` itself.
196    ///
197    /// # Examples
198    ///
199    /// Basic usage:
200    ///
201    /// ```
202    /// use aho_corasick::AhoCorasick;
203    ///
204    /// let ac = AhoCorasick::new(&[
205    ///     "abc", "b",
206    /// ]);
207    /// let mat = ac.earliest_find("abcd").expect("should have match");
208    /// assert_eq!(1, mat.pattern());
209    /// assert_eq!((1, 2), (mat.start(), mat.end()));
210    /// ```
211    pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
212        let mut prestate = PrefilterState::new(self.max_pattern_len());
213        let mut start = self.imp.start_state();
214        self.imp.earliest_find_at(
215            &mut prestate,
216            haystack.as_ref(),
217            0,
218            &mut start,
219        )
220    }
221
222    /// Returns the location of the first match according to the match
223    /// semantics that this automaton was constructed with.
224    ///
225    /// When using `MatchKind::Standard`, this corresponds precisely to the
226    /// same behavior as
227    /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
228    /// Otherwise, match semantics correspond to either
229    /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
230    /// or
231    /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
232    ///
233    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
234    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
235    /// `&[u8]` itself.
236    ///
237    /// # Examples
238    ///
239    /// Basic usage, with standard semantics:
240    ///
241    /// ```
242    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
243    ///
244    /// let patterns = &["b", "abc", "abcd"];
245    /// let haystack = "abcd";
246    ///
247    /// let ac = AhoCorasickBuilder::new()
248    ///     .match_kind(MatchKind::Standard) // default, not necessary
249    ///     .build(patterns);
250    /// let mat = ac.find(haystack).expect("should have a match");
251    /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
252    /// ```
253    ///
254    /// Now with leftmost-first semantics:
255    ///
256    /// ```
257    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
258    ///
259    /// let patterns = &["b", "abc", "abcd"];
260    /// let haystack = "abcd";
261    ///
262    /// let ac = AhoCorasickBuilder::new()
263    ///     .match_kind(MatchKind::LeftmostFirst)
264    ///     .build(patterns);
265    /// let mat = ac.find(haystack).expect("should have a match");
266    /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
267    /// ```
268    ///
269    /// And finally, leftmost-longest semantics:
270    ///
271    /// ```
272    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
273    ///
274    /// let patterns = &["b", "abc", "abcd"];
275    /// let haystack = "abcd";
276    ///
277    /// let ac = AhoCorasickBuilder::new()
278    ///     .match_kind(MatchKind::LeftmostLongest)
279    ///     .build(patterns);
280    /// let mat = ac.find(haystack).expect("should have a match");
281    /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
282    /// ```
283    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
284        let mut prestate = PrefilterState::new(self.max_pattern_len());
285        self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
286    }
287
288    /// Returns an iterator of non-overlapping matches, using the match
289    /// semantics that this automaton was constructed with.
290    ///
291    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
292    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
293    /// `&[u8]` itself.
294    ///
295    /// # Examples
296    ///
297    /// Basic usage, with standard semantics:
298    ///
299    /// ```
300    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
301    ///
302    /// let patterns = &["append", "appendage", "app"];
303    /// let haystack = "append the app to the appendage";
304    ///
305    /// let ac = AhoCorasickBuilder::new()
306    ///     .match_kind(MatchKind::Standard) // default, not necessary
307    ///     .build(patterns);
308    /// let matches: Vec<usize> = ac
309    ///     .find_iter(haystack)
310    ///     .map(|mat| mat.pattern())
311    ///     .collect();
312    /// assert_eq!(vec![2, 2, 2], matches);
313    /// ```
314    ///
315    /// Now with leftmost-first semantics:
316    ///
317    /// ```
318    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
319    ///
320    /// let patterns = &["append", "appendage", "app"];
321    /// let haystack = "append the app to the appendage";
322    ///
323    /// let ac = AhoCorasickBuilder::new()
324    ///     .match_kind(MatchKind::LeftmostFirst)
325    ///     .build(patterns);
326    /// let matches: Vec<usize> = ac
327    ///     .find_iter(haystack)
328    ///     .map(|mat| mat.pattern())
329    ///     .collect();
330    /// assert_eq!(vec![0, 2, 0], matches);
331    /// ```
332    ///
333    /// And finally, leftmost-longest semantics:
334    ///
335    /// ```
336    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
337    ///
338    /// let patterns = &["append", "appendage", "app"];
339    /// let haystack = "append the app to the appendage";
340    ///
341    /// let ac = AhoCorasickBuilder::new()
342    ///     .match_kind(MatchKind::LeftmostLongest)
343    ///     .build(patterns);
344    /// let matches: Vec<usize> = ac
345    ///     .find_iter(haystack)
346    ///     .map(|mat| mat.pattern())
347    ///     .collect();
348    /// assert_eq!(vec![0, 2, 1], matches);
349    /// ```
350    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
351        &'a self,
352        haystack: &'b B,
353    ) -> FindIter<'a, 'b, S> {
354        FindIter::new(self, haystack.as_ref())
355    }
356
357    /// Returns an iterator of overlapping matches in the given `haystack`.
358    ///
359    /// Overlapping matches can _only_ be detected using
360    /// `MatchKind::Standard` semantics. If this automaton was constructed with
361    /// leftmost semantics, then this method will panic. To determine whether
362    /// this will panic at runtime, use the
363    /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
364    /// method.
365    ///
366    /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
367    /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
368    /// `&[u8]` itself.
369    ///
370    /// # Panics
371    ///
372    /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
373    /// That is, this panics when this automaton's match semantics are not
374    /// `MatchKind::Standard`.
375    ///
376    /// # Examples
377    ///
378    /// Basic usage, with standard semantics:
379    ///
380    /// ```
381    /// use aho_corasick::AhoCorasick;
382    ///
383    /// let patterns = &["append", "appendage", "app"];
384    /// let haystack = "append the app to the appendage";
385    ///
386    /// let ac = AhoCorasick::new(patterns);
387    /// let matches: Vec<usize> = ac
388    ///     .find_overlapping_iter(haystack)
389    ///     .map(|mat| mat.pattern())
390    ///     .collect();
391    /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
392    /// ```
393    pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
394        &'a self,
395        haystack: &'b B,
396    ) -> FindOverlappingIter<'a, 'b, S> {
397        FindOverlappingIter::new(self, haystack.as_ref())
398    }
399
400    /// Replace all matches with a corresponding value in the `replace_with`
401    /// slice given. Matches correspond to the same matches as reported by
402    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
403    ///
404    /// Replacements are determined by the index of the matching pattern.
405    /// For example, if the pattern with index `2` is found, then it is
406    /// replaced by `replace_with[2]`.
407    ///
408    /// # Panics
409    ///
410    /// This panics when `replace_with.len()` does not equal the total number
411    /// of patterns that are matched by this automaton.
412    ///
413    /// # Examples
414    ///
415    /// Basic usage:
416    ///
417    /// ```
418    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
419    ///
420    /// let patterns = &["append", "appendage", "app"];
421    /// let haystack = "append the app to the appendage";
422    ///
423    /// let ac = AhoCorasickBuilder::new()
424    ///     .match_kind(MatchKind::LeftmostFirst)
425    ///     .build(patterns);
426    /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
427    /// assert_eq!("x the z to the xage", result);
428    /// ```
429    pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
430    where
431        B: AsRef<str>,
432    {
433        assert_eq!(
434            replace_with.len(),
435            self.pattern_count(),
436            "replace_all requires a replacement for every pattern \
437             in the automaton"
438        );
439        let mut dst = String::with_capacity(haystack.len());
440        self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
441            dst.push_str(replace_with[mat.pattern()].as_ref());
442            true
443        });
444        dst
445    }
446
447    /// Replace all matches using raw bytes with a corresponding value in the
448    /// `replace_with` slice given. Matches correspond to the same matches as
449    /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
450    ///
451    /// Replacements are determined by the index of the matching pattern.
452    /// For example, if the pattern with index `2` is found, then it is
453    /// replaced by `replace_with[2]`.
454    ///
455    /// # Panics
456    ///
457    /// This panics when `replace_with.len()` does not equal the total number
458    /// of patterns that are matched by this automaton.
459    ///
460    /// # Examples
461    ///
462    /// Basic usage:
463    ///
464    /// ```
465    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
466    ///
467    /// let patterns = &["append", "appendage", "app"];
468    /// let haystack = b"append the app to the appendage";
469    ///
470    /// let ac = AhoCorasickBuilder::new()
471    ///     .match_kind(MatchKind::LeftmostFirst)
472    ///     .build(patterns);
473    /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
474    /// assert_eq!(b"x the z to the xage".to_vec(), result);
475    /// ```
476    pub fn replace_all_bytes<B>(
477        &self,
478        haystack: &[u8],
479        replace_with: &[B],
480    ) -> Vec<u8>
481    where
482        B: AsRef<[u8]>,
483    {
484        assert_eq!(
485            replace_with.len(),
486            self.pattern_count(),
487            "replace_all_bytes requires a replacement for every pattern \
488             in the automaton"
489        );
490        let mut dst = Vec::with_capacity(haystack.len());
491        self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
492            dst.extend(replace_with[mat.pattern()].as_ref());
493            true
494        });
495        dst
496    }
497
498    /// Replace all matches using a closure called on each match.
499    /// Matches correspond to the same matches as reported by
500    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
501    ///
502    /// The closure accepts three parameters: the match found, the text of
503    /// the match and a string buffer with which to write the replaced text
504    /// (if any). If the closure returns `true`, then it continues to the next
505    /// match. If the closure returns `false`, then searching is stopped.
506    ///
507    /// # Examples
508    ///
509    /// Basic usage:
510    ///
511    /// ```
512    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
513    ///
514    /// let patterns = &["append", "appendage", "app"];
515    /// let haystack = "append the app to the appendage";
516    ///
517    /// let ac = AhoCorasickBuilder::new()
518    ///     .match_kind(MatchKind::LeftmostFirst)
519    ///     .build(patterns);
520    /// let mut result = String::new();
521    /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
522    ///     dst.push_str(&mat.pattern().to_string());
523    ///     true
524    /// });
525    /// assert_eq!("0 the 2 to the 0age", result);
526    /// ```
527    ///
528    /// Stopping the replacement by returning `false` (continued from the
529    /// example above):
530    ///
531    /// ```
532    /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
533    /// # let patterns = &["append", "appendage", "app"];
534    /// # let haystack = "append the app to the appendage";
535    /// # let ac = AhoCorasickBuilder::new()
536    /// #    .match_kind(MatchKind::LeftmostFirst)
537    /// #    .build(patterns);
538    /// let mut result = String::new();
539    /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
540    ///     dst.push_str(&mat.pattern().to_string());
541    ///     mat.pattern() != 2
542    /// });
543    /// assert_eq!("0 the 2 to the appendage", result);
544    /// ```
545    pub fn replace_all_with<F>(
546        &self,
547        haystack: &str,
548        dst: &mut String,
549        mut replace_with: F,
550    ) where
551        F: FnMut(&Match, &str, &mut String) -> bool,
552    {
553        let mut last_match = 0;
554        for mat in self.find_iter(haystack) {
555            dst.push_str(&haystack[last_match..mat.start()]);
556            last_match = mat.end();
557            if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
558                break;
559            };
560        }
561        dst.push_str(&haystack[last_match..]);
562    }
563
564    /// Replace all matches using raw bytes with a closure called on each
565    /// match. Matches correspond to the same matches as reported by
566    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
567    ///
568    /// The closure accepts three parameters: the match found, the text of
569    /// the match and a byte buffer with which to write the replaced text
570    /// (if any). If the closure returns `true`, then it continues to the next
571    /// match. If the closure returns `false`, then searching is stopped.
572    ///
573    /// # Examples
574    ///
575    /// Basic usage:
576    ///
577    /// ```
578    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
579    ///
580    /// let patterns = &["append", "appendage", "app"];
581    /// let haystack = b"append the app to the appendage";
582    ///
583    /// let ac = AhoCorasickBuilder::new()
584    ///     .match_kind(MatchKind::LeftmostFirst)
585    ///     .build(patterns);
586    /// let mut result = vec![];
587    /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
588    ///     dst.extend(mat.pattern().to_string().bytes());
589    ///     true
590    /// });
591    /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
592    /// ```
593    ///
594    /// Stopping the replacement by returning `false` (continued from the
595    /// example above):
596    ///
597    /// ```
598    /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
599    /// # let patterns = &["append", "appendage", "app"];
600    /// # let haystack = b"append the app to the appendage";
601    /// # let ac = AhoCorasickBuilder::new()
602    /// #    .match_kind(MatchKind::LeftmostFirst)
603    /// #    .build(patterns);
604    /// let mut result = vec![];
605    /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
606    ///     dst.extend(mat.pattern().to_string().bytes());
607    ///     mat.pattern() != 2
608    /// });
609    /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
610    /// ```
611    pub fn replace_all_with_bytes<F>(
612        &self,
613        haystack: &[u8],
614        dst: &mut Vec<u8>,
615        mut replace_with: F,
616    ) where
617        F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
618    {
619        let mut last_match = 0;
620        for mat in self.find_iter(haystack) {
621            dst.extend(&haystack[last_match..mat.start()]);
622            last_match = mat.end();
623            if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
624                break;
625            };
626        }
627        dst.extend(&haystack[last_match..]);
628    }
629
630    /// Returns an iterator of non-overlapping matches in the given
631    /// stream. Matches correspond to the same matches as reported by
632    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
633    ///
634    /// The matches yielded by this iterator use absolute position offsets in
635    /// the stream given, where the first byte has index `0`. Matches are
636    /// yieled until the stream is exhausted.
637    ///
638    /// Each item yielded by the iterator is an `io::Result<Match>`, where an
639    /// error is yielded if there was a problem reading from the reader given.
640    ///
641    /// When searching a stream, an internal buffer is used. Therefore, callers
642    /// should avoiding providing a buffered reader, if possible.
643    ///
644    /// Searching a stream requires that the automaton was built with
645    /// `MatchKind::Standard` semantics. If this automaton was constructed
646    /// with leftmost semantics, then this method will panic. To determine
647    /// whether this will panic at runtime, use the
648    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
649    /// method.
650    ///
651    /// # Memory usage
652    ///
653    /// In general, searching streams will use a constant amount of memory for
654    /// its internal buffer. The one requirement is that the internal buffer
655    /// must be at least the size of the longest possible match. In most use
656    /// cases, the default buffer size will be much larger than any individual
657    /// match.
658    ///
659    /// # Panics
660    ///
661    /// This panics when `AhoCorasick::supports_stream` returns `false`.
662    /// That is, this panics when this automaton's match semantics are not
663    /// `MatchKind::Standard`. This restriction may be lifted in the future.
664    ///
665    /// # Examples
666    ///
667    /// Basic usage:
668    ///
669    /// ```
670    /// use aho_corasick::AhoCorasick;
671    ///
672    /// # fn example() -> Result<(), ::std::io::Error> {
673    /// let patterns = &["append", "appendage", "app"];
674    /// let haystack = "append the app to the appendage";
675    ///
676    /// let ac = AhoCorasick::new(patterns);
677    /// let mut matches = vec![];
678    /// for result in ac.stream_find_iter(haystack.as_bytes()) {
679    ///     let mat = result?;
680    ///     matches.push(mat.pattern());
681    /// }
682    /// assert_eq!(vec![2, 2, 2], matches);
683    /// # Ok(()) }; example().unwrap()
684    /// ```
685    pub fn stream_find_iter<'a, R: io::Read>(
686        &'a self,
687        rdr: R,
688    ) -> StreamFindIter<'a, R, S> {
689        StreamFindIter::new(self, rdr)
690    }
691
692    /// Search for and replace all matches of this automaton in
693    /// the given reader, and write the replacements to the given
694    /// writer. Matches correspond to the same matches as reported by
695    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
696    ///
697    /// Replacements are determined by the index of the matching pattern.
698    /// For example, if the pattern with index `2` is found, then it is
699    /// replaced by `replace_with[2]`.
700    ///
701    /// After all matches are replaced, the writer is _not_ flushed.
702    ///
703    /// If there was a problem reading from the given reader or writing to the
704    /// given writer, then the corresponding `io::Error` is returned and all
705    /// replacement is stopped.
706    ///
707    /// When searching a stream, an internal buffer is used. Therefore, callers
708    /// should avoiding providing a buffered reader, if possible. However,
709    /// callers may want to provide a buffered writer.
710    ///
711    /// Searching a stream requires that the automaton was built with
712    /// `MatchKind::Standard` semantics. If this automaton was constructed
713    /// with leftmost semantics, then this method will panic. To determine
714    /// whether this will panic at runtime, use the
715    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
716    /// method.
717    ///
718    /// # Memory usage
719    ///
720    /// In general, searching streams will use a constant amount of memory for
721    /// its internal buffer. The one requirement is that the internal buffer
722    /// must be at least the size of the longest possible match. In most use
723    /// cases, the default buffer size will be much larger than any individual
724    /// match.
725    ///
726    /// # Panics
727    ///
728    /// This panics when `AhoCorasick::supports_stream` returns `false`.
729    /// That is, this panics when this automaton's match semantics are not
730    /// `MatchKind::Standard`. This restriction may be lifted in the future.
731    ///
732    /// # Examples
733    ///
734    /// Basic usage:
735    ///
736    /// ```
737    /// use aho_corasick::AhoCorasick;
738    ///
739    /// # fn example() -> Result<(), ::std::io::Error> {
740    /// let patterns = &["fox", "brown", "quick"];
741    /// let haystack = "The quick brown fox.";
742    /// let replace_with = &["sloth", "grey", "slow"];
743    ///
744    /// let ac = AhoCorasick::new(patterns);
745    /// let mut result = vec![];
746    /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
747    /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
748    /// # Ok(()) }; example().unwrap()
749    /// ```
750    pub fn stream_replace_all<R, W, B>(
751        &self,
752        rdr: R,
753        wtr: W,
754        replace_with: &[B],
755    ) -> io::Result<()>
756    where
757        R: io::Read,
758        W: io::Write,
759        B: AsRef<[u8]>,
760    {
761        assert_eq!(
762            replace_with.len(),
763            self.pattern_count(),
764            "stream_replace_all requires a replacement for every pattern \
765             in the automaton"
766        );
767        self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
768            wtr.write_all(replace_with[mat.pattern()].as_ref())
769        })
770    }
771
772    /// Search the given reader and replace all matches of this automaton
773    /// using the given closure. The result is written to the given
774    /// writer. Matches correspond to the same matches as reported by
775    /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
776    ///
777    /// The closure accepts three parameters: the match found, the text of
778    /// the match and the writer with which to write the replaced text (if any).
779    ///
780    /// After all matches are replaced, the writer is _not_ flushed.
781    ///
782    /// If there was a problem reading from the given reader or writing to the
783    /// given writer, then the corresponding `io::Error` is returned and all
784    /// replacement is stopped.
785    ///
786    /// When searching a stream, an internal buffer is used. Therefore, callers
787    /// should avoiding providing a buffered reader, if possible. However,
788    /// callers may want to provide a buffered writer.
789    ///
790    /// Searching a stream requires that the automaton was built with
791    /// `MatchKind::Standard` semantics. If this automaton was constructed
792    /// with leftmost semantics, then this method will panic. To determine
793    /// whether this will panic at runtime, use the
794    /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
795    /// method.
796    ///
797    /// # Memory usage
798    ///
799    /// In general, searching streams will use a constant amount of memory for
800    /// its internal buffer. The one requirement is that the internal buffer
801    /// must be at least the size of the longest possible match. In most use
802    /// cases, the default buffer size will be much larger than any individual
803    /// match.
804    ///
805    /// # Panics
806    ///
807    /// This panics when `AhoCorasick::supports_stream` returns `false`.
808    /// That is, this panics when this automaton's match semantics are not
809    /// `MatchKind::Standard`. This restriction may be lifted in the future.
810    ///
811    /// # Examples
812    ///
813    /// Basic usage:
814    ///
815    /// ```
816    /// use std::io::Write;
817    /// use aho_corasick::AhoCorasick;
818    ///
819    /// # fn example() -> Result<(), ::std::io::Error> {
820    /// let patterns = &["fox", "brown", "quick"];
821    /// let haystack = "The quick brown fox.";
822    ///
823    /// let ac = AhoCorasick::new(patterns);
824    /// let mut result = vec![];
825    /// ac.stream_replace_all_with(
826    ///     haystack.as_bytes(),
827    ///     &mut result,
828    ///     |mat, _, wtr| {
829    ///         wtr.write_all(mat.pattern().to_string().as_bytes())
830    ///     },
831    /// )?;
832    /// assert_eq!(b"The 2 1 0.".to_vec(), result);
833    /// # Ok(()) }; example().unwrap()
834    /// ```
835    pub fn stream_replace_all_with<R, W, F>(
836        &self,
837        rdr: R,
838        mut wtr: W,
839        mut replace_with: F,
840    ) -> io::Result<()>
841    where
842        R: io::Read,
843        W: io::Write,
844        F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
845    {
846        let mut it = StreamChunkIter::new(self, rdr);
847        while let Some(result) = it.next() {
848            let chunk = result?;
849            match chunk {
850                StreamChunk::NonMatch { bytes, .. } => {
851                    wtr.write_all(bytes)?;
852                }
853                StreamChunk::Match { bytes, mat } => {
854                    replace_with(&mat, bytes, &mut wtr)?;
855                }
856            }
857        }
858        Ok(())
859    }
860
861    /// Returns the match kind used by this automaton.
862    ///
863    /// # Examples
864    ///
865    /// Basic usage:
866    ///
867    /// ```
868    /// use aho_corasick::{AhoCorasick, MatchKind};
869    ///
870    /// let ac = AhoCorasick::new(&[
871    ///     "foo", "bar", "quux", "baz",
872    /// ]);
873    /// assert_eq!(&MatchKind::Standard, ac.match_kind());
874    /// ```
875    pub fn match_kind(&self) -> &MatchKind {
876        self.imp.match_kind()
877    }
878
879    /// Returns the length of the longest pattern matched by this automaton.
880    ///
881    /// # Examples
882    ///
883    /// Basic usage:
884    ///
885    /// ```
886    /// use aho_corasick::AhoCorasick;
887    ///
888    /// let ac = AhoCorasick::new(&[
889    ///     "foo", "bar", "quux", "baz",
890    /// ]);
891    /// assert_eq!(4, ac.max_pattern_len());
892    /// ```
893    pub fn max_pattern_len(&self) -> usize {
894        self.imp.max_pattern_len()
895    }
896
897    /// Return the total number of patterns matched by this automaton.
898    ///
899    /// This includes patterns that may never participate in a match. For
900    /// example, if
901    /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
902    /// match semantics are used, and the patterns `Sam` and `Samwise` were
903    /// used to build the automaton, then `Samwise` can never participate in a
904    /// match because `Sam` will always take priority.
905    ///
906    /// # Examples
907    ///
908    /// Basic usage:
909    ///
910    /// ```
911    /// use aho_corasick::AhoCorasick;
912    ///
913    /// let ac = AhoCorasick::new(&[
914    ///     "foo", "bar", "baz",
915    /// ]);
916    /// assert_eq!(3, ac.pattern_count());
917    /// ```
918    pub fn pattern_count(&self) -> usize {
919        self.imp.pattern_count()
920    }
921
922    /// Returns true if and only if this automaton supports reporting
923    /// overlapping matches.
924    ///
925    /// If this returns false and overlapping matches are requested, then it
926    /// will result in a panic.
927    ///
928    /// Since leftmost matching is inherently incompatible with overlapping
929    /// matches, only
930    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
931    /// supports overlapping matches. This is unlikely to change in the future.
932    ///
933    /// # Examples
934    ///
935    /// Basic usage:
936    ///
937    /// ```
938    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
939    ///
940    /// let ac = AhoCorasickBuilder::new()
941    ///     .match_kind(MatchKind::Standard)
942    ///     .build(&["foo", "bar", "baz"]);
943    /// assert!(ac.supports_overlapping());
944    ///
945    /// let ac = AhoCorasickBuilder::new()
946    ///     .match_kind(MatchKind::LeftmostFirst)
947    ///     .build(&["foo", "bar", "baz"]);
948    /// assert!(!ac.supports_overlapping());
949    /// ```
950    pub fn supports_overlapping(&self) -> bool {
951        self.match_kind.supports_overlapping()
952    }
953
954    /// Returns true if and only if this automaton supports stream searching.
955    ///
956    /// If this returns false and stream searching (or replacing) is attempted,
957    /// then it will result in a panic.
958    ///
959    /// Currently, only
960    /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
961    /// supports streaming. This may be expanded in the future.
962    ///
963    /// # Examples
964    ///
965    /// Basic usage:
966    ///
967    /// ```
968    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
969    ///
970    /// let ac = AhoCorasickBuilder::new()
971    ///     .match_kind(MatchKind::Standard)
972    ///     .build(&["foo", "bar", "baz"]);
973    /// assert!(ac.supports_stream());
974    ///
975    /// let ac = AhoCorasickBuilder::new()
976    ///     .match_kind(MatchKind::LeftmostFirst)
977    ///     .build(&["foo", "bar", "baz"]);
978    /// assert!(!ac.supports_stream());
979    /// ```
980    pub fn supports_stream(&self) -> bool {
981        self.match_kind.supports_stream()
982    }
983
984    /// Returns the approximate total amount of heap used by this automaton, in
985    /// units of bytes.
986    ///
987    /// # Examples
988    ///
989    /// This example shows the difference in heap usage between a few
990    /// configurations:
991    ///
992    /// ```ignore
993    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
994    ///
995    /// let ac = AhoCorasickBuilder::new()
996    ///     .dfa(false) // default
997    ///     .build(&["foo", "bar", "baz"]);
998    /// assert_eq!(10_336, ac.heap_bytes());
999    ///
1000    /// let ac = AhoCorasickBuilder::new()
1001    ///     .dfa(false) // default
1002    ///     .ascii_case_insensitive(true)
1003    ///     .build(&["foo", "bar", "baz"]);
1004    /// assert_eq!(10_384, ac.heap_bytes());
1005    ///
1006    /// let ac = AhoCorasickBuilder::new()
1007    ///     .dfa(true)
1008    ///     .ascii_case_insensitive(true)
1009    ///     .build(&["foo", "bar", "baz"]);
1010    /// assert_eq!(1_248, ac.heap_bytes());
1011    /// ```
1012    pub fn heap_bytes(&self) -> usize {
1013        match self.imp {
1014            Imp::NFA(ref nfa) => nfa.heap_bytes(),
1015            Imp::DFA(ref dfa) => dfa.heap_bytes(),
1016        }
1017    }
1018}
1019
1020/// The internal implementation of Aho-Corasick, which is either an NFA or
1021/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
1022/// more memory.
1023#[derive(Clone, Debug)]
1024enum Imp<S: StateID> {
1025    NFA(NFA<S>),
1026    DFA(DFA<S>),
1027}
1028
1029impl<S: StateID> Imp<S> {
1030    /// Returns the type of match semantics implemented by this automaton.
1031    fn match_kind(&self) -> &MatchKind {
1032        match *self {
1033            Imp::NFA(ref nfa) => nfa.match_kind(),
1034            Imp::DFA(ref dfa) => dfa.match_kind(),
1035        }
1036    }
1037
1038    /// Returns the identifier of the start state.
1039    fn start_state(&self) -> S {
1040        match *self {
1041            Imp::NFA(ref nfa) => nfa.start_state(),
1042            Imp::DFA(ref dfa) => dfa.start_state(),
1043        }
1044    }
1045
1046    /// The length, in bytes, of the longest pattern in this automaton. This
1047    /// information is useful for maintaining correct buffer sizes when
1048    /// searching on streams.
1049    fn max_pattern_len(&self) -> usize {
1050        match *self {
1051            Imp::NFA(ref nfa) => nfa.max_pattern_len(),
1052            Imp::DFA(ref dfa) => dfa.max_pattern_len(),
1053        }
1054    }
1055
1056    /// The total number of patterns added to this automaton. This includes
1057    /// patterns that may never match. The maximum matching pattern that can be
1058    /// reported is exactly one less than this number.
1059    fn pattern_count(&self) -> usize {
1060        match *self {
1061            Imp::NFA(ref nfa) => nfa.pattern_count(),
1062            Imp::DFA(ref dfa) => dfa.pattern_count(),
1063        }
1064    }
1065
1066    /// Returns the prefilter object, if one exists, for the underlying
1067    /// automaton.
1068    fn prefilter(&self) -> Option<&dyn Prefilter> {
1069        match *self {
1070            Imp::NFA(ref nfa) => nfa.prefilter(),
1071            Imp::DFA(ref dfa) => dfa.prefilter(),
1072        }
1073    }
1074
1075    /// Returns true if and only if we should attempt to use a prefilter.
1076    fn use_prefilter(&self) -> bool {
1077        let p = match self.prefilter() {
1078            None => return false,
1079            Some(p) => p,
1080        };
1081        !p.looks_for_non_start_of_match()
1082    }
1083
1084    #[inline(always)]
1085    fn overlapping_find_at(
1086        &self,
1087        prestate: &mut PrefilterState,
1088        haystack: &[u8],
1089        at: usize,
1090        state_id: &mut S,
1091        match_index: &mut usize,
1092    ) -> Option<Match> {
1093        match *self {
1094            Imp::NFA(ref nfa) => nfa.overlapping_find_at(
1095                prestate,
1096                haystack,
1097                at,
1098                state_id,
1099                match_index,
1100            ),
1101            Imp::DFA(ref dfa) => dfa.overlapping_find_at(
1102                prestate,
1103                haystack,
1104                at,
1105                state_id,
1106                match_index,
1107            ),
1108        }
1109    }
1110
1111    #[inline(always)]
1112    fn earliest_find_at(
1113        &self,
1114        prestate: &mut PrefilterState,
1115        haystack: &[u8],
1116        at: usize,
1117        state_id: &mut S,
1118    ) -> Option<Match> {
1119        match *self {
1120            Imp::NFA(ref nfa) => {
1121                nfa.earliest_find_at(prestate, haystack, at, state_id)
1122            }
1123            Imp::DFA(ref dfa) => {
1124                dfa.earliest_find_at(prestate, haystack, at, state_id)
1125            }
1126        }
1127    }
1128
1129    #[inline(always)]
1130    fn find_at_no_state(
1131        &self,
1132        prestate: &mut PrefilterState,
1133        haystack: &[u8],
1134        at: usize,
1135    ) -> Option<Match> {
1136        match *self {
1137            Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
1138            Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
1139        }
1140    }
1141}
1142
1143/// An iterator of non-overlapping matches in a particular haystack.
1144///
1145/// This iterator yields matches according to the
1146/// [`MatchKind`](enum.MatchKind.html)
1147/// used by this automaton.
1148///
1149/// This iterator is constructed via the
1150/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
1151/// method.
1152///
1153/// The type variable `S` refers to the representation used for state
1154/// identifiers. (By default, this is `usize`.)
1155///
1156/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1157///
1158/// The lifetime `'b` refers to the lifetime of the haystack being searched.
1159#[derive(Debug)]
1160pub struct FindIter<'a, 'b, S: StateID> {
1161    fsm: &'a Imp<S>,
1162    prestate: PrefilterState,
1163    haystack: &'b [u8],
1164    pos: usize,
1165}
1166
1167impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
1168    fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
1169        let prestate = PrefilterState::new(ac.max_pattern_len());
1170        FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
1171    }
1172}
1173
1174impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
1175    type Item = Match;
1176
1177    fn next(&mut self) -> Option<Match> {
1178        if self.pos > self.haystack.len() {
1179            return None;
1180        }
1181        let result = self.fsm.find_at_no_state(
1182            &mut self.prestate,
1183            self.haystack,
1184            self.pos,
1185        );
1186        let mat = match result {
1187            None => return None,
1188            Some(mat) => mat,
1189        };
1190        if mat.end() == self.pos {
1191            // If the automaton can match the empty string and if we found an
1192            // empty match, then we need to forcefully move the position.
1193            self.pos += 1;
1194        } else {
1195            self.pos = mat.end();
1196        }
1197        Some(mat)
1198    }
1199}
1200
1201/// An iterator of overlapping matches in a particular haystack.
1202///
1203/// This iterator will report all possible matches in a particular haystack,
1204/// even when the matches overlap.
1205///
1206/// This iterator is constructed via the
1207/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
1208/// method.
1209///
1210/// The type variable `S` refers to the representation used for state
1211/// identifiers. (By default, this is `usize`.)
1212///
1213/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1214///
1215/// The lifetime `'b` refers to the lifetime of the haystack being searched.
1216#[derive(Debug)]
1217pub struct FindOverlappingIter<'a, 'b, S: StateID> {
1218    fsm: &'a Imp<S>,
1219    prestate: PrefilterState,
1220    haystack: &'b [u8],
1221    pos: usize,
1222    last_match_end: usize,
1223    state_id: S,
1224    match_index: usize,
1225}
1226
1227impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
1228    fn new(
1229        ac: &'a AhoCorasick<S>,
1230        haystack: &'b [u8],
1231    ) -> FindOverlappingIter<'a, 'b, S> {
1232        assert!(
1233            ac.supports_overlapping(),
1234            "automaton does not support overlapping searches"
1235        );
1236        let prestate = PrefilterState::new(ac.max_pattern_len());
1237        FindOverlappingIter {
1238            fsm: &ac.imp,
1239            prestate,
1240            haystack,
1241            pos: 0,
1242            last_match_end: 0,
1243            state_id: ac.imp.start_state(),
1244            match_index: 0,
1245        }
1246    }
1247}
1248
1249impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
1250    type Item = Match;
1251
1252    fn next(&mut self) -> Option<Match> {
1253        let result = self.fsm.overlapping_find_at(
1254            &mut self.prestate,
1255            self.haystack,
1256            self.pos,
1257            &mut self.state_id,
1258            &mut self.match_index,
1259        );
1260        match result {
1261            None => return None,
1262            Some(m) => {
1263                self.pos = m.end();
1264                Some(m)
1265            }
1266        }
1267    }
1268}
1269
1270/// An iterator that reports Aho-Corasick matches in a stream.
1271///
1272/// This iterator yields elements of type `io::Result<Match>`, where an error
1273/// is reported if there was a problem reading from the underlying stream.
1274/// The iterator terminates only when the underlying stream reaches `EOF`.
1275///
1276/// This iterator is constructed via the
1277/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
1278/// method.
1279///
1280/// The type variable `R` refers to the `io::Read` stream that is being read
1281/// from.
1282///
1283/// The type variable `S` refers to the representation used for state
1284/// identifiers. (By default, this is `usize`.)
1285///
1286/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1287#[derive(Debug)]
1288pub struct StreamFindIter<'a, R, S: StateID> {
1289    it: StreamChunkIter<'a, R, S>,
1290}
1291
1292impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
1293    fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
1294        StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
1295    }
1296}
1297
1298impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
1299    type Item = io::Result<Match>;
1300
1301    fn next(&mut self) -> Option<io::Result<Match>> {
1302        loop {
1303            match self.it.next() {
1304                None => return None,
1305                Some(Err(err)) => return Some(Err(err)),
1306                Some(Ok(StreamChunk::NonMatch { .. })) => {}
1307                Some(Ok(StreamChunk::Match { mat, .. })) => {
1308                    return Some(Ok(mat));
1309                }
1310            }
1311        }
1312    }
1313}
1314
1315/// An iterator over chunks in an underlying reader. Each chunk either
1316/// corresponds to non-matching bytes or matching bytes, but all bytes from
1317/// the underlying reader are reported in sequence. There may be an arbitrary
1318/// number of non-matching chunks before seeing a matching chunk.
1319///
1320/// N.B. This does not actually implement Iterator because we need to borrow
1321/// from the underlying reader. But conceptually, it's still an iterator.
1322#[derive(Debug)]
1323struct StreamChunkIter<'a, R, S: StateID> {
1324    /// The AC automaton.
1325    fsm: &'a Imp<S>,
1326    /// State associated with this automaton's prefilter. It is a heuristic
1327    /// for stopping the prefilter if it's deemed ineffective.
1328    prestate: PrefilterState,
1329    /// The source of bytes we read from.
1330    rdr: R,
1331    /// A fixed size buffer. This is what we actually search. There are some
1332    /// invariants around the buffer's size, namely, it must be big enough to
1333    /// contain the longest possible match.
1334    buf: Buffer,
1335    /// The ID of the FSM state we're currently in.
1336    state_id: S,
1337    /// The current position at which to start the next search in `buf`.
1338    search_pos: usize,
1339    /// The absolute position of `search_pos`, where `0` corresponds to the
1340    /// position of the first byte read from `rdr`.
1341    absolute_pos: usize,
1342    /// The ending position of the last StreamChunk that was returned to the
1343    /// caller. This position is used to determine whether we need to emit
1344    /// non-matching bytes before emitting a match.
1345    report_pos: usize,
1346    /// A match that should be reported on the next call.
1347    pending_match: Option<Match>,
1348    /// Enabled only when the automaton can match the empty string. When
1349    /// enabled, we need to execute one final search after consuming the
1350    /// reader to find the trailing empty match.
1351    has_empty_match_at_end: bool,
1352}
1353
1354/// A single chunk yielded by the stream chunk iterator.
1355///
1356/// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1357#[derive(Debug)]
1358enum StreamChunk<'r> {
1359    /// A chunk that does not contain any matches.
1360    NonMatch { bytes: &'r [u8], start: usize },
1361    /// A chunk that precisely contains a match.
1362    Match { bytes: &'r [u8], mat: Match },
1363}
1364
1365impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
1366    fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
1367        assert!(
1368            ac.supports_stream(),
1369            "stream searching is only supported for Standard match semantics"
1370        );
1371
1372        let prestate = if ac.imp.use_prefilter() {
1373            PrefilterState::new(ac.max_pattern_len())
1374        } else {
1375            PrefilterState::disabled()
1376        };
1377        let buf = Buffer::new(ac.imp.max_pattern_len());
1378        let state_id = ac.imp.start_state();
1379        StreamChunkIter {
1380            fsm: &ac.imp,
1381            prestate,
1382            rdr,
1383            buf,
1384            state_id,
1385            absolute_pos: 0,
1386            report_pos: 0,
1387            search_pos: 0,
1388            pending_match: None,
1389            has_empty_match_at_end: ac.is_match(""),
1390        }
1391    }
1392
1393    fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
1394        loop {
1395            if let Some(mut mat) = self.pending_match.take() {
1396                let bytes = &self.buf.buffer()[mat.start()..mat.end()];
1397                self.report_pos = mat.end();
1398                mat = mat.increment(self.absolute_pos);
1399                return Some(Ok(StreamChunk::Match { bytes, mat }));
1400            }
1401            if self.search_pos >= self.buf.len() {
1402                if let Some(end) = self.unreported() {
1403                    let bytes = &self.buf.buffer()[self.report_pos..end];
1404                    let start = self.absolute_pos + self.report_pos;
1405                    self.report_pos = end;
1406                    return Some(Ok(StreamChunk::NonMatch { bytes, start }));
1407                }
1408                if self.buf.len() >= self.buf.min_buffer_len() {
1409                    // This is the point at which we roll our buffer, which we
1410                    // only do if our buffer has at least the minimum amount of
1411                    // bytes in it. Before rolling, we update our various
1412                    // positions to be consistent with the buffer after it has
1413                    // been rolled.
1414
1415                    self.report_pos -=
1416                        self.buf.len() - self.buf.min_buffer_len();
1417                    self.absolute_pos +=
1418                        self.search_pos - self.buf.min_buffer_len();
1419                    self.search_pos = self.buf.min_buffer_len();
1420                    self.buf.roll();
1421                }
1422                match self.buf.fill(&mut self.rdr) {
1423                    Err(err) => return Some(Err(err)),
1424                    Ok(false) => {
1425                        // We've hit EOF, but if there are still some
1426                        // unreported bytes remaining, return them now.
1427                        if self.report_pos < self.buf.len() {
1428                            let bytes = &self.buf.buffer()[self.report_pos..];
1429                            let start = self.absolute_pos + self.report_pos;
1430                            self.report_pos = self.buf.len();
1431
1432                            let chunk = StreamChunk::NonMatch { bytes, start };
1433                            return Some(Ok(chunk));
1434                        } else {
1435                            // We've reported everything, but there might still
1436                            // be a match at the very last position.
1437                            if !self.has_empty_match_at_end {
1438                                return None;
1439                            }
1440                            // fallthrough for another search to get trailing
1441                            // empty matches
1442                            self.has_empty_match_at_end = false;
1443                        }
1444                    }
1445                    Ok(true) => {}
1446                }
1447            }
1448            let result = self.fsm.earliest_find_at(
1449                &mut self.prestate,
1450                self.buf.buffer(),
1451                self.search_pos,
1452                &mut self.state_id,
1453            );
1454            match result {
1455                None => {
1456                    self.search_pos = self.buf.len();
1457                }
1458                Some(mat) => {
1459                    self.state_id = self.fsm.start_state();
1460                    if mat.end() == self.search_pos {
1461                        // If the automaton can match the empty string and if
1462                        // we found an empty match, then we need to forcefully
1463                        // move the position.
1464                        self.search_pos += 1;
1465                    } else {
1466                        self.search_pos = mat.end();
1467                    }
1468                    self.pending_match = Some(mat.clone());
1469                    if self.report_pos < mat.start() {
1470                        let bytes =
1471                            &self.buf.buffer()[self.report_pos..mat.start()];
1472                        let start = self.absolute_pos + self.report_pos;
1473                        self.report_pos = mat.start();
1474
1475                        let chunk = StreamChunk::NonMatch { bytes, start };
1476                        return Some(Ok(chunk));
1477                    }
1478                }
1479            }
1480        }
1481    }
1482
1483    fn unreported(&self) -> Option<usize> {
1484        let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
1485        if self.report_pos < end {
1486            Some(end)
1487        } else {
1488            None
1489        }
1490    }
1491}
1492
1493/// A builder for configuring an Aho-Corasick automaton.
1494#[derive(Clone, Debug)]
1495pub struct AhoCorasickBuilder {
1496    nfa_builder: nfa::Builder,
1497    dfa_builder: dfa::Builder,
1498    dfa: bool,
1499}
1500
1501impl Default for AhoCorasickBuilder {
1502    fn default() -> AhoCorasickBuilder {
1503        AhoCorasickBuilder::new()
1504    }
1505}
1506
1507impl AhoCorasickBuilder {
1508    /// Create a new builder for configuring an Aho-Corasick automaton.
1509    ///
1510    /// If you don't need fine grained configuration or aren't sure which knobs
1511    /// to set, try using
1512    /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
1513    /// instead.
1514    pub fn new() -> AhoCorasickBuilder {
1515        AhoCorasickBuilder {
1516            nfa_builder: nfa::Builder::new(),
1517            dfa_builder: dfa::Builder::new(),
1518            dfa: false,
1519        }
1520    }
1521
1522    /// Build an Aho-Corasick automaton using the configuration set on this
1523    /// builder.
1524    ///
1525    /// A builder may be reused to create more automatons.
1526    ///
1527    /// This method will use the default for representing internal state
1528    /// identifiers, which is `usize`. This guarantees that building the
1529    /// automaton will succeed and is generally a good default, but can make
1530    /// the size of the automaton 2-8 times bigger than it needs to be,
1531    /// depending on your target platform.
1532    ///
1533    /// # Examples
1534    ///
1535    /// Basic usage:
1536    ///
1537    /// ```
1538    /// use aho_corasick::AhoCorasickBuilder;
1539    ///
1540    /// let patterns = &["foo", "bar", "baz"];
1541    /// let ac = AhoCorasickBuilder::new()
1542    ///     .build(patterns);
1543    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1544    /// ```
1545    pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
1546    where
1547        I: IntoIterator<Item = P>,
1548        P: AsRef<[u8]>,
1549    {
1550        // The builder only returns an error if the chosen state ID
1551        // representation is too small to fit all of the given patterns. In
1552        // this case, since we fix the representation to usize, it will always
1553        // work because it's impossible to overflow usize since the underlying
1554        // storage would OOM long before that happens.
1555        self.build_with_size::<usize, I, P>(patterns)
1556            .expect("usize state ID type should always work")
1557    }
1558
1559    /// Build an Aho-Corasick automaton using the configuration set on this
1560    /// builder with a specific state identifier representation. This only has
1561    /// an effect when the `dfa` option is enabled.
1562    ///
1563    /// Generally, the choices for a state identifier representation are
1564    /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
1565    /// The advantage of choosing a smaller state identifier representation
1566    /// is that the automaton produced will be smaller. This might be
1567    /// beneficial for just generally using less space, or might even allow it
1568    /// to fit more of the automaton in your CPU's cache, leading to overall
1569    /// better search performance.
1570    ///
1571    /// Unlike the standard `build` method, this can report an error if the
1572    /// state identifier representation cannot support the size of the
1573    /// automaton.
1574    ///
1575    /// Note that the state identifier representation is determined by the
1576    /// `S` type variable. This requires a type hint of some sort, either
1577    /// by specifying the return type or using the turbofish, e.g.,
1578    /// `build_with_size::<u16, _, _>(...)`.
1579    ///
1580    /// # Examples
1581    ///
1582    /// Basic usage:
1583    ///
1584    /// ```
1585    /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1586    ///
1587    /// # fn example() -> Result<(), ::aho_corasick::Error> {
1588    /// let patterns = &["foo", "bar", "baz"];
1589    /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
1590    ///     .build_with_size(patterns)?;
1591    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1592    /// # Ok(()) }; example().unwrap()
1593    /// ```
1594    ///
1595    /// Or alternatively, with turbofish:
1596    ///
1597    /// ```
1598    /// use aho_corasick::AhoCorasickBuilder;
1599    ///
1600    /// # fn example() -> Result<(), ::aho_corasick::Error> {
1601    /// let patterns = &["foo", "bar", "baz"];
1602    /// let ac = AhoCorasickBuilder::new()
1603    ///     .build_with_size::<u8, _, _>(patterns)?;
1604    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1605    /// # Ok(()) }; example().unwrap()
1606    /// ```
1607    pub fn build_with_size<S, I, P>(
1608        &self,
1609        patterns: I,
1610    ) -> Result<AhoCorasick<S>>
1611    where
1612        S: StateID,
1613        I: IntoIterator<Item = P>,
1614        P: AsRef<[u8]>,
1615    {
1616        let nfa = self.nfa_builder.build(patterns)?;
1617        let match_kind = nfa.match_kind().clone();
1618        let imp = if self.dfa {
1619            let dfa = self.dfa_builder.build(&nfa)?;
1620            Imp::DFA(dfa)
1621        } else {
1622            Imp::NFA(nfa)
1623        };
1624        Ok(AhoCorasick { imp, match_kind })
1625    }
1626
1627    /// Automatically configure the settings on this builder according to the
1628    /// patterns that will be used to construct the automaton.
1629    ///
1630    /// The idea here is to balance space and time automatically. That is, when
1631    /// searching a small number of patterns, this will attempt to use the
1632    /// fastest possible configuration since the total space required will be
1633    /// small anyway. As the number of patterns grows, this will fall back to
1634    /// slower configurations that use less space.
1635    ///
1636    /// This is guaranteed to never set `match_kind`, but any other option may
1637    /// be overridden.
1638    ///
1639    /// # Examples
1640    ///
1641    /// Basic usage:
1642    ///
1643    /// ```
1644    /// use aho_corasick::AhoCorasickBuilder;
1645    ///
1646    /// let patterns = &["foo", "bar", "baz"];
1647    /// let ac = AhoCorasickBuilder::new()
1648    ///     .auto_configure(patterns)
1649    ///     .build(patterns);
1650    /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1651    /// ```
1652    pub fn auto_configure<B: AsRef<[u8]>>(
1653        &mut self,
1654        patterns: &[B],
1655    ) -> &mut AhoCorasickBuilder {
1656        // N.B. Currently we only use the length of `patterns` to make a
1657        // decision here, and could therefore ask for an `ExactSizeIterator`
1658        // instead. But it's conceivable that we might adapt this to look at
1659        // the total number of bytes, which would requires a second pass.
1660        //
1661        // The logic here is fairly rudimentary at the moment, but probably
1662        // OK. The idea here is to use the fastest thing possible for a small
1663        // number of patterns. That is, a DFA with no byte classes, since byte
1664        // classes require an extra indirection for every byte searched. With a
1665        // moderate number of patterns, we still want a DFA, but save on both
1666        // space and compilation time by enabling byte classes. Finally, fall
1667        // back to the slower but smaller NFA.
1668        if patterns.len() <= 100 {
1669            // N.B. Using byte classes can actually be faster by improving
1670            // locality, but this only really applies for multi-megabyte
1671            // automata (i.e., automata that don't fit in your CPU's cache).
1672            self.dfa(true);
1673        } else if patterns.len() <= 5000 {
1674            self.dfa(true);
1675        }
1676        self
1677    }
1678
1679    /// Set the desired match semantics.
1680    ///
1681    /// The default is `MatchKind::Standard`, which corresponds to the match
1682    /// semantics supported by the standard textbook description of the
1683    /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
1684    /// are found. Moreover, this is the only way to get overlapping matches
1685    /// or do stream searching.
1686    ///
1687    /// The other kinds of match semantics that are supported are
1688    /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
1689    /// corresponds to the match you would get if you were to try to match
1690    /// each pattern at each position in the haystack in the same order that
1691    /// you give to the automaton. That is, it returns the leftmost match
1692    /// corresponding the earliest pattern given to the automaton. The latter
1693    /// corresponds to finding the longest possible match among all leftmost
1694    /// matches.
1695    ///
1696    /// For more details on match semantics, see the
1697    /// [documentation for `MatchKind`](enum.MatchKind.html).
1698    ///
1699    /// # Examples
1700    ///
1701    /// In these examples, we demonstrate the differences between match
1702    /// semantics for a particular set of patterns in a specific order:
1703    /// `b`, `abc`, `abcd`.
1704    ///
1705    /// Standard semantics:
1706    ///
1707    /// ```
1708    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1709    ///
1710    /// let patterns = &["b", "abc", "abcd"];
1711    /// let haystack = "abcd";
1712    ///
1713    /// let ac = AhoCorasickBuilder::new()
1714    ///     .match_kind(MatchKind::Standard) // default, not necessary
1715    ///     .build(patterns);
1716    /// let mat = ac.find(haystack).expect("should have a match");
1717    /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
1718    /// ```
1719    ///
1720    /// Leftmost-first semantics:
1721    ///
1722    /// ```
1723    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1724    ///
1725    /// let patterns = &["b", "abc", "abcd"];
1726    /// let haystack = "abcd";
1727    ///
1728    /// let ac = AhoCorasickBuilder::new()
1729    ///     .match_kind(MatchKind::LeftmostFirst)
1730    ///     .build(patterns);
1731    /// let mat = ac.find(haystack).expect("should have a match");
1732    /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
1733    /// ```
1734    ///
1735    /// Leftmost-longest semantics:
1736    ///
1737    /// ```
1738    /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1739    ///
1740    /// let patterns = &["b", "abc", "abcd"];
1741    /// let haystack = "abcd";
1742    ///
1743    /// let ac = AhoCorasickBuilder::new()
1744    ///     .match_kind(MatchKind::LeftmostLongest)
1745    ///     .build(patterns);
1746    /// let mat = ac.find(haystack).expect("should have a match");
1747    /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
1748    /// ```
1749    pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
1750        self.nfa_builder.match_kind(kind);
1751        self
1752    }
1753
1754    /// Enable anchored mode, which requires all matches to start at the
1755    /// first position in a haystack.
1756    ///
1757    /// This option is disabled by default.
1758    ///
1759    /// # Examples
1760    ///
1761    /// Basic usage:
1762    ///
1763    /// ```
1764    /// use aho_corasick::AhoCorasickBuilder;
1765    ///
1766    /// let patterns = &["foo", "bar"];
1767    /// let haystack = "foobar";
1768    ///
1769    /// let ac = AhoCorasickBuilder::new()
1770    ///     .anchored(true)
1771    ///     .build(patterns);
1772    /// assert_eq!(1, ac.find_iter(haystack).count());
1773    /// ```
1774    ///
1775    /// When searching for overlapping matches, all matches that start at
1776    /// the beginning of a haystack will be reported:
1777    ///
1778    /// ```
1779    /// use aho_corasick::AhoCorasickBuilder;
1780    ///
1781    /// let patterns = &["foo", "foofoo"];
1782    /// let haystack = "foofoo";
1783    ///
1784    /// let ac = AhoCorasickBuilder::new()
1785    ///     .anchored(true)
1786    ///     .build(patterns);
1787    /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
1788    /// // A non-anchored search would return 3 matches.
1789    /// ```
1790    pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1791        self.nfa_builder.anchored(yes);
1792        self
1793    }
1794
1795    /// Enable ASCII-aware case insensitive matching.
1796    ///
1797    /// When this option is enabled, searching will be performed without
1798    /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
1799    ///
1800    /// Enabling this option does not change the search algorithm, but it may
1801    /// increase the size of the automaton.
1802    ///
1803    /// **NOTE:** In the future, support for full Unicode case insensitivity
1804    /// may be added, but ASCII case insensitivity is comparatively much
1805    /// simpler to add.
1806    ///
1807    /// # Examples
1808    ///
1809    /// Basic usage:
1810    ///
1811    /// ```
1812    /// use aho_corasick::AhoCorasickBuilder;
1813    ///
1814    /// let patterns = &["FOO", "bAr", "BaZ"];
1815    /// let haystack = "foo bar baz";
1816    ///
1817    /// let ac = AhoCorasickBuilder::new()
1818    ///     .ascii_case_insensitive(true)
1819    ///     .build(patterns);
1820    /// assert_eq!(3, ac.find_iter(haystack).count());
1821    /// ```
1822    pub fn ascii_case_insensitive(
1823        &mut self,
1824        yes: bool,
1825    ) -> &mut AhoCorasickBuilder {
1826        self.nfa_builder.ascii_case_insensitive(yes);
1827        self
1828    }
1829
1830    /// Set the limit on how many NFA states use a dense representation for
1831    /// their transitions.
1832    ///
1833    /// A dense representation uses more space, but supports faster access to
1834    /// transitions at search time. Thus, this setting permits the control of a
1835    /// space vs time trade off when using the NFA variant of Aho-Corasick.
1836    ///
1837    /// This limit is expressed in terms of the depth of a state, i.e., the
1838    /// number of transitions from the starting state of the NFA. The idea is
1839    /// that most of the time searching will be spent near the starting state
1840    /// of the automaton, so states near the start state should use a dense
1841    /// representation. States further away from the start state would then use
1842    /// a sparse representation, which uses less space but is slower to access
1843    /// transitions at search time.
1844    ///
1845    /// By default, this is set to a low but non-zero number.
1846    ///
1847    /// This setting has no effect if the `dfa` option is enabled.
1848    pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
1849        self.nfa_builder.dense_depth(depth);
1850        self
1851    }
1852
1853    /// Compile the standard Aho-Corasick automaton into a deterministic finite
1854    /// automaton (DFA).
1855    ///
1856    /// When this is disabled (which is the default), then a non-deterministic
1857    /// finite automaton (NFA) is used instead.
1858    ///
1859    /// The main benefit to a DFA is that it can execute searches more quickly
1860    /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
1861    /// DFA uses more space and can take much longer to build.
1862    ///
1863    /// Enabling this option does not change the time complexity for
1864    /// constructing the Aho-Corasick automaton (which is `O(p)` where
1865    /// `p` is the total number of patterns being compiled). Enabling this
1866    /// option does however reduce the time complexity of non-overlapping
1867    /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
1868    /// haystack.
1869    ///
1870    /// In general, it's a good idea to enable this if you're searching a
1871    /// small number of fairly short patterns (~1000), or if you want the
1872    /// fastest possible search without regard to compilation time or space
1873    /// usage.
1874    pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1875        self.dfa = yes;
1876        self
1877    }
1878
1879    /// Enable heuristic prefilter optimizations.
1880    ///
1881    /// When enabled, searching will attempt to quickly skip to match
1882    /// candidates using specialized literal search routines. A prefilter
1883    /// cannot always be used, and is generally treated as a heuristic. It
1884    /// can be useful to disable this if the prefilter is observed to be
1885    /// sub-optimal for a particular workload.
1886    ///
1887    /// This is enabled by default.
1888    pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1889        self.nfa_builder.prefilter(yes);
1890        self
1891    }
1892
1893    /// Shrink the size of the transition alphabet by mapping bytes to their
1894    /// equivalence classes. This only has an effect when the `dfa` option is
1895    /// enabled.
1896    ///
1897    /// When enabled, each a DFA will use a map from all possible bytes
1898    /// to their corresponding equivalence class. Each equivalence class
1899    /// represents a set of bytes that does not discriminate between a match
1900    /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
1901    /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
1902    /// and `z`, and a final set that contains every other byte.
1903    ///
1904    /// The advantage of this map is that the size of the transition table can
1905    /// be reduced drastically from `#states * 256 * sizeof(id)` to
1906    /// `#states * k * sizeof(id)` where `k` is the number of equivalence
1907    /// classes. As a result, total space usage can decrease substantially.
1908    /// Moreover, since a smaller alphabet is used, compilation becomes faster
1909    /// as well.
1910    ///
1911    /// The disadvantage of this map is that every byte searched must be
1912    /// passed through this map before it can be used to determine the next
1913    /// transition. This has a small match time performance cost. However, if
1914    /// the DFA is otherwise very large without byte classes, then using byte
1915    /// classes can greatly improve memory locality and thus lead to better
1916    /// overall performance.
1917    ///
1918    /// This option is enabled by default.
1919    #[deprecated(
1920        since = "0.7.16",
1921        note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1922    )]
1923    pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1924        self.dfa_builder.byte_classes(yes);
1925        self
1926    }
1927
1928    /// Premultiply state identifiers in the transition table. This only has
1929    /// an effect when the `dfa` option is enabled.
1930    ///
1931    /// When enabled, state identifiers are premultiplied to point to their
1932    /// corresponding row in the transition table. That is, given the `i`th
1933    /// state, its corresponding premultiplied identifier is `i * k` where `k`
1934    /// is the alphabet size of the automaton. (The alphabet size is at most
1935    /// 256, but is in practice smaller if byte classes is enabled.)
1936    ///
1937    /// When state identifiers are not premultiplied, then the identifier of
1938    /// the `i`th state is `i`.
1939    ///
1940    /// The advantage of premultiplying state identifiers is that is saves a
1941    /// multiplication instruction per byte when searching with a DFA. This has
1942    /// been observed to lead to a 20% performance benefit in micro-benchmarks.
1943    ///
1944    /// The primary disadvantage of premultiplying state identifiers is
1945    /// that they require a larger integer size to represent. For example,
1946    /// if the DFA has 200 states, then its premultiplied form requires 16
1947    /// bits to represent every possible state identifier, where as its
1948    /// non-premultiplied form only requires 8 bits.
1949    ///
1950    /// This option is enabled by default.
1951    #[deprecated(
1952        since = "0.7.16",
1953        note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1954    )]
1955    pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1956        self.dfa_builder.premultiply(yes);
1957        self
1958    }
1959}
1960
1961/// A knob for controlling the match semantics of an Aho-Corasick automaton.
1962///
1963/// There are two generally different ways that Aho-Corasick automatons can
1964/// report matches. The first way is the "standard" approach that results from
1965/// implementing most textbook explanations of Aho-Corasick. The second way is
1966/// to report only the leftmost non-overlapping matches. The leftmost approach
1967/// is in turn split into two different ways of resolving ambiguous matches:
1968/// leftmost-first and leftmost-longest.
1969///
1970/// The `Standard` match kind is the default and is the only one that supports
1971/// overlapping matches and stream searching. (Trying to find overlapping
1972/// or streaming matches using leftmost match semantics will result in a
1973/// panic.) The `Standard` match kind will report matches as they are seen.
1974/// When searching for overlapping matches, then all possible matches are
1975/// reported. When searching for non-overlapping matches, the first match seen
1976/// is reported. For example, for non-overlapping matches, given the patterns
1977/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
1978/// reported since it is detected first. The `abcd` match is never reported
1979/// since it overlaps with the `b` match.
1980///
1981/// In contrast, the leftmost match kind always prefers the leftmost match
1982/// among all possible matches. Given the same example as above with `abcd` and
1983/// `b` as patterns and `abcdef` as the subject string, the leftmost match is
1984/// `abcd` since it begins before the `b` match, even though the `b` match is
1985/// detected before the `abcd` match. In this case, the `b` match is not
1986/// reported at all since it overlaps with the `abcd` match.
1987///
1988/// The difference between leftmost-first and leftmost-longest is in how they
1989/// resolve ambiguous matches when there are multiple leftmost matches to
1990/// choose from. Leftmost-first always chooses the pattern that was provided
1991/// earliest, where as leftmost-longest always chooses the longest matching
1992/// pattern. For example, given the patterns `a` and `ab` and the subject
1993/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
1994/// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1995/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1996/// would be `ab`. Stated differently, the leftmost-first match depends on the
1997/// order in which the patterns were given to the Aho-Corasick automaton.
1998/// Because of that, when leftmost-first matching is used, if a pattern `A`
1999/// that appears before a pattern `B` is a prefix of `B`, then it is impossible
2000/// to ever observe a match of `B`.
2001///
2002/// If you're not sure which match kind to pick, then stick with the standard
2003/// kind, which is the default. In particular, if you need overlapping or
2004/// streaming matches, then you _must_ use the standard kind. The leftmost
2005/// kinds are useful in specific circumstances. For example, leftmost-first can
2006/// be very useful as a way to implement match priority based on the order of
2007/// patterns given and leftmost-longest can be useful for dictionary searching
2008/// such that only the longest matching words are reported.
2009///
2010/// # Relationship with regular expression alternations
2011///
2012/// Understanding match semantics can be a little tricky, and one easy way
2013/// to conceptualize non-overlapping matches from an Aho-Corasick automaton
2014/// is to think about them as a simple alternation of literals in a regular
2015/// expression. For example, let's say we wanted to match the strings
2016/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
2017/// turns out that regular expression engines have two different ways of
2018/// matching this alternation. The first way, leftmost-longest, is commonly
2019/// found in POSIX compatible implementations of regular expressions (such as
2020/// `grep`). The second way, leftmost-first, is commonly found in backtracking
2021/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
2022/// regex engine do not use backtracking, but still implement leftmost-first
2023/// semantics in an effort to match the behavior of dominant backtracking
2024/// regex engines such as those found in Perl, Ruby, Python, Javascript and
2025/// PHP.)
2026///
2027/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
2028/// will match `Samwise` because it is the longest possible match, but a
2029/// Perl-like regex will match `Sam` since it appears earlier in the
2030/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
2031/// will never match `Samwise` since `Sam` will always have higher priority.
2032/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
2033/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
2034/// still longest match, but it also appears earlier than `Sam`.
2035///
2036/// The "standard" match semantics of Aho-Corasick generally don't correspond
2037/// to the match semantics of any large group of regex implementations, so
2038/// there's no direct analogy that can be made here. Standard match semantics
2039/// are generally useful for overlapping matches, or if you just want to see
2040/// matches as they are detected.
2041///
2042/// The main conclusion to draw from this section is that the match semantics
2043/// can be tweaked to precisely match either Perl-like regex alternations or
2044/// POSIX regex alternations.
2045#[derive(Clone, Copy, Debug, Eq, PartialEq)]
2046pub enum MatchKind {
2047    /// Use standard match semantics, which support overlapping matches. When
2048    /// used with non-overlapping matches, matches are reported as they are
2049    /// seen.
2050    Standard,
2051    /// Use leftmost-first match semantics, which reports leftmost matches.
2052    /// When there are multiple possible leftmost matches, the match
2053    /// corresponding to the pattern that appeared earlier when constructing
2054    /// the automaton is reported.
2055    ///
2056    /// This does **not** support overlapping matches or stream searching. If
2057    /// this match kind is used, attempting to find overlapping matches or
2058    /// stream matches will panic.
2059    LeftmostFirst,
2060    /// Use leftmost-longest match semantics, which reports leftmost matches.
2061    /// When there are multiple possible leftmost matches, the longest match
2062    /// is chosen.
2063    ///
2064    /// This does **not** support overlapping matches or stream searching. If
2065    /// this match kind is used, attempting to find overlapping matches or
2066    /// stream matches will panic.
2067    LeftmostLongest,
2068    /// Hints that destructuring should not be exhaustive.
2069    ///
2070    /// This enum may grow additional variants, so this makes sure clients
2071    /// don't count on exhaustive matching. (Otherwise, adding a new variant
2072    /// could break existing code.)
2073    #[doc(hidden)]
2074    __Nonexhaustive,
2075}
2076
2077/// The default match kind is `MatchKind::Standard`.
2078impl Default for MatchKind {
2079    fn default() -> MatchKind {
2080        MatchKind::Standard
2081    }
2082}
2083
2084impl MatchKind {
2085    fn supports_overlapping(&self) -> bool {
2086        self.is_standard()
2087    }
2088
2089    fn supports_stream(&self) -> bool {
2090        // TODO: It may be possible to support this. It's hard.
2091        //
2092        // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
2093        self.is_standard()
2094    }
2095
2096    pub(crate) fn is_standard(&self) -> bool {
2097        *self == MatchKind::Standard
2098    }
2099
2100    pub(crate) fn is_leftmost(&self) -> bool {
2101        *self == MatchKind::LeftmostFirst
2102            || *self == MatchKind::LeftmostLongest
2103    }
2104
2105    pub(crate) fn is_leftmost_first(&self) -> bool {
2106        *self == MatchKind::LeftmostFirst
2107    }
2108
2109    /// Convert this match kind into a packed match kind. If this match kind
2110    /// corresponds to standard semantics, then this returns None, since
2111    /// packed searching does not support standard semantics.
2112    pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
2113        match *self {
2114            MatchKind::Standard => None,
2115            MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
2116            MatchKind::LeftmostLongest => {
2117                Some(packed::MatchKind::LeftmostLongest)
2118            }
2119            MatchKind::__Nonexhaustive => unreachable!(),
2120        }
2121    }
2122}
2123
2124#[cfg(test)]
2125mod tests {
2126    use super::*;
2127
2128    #[test]
2129    fn oibits() {
2130        use std::panic::{RefUnwindSafe, UnwindSafe};
2131
2132        fn assert_send<T: Send>() {}
2133        fn assert_sync<T: Sync>() {}
2134        fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
2135
2136        assert_send::<AhoCorasick>();
2137        assert_sync::<AhoCorasick>();
2138        assert_unwind_safe::<AhoCorasick>();
2139        assert_send::<AhoCorasickBuilder>();
2140        assert_sync::<AhoCorasickBuilder>();
2141        assert_unwind_safe::<AhoCorasickBuilder>();
2142    }
2143}