aho_corasick/packed/
api.rs

1use std::u16;
2
3use crate::packed::pattern::Patterns;
4use crate::packed::rabinkarp::RabinKarp;
5use crate::packed::teddy::{self, Teddy};
6use crate::Match;
7
8/// This is a limit placed on the total number of patterns we're willing to try
9/// and match at once. As more sophisticated algorithms are added, this number
10/// may be increased.
11const PATTERN_LIMIT: usize = 128;
12
13/// A knob for controlling the match semantics of a packed multiple string
14/// searcher.
15///
16/// This differs from the
17/// [`MatchKind`](../enum.MatchKind.html)
18/// type in the top-level crate module in that it doesn't support
19/// "standard" match semantics, and instead only supports leftmost-first or
20/// leftmost-longest. Namely, "standard" semantics cannot be easily supported
21/// by packed searchers.
22///
23/// For more information on the distinction between leftmost-first and
24/// leftmost-longest, see the docs on the top-level `MatchKind` type.
25///
26/// Unlike the top-level `MatchKind` type, the default match semantics for this
27/// type are leftmost-first.
28#[derive(Clone, Copy, Debug, Eq, PartialEq)]
29pub enum MatchKind {
30    /// Use leftmost-first match semantics, which reports leftmost matches.
31    /// When there are multiple possible leftmost matches, the match
32    /// corresponding to the pattern that appeared earlier when constructing
33    /// the automaton is reported.
34    ///
35    /// This is the default.
36    LeftmostFirst,
37    /// Use leftmost-longest match semantics, which reports leftmost matches.
38    /// When there are multiple possible leftmost matches, the longest match
39    /// is chosen.
40    LeftmostLongest,
41    /// Hints that destructuring should not be exhaustive.
42    ///
43    /// This enum may grow additional variants, so this makes sure clients
44    /// don't count on exhaustive matching. (Otherwise, adding a new variant
45    /// could break existing code.)
46    #[doc(hidden)]
47    __Nonexhaustive,
48}
49
50impl Default for MatchKind {
51    fn default() -> MatchKind {
52        MatchKind::LeftmostFirst
53    }
54}
55
56/// The configuration for a packed multiple pattern searcher.
57///
58/// The configuration is currently limited only to being able to select the
59/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
60/// future, more knobs may be made available.
61///
62/// A configuration produces a [`packed::Builder`](struct.Builder.html), which
63/// in turn can be used to construct a
64/// [`packed::Searcher`](struct.Searcher.html) for searching.
65///
66/// # Example
67///
68/// This example shows how to use leftmost-longest semantics instead of the
69/// default (leftmost-first).
70///
71/// ```
72/// use aho_corasick::packed::{Config, MatchKind};
73///
74/// # fn example() -> Option<()> {
75/// let searcher = Config::new()
76///     .match_kind(MatchKind::LeftmostLongest)
77///     .builder()
78///     .add("foo")
79///     .add("foobar")
80///     .build()?;
81/// let matches: Vec<usize> = searcher
82///     .find_iter("foobar")
83///     .map(|mat| mat.pattern())
84///     .collect();
85/// assert_eq!(vec![1], matches);
86/// # Some(()) }
87/// # if cfg!(target_arch = "x86_64") {
88/// #     example().unwrap()
89/// # } else {
90/// #     assert!(example().is_none());
91/// # }
92/// ```
93#[derive(Clone, Debug)]
94pub struct Config {
95    kind: MatchKind,
96    force: Option<ForceAlgorithm>,
97    force_teddy_fat: Option<bool>,
98    force_avx: Option<bool>,
99}
100
101/// An internal option for forcing the use of a particular packed algorithm.
102///
103/// When an algorithm is forced, if a searcher could not be constructed for it,
104/// then no searcher will be returned even if an alternative algorithm would
105/// work.
106#[derive(Clone, Debug)]
107enum ForceAlgorithm {
108    Teddy,
109    RabinKarp,
110}
111
112impl Default for Config {
113    fn default() -> Config {
114        Config::new()
115    }
116}
117
118impl Config {
119    /// Create a new default configuration. A default configuration uses
120    /// leftmost-first match semantics.
121    pub fn new() -> Config {
122        Config {
123            kind: MatchKind::LeftmostFirst,
124            force: None,
125            force_teddy_fat: None,
126            force_avx: None,
127        }
128    }
129
130    /// Create a packed builder from this configuration. The builder can be
131    /// used to accumulate patterns and create a
132    /// [`Searcher`](struct.Searcher.html)
133    /// from them.
134    pub fn builder(&self) -> Builder {
135        Builder::from_config(self.clone())
136    }
137
138    /// Set the match semantics for this configuration.
139    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
140        self.kind = kind;
141        self
142    }
143
144    /// An undocumented method for forcing the use of the Teddy algorithm.
145    ///
146    /// This is only exposed for more precise testing and benchmarks. Callers
147    /// should not use it as it is not part of the API stability guarantees of
148    /// this crate.
149    #[doc(hidden)]
150    pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
151        if yes {
152            self.force = Some(ForceAlgorithm::Teddy);
153        } else {
154            self.force = None;
155        }
156        self
157    }
158
159    /// An undocumented method for forcing the use of the Fat Teddy algorithm.
160    ///
161    /// This is only exposed for more precise testing and benchmarks. Callers
162    /// should not use it as it is not part of the API stability guarantees of
163    /// this crate.
164    #[doc(hidden)]
165    pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
166        self.force_teddy_fat = yes;
167        self
168    }
169
170    /// An undocumented method for forcing the use of SSE (`Some(false)`) or
171    /// AVX (`Some(true)`) algorithms.
172    ///
173    /// This is only exposed for more precise testing and benchmarks. Callers
174    /// should not use it as it is not part of the API stability guarantees of
175    /// this crate.
176    #[doc(hidden)]
177    pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
178        self.force_avx = yes;
179        self
180    }
181
182    /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
183    ///
184    /// This is only exposed for more precise testing and benchmarks. Callers
185    /// should not use it as it is not part of the API stability guarantees of
186    /// this crate.
187    #[doc(hidden)]
188    pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
189        if yes {
190            self.force = Some(ForceAlgorithm::RabinKarp);
191        } else {
192            self.force = None;
193        }
194        self
195    }
196}
197
198/// A builder for constructing a packed searcher from a collection of patterns.
199///
200/// # Example
201///
202/// This example shows how to use a builder to construct a searcher. By
203/// default, leftmost-first match semantics are used.
204///
205/// ```
206/// use aho_corasick::packed::{Builder, MatchKind};
207///
208/// # fn example() -> Option<()> {
209/// let searcher = Builder::new()
210///     .add("foobar")
211///     .add("foo")
212///     .build()?;
213/// let matches: Vec<usize> = searcher
214///     .find_iter("foobar")
215///     .map(|mat| mat.pattern())
216///     .collect();
217/// assert_eq!(vec![0], matches);
218/// # Some(()) }
219/// # if cfg!(target_arch = "x86_64") {
220/// #     example().unwrap()
221/// # } else {
222/// #     assert!(example().is_none());
223/// # }
224/// ```
225#[derive(Clone, Debug)]
226pub struct Builder {
227    /// The configuration of this builder and subsequent matcher.
228    config: Config,
229    /// Set to true if the builder detects that a matcher cannot be built.
230    inert: bool,
231    /// The patterns provided by the caller.
232    patterns: Patterns,
233}
234
235impl Builder {
236    /// Create a new builder for constructing a multi-pattern searcher. This
237    /// constructor uses the default configuration.
238    pub fn new() -> Builder {
239        Builder::from_config(Config::new())
240    }
241
242    fn from_config(config: Config) -> Builder {
243        Builder { config, inert: false, patterns: Patterns::new() }
244    }
245
246    /// Build a searcher from the patterns added to this builder so far.
247    pub fn build(&self) -> Option<Searcher> {
248        if self.inert || self.patterns.is_empty() {
249            return None;
250        }
251        let mut patterns = self.patterns.clone();
252        patterns.set_match_kind(self.config.kind);
253        let rabinkarp = RabinKarp::new(&patterns);
254        // Effectively, we only want to return a searcher if we can use Teddy,
255        // since Teddy is our only fast packed searcher at the moment.
256        // Rabin-Karp is only used when searching haystacks smaller than what
257        // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
258        // is to force it using undocumented APIs (for tests/benchmarks).
259        let (search_kind, minimum_len) = match self.config.force {
260            None | Some(ForceAlgorithm::Teddy) => {
261                let teddy = match self.build_teddy(&patterns) {
262                    None => return None,
263                    Some(teddy) => teddy,
264                };
265                let minimum_len = teddy.minimum_len();
266                (SearchKind::Teddy(teddy), minimum_len)
267            }
268            Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
269        };
270        Some(Searcher {
271            config: self.config.clone(),
272            patterns,
273            rabinkarp,
274            search_kind,
275            minimum_len,
276        })
277    }
278
279    fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
280        teddy::Builder::new()
281            .avx(self.config.force_avx)
282            .fat(self.config.force_teddy_fat)
283            .build(&patterns)
284    }
285
286    /// Add the given pattern to this set to match.
287    ///
288    /// The order in which patterns are added is significant. Namely, when
289    /// using leftmost-first match semantics, then when multiple patterns can
290    /// match at a particular location, the pattern that was added first is
291    /// used as the match.
292    ///
293    /// If the number of patterns added exceeds the amount supported by packed
294    /// searchers, then the builder will stop accumulating patterns and render
295    /// itself inert. At this point, constructing a searcher will always return
296    /// `None`.
297    pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
298        if self.inert {
299            return self;
300        } else if self.patterns.len() >= PATTERN_LIMIT {
301            self.inert = true;
302            self.patterns.reset();
303            return self;
304        }
305        // Just in case PATTERN_LIMIT increases beyond u16::MAX.
306        assert!(self.patterns.len() <= u16::MAX as usize);
307
308        let pattern = pattern.as_ref();
309        if pattern.is_empty() {
310            self.inert = true;
311            self.patterns.reset();
312            return self;
313        }
314        self.patterns.add(pattern);
315        self
316    }
317
318    /// Add the given iterator of patterns to this set to match.
319    ///
320    /// The iterator must yield elements that can be converted into a `&[u8]`.
321    ///
322    /// The order in which patterns are added is significant. Namely, when
323    /// using leftmost-first match semantics, then when multiple patterns can
324    /// match at a particular location, the pattern that was added first is
325    /// used as the match.
326    ///
327    /// If the number of patterns added exceeds the amount supported by packed
328    /// searchers, then the builder will stop accumulating patterns and render
329    /// itself inert. At this point, constructing a searcher will always return
330    /// `None`.
331    pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
332    where
333        I: IntoIterator<Item = P>,
334        P: AsRef<[u8]>,
335    {
336        for p in patterns {
337            self.add(p);
338        }
339        self
340    }
341}
342
343impl Default for Builder {
344    fn default() -> Builder {
345        Builder::new()
346    }
347}
348
349/// A packed searcher for quickly finding occurrences of multiple patterns.
350///
351/// If callers need more flexible construction, or if one wants to change the
352/// match semantics (either leftmost-first or leftmost-longest), then one can
353/// use the [`Config`](struct.Config.html) and/or
354/// [`Builder`](struct.Builder.html) types for more fine grained control.
355///
356/// # Example
357///
358/// This example shows how to create a searcher from an iterator of patterns.
359/// By default, leftmost-first match semantics are used.
360///
361/// ```
362/// use aho_corasick::packed::{MatchKind, Searcher};
363///
364/// # fn example() -> Option<()> {
365/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
366/// let matches: Vec<usize> = searcher
367///     .find_iter("foobar")
368///     .map(|mat| mat.pattern())
369///     .collect();
370/// assert_eq!(vec![0], matches);
371/// # Some(()) }
372/// # if cfg!(target_arch = "x86_64") {
373/// #     example().unwrap()
374/// # } else {
375/// #     assert!(example().is_none());
376/// # }
377/// ```
378#[derive(Clone, Debug)]
379pub struct Searcher {
380    config: Config,
381    patterns: Patterns,
382    rabinkarp: RabinKarp,
383    search_kind: SearchKind,
384    minimum_len: usize,
385}
386
387#[derive(Clone, Debug)]
388enum SearchKind {
389    Teddy(Teddy),
390    RabinKarp,
391}
392
393impl Searcher {
394    /// A convenience function for constructing a searcher from an iterator
395    /// of things that can be converted to a `&[u8]`.
396    ///
397    /// If a searcher could not be constructed (either because of an
398    /// unsupported CPU or because there are too many patterns), then `None`
399    /// is returned.
400    ///
401    /// # Example
402    ///
403    /// Basic usage:
404    ///
405    /// ```
406    /// use aho_corasick::packed::{MatchKind, Searcher};
407    ///
408    /// # fn example() -> Option<()> {
409    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
410    /// let matches: Vec<usize> = searcher
411    ///     .find_iter("foobar")
412    ///     .map(|mat| mat.pattern())
413    ///     .collect();
414    /// assert_eq!(vec![0], matches);
415    /// # Some(()) }
416    /// # if cfg!(target_arch = "x86_64") {
417    /// #     example().unwrap()
418    /// # } else {
419    /// #     assert!(example().is_none());
420    /// # }
421    /// ```
422    pub fn new<I, P>(patterns: I) -> Option<Searcher>
423    where
424        I: IntoIterator<Item = P>,
425        P: AsRef<[u8]>,
426    {
427        Builder::new().extend(patterns).build()
428    }
429
430    /// Return the first occurrence of any of the patterns in this searcher,
431    /// according to its match semantics, in the given haystack. The `Match`
432    /// returned will include the identifier of the pattern that matched, which
433    /// corresponds to the index of the pattern (starting from `0`) in which it
434    /// was added.
435    ///
436    /// # Example
437    ///
438    /// Basic usage:
439    ///
440    /// ```
441    /// use aho_corasick::packed::{MatchKind, Searcher};
442    ///
443    /// # fn example() -> Option<()> {
444    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
445    /// let mat = searcher.find("foobar")?;
446    /// assert_eq!(0, mat.pattern());
447    /// assert_eq!(0, mat.start());
448    /// assert_eq!(6, mat.end());
449    /// # Some(()) }
450    /// # if cfg!(target_arch = "x86_64") {
451    /// #     example().unwrap()
452    /// # } else {
453    /// #     assert!(example().is_none());
454    /// # }
455    /// ```
456    pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
457        self.find_at(haystack, 0)
458    }
459
460    /// Return the first occurrence of any of the patterns in this searcher,
461    /// according to its match semantics, in the given haystack starting from
462    /// the given position.
463    ///
464    /// The `Match` returned will include the identifier of the pattern that
465    /// matched, which corresponds to the index of the pattern (starting from
466    /// `0`) in which it was added. The offsets in the `Match` will be relative
467    /// to the start of `haystack` (and not `at`).
468    ///
469    /// # Example
470    ///
471    /// Basic usage:
472    ///
473    /// ```
474    /// use aho_corasick::packed::{MatchKind, Searcher};
475    ///
476    /// # fn example() -> Option<()> {
477    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
478    /// let mat = searcher.find_at("foofoobar", 3)?;
479    /// assert_eq!(0, mat.pattern());
480    /// assert_eq!(3, mat.start());
481    /// assert_eq!(9, mat.end());
482    /// # Some(()) }
483    /// # if cfg!(target_arch = "x86_64") {
484    /// #     example().unwrap()
485    /// # } else {
486    /// #     assert!(example().is_none());
487    /// # }
488    /// ```
489    pub fn find_at<B: AsRef<[u8]>>(
490        &self,
491        haystack: B,
492        at: usize,
493    ) -> Option<Match> {
494        let haystack = haystack.as_ref();
495        match self.search_kind {
496            SearchKind::Teddy(ref teddy) => {
497                if haystack[at..].len() < teddy.minimum_len() {
498                    return self.slow_at(haystack, at);
499                }
500                teddy.find_at(&self.patterns, haystack, at)
501            }
502            SearchKind::RabinKarp => {
503                self.rabinkarp.find_at(&self.patterns, haystack, at)
504            }
505        }
506    }
507
508    /// Return an iterator of non-overlapping occurrences of the patterns in
509    /// this searcher, according to its match semantics, in the given haystack.
510    ///
511    /// # Example
512    ///
513    /// Basic usage:
514    ///
515    /// ```
516    /// use aho_corasick::packed::{MatchKind, Searcher};
517    ///
518    /// # fn example() -> Option<()> {
519    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
520    /// let matches: Vec<usize> = searcher
521    ///     .find_iter("foobar fooba foofoo")
522    ///     .map(|mat| mat.pattern())
523    ///     .collect();
524    /// assert_eq!(vec![0, 1, 1, 1], matches);
525    /// # Some(()) }
526    /// # if cfg!(target_arch = "x86_64") {
527    /// #     example().unwrap()
528    /// # } else {
529    /// #     assert!(example().is_none());
530    /// # }
531    /// ```
532    pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
533        &'a self,
534        haystack: &'b B,
535    ) -> FindIter<'a, 'b> {
536        FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
537    }
538
539    /// Returns the match kind used by this packed searcher.
540    ///
541    /// # Examples
542    ///
543    /// Basic usage:
544    ///
545    /// ```
546    /// use aho_corasick::packed::{MatchKind, Searcher};
547    ///
548    /// # fn example() -> Option<()> {
549    /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
550    /// // leftmost-first is the default.
551    /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
552    /// # Some(()) }
553    /// # if cfg!(target_arch = "x86_64") {
554    /// #     example().unwrap()
555    /// # } else {
556    /// #     assert!(example().is_none());
557    /// # }
558    /// ```
559    pub fn match_kind(&self) -> &MatchKind {
560        self.patterns.match_kind()
561    }
562
563    /// Returns the minimum length of a haystack that is required in order for
564    /// packed searching to be effective.
565    ///
566    /// In some cases, the underlying packed searcher may not be able to search
567    /// very short haystacks. When that occurs, the implementation will defer
568    /// to a slower non-packed searcher (which is still generally faster than
569    /// Aho-Corasick for a small number of patterns). However, callers may
570    /// want to avoid ever using the slower variant, which one can do by
571    /// never passing a haystack shorter than the minimum length returned by
572    /// this method.
573    pub fn minimum_len(&self) -> usize {
574        self.minimum_len
575    }
576
577    /// Returns the approximate total amount of heap used by this searcher, in
578    /// units of bytes.
579    pub fn heap_bytes(&self) -> usize {
580        self.patterns.heap_bytes()
581            + self.rabinkarp.heap_bytes()
582            + self.search_kind.heap_bytes()
583    }
584
585    /// Use a slow (non-packed) searcher.
586    ///
587    /// This is useful when a packed searcher could be constructed, but could
588    /// not be used to search a specific haystack. For example, if Teddy was
589    /// built but the haystack is smaller than ~34 bytes, then Teddy might not
590    /// be able to run.
591    fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
592        self.rabinkarp.find_at(&self.patterns, haystack, at)
593    }
594}
595
596impl SearchKind {
597    fn heap_bytes(&self) -> usize {
598        match *self {
599            SearchKind::Teddy(ref ted) => ted.heap_bytes(),
600            SearchKind::RabinKarp => 0,
601        }
602    }
603}
604
605/// An iterator over non-overlapping matches from a packed searcher.
606///
607/// The lifetime `'s` refers to the lifetime of the underlying
608/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
609/// lifetime of the haystack being searched.
610#[derive(Debug)]
611pub struct FindIter<'s, 'h> {
612    searcher: &'s Searcher,
613    haystack: &'h [u8],
614    at: usize,
615}
616
617impl<'s, 'h> Iterator for FindIter<'s, 'h> {
618    type Item = Match;
619
620    fn next(&mut self) -> Option<Match> {
621        if self.at > self.haystack.len() {
622            return None;
623        }
624        match self.searcher.find_at(&self.haystack, self.at) {
625            None => None,
626            Some(c) => {
627                self.at = c.end;
628                Some(c)
629            }
630        }
631    }
632}