aho_corasick/
prefilter.rs

1use std::cmp;
2use std::fmt;
3use std::panic::{RefUnwindSafe, UnwindSafe};
4use std::u8;
5
6use memchr::{memchr, memchr2, memchr3};
7
8use crate::ahocorasick::MatchKind;
9use crate::packed;
10use crate::Match;
11
12/// A candidate is the result of running a prefilter on a haystack at a
13/// particular position. The result is either no match, a confirmed match or
14/// a possible match.
15///
16/// When no match is returned, the prefilter is guaranteeing that no possible
17/// match can be found in the haystack, and the caller may trust this. That is,
18/// all correct prefilters must never report false negatives.
19///
20/// In some cases, a prefilter can confirm a match very quickly, in which case,
21/// the caller may use this to stop what it's doing and report the match. In
22/// this case, prefilter implementations must never report a false positive.
23/// In other cases, the prefilter can only report a potential match, in which
24/// case the callers must attempt to confirm the match. In this case, prefilter
25/// implementations are permitted to return false positives.
26#[derive(Clone, Debug)]
27pub enum Candidate {
28    None,
29    Match(Match),
30    PossibleStartOfMatch(usize),
31}
32
33impl Candidate {
34    /// Convert this candidate into an option. This is useful when callers
35    /// do not distinguish between true positives and false positives (i.e.,
36    /// the caller must always confirm the match in order to update some other
37    /// state).
38    pub fn into_option(self) -> Option<usize> {
39        match self {
40            Candidate::None => None,
41            Candidate::Match(ref m) => Some(m.start()),
42            Candidate::PossibleStartOfMatch(start) => Some(start),
43        }
44    }
45}
46
47/// A prefilter describes the behavior of fast literal scanners for quickly
48/// skipping past bytes in the haystack that we know cannot possibly
49/// participate in a match.
50pub trait Prefilter:
51    Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
52{
53    /// Returns the next possible match candidate. This may yield false
54    /// positives, so callers must confirm a match starting at the position
55    /// returned. This, however, must never produce false negatives. That is,
56    /// this must, at minimum, return the starting position of the next match
57    /// in the given haystack after or at the given position.
58    fn next_candidate(
59        &self,
60        state: &mut PrefilterState,
61        haystack: &[u8],
62        at: usize,
63    ) -> Candidate;
64
65    /// A method for cloning a prefilter, to work-around the fact that Clone
66    /// is not object-safe.
67    fn clone_prefilter(&self) -> Box<dyn Prefilter>;
68
69    /// Returns the approximate total amount of heap used by this prefilter, in
70    /// units of bytes.
71    fn heap_bytes(&self) -> usize;
72
73    /// Returns true if and only if this prefilter never returns false
74    /// positives. This is useful for completely avoiding the automaton
75    /// when the prefilter can quickly confirm its own matches.
76    ///
77    /// By default, this returns true, which is conservative; it is always
78    /// correct to return `true`. Returning `false` here and reporting a false
79    /// positive will result in incorrect searches.
80    fn reports_false_positives(&self) -> bool {
81        true
82    }
83
84    /// Returns true if and only if this prefilter may look for a non-starting
85    /// position of a match.
86    ///
87    /// This is useful in a streaming context where prefilters that don't look
88    /// for a starting position of a match can be quite difficult to deal with.
89    ///
90    /// This returns false by default.
91    fn looks_for_non_start_of_match(&self) -> bool {
92        false
93    }
94}
95
96impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
97    #[inline]
98    fn next_candidate(
99        &self,
100        state: &mut PrefilterState,
101        haystack: &[u8],
102        at: usize,
103    ) -> Candidate {
104        (**self).next_candidate(state, haystack, at)
105    }
106
107    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
108        (**self).clone_prefilter()
109    }
110
111    fn heap_bytes(&self) -> usize {
112        (**self).heap_bytes()
113    }
114
115    fn reports_false_positives(&self) -> bool {
116        (**self).reports_false_positives()
117    }
118}
119
120/// A convenience object for representing any type that implements Prefilter
121/// and is cloneable.
122#[derive(Debug)]
123pub struct PrefilterObj(Box<dyn Prefilter>);
124
125impl Clone for PrefilterObj {
126    fn clone(&self) -> Self {
127        PrefilterObj(self.0.clone_prefilter())
128    }
129}
130
131impl PrefilterObj {
132    /// Create a new prefilter object.
133    pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
134        PrefilterObj(Box::new(t))
135    }
136
137    /// Return the underlying prefilter trait object.
138    pub fn as_ref(&self) -> &dyn Prefilter {
139        &*self.0
140    }
141}
142
143/// PrefilterState tracks state associated with the effectiveness of a
144/// prefilter. It is used to track how many bytes, on average, are skipped by
145/// the prefilter. If this average dips below a certain threshold over time,
146/// then the state renders the prefilter inert and stops using it.
147///
148/// A prefilter state should be created for each search. (Where creating an
149/// iterator via, e.g., `find_iter`, is treated as a single search.)
150#[derive(Clone, Debug)]
151pub struct PrefilterState {
152    /// The number of skips that has been executed.
153    skips: usize,
154    /// The total number of bytes that have been skipped.
155    skipped: usize,
156    /// The maximum length of a match. This is used to help determine how many
157    /// bytes on average should be skipped in order for a prefilter to be
158    /// effective.
159    max_match_len: usize,
160    /// Once this heuristic has been deemed permanently ineffective, it will be
161    /// inert throughout the rest of its lifetime. This serves as a cheap way
162    /// to check inertness.
163    inert: bool,
164    /// The last (absolute) position at which a prefilter scanned to.
165    /// Prefilters can use this position to determine whether to re-scan or
166    /// not.
167    ///
168    /// Unlike other things that impact effectiveness, this is a fleeting
169    /// condition. That is, a prefilter can be considered ineffective if it is
170    /// at a position before `last_scan_at`, but can become effective again
171    /// once the search moves past `last_scan_at`.
172    ///
173    /// The utility of this is to both avoid additional overhead from calling
174    /// the prefilter and to avoid quadratic behavior. This ensures that a
175    /// prefilter will scan any particular byte at most once. (Note that some
176    /// prefilters, like the start-byte prefilter, do not need to use this
177    /// field at all, since it only looks for starting bytes.)
178    last_scan_at: usize,
179}
180
181impl PrefilterState {
182    /// The minimum number of skip attempts to try before considering whether
183    /// a prefilter is effective or not.
184    const MIN_SKIPS: usize = 40;
185
186    /// The minimum amount of bytes that skipping must average, expressed as a
187    /// factor of the multiple of the length of a possible match.
188    ///
189    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
190    /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
191    /// prefilter outed to be rendered inert.
192    const MIN_AVG_FACTOR: usize = 2;
193
194    /// Create a fresh prefilter state.
195    pub fn new(max_match_len: usize) -> PrefilterState {
196        PrefilterState {
197            skips: 0,
198            skipped: 0,
199            max_match_len,
200            inert: false,
201            last_scan_at: 0,
202        }
203    }
204
205    /// Create a prefilter state that always disables the prefilter.
206    pub fn disabled() -> PrefilterState {
207        PrefilterState {
208            skips: 0,
209            skipped: 0,
210            max_match_len: 0,
211            inert: true,
212            last_scan_at: 0,
213        }
214    }
215
216    /// Update this state with the number of bytes skipped on the last
217    /// invocation of the prefilter.
218    #[inline]
219    fn update_skipped_bytes(&mut self, skipped: usize) {
220        self.skips += 1;
221        self.skipped += skipped;
222    }
223
224    /// Updates the position at which the last scan stopped. This may be
225    /// greater than the position of the last candidate reported. For example,
226    /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
227    /// will report a candidate at position `0`, but the end of its last scan
228    /// will be at position `3`.
229    ///
230    /// This position factors into the effectiveness of this prefilter. If the
231    /// current position is less than the last position at which a scan ended,
232    /// then the prefilter should not be re-run until the search moves past
233    /// that position.
234    #[inline]
235    fn update_at(&mut self, at: usize) {
236        if at > self.last_scan_at {
237            self.last_scan_at = at;
238        }
239    }
240
241    /// Return true if and only if this state indicates that a prefilter is
242    /// still effective.
243    ///
244    /// The given pos should correspond to the current starting position of the
245    /// search.
246    #[inline]
247    pub fn is_effective(&mut self, at: usize) -> bool {
248        if self.inert {
249            return false;
250        }
251        if at < self.last_scan_at {
252            return false;
253        }
254        if self.skips < PrefilterState::MIN_SKIPS {
255            return true;
256        }
257
258        let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
259        if self.skipped >= min_avg * self.skips {
260            return true;
261        }
262
263        // We're inert.
264        self.inert = true;
265        false
266    }
267}
268
269/// A builder for constructing the best possible prefilter. When constructed,
270/// this builder will heuristically select the best prefilter it can build,
271/// if any, and discard the rest.
272#[derive(Debug)]
273pub struct Builder {
274    count: usize,
275    ascii_case_insensitive: bool,
276    start_bytes: StartBytesBuilder,
277    rare_bytes: RareBytesBuilder,
278    packed: Option<packed::Builder>,
279}
280
281impl Builder {
282    /// Create a new builder for constructing the best possible prefilter.
283    pub fn new(kind: MatchKind) -> Builder {
284        let pbuilder = kind
285            .as_packed()
286            .map(|kind| packed::Config::new().match_kind(kind).builder());
287        Builder {
288            count: 0,
289            ascii_case_insensitive: false,
290            start_bytes: StartBytesBuilder::new(),
291            rare_bytes: RareBytesBuilder::new(),
292            packed: pbuilder,
293        }
294    }
295
296    /// Enable ASCII case insensitivity. When set, byte strings added to this
297    /// builder will be interpreted without respect to ASCII case.
298    pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
299        self.ascii_case_insensitive = yes;
300        self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
301        self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
302        self
303    }
304
305    /// Return a prefilter suitable for quickly finding potential matches.
306    ///
307    /// All patterns added to an Aho-Corasick automaton should be added to this
308    /// builder before attempting to construct the prefilter.
309    pub fn build(&self) -> Option<PrefilterObj> {
310        // match (self.start_bytes.build(), self.rare_bytes.build()) {
311        match (self.start_bytes.build(), self.rare_bytes.build()) {
312            // If we could build both start and rare prefilters, then there are
313            // a few cases in which we'd want to use the start-byte prefilter
314            // over the rare-byte prefilter, since the former has lower
315            // overhead.
316            (prestart @ Some(_), prerare @ Some(_)) => {
317                // If the start-byte prefilter can scan for a smaller number
318                // of bytes than the rare-byte prefilter, then it's probably
319                // faster.
320                let has_fewer_bytes =
321                    self.start_bytes.count < self.rare_bytes.count;
322                // Otherwise, if the combined frequency rank of the detected
323                // bytes in the start-byte prefilter is "close" to the combined
324                // frequency rank of the rare-byte prefilter, then we pick
325                // the start-byte prefilter even if the rare-byte prefilter
326                // heuristically searches for rare bytes. This is because the
327                // rare-byte prefilter has higher constant costs, so we tend to
328                // prefer the start-byte prefilter when we can.
329                let has_rarer_bytes =
330                    self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
331                if has_fewer_bytes || has_rarer_bytes {
332                    prestart
333                } else {
334                    prerare
335                }
336            }
337            (prestart @ Some(_), None) => prestart,
338            (None, prerare @ Some(_)) => prerare,
339            (None, None) if self.ascii_case_insensitive => None,
340            (None, None) => self
341                .packed
342                .as_ref()
343                .and_then(|b| b.build())
344                .map(|s| PrefilterObj::new(Packed(s))),
345        }
346    }
347
348    /// Add a literal string to this prefilter builder.
349    pub fn add(&mut self, bytes: &[u8]) {
350        self.count += 1;
351        self.start_bytes.add(bytes);
352        self.rare_bytes.add(bytes);
353        if let Some(ref mut pbuilder) = self.packed {
354            pbuilder.add(bytes);
355        }
356    }
357}
358
359/// A type that wraps a packed searcher and implements the `Prefilter`
360/// interface.
361#[derive(Clone, Debug)]
362struct Packed(packed::Searcher);
363
364impl Prefilter for Packed {
365    fn next_candidate(
366        &self,
367        _state: &mut PrefilterState,
368        haystack: &[u8],
369        at: usize,
370    ) -> Candidate {
371        self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
372    }
373
374    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
375        Box::new(self.clone())
376    }
377
378    fn heap_bytes(&self) -> usize {
379        self.0.heap_bytes()
380    }
381
382    fn reports_false_positives(&self) -> bool {
383        false
384    }
385}
386
387/// A builder for constructing a rare byte prefilter.
388///
389/// A rare byte prefilter attempts to pick out a small set of rare bytes that
390/// occurr in the patterns, and then quickly scan to matches of those rare
391/// bytes.
392#[derive(Clone, Debug)]
393struct RareBytesBuilder {
394    /// Whether this prefilter should account for ASCII case insensitivity or
395    /// not.
396    ascii_case_insensitive: bool,
397    /// A set of rare bytes, indexed by byte value.
398    rare_set: ByteSet,
399    /// A set of byte offsets associated with bytes in a pattern. An entry
400    /// corresponds to a particular bytes (its index) and is only non-zero if
401    /// the byte occurred at an offset greater than 0 in at least one pattern.
402    ///
403    /// If a byte's offset is not representable in 8 bits, then the rare bytes
404    /// prefilter becomes inert.
405    byte_offsets: RareByteOffsets,
406    /// Whether this is available as a prefilter or not. This can be set to
407    /// false during construction if a condition is seen that invalidates the
408    /// use of the rare-byte prefilter.
409    available: bool,
410    /// The number of bytes set to an active value in `byte_offsets`.
411    count: usize,
412    /// The sum of frequency ranks for the rare bytes detected. This is
413    /// intended to give a heuristic notion of how rare the bytes are.
414    rank_sum: u16,
415}
416
417/// A set of bytes.
418#[derive(Clone, Copy)]
419struct ByteSet([bool; 256]);
420
421impl ByteSet {
422    fn empty() -> ByteSet {
423        ByteSet([false; 256])
424    }
425
426    fn insert(&mut self, b: u8) -> bool {
427        let new = !self.contains(b);
428        self.0[b as usize] = true;
429        new
430    }
431
432    fn contains(&self, b: u8) -> bool {
433        self.0[b as usize]
434    }
435}
436
437impl fmt::Debug for ByteSet {
438    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
439        let mut bytes = vec![];
440        for b in 0..=255 {
441            if self.contains(b) {
442                bytes.push(b);
443            }
444        }
445        f.debug_struct("ByteSet").field("set", &bytes).finish()
446    }
447}
448
449/// A set of byte offsets, keyed by byte.
450#[derive(Clone, Copy)]
451struct RareByteOffsets {
452    /// Each entry corresponds to the maximum offset of the corresponding
453    /// byte across all patterns seen.
454    set: [RareByteOffset; 256],
455}
456
457impl RareByteOffsets {
458    /// Create a new empty set of rare byte offsets.
459    pub fn empty() -> RareByteOffsets {
460        RareByteOffsets { set: [RareByteOffset::default(); 256] }
461    }
462
463    /// Add the given offset for the given byte to this set. If the offset is
464    /// greater than the existing offset, then it overwrites the previous
465    /// value and returns false. If there is no previous value set, then this
466    /// sets it and returns true.
467    pub fn set(&mut self, byte: u8, off: RareByteOffset) {
468        self.set[byte as usize].max =
469            cmp::max(self.set[byte as usize].max, off.max);
470    }
471}
472
473impl fmt::Debug for RareByteOffsets {
474    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
475        let mut offsets = vec![];
476        for off in self.set.iter() {
477            if off.max > 0 {
478                offsets.push(off);
479            }
480        }
481        f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
482    }
483}
484
485/// Offsets associated with an occurrence of a "rare" byte in any of the
486/// patterns used to construct a single Aho-Corasick automaton.
487#[derive(Clone, Copy, Debug)]
488struct RareByteOffset {
489    /// The maximum offset at which a particular byte occurs from the start
490    /// of any pattern. This is used as a shift amount. That is, when an
491    /// occurrence of this byte is found, the candidate position reported by
492    /// the prefilter is `position_of_byte - max`, such that the automaton
493    /// will begin its search at a position that is guaranteed to observe a
494    /// match.
495    ///
496    /// To avoid accidentally quadratic behavior, a prefilter is considered
497    /// ineffective when it is asked to start scanning from a position that it
498    /// has already scanned past.
499    ///
500    /// Using a `u8` here means that if we ever see a pattern that's longer
501    /// than 255 bytes, then the entire rare byte prefilter is disabled.
502    max: u8,
503}
504
505impl Default for RareByteOffset {
506    fn default() -> RareByteOffset {
507        RareByteOffset { max: 0 }
508    }
509}
510
511impl RareByteOffset {
512    /// Create a new rare byte offset. If the given offset is too big, then
513    /// None is returned. In that case, callers should render the rare bytes
514    /// prefilter inert.
515    fn new(max: usize) -> Option<RareByteOffset> {
516        if max > u8::MAX as usize {
517            None
518        } else {
519            Some(RareByteOffset { max: max as u8 })
520        }
521    }
522}
523
524impl RareBytesBuilder {
525    /// Create a new builder for constructing a rare byte prefilter.
526    fn new() -> RareBytesBuilder {
527        RareBytesBuilder {
528            ascii_case_insensitive: false,
529            rare_set: ByteSet::empty(),
530            byte_offsets: RareByteOffsets::empty(),
531            available: true,
532            count: 0,
533            rank_sum: 0,
534        }
535    }
536
537    /// Enable ASCII case insensitivity. When set, byte strings added to this
538    /// builder will be interpreted without respect to ASCII case.
539    fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
540        self.ascii_case_insensitive = yes;
541        self
542    }
543
544    /// Build the rare bytes prefilter.
545    ///
546    /// If there are more than 3 distinct starting bytes, or if heuristics
547    /// otherwise determine that this prefilter should not be used, then `None`
548    /// is returned.
549    fn build(&self) -> Option<PrefilterObj> {
550        if !self.available || self.count > 3 {
551            return None;
552        }
553        let (mut bytes, mut len) = ([0; 3], 0);
554        for b in 0..=255 {
555            if self.rare_set.contains(b) {
556                bytes[len] = b as u8;
557                len += 1;
558            }
559        }
560        match len {
561            0 => None,
562            1 => Some(PrefilterObj::new(RareBytesOne {
563                byte1: bytes[0],
564                offset: self.byte_offsets.set[bytes[0] as usize],
565            })),
566            2 => Some(PrefilterObj::new(RareBytesTwo {
567                offsets: self.byte_offsets,
568                byte1: bytes[0],
569                byte2: bytes[1],
570            })),
571            3 => Some(PrefilterObj::new(RareBytesThree {
572                offsets: self.byte_offsets,
573                byte1: bytes[0],
574                byte2: bytes[1],
575                byte3: bytes[2],
576            })),
577            _ => unreachable!(),
578        }
579    }
580
581    /// Add a byte string to this builder.
582    ///
583    /// All patterns added to an Aho-Corasick automaton should be added to this
584    /// builder before attempting to construct the prefilter.
585    fn add(&mut self, bytes: &[u8]) {
586        // If we've already given up, then do nothing.
587        if !self.available {
588            return;
589        }
590        // If we've already blown our budget, then don't waste time looking
591        // for more rare bytes.
592        if self.count > 3 {
593            self.available = false;
594            return;
595        }
596        // If the pattern is too long, then our offset table is bunk, so
597        // give up.
598        if bytes.len() >= 256 {
599            self.available = false;
600            return;
601        }
602        let mut rarest = match bytes.get(0) {
603            None => return,
604            Some(&b) => (b, freq_rank(b)),
605        };
606        // The idea here is to look for the rarest byte in each pattern, and
607        // add that to our set. As a special exception, if we see a byte that
608        // we've already added, then we immediately stop and choose that byte,
609        // even if there's another rare byte in the pattern. This helps us
610        // apply the rare byte optimization in more cases by attempting to pick
611        // bytes that are in common between patterns. So for example, if we
612        // were searching for `Sherlock` and `lockjaw`, then this would pick
613        // `k` for both patterns, resulting in the use of `memchr` instead of
614        // `memchr2` for `k` and `j`.
615        let mut found = false;
616        for (pos, &b) in bytes.iter().enumerate() {
617            self.set_offset(pos, b);
618            if found {
619                continue;
620            }
621            if self.rare_set.contains(b) {
622                found = true;
623                continue;
624            }
625            let rank = freq_rank(b);
626            if rank < rarest.1 {
627                rarest = (b, rank);
628            }
629        }
630        if !found {
631            self.add_rare_byte(rarest.0);
632        }
633    }
634
635    fn set_offset(&mut self, pos: usize, byte: u8) {
636        // This unwrap is OK because pos is never bigger than our max.
637        let offset = RareByteOffset::new(pos).unwrap();
638        self.byte_offsets.set(byte, offset);
639        if self.ascii_case_insensitive {
640            self.byte_offsets.set(opposite_ascii_case(byte), offset);
641        }
642    }
643
644    fn add_rare_byte(&mut self, byte: u8) {
645        self.add_one_rare_byte(byte);
646        if self.ascii_case_insensitive {
647            self.add_one_rare_byte(opposite_ascii_case(byte));
648        }
649    }
650
651    fn add_one_rare_byte(&mut self, byte: u8) {
652        if self.rare_set.insert(byte) {
653            self.count += 1;
654            self.rank_sum += freq_rank(byte) as u16;
655        }
656    }
657}
658
659/// A prefilter for scanning for a single "rare" byte.
660#[derive(Clone, Debug)]
661struct RareBytesOne {
662    byte1: u8,
663    offset: RareByteOffset,
664}
665
666impl Prefilter for RareBytesOne {
667    fn next_candidate(
668        &self,
669        state: &mut PrefilterState,
670        haystack: &[u8],
671        at: usize,
672    ) -> Candidate {
673        memchr(self.byte1, &haystack[at..])
674            .map(|i| {
675                let pos = at + i;
676                state.last_scan_at = pos;
677                cmp::max(at, pos.saturating_sub(self.offset.max as usize))
678            })
679            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
680    }
681
682    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
683        Box::new(self.clone())
684    }
685
686    fn heap_bytes(&self) -> usize {
687        0
688    }
689
690    fn looks_for_non_start_of_match(&self) -> bool {
691        // TODO: It should be possible to use a rare byte prefilter in a
692        // streaming context. The main problem is that we usually assume that
693        // if a prefilter has scanned some text and not found anything, then no
694        // match *starts* in that text. This doesn't matter in non-streaming
695        // contexts, but in a streaming context, if we're looking for a byte
696        // that doesn't start at the beginning of a match and don't find it,
697        // then it's still possible for a match to start at the end of the
698        // current buffer content. In order to fix this, the streaming searcher
699        // would need to become aware of prefilters that do this and use the
700        // appropriate offset in various places. It is quite a delicate change
701        // and probably shouldn't be attempted until streaming search has a
702        // better testing strategy. In particular, we'd really like to be able
703        // to vary the buffer size to force strange cases that occur at the
704        // edge of the buffer. If we make the buffer size minimal, then these
705        // cases occur more frequently and easier.
706        //
707        // This is also a bummer because this means that if the prefilter
708        // builder chose a rare byte prefilter, then a streaming search won't
709        // use any prefilter at all because the builder doesn't know how it's
710        // going to be used. Assuming we don't make streaming search aware of
711        // these special types of prefilters as described above, we could fix
712        // this by building a "backup" prefilter that could be used when the
713        // rare byte prefilter could not. But that's a bandaide. Sigh.
714        true
715    }
716}
717
718/// A prefilter for scanning for two "rare" bytes.
719#[derive(Clone, Debug)]
720struct RareBytesTwo {
721    offsets: RareByteOffsets,
722    byte1: u8,
723    byte2: u8,
724}
725
726impl Prefilter for RareBytesTwo {
727    fn next_candidate(
728        &self,
729        state: &mut PrefilterState,
730        haystack: &[u8],
731        at: usize,
732    ) -> Candidate {
733        memchr2(self.byte1, self.byte2, &haystack[at..])
734            .map(|i| {
735                let pos = at + i;
736                state.update_at(pos);
737                let offset = self.offsets.set[haystack[pos] as usize].max;
738                cmp::max(at, pos.saturating_sub(offset as usize))
739            })
740            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
741    }
742
743    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
744        Box::new(self.clone())
745    }
746
747    fn heap_bytes(&self) -> usize {
748        0
749    }
750
751    fn looks_for_non_start_of_match(&self) -> bool {
752        // TODO: See Prefilter impl for RareBytesOne.
753        true
754    }
755}
756
757/// A prefilter for scanning for three "rare" bytes.
758#[derive(Clone, Debug)]
759struct RareBytesThree {
760    offsets: RareByteOffsets,
761    byte1: u8,
762    byte2: u8,
763    byte3: u8,
764}
765
766impl Prefilter for RareBytesThree {
767    fn next_candidate(
768        &self,
769        state: &mut PrefilterState,
770        haystack: &[u8],
771        at: usize,
772    ) -> Candidate {
773        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
774            .map(|i| {
775                let pos = at + i;
776                state.update_at(pos);
777                let offset = self.offsets.set[haystack[pos] as usize].max;
778                cmp::max(at, pos.saturating_sub(offset as usize))
779            })
780            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
781    }
782
783    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
784        Box::new(self.clone())
785    }
786
787    fn heap_bytes(&self) -> usize {
788        0
789    }
790
791    fn looks_for_non_start_of_match(&self) -> bool {
792        // TODO: See Prefilter impl for RareBytesOne.
793        true
794    }
795}
796
797/// A builder for constructing a starting byte prefilter.
798///
799/// A starting byte prefilter is a simplistic prefilter that looks for possible
800/// matches by reporting all positions corresponding to a particular byte. This
801/// generally only takes affect when there are at most 3 distinct possible
802/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
803/// distinct starting bytes (`f` and `b`), and this prefilter returns all
804/// occurrences of either `f` or `b`.
805///
806/// In some cases, a heuristic frequency analysis may determine that it would
807/// be better not to use this prefilter even when there are 3 or fewer distinct
808/// starting bytes.
809#[derive(Clone, Debug)]
810struct StartBytesBuilder {
811    /// Whether this prefilter should account for ASCII case insensitivity or
812    /// not.
813    ascii_case_insensitive: bool,
814    /// The set of starting bytes observed.
815    byteset: Vec<bool>,
816    /// The number of bytes set to true in `byteset`.
817    count: usize,
818    /// The sum of frequency ranks for the rare bytes detected. This is
819    /// intended to give a heuristic notion of how rare the bytes are.
820    rank_sum: u16,
821}
822
823impl StartBytesBuilder {
824    /// Create a new builder for constructing a start byte prefilter.
825    fn new() -> StartBytesBuilder {
826        StartBytesBuilder {
827            ascii_case_insensitive: false,
828            byteset: vec![false; 256],
829            count: 0,
830            rank_sum: 0,
831        }
832    }
833
834    /// Enable ASCII case insensitivity. When set, byte strings added to this
835    /// builder will be interpreted without respect to ASCII case.
836    fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
837        self.ascii_case_insensitive = yes;
838        self
839    }
840
841    /// Build the starting bytes prefilter.
842    ///
843    /// If there are more than 3 distinct starting bytes, or if heuristics
844    /// otherwise determine that this prefilter should not be used, then `None`
845    /// is returned.
846    fn build(&self) -> Option<PrefilterObj> {
847        if self.count > 3 {
848            return None;
849        }
850        let (mut bytes, mut len) = ([0; 3], 0);
851        for b in 0..256 {
852            if !self.byteset[b] {
853                continue;
854            }
855            // We don't handle non-ASCII bytes for now. Getting non-ASCII
856            // bytes right is trickier, since we generally don't want to put
857            // a leading UTF-8 code unit into a prefilter that isn't ASCII,
858            // since they can frequently. Instead, it would be better to use a
859            // continuation byte, but this requires more sophisticated analysis
860            // of the automaton and a richer prefilter API.
861            if b > 0x7F {
862                return None;
863            }
864            bytes[len] = b as u8;
865            len += 1;
866        }
867        match len {
868            0 => None,
869            1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
870            2 => Some(PrefilterObj::new(StartBytesTwo {
871                byte1: bytes[0],
872                byte2: bytes[1],
873            })),
874            3 => Some(PrefilterObj::new(StartBytesThree {
875                byte1: bytes[0],
876                byte2: bytes[1],
877                byte3: bytes[2],
878            })),
879            _ => unreachable!(),
880        }
881    }
882
883    /// Add a byte string to this builder.
884    ///
885    /// All patterns added to an Aho-Corasick automaton should be added to this
886    /// builder before attempting to construct the prefilter.
887    fn add(&mut self, bytes: &[u8]) {
888        if self.count > 3 {
889            return;
890        }
891        if let Some(&byte) = bytes.get(0) {
892            self.add_one_byte(byte);
893            if self.ascii_case_insensitive {
894                self.add_one_byte(opposite_ascii_case(byte));
895            }
896        }
897    }
898
899    fn add_one_byte(&mut self, byte: u8) {
900        if !self.byteset[byte as usize] {
901            self.byteset[byte as usize] = true;
902            self.count += 1;
903            self.rank_sum += freq_rank(byte) as u16;
904        }
905    }
906}
907
908/// A prefilter for scanning for a single starting byte.
909#[derive(Clone, Debug)]
910struct StartBytesOne {
911    byte1: u8,
912}
913
914impl Prefilter for StartBytesOne {
915    fn next_candidate(
916        &self,
917        _state: &mut PrefilterState,
918        haystack: &[u8],
919        at: usize,
920    ) -> Candidate {
921        memchr(self.byte1, &haystack[at..])
922            .map(|i| at + i)
923            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
924    }
925
926    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
927        Box::new(self.clone())
928    }
929
930    fn heap_bytes(&self) -> usize {
931        0
932    }
933}
934
935/// A prefilter for scanning for two starting bytes.
936#[derive(Clone, Debug)]
937struct StartBytesTwo {
938    byte1: u8,
939    byte2: u8,
940}
941
942impl Prefilter for StartBytesTwo {
943    fn next_candidate(
944        &self,
945        _state: &mut PrefilterState,
946        haystack: &[u8],
947        at: usize,
948    ) -> Candidate {
949        memchr2(self.byte1, self.byte2, &haystack[at..])
950            .map(|i| at + i)
951            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
952    }
953
954    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
955        Box::new(self.clone())
956    }
957
958    fn heap_bytes(&self) -> usize {
959        0
960    }
961}
962
963/// A prefilter for scanning for three starting bytes.
964#[derive(Clone, Debug)]
965struct StartBytesThree {
966    byte1: u8,
967    byte2: u8,
968    byte3: u8,
969}
970
971impl Prefilter for StartBytesThree {
972    fn next_candidate(
973        &self,
974        _state: &mut PrefilterState,
975        haystack: &[u8],
976        at: usize,
977    ) -> Candidate {
978        memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
979            .map(|i| at + i)
980            .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
981    }
982
983    fn clone_prefilter(&self) -> Box<dyn Prefilter> {
984        Box::new(self.clone())
985    }
986
987    fn heap_bytes(&self) -> usize {
988        0
989    }
990}
991
992/// Return the next candidate reported by the given prefilter while
993/// simultaneously updating the given prestate.
994///
995/// The caller is responsible for checking the prestate before deciding whether
996/// to initiate a search.
997#[inline]
998pub fn next<P: Prefilter>(
999    prestate: &mut PrefilterState,
1000    prefilter: P,
1001    haystack: &[u8],
1002    at: usize,
1003) -> Candidate {
1004    let cand = prefilter.next_candidate(prestate, haystack, at);
1005    match cand {
1006        Candidate::None => {
1007            prestate.update_skipped_bytes(haystack.len() - at);
1008        }
1009        Candidate::Match(ref m) => {
1010            prestate.update_skipped_bytes(m.start() - at);
1011        }
1012        Candidate::PossibleStartOfMatch(i) => {
1013            prestate.update_skipped_bytes(i - at);
1014        }
1015    }
1016    cand
1017}
1018
1019/// If the given byte is an ASCII letter, then return it in the opposite case.
1020/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
1021/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
1022pub fn opposite_ascii_case(b: u8) -> u8 {
1023    if b'A' <= b && b <= b'Z' {
1024        b.to_ascii_lowercase()
1025    } else if b'a' <= b && b <= b'z' {
1026        b.to_ascii_uppercase()
1027    } else {
1028        b
1029    }
1030}
1031
1032/// Return the frequency rank of the given byte. The higher the rank, the more
1033/// common the byte (heuristically speaking).
1034fn freq_rank(b: u8) -> u8 {
1035    use crate::byte_frequencies::BYTE_FREQUENCIES;
1036    BYTE_FREQUENCIES[b as usize]
1037}
1038
1039#[cfg(test)]
1040mod tests {
1041    use super::*;
1042
1043    #[test]
1044    fn scratch() {
1045        let mut b = Builder::new(MatchKind::LeftmostFirst);
1046        b.add(b"Sherlock");
1047        b.add(b"locjaw");
1048        // b.add(b"Sherlock");
1049        // b.add(b"Holmes");
1050        // b.add(b"Watson");
1051        // b.add("Шерлок Холмс".as_bytes());
1052        // b.add("Джон Уотсон".as_bytes());
1053
1054        let s = b.build().unwrap();
1055        println!("{:?}", s);
1056    }
1057}