regex/literal/
imp.rs

1use std::mem;
2
3use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
4use memchr::{memchr, memchr2, memchr3, memmem};
5use regex_syntax::hir::literal::{Literal, Literals};
6
7/// A prefix extracted from a compiled regular expression.
8///
9/// A regex prefix is a set of literal strings that *must* be matched at the
10/// beginning of a regex in order for the entire regex to match. Similarly
11/// for a regex suffix.
12#[derive(Clone, Debug)]
13pub struct LiteralSearcher {
14    complete: bool,
15    lcp: Memmem,
16    lcs: Memmem,
17    matcher: Matcher,
18}
19
20#[derive(Clone, Debug)]
21enum Matcher {
22    /// No literals. (Never advances through the input.)
23    Empty,
24    /// A set of four or more single byte literals.
25    Bytes(SingleByteSet),
26    /// A single substring, using vector accelerated routines when available.
27    Memmem(Memmem),
28    /// An Aho-Corasick automaton.
29    AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
30    /// A packed multiple substring searcher, using SIMD.
31    ///
32    /// Note that Aho-Corasick will actually use this packed searcher
33    /// internally automatically, however, there is some overhead associated
34    /// with going through the Aho-Corasick machinery. So using the packed
35    /// searcher directly results in some gains.
36    Packed { s: packed::Searcher, lits: Vec<Literal> },
37}
38
39impl LiteralSearcher {
40    /// Returns a matcher that never matches and never advances the input.
41    pub fn empty() -> Self {
42        Self::new(Literals::empty(), Matcher::Empty)
43    }
44
45    /// Returns a matcher for literal prefixes from the given set.
46    pub fn prefixes(lits: Literals) -> Self {
47        let matcher = Matcher::prefixes(&lits);
48        Self::new(lits, matcher)
49    }
50
51    /// Returns a matcher for literal suffixes from the given set.
52    pub fn suffixes(lits: Literals) -> Self {
53        let matcher = Matcher::suffixes(&lits);
54        Self::new(lits, matcher)
55    }
56
57    fn new(lits: Literals, matcher: Matcher) -> Self {
58        let complete = lits.all_complete();
59        LiteralSearcher {
60            complete,
61            lcp: Memmem::new(lits.longest_common_prefix()),
62            lcs: Memmem::new(lits.longest_common_suffix()),
63            matcher,
64        }
65    }
66
67    /// Returns true if all matches comprise the entire regular expression.
68    ///
69    /// This does not necessarily mean that a literal match implies a match
70    /// of the regular expression. For example, the regular expression `^a`
71    /// is comprised of a single complete literal `a`, but the regular
72    /// expression demands that it only match at the beginning of a string.
73    pub fn complete(&self) -> bool {
74        self.complete && !self.is_empty()
75    }
76
77    /// Find the position of a literal in `haystack` if it exists.
78    #[cfg_attr(feature = "perf-inline", inline(always))]
79    pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
80        use self::Matcher::*;
81        match self.matcher {
82            Empty => Some((0, 0)),
83            Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
84            Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
85            AC { ref ac, .. } => {
86                ac.find(haystack).map(|m| (m.start(), m.end()))
87            }
88            Packed { ref s, .. } => {
89                s.find(haystack).map(|m| (m.start(), m.end()))
90            }
91        }
92    }
93
94    /// Like find, except matches must start at index `0`.
95    pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
96        for lit in self.iter() {
97            if lit.len() > haystack.len() {
98                continue;
99            }
100            if lit == &haystack[0..lit.len()] {
101                return Some((0, lit.len()));
102            }
103        }
104        None
105    }
106
107    /// Like find, except matches must end at index `haystack.len()`.
108    pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
109        for lit in self.iter() {
110            if lit.len() > haystack.len() {
111                continue;
112            }
113            if lit == &haystack[haystack.len() - lit.len()..] {
114                return Some((haystack.len() - lit.len(), haystack.len()));
115            }
116        }
117        None
118    }
119
120    /// Returns an iterator over all literals to be matched.
121    pub fn iter(&self) -> LiteralIter<'_> {
122        match self.matcher {
123            Matcher::Empty => LiteralIter::Empty,
124            Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
125            Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
126            Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
127            Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
128        }
129    }
130
131    /// Returns a matcher for the longest common prefix of this matcher.
132    pub fn lcp(&self) -> &Memmem {
133        &self.lcp
134    }
135
136    /// Returns a matcher for the longest common suffix of this matcher.
137    pub fn lcs(&self) -> &Memmem {
138        &self.lcs
139    }
140
141    /// Returns true iff this prefix is empty.
142    pub fn is_empty(&self) -> bool {
143        self.len() == 0
144    }
145
146    /// Returns the number of prefixes in this machine.
147    pub fn len(&self) -> usize {
148        use self::Matcher::*;
149        match self.matcher {
150            Empty => 0,
151            Bytes(ref sset) => sset.dense.len(),
152            Memmem(_) => 1,
153            AC { ref ac, .. } => ac.pattern_count(),
154            Packed { ref lits, .. } => lits.len(),
155        }
156    }
157
158    /// Return the approximate heap usage of literals in bytes.
159    pub fn approximate_size(&self) -> usize {
160        use self::Matcher::*;
161        match self.matcher {
162            Empty => 0,
163            Bytes(ref sset) => sset.approximate_size(),
164            Memmem(ref single) => single.approximate_size(),
165            AC { ref ac, .. } => ac.heap_bytes(),
166            Packed { ref s, .. } => s.heap_bytes(),
167        }
168    }
169}
170
171impl Matcher {
172    fn prefixes(lits: &Literals) -> Self {
173        let sset = SingleByteSet::prefixes(lits);
174        Matcher::new(lits, sset)
175    }
176
177    fn suffixes(lits: &Literals) -> Self {
178        let sset = SingleByteSet::suffixes(lits);
179        Matcher::new(lits, sset)
180    }
181
182    fn new(lits: &Literals, sset: SingleByteSet) -> Self {
183        if lits.literals().is_empty() {
184            return Matcher::Empty;
185        }
186        if sset.dense.len() >= 26 {
187            // Avoid trying to match a large number of single bytes.
188            // This is *very* sensitive to a frequency analysis comparison
189            // between the bytes in sset and the composition of the haystack.
190            // No matter the size of sset, if its members all are rare in the
191            // haystack, then it'd be worth using it. How to tune this... IDK.
192            // ---AG
193            return Matcher::Empty;
194        }
195        if sset.complete {
196            return Matcher::Bytes(sset);
197        }
198        if lits.literals().len() == 1 {
199            return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
200        }
201
202        let pats = lits.literals().to_owned();
203        let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
204        if lits.literals().len() <= 100 && !is_aho_corasick_fast {
205            let mut builder = packed::Config::new()
206                .match_kind(packed::MatchKind::LeftmostFirst)
207                .builder();
208            if let Some(s) = builder.extend(&pats).build() {
209                return Matcher::Packed { s, lits: pats };
210            }
211        }
212        let ac = AhoCorasickBuilder::new()
213            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
214            .dfa(true)
215            .build_with_size::<u32, _, _>(&pats)
216            .unwrap();
217        Matcher::AC { ac, lits: pats }
218    }
219}
220
221#[derive(Debug)]
222pub enum LiteralIter<'a> {
223    Empty,
224    Bytes(&'a [u8]),
225    Single(&'a [u8]),
226    AC(&'a [Literal]),
227    Packed(&'a [Literal]),
228}
229
230impl<'a> Iterator for LiteralIter<'a> {
231    type Item = &'a [u8];
232
233    fn next(&mut self) -> Option<Self::Item> {
234        match *self {
235            LiteralIter::Empty => None,
236            LiteralIter::Bytes(ref mut many) => {
237                if many.is_empty() {
238                    None
239                } else {
240                    let next = &many[0..1];
241                    *many = &many[1..];
242                    Some(next)
243                }
244            }
245            LiteralIter::Single(ref mut one) => {
246                if one.is_empty() {
247                    None
248                } else {
249                    let next = &one[..];
250                    *one = &[];
251                    Some(next)
252                }
253            }
254            LiteralIter::AC(ref mut lits) => {
255                if lits.is_empty() {
256                    None
257                } else {
258                    let next = &lits[0];
259                    *lits = &lits[1..];
260                    Some(&**next)
261                }
262            }
263            LiteralIter::Packed(ref mut lits) => {
264                if lits.is_empty() {
265                    None
266                } else {
267                    let next = &lits[0];
268                    *lits = &lits[1..];
269                    Some(&**next)
270                }
271            }
272        }
273    }
274}
275
276#[derive(Clone, Debug)]
277struct SingleByteSet {
278    sparse: Vec<bool>,
279    dense: Vec<u8>,
280    complete: bool,
281    all_ascii: bool,
282}
283
284impl SingleByteSet {
285    fn new() -> SingleByteSet {
286        SingleByteSet {
287            sparse: vec![false; 256],
288            dense: vec![],
289            complete: true,
290            all_ascii: true,
291        }
292    }
293
294    fn prefixes(lits: &Literals) -> SingleByteSet {
295        let mut sset = SingleByteSet::new();
296        for lit in lits.literals() {
297            sset.complete = sset.complete && lit.len() == 1;
298            if let Some(&b) = lit.get(0) {
299                if !sset.sparse[b as usize] {
300                    if b > 0x7F {
301                        sset.all_ascii = false;
302                    }
303                    sset.dense.push(b);
304                    sset.sparse[b as usize] = true;
305                }
306            }
307        }
308        sset
309    }
310
311    fn suffixes(lits: &Literals) -> SingleByteSet {
312        let mut sset = SingleByteSet::new();
313        for lit in lits.literals() {
314            sset.complete = sset.complete && lit.len() == 1;
315            if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
316                if !sset.sparse[b as usize] {
317                    if b > 0x7F {
318                        sset.all_ascii = false;
319                    }
320                    sset.dense.push(b);
321                    sset.sparse[b as usize] = true;
322                }
323            }
324        }
325        sset
326    }
327
328    /// Faster find that special cases certain sizes to use memchr.
329    #[cfg_attr(feature = "perf-inline", inline(always))]
330    fn find(&self, text: &[u8]) -> Option<usize> {
331        match self.dense.len() {
332            0 => None,
333            1 => memchr(self.dense[0], text),
334            2 => memchr2(self.dense[0], self.dense[1], text),
335            3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
336            _ => self._find(text),
337        }
338    }
339
340    /// Generic find that works on any sized set.
341    fn _find(&self, haystack: &[u8]) -> Option<usize> {
342        for (i, &b) in haystack.iter().enumerate() {
343            if self.sparse[b as usize] {
344                return Some(i);
345            }
346        }
347        None
348    }
349
350    fn approximate_size(&self) -> usize {
351        (self.dense.len() * mem::size_of::<u8>())
352            + (self.sparse.len() * mem::size_of::<bool>())
353    }
354}
355
356/// A simple wrapper around the memchr crate's memmem implementation.
357///
358/// The API this exposes mirrors the API of previous substring searchers that
359/// this supplanted.
360#[derive(Clone, Debug)]
361pub struct Memmem {
362    finder: memmem::Finder<'static>,
363    char_len: usize,
364}
365
366impl Memmem {
367    fn new(pat: &[u8]) -> Memmem {
368        Memmem {
369            finder: memmem::Finder::new(pat).into_owned(),
370            char_len: char_len_lossy(pat),
371        }
372    }
373
374    #[cfg_attr(feature = "perf-inline", inline(always))]
375    pub fn find(&self, haystack: &[u8]) -> Option<usize> {
376        self.finder.find(haystack)
377    }
378
379    #[cfg_attr(feature = "perf-inline", inline(always))]
380    pub fn is_suffix(&self, text: &[u8]) -> bool {
381        if text.len() < self.len() {
382            return false;
383        }
384        &text[text.len() - self.len()..] == self.finder.needle()
385    }
386
387    pub fn len(&self) -> usize {
388        self.finder.needle().len()
389    }
390
391    pub fn char_len(&self) -> usize {
392        self.char_len
393    }
394
395    fn approximate_size(&self) -> usize {
396        self.finder.needle().len() * mem::size_of::<u8>()
397    }
398}
399
400fn char_len_lossy(bytes: &[u8]) -> usize {
401    String::from_utf8_lossy(bytes).chars().count()
402}