regex/
exec.rs

Help
1use std::cell::RefCell;
2use std::collections::HashMap;
3use std::panic::AssertUnwindSafe;
4use std::sync::Arc;
5
6#[cfg(feature = "perf-literal")]
7use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
8use regex_syntax::hir::literal::Literals;
9use regex_syntax::hir::Hir;
10use regex_syntax::ParserBuilder;
11
12use crate::backtrack;
13use crate::compile::Compiler;
14#[cfg(feature = "perf-dfa")]
15use crate::dfa;
16use crate::error::Error;
17use crate::input::{ByteInput, CharInput};
18use crate::literal::LiteralSearcher;
19use crate::pikevm;
20use crate::pool::{Pool, PoolGuard};
21use crate::prog::Program;
22use crate::re_builder::RegexOptions;
23use crate::re_bytes;
24use crate::re_set;
25use crate::re_trait::{Locations, RegularExpression, Slot};
26use crate::re_unicode;
27use crate::utf8::next_utf8;
28
29/// `Exec` manages the execution of a regular expression.
30///
31/// In particular, this manages the various compiled forms of a single regular
32/// expression and the choice of which matching engine to use to execute a
33/// regular expression.
34#[derive(Debug)]
35pub struct Exec {
36    /// All read only state.
37    ro: Arc<ExecReadOnly>,
38    /// A pool of reusable values for the various matching engines.
39    ///
40    /// Note that boxing this value is not strictly necessary, but it is an
41    /// easy way to ensure that T does not bloat the stack sized used by a pool
42    /// in the case where T is big. And this turns out to be the case at the
43    /// time of writing for regex's use of this pool. At the time of writing,
44    /// the size of a Regex on the stack is 856 bytes. Boxing this value
45    /// reduces that size to 16 bytes.
46    pool: Box<Pool<ProgramCache>>,
47}
48
49/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
50/// means it is no longer Sync, but we can now avoid the overhead of
51/// synchronization to fetch the cache.
52#[derive(Debug)]
53pub struct ExecNoSync<'c> {
54    /// All read only state.
55    ro: &'c Arc<ExecReadOnly>,
56    /// Caches for the various matching engines.
57    cache: PoolGuard<'c, ProgramCache>,
58}
59
60/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
61#[derive(Debug)]
62pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
63
64/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
65/// state is determined at compile time and never changes during search.
66#[derive(Debug)]
67struct ExecReadOnly {
68    /// The original regular expressions given by the caller to compile.
69    res: Vec<String>,
70    /// A compiled program that is used in the NFA simulation and backtracking.
71    /// It can be byte-based or Unicode codepoint based.
72    ///
73    /// N.B. It is not possibly to make this byte-based from the public API.
74    /// It is only used for testing byte based programs in the NFA simulations.
75    nfa: Program,
76    /// A compiled byte based program for DFA execution. This is only used
77    /// if a DFA can be executed. (Currently, only word boundary assertions are
78    /// not supported.) Note that this program contains an embedded `.*?`
79    /// preceding the first capture group, unless the regex is anchored at the
80    /// beginning.
81    dfa: Program,
82    /// The same as above, except the program is reversed (and there is no
83    /// preceding `.*?`). This is used by the DFA to find the starting location
84    /// of matches.
85    dfa_reverse: Program,
86    /// A set of suffix literals extracted from the regex.
87    ///
88    /// Prefix literals are stored on the `Program`, since they are used inside
89    /// the matching engines.
90    suffixes: LiteralSearcher,
91    /// An Aho-Corasick automaton with leftmost-first match semantics.
92    ///
93    /// This is only set when the entire regex is a simple unanchored
94    /// alternation of literals. We could probably use it more circumstances,
95    /// but this is already hacky enough in this architecture.
96    ///
97    /// N.B. We use u32 as a state ID representation under the assumption that
98    /// if we were to exhaust the ID space, we probably would have long
99    /// surpassed the compilation size limit.
100    #[cfg(feature = "perf-literal")]
101    ac: Option<AhoCorasick<u32>>,
102    /// match_type encodes as much upfront knowledge about how we're going to
103    /// execute a search as possible.
104    match_type: MatchType,
105}
106
107/// Facilitates the construction of an executor by exposing various knobs
108/// to control how a regex is executed and what kinds of resources it's
109/// permitted to use.
110// `ExecBuilder` is only public via the `internal` module, so avoid deriving
111// `Debug`.
112#[allow(missing_debug_implementations)]
113pub struct ExecBuilder {
114    options: RegexOptions,
115    match_type: Option<MatchType>,
116    bytes: bool,
117    only_utf8: bool,
118}
119
120/// Parsed represents a set of parsed regular expressions and their detected
121/// literals.
122struct Parsed {
123    exprs: Vec<Hir>,
124    prefixes: Literals,
125    suffixes: Literals,
126    bytes: bool,
127}
128
129impl ExecBuilder {
130    /// Create a regex execution builder.
131    ///
132    /// This uses default settings for everything except the regex itself,
133    /// which must be provided. Further knobs can be set by calling methods,
134    /// and then finally, `build` to actually create the executor.
135    pub fn new(re: &str) -> Self {
136        Self::new_many(&[re])
137    }
138
139    /// Like new, but compiles the union of the given regular expressions.
140    ///
141    /// Note that when compiling 2 or more regular expressions, capture groups
142    /// are completely unsupported. (This means both `find` and `captures`
143    /// won't work.)
144    pub fn new_many<I, S>(res: I) -> Self
145    where
146        S: AsRef<str>,
147        I: IntoIterator<Item = S>,
148    {
149        let mut opts = RegexOptions::default();
150        opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
151        Self::new_options(opts)
152    }
153
154    /// Create a regex execution builder.
155    pub fn new_options(opts: RegexOptions) -> Self {
156        ExecBuilder {
157            options: opts,
158            match_type: None,
159            bytes: false,
160            only_utf8: true,
161        }
162    }
163
164    /// Set the matching engine to be automatically determined.
165    ///
166    /// This is the default state and will apply whatever optimizations are
167    /// possible, such as running a DFA.
168    ///
169    /// This overrides whatever was previously set via the `nfa` or
170    /// `bounded_backtracking` methods.
171    pub fn automatic(mut self) -> Self {
172        self.match_type = None;
173        self
174    }
175
176    /// Sets the matching engine to use the NFA algorithm no matter what
177    /// optimizations are possible.
178    ///
179    /// This overrides whatever was previously set via the `automatic` or
180    /// `bounded_backtracking` methods.
181    pub fn nfa(mut self) -> Self {
182        self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
183        self
184    }
185
186    /// Sets the matching engine to use a bounded backtracking engine no
187    /// matter what optimizations are possible.
188    ///
189    /// One must use this with care, since the bounded backtracking engine
190    /// uses memory proportion to `len(regex) * len(text)`.
191    ///
192    /// This overrides whatever was previously set via the `automatic` or
193    /// `nfa` methods.
194    pub fn bounded_backtracking(mut self) -> Self {
195        self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
196        self
197    }
198
199    /// Compiles byte based programs for use with the NFA matching engines.
200    ///
201    /// By default, the NFA engines match on Unicode scalar values. They can
202    /// be made to use byte based programs instead. In general, the byte based
203    /// programs are slower because of a less efficient encoding of character
204    /// classes.
205    ///
206    /// Note that this does not impact DFA matching engines, which always
207    /// execute on bytes.
208    pub fn bytes(mut self, yes: bool) -> Self {
209        self.bytes = yes;
210        self
211    }
212
213    /// When disabled, the program compiled may match arbitrary bytes.
214    ///
215    /// When enabled (the default), all compiled programs exclusively match
216    /// valid UTF-8 bytes.
217    pub fn only_utf8(mut self, yes: bool) -> Self {
218        self.only_utf8 = yes;
219        self
220    }
221
222    /// Set the Unicode flag.
223    pub fn unicode(mut self, yes: bool) -> Self {
224        self.options.unicode = yes;
225        self
226    }
227
228    /// Parse the current set of patterns into their AST and extract literals.
229    fn parse(&self) -> Result<Parsed, Error> {
230        let mut exprs = Vec::with_capacity(self.options.pats.len());
231        let mut prefixes = Some(Literals::empty());
232        let mut suffixes = Some(Literals::empty());
233        let mut bytes = false;
234        let is_set = self.options.pats.len() > 1;
235        // If we're compiling a regex set and that set has any anchored
236        // expressions, then disable all literal optimizations.
237        for pat in &self.options.pats {
238            let mut parser = ParserBuilder::new()
239                .octal(self.options.octal)
240                .case_insensitive(self.options.case_insensitive)
241                .multi_line(self.options.multi_line)
242                .dot_matches_new_line(self.options.dot_matches_new_line)
243                .swap_greed(self.options.swap_greed)
244                .ignore_whitespace(self.options.ignore_whitespace)
245                .unicode(self.options.unicode)
246                .allow_invalid_utf8(!self.only_utf8)
247                .nest_limit(self.options.nest_limit)
248                .build();
249            let expr =
250                parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
251            bytes = bytes || !expr.is_always_utf8();
252
253            if cfg!(feature = "perf-literal") {
254                if !expr.is_anchored_start() && expr.is_any_anchored_start() {
255                    // Partial anchors unfortunately make it hard to use
256                    // prefixes, so disable them.
257                    prefixes = None;
258                } else if is_set && expr.is_anchored_start() {
259                    // Regex sets with anchors do not go well with literal
260                    // optimizations.
261                    prefixes = None;
262                }
263                prefixes = prefixes.and_then(|mut prefixes| {
264                    if !prefixes.union_prefixes(&expr) {
265                        None
266                    } else {
267                        Some(prefixes)
268                    }
269                });
270
271                if !expr.is_anchored_end() && expr.is_any_anchored_end() {
272                    // Partial anchors unfortunately make it hard to use
273                    // suffixes, so disable them.
274                    suffixes = None;
275                } else if is_set && expr.is_anchored_end() {
276                    // Regex sets with anchors do not go well with literal
277                    // optimizations.
278                    suffixes = None;
279                }
280                suffixes = suffixes.and_then(|mut suffixes| {
281                    if !suffixes.union_suffixes(&expr) {
282                        None
283                    } else {
284                        Some(suffixes)
285                    }
286                });
287            }
288            exprs.push(expr);
289        }
290        Ok(Parsed {
291            exprs,
292            prefixes: prefixes.unwrap_or_else(Literals::empty),
293            suffixes: suffixes.unwrap_or_else(Literals::empty),
294            bytes,
295        })
296    }
297
298    /// Build an executor that can run a regular expression.
299    pub fn build(self) -> Result<Exec, Error> {
300        // Special case when we have no patterns to compile.
301        // This can happen when compiling a regex set.
302        if self.options.pats.is_empty() {
303            let ro = Arc::new(ExecReadOnly {
304                res: vec![],
305                nfa: Program::new(),
306                dfa: Program::new(),
307                dfa_reverse: Program::new(),
308                suffixes: LiteralSearcher::empty(),
309                #[cfg(feature = "perf-literal")]
310                ac: None,
311                match_type: MatchType::Nothing,
312            });
313            let pool = ExecReadOnly::new_pool(&ro);
314            return Ok(Exec { ro, pool });
315        }
316        let parsed = self.parse()?;
317        let mut nfa = Compiler::new()
318            .size_limit(self.options.size_limit)
319            .bytes(self.bytes || parsed.bytes)
320            .only_utf8(self.only_utf8)
321            .compile(&parsed.exprs)?;
322        let mut dfa = Compiler::new()
323            .size_limit(self.options.size_limit)
324            .dfa(true)
325            .only_utf8(self.only_utf8)
326            .compile(&parsed.exprs)?;
327        let mut dfa_reverse = Compiler::new()
328            .size_limit(self.options.size_limit)
329            .dfa(true)
330            .only_utf8(self.only_utf8)
331            .reverse(true)
332            .compile(&parsed.exprs)?;
333
334        #[cfg(feature = "perf-literal")]
335        let ac = self.build_aho_corasick(&parsed);
336        nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
337        dfa.prefixes = nfa.prefixes.clone();
338        dfa.dfa_size_limit = self.options.dfa_size_limit;
339        dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
340
341        let mut ro = ExecReadOnly {
342            res: self.options.pats,
343            nfa,
344            dfa,
345            dfa_reverse,
346            suffixes: LiteralSearcher::suffixes(parsed.suffixes),
347            #[cfg(feature = "perf-literal")]
348            ac,
349            match_type: MatchType::Nothing,
350        };
351        ro.match_type = ro.choose_match_type(self.match_type);
352
353        let ro = Arc::new(ro);
354        let pool = ExecReadOnly::new_pool(&ro);
355        Ok(Exec { ro, pool })
356    }
357
358    #[cfg(feature = "perf-literal")]
359    fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
360        if parsed.exprs.len() != 1 {
361            return None;
362        }
363        let lits = match alternation_literals(&parsed.exprs[0]) {
364            None => return None,
365            Some(lits) => lits,
366        };
367        // If we have a small number of literals, then let Teddy handle
368        // things (see literal/mod.rs).
369        if lits.len() <= 32 {
370            return None;
371        }
372        Some(
373            AhoCorasickBuilder::new()
374                .match_kind(MatchKind::LeftmostFirst)
375                .auto_configure(&lits)
376                .build_with_size::<u32, _, _>(&lits)
377                // This should never happen because we'd long exceed the
378                // compilation limit for regexes first.
379                .expect("AC automaton too big"),
380        )
381    }
382}
383
384impl<'c> RegularExpression for ExecNoSyncStr<'c> {
385    type Text = str;
386
387    fn slots_len(&self) -> usize {
388        self.0.slots_len()
389    }
390
391    fn next_after_empty(&self, text: &str, i: usize) -> usize {
392        next_utf8(text.as_bytes(), i)
393    }
394
395    #[cfg_attr(feature = "perf-inline", inline(always))]
396    fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
397        self.0.shortest_match_at(text.as_bytes(), start)
398    }
399
400    #[cfg_attr(feature = "perf-inline", inline(always))]
401    fn is_match_at(&self, text: &str, start: usize) -> bool {
402        self.0.is_match_at(text.as_bytes(), start)
403    }
404
405    #[cfg_attr(feature = "perf-inline", inline(always))]
406    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
407        self.0.find_at(text.as_bytes(), start)
408    }
409
410    #[cfg_attr(feature = "perf-inline", inline(always))]
411    fn captures_read_at(
412        &self,
413        locs: &mut Locations,
414        text: &str,
415        start: usize,
416    ) -> Option<(usize, usize)> {
417        self.0.captures_read_at(locs, text.as_bytes(), start)
418    }
419}
420
421impl<'c> RegularExpression for ExecNoSync<'c> {
422    type Text = [u8];
423
424    /// Returns the number of capture slots in the regular expression. (There
425    /// are two slots for every capture group, corresponding to possibly empty
426    /// start and end locations of the capture.)
427    fn slots_len(&self) -> usize {
428        self.ro.nfa.captures.len() * 2
429    }
430
431    fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
432        i + 1
433    }
434
435    /// Returns the end of a match location, possibly occurring before the
436    /// end location of the correct leftmost-first match.
437    #[cfg_attr(feature = "perf-inline", inline(always))]
438    fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
439        if !self.is_anchor_end_match(text) {
440            return None;
441        }
442        match self.ro.match_type {
443            #[cfg(feature = "perf-literal")]
444            MatchType::Literal(ty) => {
445                self.find_literals(ty, text, start).map(|(_, e)| e)
446            }
447            #[cfg(feature = "perf-dfa")]
448            MatchType::Dfa | MatchType::DfaMany => {
449                match self.shortest_dfa(text, start) {
450                    dfa::Result::Match(end) => Some(end),
451                    dfa::Result::NoMatch(_) => None,
452                    dfa::Result::Quit => self.shortest_nfa(text, start),
453                }
454            }
455            #[cfg(feature = "perf-dfa")]
456            MatchType::DfaAnchoredReverse => {
457                match dfa::Fsm::reverse(
458                    &self.ro.dfa_reverse,
459                    self.cache.value(),
460                    true,
461                    &text[start..],
462                    text.len() - start,
463                ) {
464                    dfa::Result::Match(_) => Some(text.len()),
465                    dfa::Result::NoMatch(_) => None,
466                    dfa::Result::Quit => self.shortest_nfa(text, start),
467                }
468            }
469            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
470            MatchType::DfaSuffix => {
471                match self.shortest_dfa_reverse_suffix(text, start) {
472                    dfa::Result::Match(e) => Some(e),
473                    dfa::Result::NoMatch(_) => None,
474                    dfa::Result::Quit => self.shortest_nfa(text, start),
475                }
476            }
477            MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
478            MatchType::Nothing => None,
479        }
480    }
481
482    /// Returns true if and only if the regex matches text.
483    ///
484    /// For single regular expressions, this is equivalent to calling
485    /// shortest_match(...).is_some().
486    #[cfg_attr(feature = "perf-inline", inline(always))]
487    fn is_match_at(&self, text: &[u8], start: usize) -> bool {
488        if !self.is_anchor_end_match(text) {
489            return false;
490        }
491        // We need to do this dance because shortest_match relies on the NFA
492        // filling in captures[1], but a RegexSet has no captures. In other
493        // words, a RegexSet can't (currently) use shortest_match. ---AG
494        match self.ro.match_type {
495            #[cfg(feature = "perf-literal")]
496            MatchType::Literal(ty) => {
497                self.find_literals(ty, text, start).is_some()
498            }
499            #[cfg(feature = "perf-dfa")]
500            MatchType::Dfa | MatchType::DfaMany => {
501                match self.shortest_dfa(text, start) {
502                    dfa::Result::Match(_) => true,
503                    dfa::Result::NoMatch(_) => false,
504                    dfa::Result::Quit => self.match_nfa(text, start),
505                }
506            }
507            #[cfg(feature = "perf-dfa")]
508            MatchType::DfaAnchoredReverse => {
509                match dfa::Fsm::reverse(
510                    &self.ro.dfa_reverse,
511                    self.cache.value(),
512                    true,
513                    &text[start..],
514                    text.len() - start,
515                ) {
516                    dfa::Result::Match(_) => true,
517                    dfa::Result::NoMatch(_) => false,
518                    dfa::Result::Quit => self.match_nfa(text, start),
519                }
520            }
521            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
522            MatchType::DfaSuffix => {
523                match self.shortest_dfa_reverse_suffix(text, start) {
524                    dfa::Result::Match(_) => true,
525                    dfa::Result::NoMatch(_) => false,
526                    dfa::Result::Quit => self.match_nfa(text, start),
527                }
528            }
529            MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
530            MatchType::Nothing => false,
531        }
532    }
533
534    /// Finds the start and end location of the leftmost-first match, starting
535    /// at the given location.
536    #[cfg_attr(feature = "perf-inline", inline(always))]
537    fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
538        if !self.is_anchor_end_match(text) {
539            return None;
540        }
541        match self.ro.match_type {
542            #[cfg(feature = "perf-literal")]
543            MatchType::Literal(ty) => self.find_literals(ty, text, start),
544            #[cfg(feature = "perf-dfa")]
545            MatchType::Dfa => match self.find_dfa_forward(text, start) {
546                dfa::Result::Match((s, e)) => Some((s, e)),
547                dfa::Result::NoMatch(_) => None,
548                dfa::Result::Quit => {
549                    self.find_nfa(MatchNfaType::Auto, text, start)
550                }
551            },
552            #[cfg(feature = "perf-dfa")]
553            MatchType::DfaAnchoredReverse => {
554                match self.find_dfa_anchored_reverse(text, start) {
555                    dfa::Result::Match((s, e)) => Some((s, e)),
556                    dfa::Result::NoMatch(_) => None,
557                    dfa::Result::Quit => {
558                        self.find_nfa(MatchNfaType::Auto, text, start)
559                    }
560                }
561            }
562            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
563            MatchType::DfaSuffix => {
564                match self.find_dfa_reverse_suffix(text, start) {
565                    dfa::Result::Match((s, e)) => Some((s, e)),
566                    dfa::Result::NoMatch(_) => None,
567                    dfa::Result::Quit => {
568                        self.find_nfa(MatchNfaType::Auto, text, start)
569                    }
570                }
571            }
572            MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
573            MatchType::Nothing => None,
574            #[cfg(feature = "perf-dfa")]
575            MatchType::DfaMany => {
576                unreachable!("BUG: RegexSet cannot be used with find")
577            }
578        }
579    }
580
581    /// Finds the start and end location of the leftmost-first match and also
582    /// fills in all matching capture groups.
583    ///
584    /// The number of capture slots given should be equal to the total number
585    /// of capture slots in the compiled program.
586    ///
587    /// Note that the first two slots always correspond to the start and end
588    /// locations of the overall match.
589    fn captures_read_at(
590        &self,
591        locs: &mut Locations,
592        text: &[u8],
593        start: usize,
594    ) -> Option<(usize, usize)> {
595        let slots = locs.as_slots();
596        for slot in slots.iter_mut() {
597            *slot = None;
598        }
599        // If the caller unnecessarily uses this, then we try to save them
600        // from themselves.
601        match slots.len() {
602            0 => return self.find_at(text, start),
603            2 => {
604                return self.find_at(text, start).map(|(s, e)| {
605                    slots[0] = Some(s);
606                    slots[1] = Some(e);
607                    (s, e)
608                });
609            }
610            _ => {} // fallthrough
611        }
612        if !self.is_anchor_end_match(text) {
613            return None;
614        }
615        match self.ro.match_type {
616            #[cfg(feature = "perf-literal")]
617            MatchType::Literal(ty) => {
618                self.find_literals(ty, text, start).and_then(|(s, e)| {
619                    self.captures_nfa_type(
620                        MatchNfaType::Auto,
621                        slots,
622                        text,
623                        s,
624                        e,
625                    )
626                })
627            }
628            #[cfg(feature = "perf-dfa")]
629            MatchType::Dfa => {
630                if self.ro.nfa.is_anchored_start {
631                    self.captures_nfa(slots, text, start)
632                } else {
633                    match self.find_dfa_forward(text, start) {
634                        dfa::Result::Match((s, e)) => self.captures_nfa_type(
635                            MatchNfaType::Auto,
636                            slots,
637                            text,
638                            s,
639                            e,
640                        ),
641                        dfa::Result::NoMatch(_) => None,
642                        dfa::Result::Quit => {
643                            self.captures_nfa(slots, text, start)
644                        }
645                    }
646                }
647            }
648            #[cfg(feature = "perf-dfa")]
649            MatchType::DfaAnchoredReverse => {
650                match self.find_dfa_anchored_reverse(text, start) {
651                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
652                        MatchNfaType::Auto,
653                        slots,
654                        text,
655                        s,
656                        e,
657                    ),
658                    dfa::Result::NoMatch(_) => None,
659                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
660                }
661            }
662            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
663            MatchType::DfaSuffix => {
664                match self.find_dfa_reverse_suffix(text, start) {
665                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
666                        MatchNfaType::Auto,
667                        slots,
668                        text,
669                        s,
670                        e,
671                    ),
672                    dfa::Result::NoMatch(_) => None,
673                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
674                }
675            }
676            MatchType::Nfa(ty) => {
677                self.captures_nfa_type(ty, slots, text, start, text.len())
678            }
679            MatchType::Nothing => None,
680            #[cfg(feature = "perf-dfa")]
681            MatchType::DfaMany => {
682                unreachable!("BUG: RegexSet cannot be used with captures")
683            }
684        }
685    }
686}
687
688impl<'c> ExecNoSync<'c> {
689    /// Finds the leftmost-first match using only literal search.
690    #[cfg(feature = "perf-literal")]
691    #[cfg_attr(feature = "perf-inline", inline(always))]
692    fn find_literals(
693        &self,
694        ty: MatchLiteralType,
695        text: &[u8],
696        start: usize,
697    ) -> Option<(usize, usize)> {
698        use self::MatchLiteralType::*;
699        match ty {
700            Unanchored => {
701                let lits = &self.ro.nfa.prefixes;
702                lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
703            }
704            AnchoredStart => {
705                let lits = &self.ro.nfa.prefixes;
706                if start == 0 || !self.ro.nfa.is_anchored_start {
707                    lits.find_start(&text[start..])
708                        .map(|(s, e)| (start + s, start + e))
709                } else {
710                    None
711                }
712            }
713            AnchoredEnd => {
714                let lits = &self.ro.suffixes;
715                lits.find_end(&text[start..])
716                    .map(|(s, e)| (start + s, start + e))
717            }
718            AhoCorasick => self
719                .ro
720                .ac
721                .as_ref()
722                .unwrap()
723                .find(&text[start..])
724                .map(|m| (start + m.start(), start + m.end())),
725        }
726    }
727
728    /// Finds the leftmost-first match (start and end) using only the DFA.
729    ///
730    /// If the result returned indicates that the DFA quit, then another
731    /// matching engine should be used.
732    #[cfg(feature = "perf-dfa")]
733    #[cfg_attr(feature = "perf-inline", inline(always))]
734    fn find_dfa_forward(
735        &self,
736        text: &[u8],
737        start: usize,
738    ) -> dfa::Result<(usize, usize)> {
739        use crate::dfa::Result::*;
740        let end = match dfa::Fsm::forward(
741            &self.ro.dfa,
742            self.cache.value(),
743            false,
744            text,
745            start,
746        ) {
747            NoMatch(i) => return NoMatch(i),
748            Quit => return Quit,
749            Match(end) if start == end => return Match((start, start)),
750            Match(end) => end,
751        };
752        // Now run the DFA in reverse to find the start of the match.
753        match dfa::Fsm::reverse(
754            &self.ro.dfa_reverse,
755            self.cache.value(),
756            false,
757            &text[start..],
758            end - start,
759        ) {
760            Match(s) => Match((start + s, end)),
761            NoMatch(i) => NoMatch(i),
762            Quit => Quit,
763        }
764    }
765
766    /// Finds the leftmost-first match (start and end) using only the DFA,
767    /// but assumes the regex is anchored at the end and therefore starts at
768    /// the end of the regex and matches in reverse.
769    ///
770    /// If the result returned indicates that the DFA quit, then another
771    /// matching engine should be used.
772    #[cfg(feature = "perf-dfa")]
773    #[cfg_attr(feature = "perf-inline", inline(always))]
774    fn find_dfa_anchored_reverse(
775        &self,
776        text: &[u8],
777        start: usize,
778    ) -> dfa::Result<(usize, usize)> {
779        use crate::dfa::Result::*;
780        match dfa::Fsm::reverse(
781            &self.ro.dfa_reverse,
782            self.cache.value(),
783            false,
784            &text[start..],
785            text.len() - start,
786        ) {
787            Match(s) => Match((start + s, text.len())),
788            NoMatch(i) => NoMatch(i),
789            Quit => Quit,
790        }
791    }
792
793    /// Finds the end of the shortest match using only the DFA.
794    #[cfg(feature = "perf-dfa")]
795    #[cfg_attr(feature = "perf-inline", inline(always))]
796    fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
797        dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
798    }
799
800    /// Finds the end of the shortest match using only the DFA by scanning for
801    /// suffix literals.
802    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
803    #[cfg_attr(feature = "perf-inline", inline(always))]
804    fn shortest_dfa_reverse_suffix(
805        &self,
806        text: &[u8],
807        start: usize,
808    ) -> dfa::Result<usize> {
809        match self.exec_dfa_reverse_suffix(text, start) {
810            None => self.shortest_dfa(text, start),
811            Some(r) => r.map(|(_, end)| end),
812        }
813    }
814
815    /// Finds the end of the shortest match using only the DFA by scanning for
816    /// suffix literals. It also reports the start of the match.
817    ///
818    /// Note that if None is returned, then the optimization gave up to avoid
819    /// worst case quadratic behavior. A forward scanning DFA should be tried
820    /// next.
821    ///
822    /// If a match is returned and the full leftmost-first match is desired,
823    /// then a forward scan starting from the beginning of the match must be
824    /// done.
825    ///
826    /// If the result returned indicates that the DFA quit, then another
827    /// matching engine should be used.
828    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
829    #[cfg_attr(feature = "perf-inline", inline(always))]
830    fn exec_dfa_reverse_suffix(
831        &self,
832        text: &[u8],
833        original_start: usize,
834    ) -> Option<dfa::Result<(usize, usize)>> {
835        use crate::dfa::Result::*;
836
837        let lcs = self.ro.suffixes.lcs();
838        debug_assert!(lcs.len() >= 1);
839        let mut start = original_start;
840        let mut end = start;
841        let mut last_literal = start;
842        while end <= text.len() {
843            last_literal += match lcs.find(&text[last_literal..]) {
844                None => return Some(NoMatch(text.len())),
845                Some(i) => i,
846            };
847            end = last_literal + lcs.len();
848            match dfa::Fsm::reverse(
849                &self.ro.dfa_reverse,
850                self.cache.value(),
851                false,
852                &text[start..end],
853                end - start,
854            ) {
855                Match(0) | NoMatch(0) => return None,
856                Match(i) => return Some(Match((start + i, end))),
857                NoMatch(i) => {
858                    start += i;
859                    last_literal += 1;
860                    continue;
861                }
862                Quit => return Some(Quit),
863            };
864        }
865        Some(NoMatch(text.len()))
866    }
867
868    /// Finds the leftmost-first match (start and end) using only the DFA
869    /// by scanning for suffix literals.
870    ///
871    /// If the result returned indicates that the DFA quit, then another
872    /// matching engine should be used.
873    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
874    #[cfg_attr(feature = "perf-inline", inline(always))]
875    fn find_dfa_reverse_suffix(
876        &self,
877        text: &[u8],
878        start: usize,
879    ) -> dfa::Result<(usize, usize)> {
880        use crate::dfa::Result::*;
881
882        let match_start = match self.exec_dfa_reverse_suffix(text, start) {
883            None => return self.find_dfa_forward(text, start),
884            Some(Match((start, _))) => start,
885            Some(r) => return r,
886        };
887        // At this point, we've found a match. The only way to quit now
888        // without a match is if the DFA gives up (seems unlikely).
889        //
890        // Now run the DFA forwards to find the proper end of the match.
891        // (The suffix literal match can only indicate the earliest
892        // possible end location, which may appear before the end of the
893        // leftmost-first match.)
894        match dfa::Fsm::forward(
895            &self.ro.dfa,
896            self.cache.value(),
897            false,
898            text,
899            match_start,
900        ) {
901            NoMatch(_) => panic!("BUG: reverse match implies forward match"),
902            Quit => Quit,
903            Match(e) => Match((match_start, e)),
904        }
905    }
906
907    /// Executes the NFA engine to return whether there is a match or not.
908    ///
909    /// Ideally, we could use shortest_nfa(...).is_some() and get the same
910    /// performance characteristics, but regex sets don't have captures, which
911    /// shortest_nfa depends on.
912    #[cfg(feature = "perf-dfa")]
913    fn match_nfa(&self, text: &[u8], start: usize) -> bool {
914        self.match_nfa_type(MatchNfaType::Auto, text, start)
915    }
916
917    /// Like match_nfa, but allows specification of the type of NFA engine.
918    fn match_nfa_type(
919        &self,
920        ty: MatchNfaType,
921        text: &[u8],
922        start: usize,
923    ) -> bool {
924        self.exec_nfa(
925            ty,
926            &mut [false],
927            &mut [],
928            true,
929            false,
930            text,
931            start,
932            text.len(),
933        )
934    }
935
936    /// Finds the shortest match using an NFA.
937    #[cfg(feature = "perf-dfa")]
938    fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
939        self.shortest_nfa_type(MatchNfaType::Auto, text, start)
940    }
941
942    /// Like shortest_nfa, but allows specification of the type of NFA engine.
943    fn shortest_nfa_type(
944        &self,
945        ty: MatchNfaType,
946        text: &[u8],
947        start: usize,
948    ) -> Option<usize> {
949        let mut slots = [None, None];
950        if self.exec_nfa(
951            ty,
952            &mut [false],
953            &mut slots,
954            true,
955            true,
956            text,
957            start,
958            text.len(),
959        ) {
960            slots[1]
961        } else {
962            None
963        }
964    }
965
966    /// Like find, but executes an NFA engine.
967    fn find_nfa(
968        &self,
969        ty: MatchNfaType,
970        text: &[u8],
971        start: usize,
972    ) -> Option<(usize, usize)> {
973        let mut slots = [None, None];
974        if self.exec_nfa(
975            ty,
976            &mut [false],
977            &mut slots,
978            false,
979            false,
980            text,
981            start,
982            text.len(),
983        ) {
984            match (slots[0], slots[1]) {
985                (Some(s), Some(e)) => Some((s, e)),
986                _ => None,
987            }
988        } else {
989            None
990        }
991    }
992
993    /// Like find_nfa, but fills in captures.
994    ///
995    /// `slots` should have length equal to `2 * nfa.captures.len()`.
996    #[cfg(feature = "perf-dfa")]
997    fn captures_nfa(
998        &self,
999        slots: &mut [Slot],
1000        text: &[u8],
1001        start: usize,
1002    ) -> Option<(usize, usize)> {
1003        self.captures_nfa_type(
1004            MatchNfaType::Auto,
1005            slots,
1006            text,
1007            start,
1008            text.len(),
1009        )
1010    }
1011
1012    /// Like captures_nfa, but allows specification of type of NFA engine.
1013    fn captures_nfa_type(
1014        &self,
1015        ty: MatchNfaType,
1016        slots: &mut [Slot],
1017        text: &[u8],
1018        start: usize,
1019        end: usize,
1020    ) -> Option<(usize, usize)> {
1021        if self.exec_nfa(
1022            ty,
1023            &mut [false],
1024            slots,
1025            false,
1026            false,
1027            text,
1028            start,
1029            end,
1030        ) {
1031            match (slots[0], slots[1]) {
1032                (Some(s), Some(e)) => Some((s, e)),
1033                _ => None,
1034            }
1035        } else {
1036            None
1037        }
1038    }
1039
1040    fn exec_nfa(
1041        &self,
1042        mut ty: MatchNfaType,
1043        matches: &mut [bool],
1044        slots: &mut [Slot],
1045        quit_after_match: bool,
1046        quit_after_match_with_pos: bool,
1047        text: &[u8],
1048        start: usize,
1049        end: usize,
1050    ) -> bool {
1051        use self::MatchNfaType::*;
1052        if let Auto = ty {
1053            if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1054                ty = Backtrack;
1055            } else {
1056                ty = PikeVM;
1057            }
1058        }
1059        // The backtracker can't return the shortest match position as it is
1060        // implemented today. So if someone calls `shortest_match` and we need
1061        // to run an NFA, then use the PikeVM.
1062        if quit_after_match_with_pos || ty == PikeVM {
1063            self.exec_pikevm(
1064                matches,
1065                slots,
1066                quit_after_match,
1067                text,
1068                start,
1069                end,
1070            )
1071        } else {
1072            self.exec_backtrack(matches, slots, text, start, end)
1073        }
1074    }
1075
1076    /// Always run the NFA algorithm.
1077    fn exec_pikevm(
1078        &self,
1079        matches: &mut [bool],
1080        slots: &mut [Slot],
1081        quit_after_match: bool,
1082        text: &[u8],
1083        start: usize,
1084        end: usize,
1085    ) -> bool {
1086        if self.ro.nfa.uses_bytes() {
1087            pikevm::Fsm::exec(
1088                &self.ro.nfa,
1089                self.cache.value(),
1090                matches,
1091                slots,
1092                quit_after_match,
1093                ByteInput::new(text, self.ro.nfa.only_utf8),
1094                start,
1095                end,
1096            )
1097        } else {
1098            pikevm::Fsm::exec(
1099                &self.ro.nfa,
1100                self.cache.value(),
1101                matches,
1102                slots,
1103                quit_after_match,
1104                CharInput::new(text),
1105                start,
1106                end,
1107            )
1108        }
1109    }
1110
1111    /// Always runs the NFA using bounded backtracking.
1112    fn exec_backtrack(
1113        &self,
1114        matches: &mut [bool],
1115        slots: &mut [Slot],
1116        text: &[u8],
1117        start: usize,
1118        end: usize,
1119    ) -> bool {
1120        if self.ro.nfa.uses_bytes() {
1121            backtrack::Bounded::exec(
1122                &self.ro.nfa,
1123                self.cache.value(),
1124                matches,
1125                slots,
1126                ByteInput::new(text, self.ro.nfa.only_utf8),
1127                start,
1128                end,
1129            )
1130        } else {
1131            backtrack::Bounded::exec(
1132                &self.ro.nfa,
1133                self.cache.value(),
1134                matches,
1135                slots,
1136                CharInput::new(text),
1137                start,
1138                end,
1139            )
1140        }
1141    }
1142
1143    /// Finds which regular expressions match the given text.
1144    ///
1145    /// `matches` should have length equal to the number of regexes being
1146    /// searched.
1147    ///
1148    /// This is only useful when one wants to know which regexes in a set
1149    /// match some text.
1150    pub fn many_matches_at(
1151        &self,
1152        matches: &mut [bool],
1153        text: &[u8],
1154        start: usize,
1155    ) -> bool {
1156        use self::MatchType::*;
1157        if !self.is_anchor_end_match(text) {
1158            return false;
1159        }
1160        match self.ro.match_type {
1161            #[cfg(feature = "perf-literal")]
1162            Literal(ty) => {
1163                debug_assert_eq!(matches.len(), 1);
1164                matches[0] = self.find_literals(ty, text, start).is_some();
1165                matches[0]
1166            }
1167            #[cfg(feature = "perf-dfa")]
1168            Dfa | DfaAnchoredReverse | DfaMany => {
1169                match dfa::Fsm::forward_many(
1170                    &self.ro.dfa,
1171                    self.cache.value(),
1172                    matches,
1173                    text,
1174                    start,
1175                ) {
1176                    dfa::Result::Match(_) => true,
1177                    dfa::Result::NoMatch(_) => false,
1178                    dfa::Result::Quit => self.exec_nfa(
1179                        MatchNfaType::Auto,
1180                        matches,
1181                        &mut [],
1182                        false,
1183                        false,
1184                        text,
1185                        start,
1186                        text.len(),
1187                    ),
1188                }
1189            }
1190            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1191            DfaSuffix => {
1192                match dfa::Fsm::forward_many(
1193                    &self.ro.dfa,
1194                    self.cache.value(),
1195                    matches,
1196                    text,
1197                    start,
1198                ) {
1199                    dfa::Result::Match(_) => true,
1200                    dfa::Result::NoMatch(_) => false,
1201                    dfa::Result::Quit => self.exec_nfa(
1202                        MatchNfaType::Auto,
1203                        matches,
1204                        &mut [],
1205                        false,
1206                        false,
1207                        text,
1208                        start,
1209                        text.len(),
1210                    ),
1211                }
1212            }
1213            Nfa(ty) => self.exec_nfa(
1214                ty,
1215                matches,
1216                &mut [],
1217                false,
1218                false,
1219                text,
1220                start,
1221                text.len(),
1222            ),
1223            Nothing => false,
1224        }
1225    }
1226
1227    #[cfg_attr(feature = "perf-inline", inline(always))]
1228    fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1229        #[cfg(not(feature = "perf-literal"))]
1230        fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1231            true
1232        }
1233
1234        #[cfg(feature = "perf-literal")]
1235        fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1236            // Only do this check if the haystack is big (>1MB).
1237            if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1238                let lcs = ro.suffixes.lcs();
1239                if lcs.len() >= 1 && !lcs.is_suffix(text) {
1240                    return false;
1241                }
1242            }
1243            true
1244        }
1245
1246        imp(&self.ro, text)
1247    }
1248
1249    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1250        &self.ro.nfa.capture_name_idx
1251    }
1252}
1253
1254impl<'c> ExecNoSyncStr<'c> {
1255    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1256        self.0.capture_name_idx()
1257    }
1258}
1259
1260impl Exec {
1261    /// Get a searcher that isn't Sync.
1262    #[cfg_attr(feature = "perf-inline", inline(always))]
1263    pub fn searcher(&self) -> ExecNoSync<'_> {
1264        ExecNoSync {
1265            ro: &self.ro, // a clone is too expensive here! (and not needed)
1266            cache: self.pool.get(),
1267        }
1268    }
1269
1270    /// Get a searcher that isn't Sync and can match on &str.
1271    #[cfg_attr(feature = "perf-inline", inline(always))]
1272    pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
1273        ExecNoSyncStr(self.searcher())
1274    }
1275
1276    /// Build a Regex from this executor.
1277    pub fn into_regex(self) -> re_unicode::Regex {
1278        re_unicode::Regex::from(self)
1279    }
1280
1281    /// Build a RegexSet from this executor.
1282    pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1283        re_set::unicode::RegexSet::from(self)
1284    }
1285
1286    /// Build a Regex from this executor that can match arbitrary bytes.
1287    pub fn into_byte_regex(self) -> re_bytes::Regex {
1288        re_bytes::Regex::from(self)
1289    }
1290
1291    /// Build a RegexSet from this executor that can match arbitrary bytes.
1292    pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1293        re_set::bytes::RegexSet::from(self)
1294    }
1295
1296    /// The original regular expressions given by the caller that were
1297    /// compiled.
1298    pub fn regex_strings(&self) -> &[String] {
1299        &self.ro.res
1300    }
1301
1302    /// Return a slice of capture names.
1303    ///
1304    /// Any capture that isn't named is None.
1305    pub fn capture_names(&self) -> &[Option<String>] {
1306        &self.ro.nfa.captures
1307    }
1308
1309    /// Return a reference to named groups mapping (from group name to
1310    /// group position).
1311    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1312        &self.ro.nfa.capture_name_idx
1313    }
1314}
1315
1316impl Clone for Exec {
1317    fn clone(&self) -> Exec {
1318        let pool = ExecReadOnly::new_pool(&self.ro);
1319        Exec { ro: self.ro.clone(), pool }
1320    }
1321}
1322
1323impl ExecReadOnly {
1324    fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1325        if let Some(MatchType::Nfa(_)) = hint {
1326            return hint.unwrap();
1327        }
1328        // If the NFA is empty, then we'll never match anything.
1329        if self.nfa.insts.is_empty() {
1330            return MatchType::Nothing;
1331        }
1332        if let Some(literalty) = self.choose_literal_match_type() {
1333            return literalty;
1334        }
1335        if let Some(dfaty) = self.choose_dfa_match_type() {
1336            return dfaty;
1337        }
1338        // We're so totally hosed.
1339        MatchType::Nfa(MatchNfaType::Auto)
1340    }
1341
1342    /// If a plain literal scan can be used, then a corresponding literal
1343    /// search type is returned.
1344    fn choose_literal_match_type(&self) -> Option<MatchType> {
1345        #[cfg(not(feature = "perf-literal"))]
1346        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1347            None
1348        }
1349
1350        #[cfg(feature = "perf-literal")]
1351        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1352            // If our set of prefixes is complete, then we can use it to find
1353            // a match in lieu of a regex engine. This doesn't quite work well
1354            // in the presence of multiple regexes, so only do it when there's
1355            // one.
1356            //
1357            // TODO(burntsushi): Also, don't try to match literals if the regex
1358            // is partially anchored. We could technically do it, but we'd need
1359            // to create two sets of literals: all of them and then the subset
1360            // that aren't anchored. We would then only search for all of them
1361            // when at the beginning of the input and use the subset in all
1362            // other cases.
1363            if ro.res.len() != 1 {
1364                return None;
1365            }
1366            if ro.ac.is_some() {
1367                return Some(MatchType::Literal(
1368                    MatchLiteralType::AhoCorasick,
1369                ));
1370            }
1371            if ro.nfa.prefixes.complete() {
1372                return if ro.nfa.is_anchored_start {
1373                    Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1374                } else {
1375                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1376                };
1377            }
1378            if ro.suffixes.complete() {
1379                return if ro.nfa.is_anchored_end {
1380                    Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1381                } else {
1382                    // This case shouldn't happen. When the regex isn't
1383                    // anchored, then complete prefixes should imply complete
1384                    // suffixes.
1385                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1386                };
1387            }
1388            None
1389        }
1390
1391        imp(self)
1392    }
1393
1394    /// If a DFA scan can be used, then choose the appropriate DFA strategy.
1395    fn choose_dfa_match_type(&self) -> Option<MatchType> {
1396        #[cfg(not(feature = "perf-dfa"))]
1397        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1398            None
1399        }
1400
1401        #[cfg(feature = "perf-dfa")]
1402        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1403            if !dfa::can_exec(&ro.dfa) {
1404                return None;
1405            }
1406            // Regex sets require a slightly specialized path.
1407            if ro.res.len() >= 2 {
1408                return Some(MatchType::DfaMany);
1409            }
1410            // If the regex is anchored at the end but not the start, then
1411            // just match in reverse from the end of the haystack.
1412            if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1413                return Some(MatchType::DfaAnchoredReverse);
1414            }
1415            #[cfg(feature = "perf-literal")]
1416            {
1417                // If there's a longish suffix literal, then it might be faster
1418                // to look for that first.
1419                if ro.should_suffix_scan() {
1420                    return Some(MatchType::DfaSuffix);
1421                }
1422            }
1423            // Fall back to your garden variety forward searching lazy DFA.
1424            Some(MatchType::Dfa)
1425        }
1426
1427        imp(self)
1428    }
1429
1430    /// Returns true if the program is amenable to suffix scanning.
1431    ///
1432    /// When this is true, as a heuristic, we assume it is OK to quickly scan
1433    /// for suffix literals and then do a *reverse* DFA match from any matches
1434    /// produced by the literal scan. (And then followed by a forward DFA
1435    /// search, since the previously found suffix literal maybe not actually be
1436    /// the end of a match.)
1437    ///
1438    /// This is a bit of a specialized optimization, but can result in pretty
1439    /// big performance wins if 1) there are no prefix literals and 2) the
1440    /// suffix literals are pretty rare in the text. (1) is obviously easy to
1441    /// account for but (2) is harder. As a proxy, we assume that longer
1442    /// strings are generally rarer, so we only enable this optimization when
1443    /// we have a meaty suffix.
1444    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1445    fn should_suffix_scan(&self) -> bool {
1446        if self.suffixes.is_empty() {
1447            return false;
1448        }
1449        let lcs_len = self.suffixes.lcs().char_len();
1450        lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1451    }
1452
1453    fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
1454        let ro = ro.clone();
1455        Box::new(Pool::new(Box::new(move || {
1456            AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
1457        })))
1458    }
1459}
1460
1461#[derive(Clone, Copy, Debug)]
1462enum MatchType {
1463    /// A single or multiple literal search. This is only used when the regex
1464    /// can be decomposed into a literal search.
1465    #[cfg(feature = "perf-literal")]
1466    Literal(MatchLiteralType),
1467    /// A normal DFA search.
1468    #[cfg(feature = "perf-dfa")]
1469    Dfa,
1470    /// A reverse DFA search starting from the end of a haystack.
1471    #[cfg(feature = "perf-dfa")]
1472    DfaAnchoredReverse,
1473    /// A reverse DFA search with suffix literal scanning.
1474    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1475    DfaSuffix,
1476    /// Use the DFA on two or more regular expressions.
1477    #[cfg(feature = "perf-dfa")]
1478    DfaMany,
1479    /// An NFA variant.
1480    Nfa(MatchNfaType),
1481    /// No match is ever possible, so don't ever try to search.
1482    Nothing,
1483}
1484
1485#[derive(Clone, Copy, Debug)]
1486#[cfg(feature = "perf-literal")]
1487enum MatchLiteralType {
1488    /// Match literals anywhere in text.
1489    Unanchored,
1490    /// Match literals only at the start of text.
1491    AnchoredStart,
1492    /// Match literals only at the end of text.
1493    AnchoredEnd,
1494    /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1495    /// ExecReadOnly.
1496    AhoCorasick,
1497}
1498
1499#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1500enum MatchNfaType {
1501    /// Choose between Backtrack and PikeVM.
1502    Auto,
1503    /// NFA bounded backtracking.
1504    ///
1505    /// (This is only set by tests, since it never makes sense to always want
1506    /// backtracking.)
1507    Backtrack,
1508    /// The Pike VM.
1509    ///
1510    /// (This is only set by tests, since it never makes sense to always want
1511    /// the Pike VM.)
1512    PikeVM,
1513}
1514
1515/// `ProgramCache` maintains reusable allocations for each matching engine
1516/// available to a particular program.
1517///
1518/// We declare this as unwind safe since it's a cache that's only used for
1519/// performance purposes. If a panic occurs, it is (or should be) always safe
1520/// to continue using the same regex object.
1521pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
1522
1523#[derive(Debug)]
1524pub struct ProgramCacheInner {
1525    pub pikevm: pikevm::Cache,
1526    pub backtrack: backtrack::Cache,
1527    #[cfg(feature = "perf-dfa")]
1528    pub dfa: dfa::Cache,
1529    #[cfg(feature = "perf-dfa")]
1530    pub dfa_reverse: dfa::Cache,
1531}
1532
1533impl ProgramCacheInner {
1534    fn new(ro: &ExecReadOnly) -> Self {
1535        ProgramCacheInner {
1536            pikevm: pikevm::Cache::new(&ro.nfa),
1537            backtrack: backtrack::Cache::new(&ro.nfa),
1538            #[cfg(feature = "perf-dfa")]
1539            dfa: dfa::Cache::new(&ro.dfa),
1540            #[cfg(feature = "perf-dfa")]
1541            dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1542        }
1543    }
1544}
1545
1546/// Alternation literals checks if the given HIR is a simple alternation of
1547/// literals, and if so, returns them. Otherwise, this returns None.
1548#[cfg(feature = "perf-literal")]
1549fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1550    use regex_syntax::hir::{HirKind, Literal};
1551
1552    // This is pretty hacky, but basically, if `is_alternation_literal` is
1553    // true, then we can make several assumptions about the structure of our
1554    // HIR. This is what justifies the `unreachable!` statements below.
1555    //
1556    // This code should be refactored once we overhaul this crate's
1557    // optimization pipeline, because this is a terribly inflexible way to go
1558    // about things.
1559
1560    if !expr.is_alternation_literal() {
1561        return None;
1562    }
1563    let alts = match *expr.kind() {
1564        HirKind::Alternation(ref alts) => alts,
1565        _ => return None, // one literal isn't worth it
1566    };
1567
1568    let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1569        Literal::Unicode(c) => {
1570            let mut buf = [0; 4];
1571            dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1572        }
1573        Literal::Byte(b) => {
1574            dst.push(b);
1575        }
1576    };
1577
1578    let mut lits = vec![];
1579    for alt in alts {
1580        let mut lit = vec![];
1581        match *alt.kind() {
1582            HirKind::Literal(ref x) => extendlit(x, &mut lit),
1583            HirKind::Concat(ref exprs) => {
1584                for e in exprs {
1585                    match *e.kind() {
1586                        HirKind::Literal(ref x) => extendlit(x, &mut lit),
1587                        _ => unreachable!("expected literal, got {:?}", e),
1588                    }
1589                }
1590            }
1591            _ => unreachable!("expected literal or concat, got {:?}", alt),
1592        }
1593        lits.push(lit);
1594    }
1595    Some(lits)
1596}
1597
1598#[cfg(test)]
1599mod test {
1600    #[test]
1601    fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1602        use crate::internal::ExecBuilder;
1603
1604        let backtrack_bytes_re = ExecBuilder::new("^S")
1605            .bounded_backtracking()
1606            .only_utf8(false)
1607            .build()
1608            .map(|exec| exec.into_byte_regex())
1609            .map_err(|err| format!("{}", err))
1610            .unwrap();
1611
1612        let default_bytes_re = ExecBuilder::new("^S")
1613            .only_utf8(false)
1614            .build()
1615            .map(|exec| exec.into_byte_regex())
1616            .map_err(|err| format!("{}", err))
1617            .unwrap();
1618
1619        let input = vec![83, 83];
1620
1621        let s1 = backtrack_bytes_re.split(&input);
1622        let s2 = default_bytes_re.split(&input);
1623        for (chunk1, chunk2) in s1.zip(s2) {
1624            assert_eq!(chunk1, chunk2);
1625        }
1626    }
1627
1628    #[test]
1629    fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1630        use crate::internal::ExecBuilder;
1631
1632        let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1633            .bounded_backtracking()
1634            .bytes(true)
1635            .build()
1636            .map(|exec| exec.into_regex())
1637            .map_err(|err| format!("{}", err))
1638            .unwrap();
1639
1640        let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1641            .bytes(true)
1642            .build()
1643            .map(|exec| exec.into_regex())
1644            .map_err(|err| format!("{}", err))
1645            .unwrap();
1646
1647        let input = "**";
1648
1649        let s1 = backtrack_bytes_re.split(input);
1650        let s2 = default_bytes_re.split(input);
1651        for (chunk1, chunk2) in s1.zip(s2) {
1652            assert_eq!(chunk1, chunk2);
1653        }
1654    }
1655}
regex/exec.rs

regex/
exec.rs