regex/
compile.rs

Help
1use std::collections::HashMap;
2use std::fmt;
3use std::iter;
4use std::result;
5use std::sync::Arc;
6
7use regex_syntax::hir::{self, Hir};
8use regex_syntax::is_word_byte;
9use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11use crate::prog::{
12    EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13    InstSave, InstSplit, Program,
14};
15
16use crate::Error;
17
18type Result = result::Result<Patch, Error>;
19type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21#[derive(Debug)]
22struct Patch {
23    hole: Hole,
24    entry: InstPtr,
25}
26
27/// A compiler translates a regular expression AST to a sequence of
28/// instructions. The sequence of instructions represents an NFA.
29// `Compiler` is only public via the `internal` module, so avoid deriving
30// `Debug`.
31#[allow(missing_debug_implementations)]
32pub struct Compiler {
33    insts: Vec<MaybeInst>,
34    compiled: Program,
35    capture_name_idx: HashMap<String, usize>,
36    num_exprs: usize,
37    size_limit: usize,
38    suffix_cache: SuffixCache,
39    utf8_seqs: Option<Utf8Sequences>,
40    byte_classes: ByteClassSet,
41    // This keeps track of extra bytes allocated while compiling the regex
42    // program. Currently, this corresponds to two things. First is the heap
43    // memory allocated by Unicode character classes ('InstRanges'). Second is
44    // a "fake" amount of memory used by empty sub-expressions, so that enough
45    // empty sub-expressions will ultimately trigger the compiler to bail
46    // because of a size limit restriction. (That empty sub-expressions don't
47    // add to heap memory usage is more-or-less an implementation detail.) In
48    // the second case, if we don't bail, then an excessively large repetition
49    // on an empty sub-expression can result in the compiler using a very large
50    // amount of CPU time.
51    extra_inst_bytes: usize,
52}
53
54impl Compiler {
55    /// Create a new regular expression compiler.
56    ///
57    /// Various options can be set before calling `compile` on an expression.
58    pub fn new() -> Self {
59        Compiler {
60            insts: vec![],
61            compiled: Program::new(),
62            capture_name_idx: HashMap::new(),
63            num_exprs: 0,
64            size_limit: 10 * (1 << 20),
65            suffix_cache: SuffixCache::new(1000),
66            utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67            byte_classes: ByteClassSet::new(),
68            extra_inst_bytes: 0,
69        }
70    }
71
72    /// The size of the resulting program is limited by size_limit. If
73    /// the program approximately exceeds the given size (in bytes), then
74    /// compilation will stop and return an error.
75    pub fn size_limit(mut self, size_limit: usize) -> Self {
76        self.size_limit = size_limit;
77        self
78    }
79
80    /// If bytes is true, then the program is compiled as a byte based
81    /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82    /// false, then the automaton is Unicode scalar value based, e.g., an
83    /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84    ///
85    /// The specific invariant is that when returning a byte based machine,
86    /// the neither the `Char` nor `Ranges` instructions are produced.
87    /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88    /// instruction is never produced.
89    ///
90    /// Note that `dfa(true)` implies `bytes(true)`.
91    pub fn bytes(mut self, yes: bool) -> Self {
92        self.compiled.is_bytes = yes;
93        self
94    }
95
96    /// When disabled, the program compiled may match arbitrary bytes.
97    ///
98    /// When enabled (the default), all compiled programs exclusively match
99    /// valid UTF-8 bytes.
100    pub fn only_utf8(mut self, yes: bool) -> Self {
101        self.compiled.only_utf8 = yes;
102        self
103    }
104
105    /// When set, the machine returned is suitable for use in the DFA matching
106    /// engine.
107    ///
108    /// In particular, this ensures that if the regex is not anchored in the
109    /// beginning, then a preceding `.*?` is included in the program. (The NFA
110    /// based engines handle the preceding `.*?` explicitly, which is difficult
111    /// or impossible in the DFA engine.)
112    pub fn dfa(mut self, yes: bool) -> Self {
113        self.compiled.is_dfa = yes;
114        self
115    }
116
117    /// When set, the machine returned is suitable for matching text in
118    /// reverse. In particular, all concatenations are flipped.
119    pub fn reverse(mut self, yes: bool) -> Self {
120        self.compiled.is_reverse = yes;
121        self
122    }
123
124    /// Compile a regular expression given its AST.
125    ///
126    /// The compiler is guaranteed to succeed unless the program exceeds the
127    /// specified size limit. If the size limit is exceeded, then compilation
128    /// stops and returns an error.
129    pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130        debug_assert!(!exprs.is_empty());
131        self.num_exprs = exprs.len();
132        if exprs.len() == 1 {
133            self.compile_one(&exprs[0])
134        } else {
135            self.compile_many(exprs)
136        }
137    }
138
139    fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140        // If we're compiling a forward DFA and we aren't anchored, then
141        // add a `.*?` before the first capture group.
142        // Other matching engines handle this by baking the logic into the
143        // matching engine itself.
144        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145        self.compiled.is_anchored_start = expr.is_anchored_start();
146        self.compiled.is_anchored_end = expr.is_anchored_end();
147        if self.compiled.needs_dotstar() {
148            dotstar_patch = self.c_dotstar()?;
149            self.compiled.start = dotstar_patch.entry;
150        }
151        self.compiled.captures = vec![None];
152        let patch =
153            self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
154        if self.compiled.needs_dotstar() {
155            self.fill(dotstar_patch.hole, patch.entry);
156        } else {
157            self.compiled.start = patch.entry;
158        }
159        self.fill_to_next(patch.hole);
160        self.compiled.matches = vec![self.insts.len()];
161        self.push_compiled(Inst::Match(0));
162        self.compile_finish()
163    }
164
165    fn compile_many(
166        mut self,
167        exprs: &[Hir],
168    ) -> result::Result<Program, Error> {
169        debug_assert!(exprs.len() > 1);
170
171        self.compiled.is_anchored_start =
172            exprs.iter().all(|e| e.is_anchored_start());
173        self.compiled.is_anchored_end =
174            exprs.iter().all(|e| e.is_anchored_end());
175        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
176        if self.compiled.needs_dotstar() {
177            dotstar_patch = self.c_dotstar()?;
178            self.compiled.start = dotstar_patch.entry;
179        } else {
180            self.compiled.start = 0; // first instruction is always split
181        }
182        self.fill_to_next(dotstar_patch.hole);
183
184        let mut prev_hole = Hole::None;
185        for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
186            self.fill_to_next(prev_hole);
187            let split = self.push_split_hole();
188            let Patch { hole, entry } =
189                self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
190            self.fill_to_next(hole);
191            self.compiled.matches.push(self.insts.len());
192            self.push_compiled(Inst::Match(i));
193            prev_hole = self.fill_split(split, Some(entry), None);
194        }
195        let i = exprs.len() - 1;
196        let Patch { hole, entry } =
197            self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
198        self.fill(prev_hole, entry);
199        self.fill_to_next(hole);
200        self.compiled.matches.push(self.insts.len());
201        self.push_compiled(Inst::Match(i));
202        self.compile_finish()
203    }
204
205    fn compile_finish(mut self) -> result::Result<Program, Error> {
206        self.compiled.insts =
207            self.insts.into_iter().map(|inst| inst.unwrap()).collect();
208        self.compiled.byte_classes = self.byte_classes.byte_classes();
209        self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
210        Ok(self.compiled)
211    }
212
213    /// Compile expr into self.insts, returning a patch on success,
214    /// or an error if we run out of memory.
215    ///
216    /// All of the c_* methods of the compiler share the contract outlined
217    /// here.
218    ///
219    /// The main thing that a c_* method does is mutate `self.insts`
220    /// to add a list of mostly compiled instructions required to execute
221    /// the given expression. `self.insts` contains MaybeInsts rather than
222    /// Insts because there is some backpatching required.
223    ///
224    /// The `Patch` value returned by each c_* method provides metadata
225    /// about the compiled instructions emitted to `self.insts`. The
226    /// `entry` member of the patch refers to the first instruction
227    /// (the entry point), while the `hole` member contains zero or
228    /// more offsets to partial instructions that need to be backpatched.
229    /// The c_* routine can't know where its list of instructions are going to
230    /// jump to after execution, so it is up to the caller to patch
231    /// these jumps to point to the right place. So compiling some
232    /// expression, e, we would end up with a situation that looked like:
233    ///
234    /// ```text
235    /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
236    ///                     ^              ^             ^
237    ///                     |                \         /
238    ///                   entry                \     /
239    ///                                         hole
240    /// ```
241    ///
242    /// To compile two expressions, e1 and e2, concatenated together we
243    /// would do:
244    ///
245    /// ```ignore
246    /// let patch1 = self.c(e1);
247    /// let patch2 = self.c(e2);
248    /// ```
249    ///
250    /// while leaves us with a situation that looks like
251    ///
252    /// ```text
253    /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
254    ///                     ^        ^            ^        ^
255    ///                     |        |            |        |
256    ///                entry1        hole1   entry2        hole2
257    /// ```
258    ///
259    /// Then to merge the two patches together into one we would backpatch
260    /// hole1 with entry2 and return a new patch that enters at entry1
261    /// and has hole2 for a hole. In fact, if you look at the c_concat
262    /// method you will see that it does exactly this, though it handles
263    /// a list of expressions rather than just the two that we use for
264    /// an example.
265    ///
266    /// Ok(None) is returned when an expression is compiled to no
267    /// instruction, and so no patch.entry value makes sense.
268    fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
269        use crate::prog;
270        use regex_syntax::hir::HirKind::*;
271
272        self.check_size()?;
273        match *expr.kind() {
274            Empty => self.c_empty(),
275            Literal(hir::Literal::Unicode(c)) => self.c_char(c),
276            Literal(hir::Literal::Byte(b)) => {
277                assert!(self.compiled.uses_bytes());
278                self.c_byte(b)
279            }
280            Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
281            Class(hir::Class::Bytes(ref cls)) => {
282                if self.compiled.uses_bytes() {
283                    self.c_class_bytes(cls.ranges())
284                } else {
285                    assert!(cls.is_all_ascii());
286                    let mut char_ranges = vec![];
287                    for r in cls.iter() {
288                        let (s, e) = (r.start() as char, r.end() as char);
289                        char_ranges.push(hir::ClassUnicodeRange::new(s, e));
290                    }
291                    self.c_class(&char_ranges)
292                }
293            }
294            Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
295                self.byte_classes.set_range(b'\n', b'\n');
296                self.c_empty_look(prog::EmptyLook::EndLine)
297            }
298            Anchor(hir::Anchor::StartLine) => {
299                self.byte_classes.set_range(b'\n', b'\n');
300                self.c_empty_look(prog::EmptyLook::StartLine)
301            }
302            Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
303                self.byte_classes.set_range(b'\n', b'\n');
304                self.c_empty_look(prog::EmptyLook::StartLine)
305            }
306            Anchor(hir::Anchor::EndLine) => {
307                self.byte_classes.set_range(b'\n', b'\n');
308                self.c_empty_look(prog::EmptyLook::EndLine)
309            }
310            Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
311                self.c_empty_look(prog::EmptyLook::EndText)
312            }
313            Anchor(hir::Anchor::StartText) => {
314                self.c_empty_look(prog::EmptyLook::StartText)
315            }
316            Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
317                self.c_empty_look(prog::EmptyLook::StartText)
318            }
319            Anchor(hir::Anchor::EndText) => {
320                self.c_empty_look(prog::EmptyLook::EndText)
321            }
322            WordBoundary(hir::WordBoundary::Unicode) => {
323                if !cfg!(feature = "unicode-perl") {
324                    return Err(Error::Syntax(
325                        "Unicode word boundaries are unavailable when \
326                         the unicode-perl feature is disabled"
327                            .to_string(),
328                    ));
329                }
330                self.compiled.has_unicode_word_boundary = true;
331                self.byte_classes.set_word_boundary();
332                // We also make sure that all ASCII bytes are in a different
333                // class from non-ASCII bytes. Otherwise, it's possible for
334                // ASCII bytes to get lumped into the same class as non-ASCII
335                // bytes. This in turn may cause the lazy DFA to falsely start
336                // when it sees an ASCII byte that maps to a byte class with
337                // non-ASCII bytes. This ensures that never happens.
338                self.byte_classes.set_range(0, 0x7F);
339                self.c_empty_look(prog::EmptyLook::WordBoundary)
340            }
341            WordBoundary(hir::WordBoundary::UnicodeNegate) => {
342                if !cfg!(feature = "unicode-perl") {
343                    return Err(Error::Syntax(
344                        "Unicode word boundaries are unavailable when \
345                         the unicode-perl feature is disabled"
346                            .to_string(),
347                    ));
348                }
349                self.compiled.has_unicode_word_boundary = true;
350                self.byte_classes.set_word_boundary();
351                // See comments above for why we set the ASCII range here.
352                self.byte_classes.set_range(0, 0x7F);
353                self.c_empty_look(prog::EmptyLook::NotWordBoundary)
354            }
355            WordBoundary(hir::WordBoundary::Ascii) => {
356                self.byte_classes.set_word_boundary();
357                self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
358            }
359            WordBoundary(hir::WordBoundary::AsciiNegate) => {
360                self.byte_classes.set_word_boundary();
361                self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
362            }
363            Group(ref g) => match g.kind {
364                hir::GroupKind::NonCapturing => self.c(&g.hir),
365                hir::GroupKind::CaptureIndex(index) => {
366                    if index as usize >= self.compiled.captures.len() {
367                        self.compiled.captures.push(None);
368                    }
369                    self.c_capture(2 * index as usize, &g.hir)
370                }
371                hir::GroupKind::CaptureName { index, ref name } => {
372                    if index as usize >= self.compiled.captures.len() {
373                        let n = name.to_string();
374                        self.compiled.captures.push(Some(n.clone()));
375                        self.capture_name_idx.insert(n, index as usize);
376                    }
377                    self.c_capture(2 * index as usize, &g.hir)
378                }
379            },
380            Concat(ref es) => {
381                if self.compiled.is_reverse {
382                    self.c_concat(es.iter().rev())
383                } else {
384                    self.c_concat(es)
385                }
386            }
387            Alternation(ref es) => self.c_alternate(&**es),
388            Repetition(ref rep) => self.c_repeat(rep),
389        }
390    }
391
392    fn c_empty(&mut self) -> ResultOrEmpty {
393        // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
394        // See: CVE-2022-24713
395        //
396        // Since 'empty' sub-expressions don't increase the size of
397        // the actual compiled object, we "fake" an increase in its
398        // size so that our 'check_size_limit' routine will eventually
399        // stop compilation if there are too many empty sub-expressions
400        // (e.g., via a large repetition).
401        self.extra_inst_bytes += std::mem::size_of::<Inst>();
402        Ok(None)
403    }
404
405    fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
406        if self.num_exprs > 1 || self.compiled.is_dfa {
407            // Don't ever compile Save instructions for regex sets because
408            // they are never used. They are also never used in DFA programs
409            // because DFAs can't handle captures.
410            self.c(expr)
411        } else {
412            let entry = self.insts.len();
413            let hole = self.push_hole(InstHole::Save { slot: first_slot });
414            let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
415            self.fill(hole, patch.entry);
416            self.fill_to_next(patch.hole);
417            let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
418            Ok(Some(Patch { hole, entry }))
419        }
420    }
421
422    fn c_dotstar(&mut self) -> Result {
423        Ok(if !self.compiled.only_utf8() {
424            self.c(&Hir::repetition(hir::Repetition {
425                kind: hir::RepetitionKind::ZeroOrMore,
426                greedy: false,
427                hir: Box::new(Hir::any(true)),
428            }))?
429            .unwrap()
430        } else {
431            self.c(&Hir::repetition(hir::Repetition {
432                kind: hir::RepetitionKind::ZeroOrMore,
433                greedy: false,
434                hir: Box::new(Hir::any(false)),
435            }))?
436            .unwrap()
437        })
438    }
439
440    fn c_char(&mut self, c: char) -> ResultOrEmpty {
441        if self.compiled.uses_bytes() {
442            if c.is_ascii() {
443                let b = c as u8;
444                let hole =
445                    self.push_hole(InstHole::Bytes { start: b, end: b });
446                self.byte_classes.set_range(b, b);
447                Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
448            } else {
449                self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
450            }
451        } else {
452            let hole = self.push_hole(InstHole::Char { c });
453            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
454        }
455    }
456
457    fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
458        use std::mem::size_of;
459
460        assert!(!ranges.is_empty());
461        if self.compiled.uses_bytes() {
462            Ok(Some(CompileClass { c: self, ranges }.compile()?))
463        } else {
464            let ranges: Vec<(char, char)> =
465                ranges.iter().map(|r| (r.start(), r.end())).collect();
466            let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
467                self.push_hole(InstHole::Char { c: ranges[0].0 })
468            } else {
469                self.extra_inst_bytes +=
470                    ranges.len() * (size_of::<char>() * 2);
471                self.push_hole(InstHole::Ranges { ranges })
472            };
473            Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
474        }
475    }
476
477    fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
478        self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
479    }
480
481    fn c_class_bytes(
482        &mut self,
483        ranges: &[hir::ClassBytesRange],
484    ) -> ResultOrEmpty {
485        debug_assert!(!ranges.is_empty());
486
487        let first_split_entry = self.insts.len();
488        let mut holes = vec![];
489        let mut prev_hole = Hole::None;
490        for r in &ranges[0..ranges.len() - 1] {
491            self.fill_to_next(prev_hole);
492            let split = self.push_split_hole();
493            let next = self.insts.len();
494            self.byte_classes.set_range(r.start(), r.end());
495            holes.push(self.push_hole(InstHole::Bytes {
496                start: r.start(),
497                end: r.end(),
498            }));
499            prev_hole = self.fill_split(split, Some(next), None);
500        }
501        let next = self.insts.len();
502        let r = &ranges[ranges.len() - 1];
503        self.byte_classes.set_range(r.start(), r.end());
504        holes.push(
505            self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
506        );
507        self.fill(prev_hole, next);
508        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
509    }
510
511    fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
512        let hole = self.push_hole(InstHole::EmptyLook { look });
513        Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
514    }
515
516    fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
517    where
518        I: IntoIterator<Item = &'a Hir>,
519    {
520        let mut exprs = exprs.into_iter();
521        let Patch { mut hole, entry } = loop {
522            match exprs.next() {
523                None => return self.c_empty(),
524                Some(e) => {
525                    if let Some(p) = self.c(e)? {
526                        break p;
527                    }
528                }
529            }
530        };
531        for e in exprs {
532            if let Some(p) = self.c(e)? {
533                self.fill(hole, p.entry);
534                hole = p.hole;
535            }
536        }
537        Ok(Some(Patch { hole, entry }))
538    }
539
540    fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
541        debug_assert!(
542            exprs.len() >= 2,
543            "alternates must have at least 2 exprs"
544        );
545
546        // Initial entry point is always the first split.
547        let first_split_entry = self.insts.len();
548
549        // Save up all of the holes from each alternate. They will all get
550        // patched to point to the same location.
551        let mut holes = vec![];
552
553        // true indicates that the hole is a split where we want to fill
554        // the second branch.
555        let mut prev_hole = (Hole::None, false);
556        for e in &exprs[0..exprs.len() - 1] {
557            if prev_hole.1 {
558                let next = self.insts.len();
559                self.fill_split(prev_hole.0, None, Some(next));
560            } else {
561                self.fill_to_next(prev_hole.0);
562            }
563            let split = self.push_split_hole();
564            if let Some(Patch { hole, entry }) = self.c(e)? {
565                holes.push(hole);
566                prev_hole = (self.fill_split(split, Some(entry), None), false);
567            } else {
568                let (split1, split2) = split.dup_one();
569                holes.push(split1);
570                prev_hole = (split2, true);
571            }
572        }
573        if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
574            holes.push(hole);
575            if prev_hole.1 {
576                self.fill_split(prev_hole.0, None, Some(entry));
577            } else {
578                self.fill(prev_hole.0, entry);
579            }
580        } else {
581            // We ignore prev_hole.1. When it's true, it means we have two
582            // empty branches both pushing prev_hole.0 into holes, so both
583            // branches will go to the same place anyway.
584            holes.push(prev_hole.0);
585        }
586        Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
587    }
588
589    fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
590        use regex_syntax::hir::RepetitionKind::*;
591        match rep.kind {
592            ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
593            ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
594            OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
595            Range(hir::RepetitionRange::Exactly(min_max)) => {
596                self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
597            }
598            Range(hir::RepetitionRange::AtLeast(min)) => {
599                self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
600            }
601            Range(hir::RepetitionRange::Bounded(min, max)) => {
602                self.c_repeat_range(&rep.hir, rep.greedy, min, max)
603            }
604        }
605    }
606
607    fn c_repeat_zero_or_one(
608        &mut self,
609        expr: &Hir,
610        greedy: bool,
611    ) -> ResultOrEmpty {
612        let split_entry = self.insts.len();
613        let split = self.push_split_hole();
614        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615            Some(p) => p,
616            None => return self.pop_split_hole(),
617        };
618        let split_hole = if greedy {
619            self.fill_split(split, Some(entry_rep), None)
620        } else {
621            self.fill_split(split, None, Some(entry_rep))
622        };
623        let holes = vec![hole_rep, split_hole];
624        Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
625    }
626
627    fn c_repeat_zero_or_more(
628        &mut self,
629        expr: &Hir,
630        greedy: bool,
631    ) -> ResultOrEmpty {
632        let split_entry = self.insts.len();
633        let split = self.push_split_hole();
634        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
635            Some(p) => p,
636            None => return self.pop_split_hole(),
637        };
638
639        self.fill(hole_rep, split_entry);
640        let split_hole = if greedy {
641            self.fill_split(split, Some(entry_rep), None)
642        } else {
643            self.fill_split(split, None, Some(entry_rep))
644        };
645        Ok(Some(Patch { hole: split_hole, entry: split_entry }))
646    }
647
648    fn c_repeat_one_or_more(
649        &mut self,
650        expr: &Hir,
651        greedy: bool,
652    ) -> ResultOrEmpty {
653        let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
654            Some(p) => p,
655            None => return Ok(None),
656        };
657        self.fill_to_next(hole_rep);
658        let split = self.push_split_hole();
659
660        let split_hole = if greedy {
661            self.fill_split(split, Some(entry_rep), None)
662        } else {
663            self.fill_split(split, None, Some(entry_rep))
664        };
665        Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
666    }
667
668    fn c_repeat_range_min_or_more(
669        &mut self,
670        expr: &Hir,
671        greedy: bool,
672        min: u32,
673    ) -> ResultOrEmpty {
674        let min = u32_to_usize(min);
675        // Using next_inst() is ok, because we can't return it (concat would
676        // have to return Some(_) while c_repeat_range_min_or_more returns
677        // None).
678        let patch_concat = self
679            .c_concat(iter::repeat(expr).take(min))?
680            .unwrap_or_else(|| self.next_inst());
681        if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
682            self.fill(patch_concat.hole, patch_rep.entry);
683            Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
684        } else {
685            Ok(None)
686        }
687    }
688
689    fn c_repeat_range(
690        &mut self,
691        expr: &Hir,
692        greedy: bool,
693        min: u32,
694        max: u32,
695    ) -> ResultOrEmpty {
696        let (min, max) = (u32_to_usize(min), u32_to_usize(max));
697        debug_assert!(min <= max);
698        let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
699        if min == max {
700            return Ok(patch_concat);
701        }
702        // Same reasoning as in c_repeat_range_min_or_more (we know that min <
703        // max at this point).
704        let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
705        let initial_entry = patch_concat.entry;
706        // It is much simpler to compile, e.g., `a{2,5}` as:
707        //
708        //     aaa?a?a?
709        //
710        // But you end up with a sequence of instructions like this:
711        //
712        //     0: 'a'
713        //     1: 'a',
714        //     2: split(3, 4)
715        //     3: 'a'
716        //     4: split(5, 6)
717        //     5: 'a'
718        //     6: split(7, 8)
719        //     7: 'a'
720        //     8: MATCH
721        //
722        // This is *incredibly* inefficient because the splits end
723        // up forming a chain, which has to be resolved everything a
724        // transition is followed.
725        let mut holes = vec![];
726        let mut prev_hole = patch_concat.hole;
727        for _ in min..max {
728            self.fill_to_next(prev_hole);
729            let split = self.push_split_hole();
730            let Patch { hole, entry } = match self.c(expr)? {
731                Some(p) => p,
732                None => return self.pop_split_hole(),
733            };
734            prev_hole = hole;
735            if greedy {
736                holes.push(self.fill_split(split, Some(entry), None));
737            } else {
738                holes.push(self.fill_split(split, None, Some(entry)));
739            }
740        }
741        holes.push(prev_hole);
742        Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
743    }
744
745    /// Can be used as a default value for the c_* functions when the call to
746    /// c_function is followed by inserting at least one instruction that is
747    /// always executed after the ones written by the c* function.
748    fn next_inst(&self) -> Patch {
749        Patch { hole: Hole::None, entry: self.insts.len() }
750    }
751
752    fn fill(&mut self, hole: Hole, goto: InstPtr) {
753        match hole {
754            Hole::None => {}
755            Hole::One(pc) => {
756                self.insts[pc].fill(goto);
757            }
758            Hole::Many(holes) => {
759                for hole in holes {
760                    self.fill(hole, goto);
761                }
762            }
763        }
764    }
765
766    fn fill_to_next(&mut self, hole: Hole) {
767        let next = self.insts.len();
768        self.fill(hole, next);
769    }
770
771    fn fill_split(
772        &mut self,
773        hole: Hole,
774        goto1: Option<InstPtr>,
775        goto2: Option<InstPtr>,
776    ) -> Hole {
777        match hole {
778            Hole::None => Hole::None,
779            Hole::One(pc) => match (goto1, goto2) {
780                (Some(goto1), Some(goto2)) => {
781                    self.insts[pc].fill_split(goto1, goto2);
782                    Hole::None
783                }
784                (Some(goto1), None) => {
785                    self.insts[pc].half_fill_split_goto1(goto1);
786                    Hole::One(pc)
787                }
788                (None, Some(goto2)) => {
789                    self.insts[pc].half_fill_split_goto2(goto2);
790                    Hole::One(pc)
791                }
792                (None, None) => unreachable!(
793                    "at least one of the split \
794                     holes must be filled"
795                ),
796            },
797            Hole::Many(holes) => {
798                let mut new_holes = vec![];
799                for hole in holes {
800                    new_holes.push(self.fill_split(hole, goto1, goto2));
801                }
802                if new_holes.is_empty() {
803                    Hole::None
804                } else if new_holes.len() == 1 {
805                    new_holes.pop().unwrap()
806                } else {
807                    Hole::Many(new_holes)
808                }
809            }
810        }
811    }
812
813    fn push_compiled(&mut self, inst: Inst) {
814        self.insts.push(MaybeInst::Compiled(inst));
815    }
816
817    fn push_hole(&mut self, inst: InstHole) -> Hole {
818        let hole = self.insts.len();
819        self.insts.push(MaybeInst::Uncompiled(inst));
820        Hole::One(hole)
821    }
822
823    fn push_split_hole(&mut self) -> Hole {
824        let hole = self.insts.len();
825        self.insts.push(MaybeInst::Split);
826        Hole::One(hole)
827    }
828
829    fn pop_split_hole(&mut self) -> ResultOrEmpty {
830        self.insts.pop();
831        Ok(None)
832    }
833
834    fn check_size(&self) -> result::Result<(), Error> {
835        use std::mem::size_of;
836
837        let size =
838            self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
839        if size > self.size_limit {
840            Err(Error::CompiledTooBig(self.size_limit))
841        } else {
842            Ok(())
843        }
844    }
845}
846
847#[derive(Debug)]
848enum Hole {
849    None,
850    One(InstPtr),
851    Many(Vec<Hole>),
852}
853
854impl Hole {
855    fn dup_one(self) -> (Self, Self) {
856        match self {
857            Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
858            Hole::None | Hole::Many(_) => {
859                unreachable!("must be called on single hole")
860            }
861        }
862    }
863}
864
865#[derive(Clone, Debug)]
866enum MaybeInst {
867    Compiled(Inst),
868    Uncompiled(InstHole),
869    Split,
870    Split1(InstPtr),
871    Split2(InstPtr),
872}
873
874impl MaybeInst {
875    fn fill(&mut self, goto: InstPtr) {
876        let maybeinst = match *self {
877            MaybeInst::Split => MaybeInst::Split1(goto),
878            MaybeInst::Uncompiled(ref inst) => {
879                MaybeInst::Compiled(inst.fill(goto))
880            }
881            MaybeInst::Split1(goto1) => {
882                MaybeInst::Compiled(Inst::Split(InstSplit {
883                    goto1,
884                    goto2: goto,
885                }))
886            }
887            MaybeInst::Split2(goto2) => {
888                MaybeInst::Compiled(Inst::Split(InstSplit {
889                    goto1: goto,
890                    goto2,
891                }))
892            }
893            _ => unreachable!(
894                "not all instructions were compiled! \
895                 found uncompiled instruction: {:?}",
896                self
897            ),
898        };
899        *self = maybeinst;
900    }
901
902    fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
903        let filled = match *self {
904            MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
905            _ => unreachable!(
906                "must be called on Split instruction, \
907                 instead it was called on: {:?}",
908                self
909            ),
910        };
911        *self = MaybeInst::Compiled(filled);
912    }
913
914    fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
915        let half_filled = match *self {
916            MaybeInst::Split => goto1,
917            _ => unreachable!(
918                "must be called on Split instruction, \
919                 instead it was called on: {:?}",
920                self
921            ),
922        };
923        *self = MaybeInst::Split1(half_filled);
924    }
925
926    fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
927        let half_filled = match *self {
928            MaybeInst::Split => goto2,
929            _ => unreachable!(
930                "must be called on Split instruction, \
931                 instead it was called on: {:?}",
932                self
933            ),
934        };
935        *self = MaybeInst::Split2(half_filled);
936    }
937
938    fn unwrap(self) -> Inst {
939        match self {
940            MaybeInst::Compiled(inst) => inst,
941            _ => unreachable!(
942                "must be called on a compiled instruction, \
943                 instead it was called on: {:?}",
944                self
945            ),
946        }
947    }
948}
949
950#[derive(Clone, Debug)]
951enum InstHole {
952    Save { slot: usize },
953    EmptyLook { look: EmptyLook },
954    Char { c: char },
955    Ranges { ranges: Vec<(char, char)> },
956    Bytes { start: u8, end: u8 },
957}
958
959impl InstHole {
960    fn fill(&self, goto: InstPtr) -> Inst {
961        match *self {
962            InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
963            InstHole::EmptyLook { look } => {
964                Inst::EmptyLook(InstEmptyLook { goto, look })
965            }
966            InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
967            InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
968                goto,
969                ranges: ranges.clone().into_boxed_slice(),
970            }),
971            InstHole::Bytes { start, end } => {
972                Inst::Bytes(InstBytes { goto, start, end })
973            }
974        }
975    }
976}
977
978struct CompileClass<'a, 'b> {
979    c: &'a mut Compiler,
980    ranges: &'b [hir::ClassUnicodeRange],
981}
982
983impl<'a, 'b> CompileClass<'a, 'b> {
984    fn compile(mut self) -> Result {
985        let mut holes = vec![];
986        let mut initial_entry = None;
987        let mut last_split = Hole::None;
988        let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
989        self.c.suffix_cache.clear();
990
991        for (i, range) in self.ranges.iter().enumerate() {
992            let is_last_range = i + 1 == self.ranges.len();
993            utf8_seqs.reset(range.start(), range.end());
994            let mut it = (&mut utf8_seqs).peekable();
995            loop {
996                let utf8_seq = match it.next() {
997                    None => break,
998                    Some(utf8_seq) => utf8_seq,
999                };
1000                if is_last_range && it.peek().is_none() {
1001                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1002                    holes.push(hole);
1003                    self.c.fill(last_split, entry);
1004                    last_split = Hole::None;
1005                    if initial_entry.is_none() {
1006                        initial_entry = Some(entry);
1007                    }
1008                } else {
1009                    if initial_entry.is_none() {
1010                        initial_entry = Some(self.c.insts.len());
1011                    }
1012                    self.c.fill_to_next(last_split);
1013                    last_split = self.c.push_split_hole();
1014                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1015                    holes.push(hole);
1016                    last_split =
1017                        self.c.fill_split(last_split, Some(entry), None);
1018                }
1019            }
1020        }
1021        self.c.utf8_seqs = Some(utf8_seqs);
1022        Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1023    }
1024
1025    fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1026        if self.c.compiled.is_reverse {
1027            self.c_utf8_seq_(seq)
1028        } else {
1029            self.c_utf8_seq_(seq.into_iter().rev())
1030        }
1031    }
1032
1033    fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1034    where
1035        I: IntoIterator<Item = &'r Utf8Range>,
1036    {
1037        // The initial instruction for each UTF-8 sequence should be the same.
1038        let mut from_inst = ::std::usize::MAX;
1039        let mut last_hole = Hole::None;
1040        for byte_range in seq {
1041            let key = SuffixCacheKey {
1042                from_inst,
1043                start: byte_range.start,
1044                end: byte_range.end,
1045            };
1046            {
1047                let pc = self.c.insts.len();
1048                if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1049                    from_inst = cached_pc;
1050                    continue;
1051                }
1052            }
1053            self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1054            if from_inst == ::std::usize::MAX {
1055                last_hole = self.c.push_hole(InstHole::Bytes {
1056                    start: byte_range.start,
1057                    end: byte_range.end,
1058                });
1059            } else {
1060                self.c.push_compiled(Inst::Bytes(InstBytes {
1061                    goto: from_inst,
1062                    start: byte_range.start,
1063                    end: byte_range.end,
1064                }));
1065            }
1066            from_inst = self.c.insts.len().checked_sub(1).unwrap();
1067            debug_assert!(from_inst < ::std::usize::MAX);
1068        }
1069        debug_assert!(from_inst < ::std::usize::MAX);
1070        Ok(Patch { hole: last_hole, entry: from_inst })
1071    }
1072}
1073
1074/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1075/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1076/// The set of byte ranges looks like this:
1077///
1078/// [0-7F]
1079/// [C2-DF][80-BF]
1080/// [E0][A0-BF][80-BF]
1081/// [E1-EC][80-BF][80-BF]
1082/// [ED][80-9F][80-BF]
1083/// [EE-EF][80-BF][80-BF]
1084///
1085/// Each line above translates to one alternate in the compiled regex program.
1086/// However, all but one of the alternates end in the same suffix, which is
1087/// a waste of an instruction. The suffix cache facilitates reusing them across
1088/// alternates.
1089///
1090/// Note that a HashMap could be trivially used for this, but we don't need its
1091/// overhead. Some small bounded space (LRU style) is more than enough.
1092///
1093/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1094/// except it uses hashes as original indices and then compares full keys for
1095/// validation against `dense` array.
1096#[derive(Debug)]
1097struct SuffixCache {
1098    sparse: Box<[usize]>,
1099    dense: Vec<SuffixCacheEntry>,
1100}
1101
1102#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1103struct SuffixCacheEntry {
1104    key: SuffixCacheKey,
1105    pc: InstPtr,
1106}
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1109struct SuffixCacheKey {
1110    from_inst: InstPtr,
1111    start: u8,
1112    end: u8,
1113}
1114
1115impl SuffixCache {
1116    fn new(size: usize) -> Self {
1117        SuffixCache {
1118            sparse: vec![0usize; size].into(),
1119            dense: Vec::with_capacity(size),
1120        }
1121    }
1122
1123    fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1124        let hash = self.hash(&key);
1125        let pos = &mut self.sparse[hash];
1126        if let Some(entry) = self.dense.get(*pos) {
1127            if entry.key == key {
1128                return Some(entry.pc);
1129            }
1130        }
1131        *pos = self.dense.len();
1132        self.dense.push(SuffixCacheEntry { key, pc });
1133        None
1134    }
1135
1136    fn clear(&mut self) {
1137        self.dense.clear();
1138    }
1139
1140    fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1141        // Basic FNV-1a hash as described:
1142        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1143        const FNV_PRIME: u64 = 1_099_511_628_211;
1144        let mut h = 14_695_981_039_346_656_037;
1145        h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1146        h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1147        h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1148        (h as usize) % self.sparse.len()
1149    }
1150}
1151
1152struct ByteClassSet([bool; 256]);
1153
1154impl ByteClassSet {
1155    fn new() -> Self {
1156        ByteClassSet([false; 256])
1157    }
1158
1159    fn set_range(&mut self, start: u8, end: u8) {
1160        debug_assert!(start <= end);
1161        if start > 0 {
1162            self.0[start as usize - 1] = true;
1163        }
1164        self.0[end as usize] = true;
1165    }
1166
1167    fn set_word_boundary(&mut self) {
1168        // We need to mark all ranges of bytes whose pairs result in
1169        // evaluating \b differently.
1170        let iswb = is_word_byte;
1171        let mut b1: u16 = 0;
1172        let mut b2: u16;
1173        while b1 <= 255 {
1174            b2 = b1 + 1;
1175            while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1176                b2 += 1;
1177            }
1178            self.set_range(b1 as u8, (b2 - 1) as u8);
1179            b1 = b2;
1180        }
1181    }
1182
1183    fn byte_classes(&self) -> Vec<u8> {
1184        // N.B. If you're debugging the DFA, it's useful to simply return
1185        // `(0..256).collect()`, which effectively removes the byte classes
1186        // and makes the transitions easier to read.
1187        // (0usize..256).map(|x| x as u8).collect()
1188        let mut byte_classes = vec![0; 256];
1189        let mut class = 0u8;
1190        let mut i = 0;
1191        loop {
1192            byte_classes[i] = class as u8;
1193            if i >= 255 {
1194                break;
1195            }
1196            if self.0[i] {
1197                class = class.checked_add(1).unwrap();
1198            }
1199            i += 1;
1200        }
1201        byte_classes
1202    }
1203}
1204
1205impl fmt::Debug for ByteClassSet {
1206    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207        f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1208    }
1209}
1210
1211fn u32_to_usize(n: u32) -> usize {
1212    // In case usize is less than 32 bits, we need to guard against overflow.
1213    // On most platforms this compiles to nothing.
1214    // TODO Use `std::convert::TryFrom` once it's stable.
1215    if (n as u64) > (::std::usize::MAX as u64) {
1216        panic!("BUG: {} is too big to be pointer sized", n)
1217    }
1218    n as usize
1219}
1220
1221#[cfg(test)]
1222mod tests {
1223    use super::ByteClassSet;
1224
1225    #[test]
1226    fn byte_classes() {
1227        let mut set = ByteClassSet::new();
1228        set.set_range(b'a', b'z');
1229        let classes = set.byte_classes();
1230        assert_eq!(classes[0], 0);
1231        assert_eq!(classes[1], 0);
1232        assert_eq!(classes[2], 0);
1233        assert_eq!(classes[b'a' as usize - 1], 0);
1234        assert_eq!(classes[b'a' as usize], 1);
1235        assert_eq!(classes[b'm' as usize], 1);
1236        assert_eq!(classes[b'z' as usize], 1);
1237        assert_eq!(classes[b'z' as usize + 1], 2);
1238        assert_eq!(classes[254], 2);
1239        assert_eq!(classes[255], 2);
1240
1241        let mut set = ByteClassSet::new();
1242        set.set_range(0, 2);
1243        set.set_range(4, 6);
1244        let classes = set.byte_classes();
1245        assert_eq!(classes[0], 0);
1246        assert_eq!(classes[1], 0);
1247        assert_eq!(classes[2], 0);
1248        assert_eq!(classes[3], 1);
1249        assert_eq!(classes[4], 2);
1250        assert_eq!(classes[5], 2);
1251        assert_eq!(classes[6], 2);
1252        assert_eq!(classes[7], 3);
1253        assert_eq!(classes[255], 3);
1254    }
1255
1256    #[test]
1257    fn full_byte_classes() {
1258        let mut set = ByteClassSet::new();
1259        for i in 0..256u16 {
1260            set.set_range(i as u8, i as u8);
1261        }
1262        assert_eq!(set.byte_classes().len(), 256);
1263    }
1264}
regex/compile.rs

regex/
compile.rs