regex/
prog.rs

1use std::cmp::Ordering;
2use std::collections::HashMap;
3use std::fmt;
4use std::mem;
5use std::ops::Deref;
6use std::slice;
7use std::sync::Arc;
8
9use crate::input::Char;
10use crate::literal::LiteralSearcher;
11
12/// `InstPtr` represents the index of an instruction in a regex program.
13pub type InstPtr = usize;
14
15/// Program is a sequence of instructions and various facts about thos
16/// instructions.
17#[derive(Clone)]
18pub struct Program {
19    /// A sequence of instructions that represents an NFA.
20    pub insts: Vec<Inst>,
21    /// Pointers to each Match instruction in the sequence.
22    ///
23    /// This is always length 1 unless this program represents a regex set.
24    pub matches: Vec<InstPtr>,
25    /// The ordered sequence of all capture groups extracted from the AST.
26    /// Unnamed groups are `None`.
27    pub captures: Vec<Option<String>>,
28    /// Pointers to all named capture groups into `captures`.
29    pub capture_name_idx: Arc<HashMap<String, usize>>,
30    /// A pointer to the start instruction. This can vary depending on how
31    /// the program was compiled. For example, programs for use with the DFA
32    /// engine have a `.*?` inserted at the beginning of unanchored regular
33    /// expressions. The actual starting point of the program is after the
34    /// `.*?`.
35    pub start: InstPtr,
36    /// A set of equivalence classes for discriminating bytes in the compiled
37    /// program.
38    pub byte_classes: Vec<u8>,
39    /// When true, this program can only match valid UTF-8.
40    pub only_utf8: bool,
41    /// When true, this program uses byte range instructions instead of Unicode
42    /// range instructions.
43    pub is_bytes: bool,
44    /// When true, the program is compiled for DFA matching. For example, this
45    /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
46    /// regexes.
47    pub is_dfa: bool,
48    /// When true, the program matches text in reverse (for use only in the
49    /// DFA).
50    pub is_reverse: bool,
51    /// Whether the regex must match from the start of the input.
52    pub is_anchored_start: bool,
53    /// Whether the regex must match at the end of the input.
54    pub is_anchored_end: bool,
55    /// Whether this program contains a Unicode word boundary instruction.
56    pub has_unicode_word_boundary: bool,
57    /// A possibly empty machine for very quickly matching prefix literals.
58    pub prefixes: LiteralSearcher,
59    /// A limit on the size of the cache that the DFA is allowed to use while
60    /// matching.
61    ///
62    /// The cache limit specifies approximately how much space we're willing to
63    /// give to the state cache. Once the state cache exceeds the size, it is
64    /// wiped and all states must be re-computed.
65    ///
66    /// Note that this value does not impact correctness. It can be set to 0
67    /// and the DFA will run just fine. (It will only ever store exactly one
68    /// state in the cache, and will likely run very slowly, but it will work.)
69    ///
70    /// Also note that this limit is *per thread of execution*. That is,
71    /// if the same regex is used to search text across multiple threads
72    /// simultaneously, then the DFA cache is not shared. Instead, copies are
73    /// made.
74    pub dfa_size_limit: usize,
75}
76
77impl Program {
78    /// Creates an empty instruction sequence. Fields are given default
79    /// values.
80    pub fn new() -> Self {
81        Program {
82            insts: vec![],
83            matches: vec![],
84            captures: vec![],
85            capture_name_idx: Arc::new(HashMap::new()),
86            start: 0,
87            byte_classes: vec![0; 256],
88            only_utf8: true,
89            is_bytes: false,
90            is_dfa: false,
91            is_reverse: false,
92            is_anchored_start: false,
93            is_anchored_end: false,
94            has_unicode_word_boundary: false,
95            prefixes: LiteralSearcher::empty(),
96            dfa_size_limit: 2 * (1 << 20),
97        }
98    }
99
100    /// If pc is an index to a no-op instruction (like Save), then return the
101    /// next pc that is not a no-op instruction.
102    pub fn skip(&self, mut pc: usize) -> usize {
103        loop {
104            match self[pc] {
105                Inst::Save(ref i) => pc = i.goto,
106                _ => return pc,
107            }
108        }
109    }
110
111    /// Return true if and only if an execution engine at instruction `pc` will
112    /// always lead to a match.
113    pub fn leads_to_match(&self, pc: usize) -> bool {
114        if self.matches.len() > 1 {
115            // If we have a regex set, then we have more than one ending
116            // state, so leading to one of those states is generally
117            // meaningless.
118            return false;
119        }
120        match self[self.skip(pc)] {
121            Inst::Match(_) => true,
122            _ => false,
123        }
124    }
125
126    /// Returns true if the current configuration demands that an implicit
127    /// `.*?` be prepended to the instruction sequence.
128    pub fn needs_dotstar(&self) -> bool {
129        self.is_dfa && !self.is_reverse && !self.is_anchored_start
130    }
131
132    /// Returns true if this program uses Byte instructions instead of
133    /// Char/Range instructions.
134    pub fn uses_bytes(&self) -> bool {
135        self.is_bytes || self.is_dfa
136    }
137
138    /// Returns true if this program exclusively matches valid UTF-8 bytes.
139    ///
140    /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
141    pub fn only_utf8(&self) -> bool {
142        self.only_utf8
143    }
144
145    /// Return the approximate heap usage of this instruction sequence in
146    /// bytes.
147    pub fn approximate_size(&self) -> usize {
148        // The only instruction that uses heap space is Ranges (for
149        // Unicode codepoint programs) to store non-overlapping codepoint
150        // ranges. To keep this operation constant time, we ignore them.
151        (self.len() * mem::size_of::<Inst>())
152            + (self.matches.len() * mem::size_of::<InstPtr>())
153            + (self.captures.len() * mem::size_of::<Option<String>>())
154            + (self.capture_name_idx.len()
155                * (mem::size_of::<String>() + mem::size_of::<usize>()))
156            + (self.byte_classes.len() * mem::size_of::<u8>())
157            + self.prefixes.approximate_size()
158    }
159}
160
161impl Deref for Program {
162    type Target = [Inst];
163
164    #[cfg_attr(feature = "perf-inline", inline(always))]
165    fn deref(&self) -> &Self::Target {
166        &*self.insts
167    }
168}
169
170impl fmt::Debug for Program {
171    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
172        use self::Inst::*;
173
174        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
175            if goto == cur + 1 {
176                fmtd
177            } else {
178                format!("{} (goto: {})", fmtd, goto)
179            }
180        }
181
182        fn visible_byte(b: u8) -> String {
183            use std::ascii::escape_default;
184            let escaped = escape_default(b).collect::<Vec<u8>>();
185            String::from_utf8_lossy(&escaped).into_owned()
186        }
187
188        for (pc, inst) in self.iter().enumerate() {
189            match *inst {
190                Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
191                Save(ref inst) => {
192                    let s = format!("{:04} Save({})", pc, inst.slot);
193                    write!(f, "{}", with_goto(pc, inst.goto, s))?;
194                }
195                Split(ref inst) => {
196                    write!(
197                        f,
198                        "{:04} Split({}, {})",
199                        pc, inst.goto1, inst.goto2
200                    )?;
201                }
202                EmptyLook(ref inst) => {
203                    let s = format!("{:?}", inst.look);
204                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
205                }
206                Char(ref inst) => {
207                    let s = format!("{:?}", inst.c);
208                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
209                }
210                Ranges(ref inst) => {
211                    let ranges = inst
212                        .ranges
213                        .iter()
214                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
215                        .collect::<Vec<String>>()
216                        .join(", ");
217                    write!(
218                        f,
219                        "{:04} {}",
220                        pc,
221                        with_goto(pc, inst.goto, ranges)
222                    )?;
223                }
224                Bytes(ref inst) => {
225                    let s = format!(
226                        "Bytes({}, {})",
227                        visible_byte(inst.start),
228                        visible_byte(inst.end)
229                    );
230                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
231                }
232            }
233            if pc == self.start {
234                write!(f, " (start)")?;
235            }
236            writeln!(f)?;
237        }
238        Ok(())
239    }
240}
241
242impl<'a> IntoIterator for &'a Program {
243    type Item = &'a Inst;
244    type IntoIter = slice::Iter<'a, Inst>;
245    fn into_iter(self) -> Self::IntoIter {
246        self.iter()
247    }
248}
249
250/// Inst is an instruction code in a Regex program.
251///
252/// Regrettably, a regex program either contains Unicode codepoint
253/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
254/// A regex program can never contain both.
255///
256/// It would be worth investigating splitting this into two distinct types and
257/// then figuring out how to make the matching engines polymorphic over those
258/// types without sacrificing performance.
259///
260/// Other than the benefit of moving invariants into the type system, another
261/// benefit is the decreased size. If we remove the `Char` and `Ranges`
262/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
263/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
264/// variant.) Given that byte based machines are typically much bigger than
265/// their Unicode analogues (because they can decode UTF-8 directly), this ends
266/// up being a pretty significant savings.
267#[derive(Clone, Debug)]
268pub enum Inst {
269    /// Match indicates that the program has reached a match state.
270    ///
271    /// The number in the match corresponds to the Nth logical regular
272    /// expression in this program. This index is always 0 for normal regex
273    /// programs. Values greater than 0 appear when compiling regex sets, and
274    /// each match instruction gets its own unique value. The value corresponds
275    /// to the Nth regex in the set.
276    Match(usize),
277    /// Save causes the program to save the current location of the input in
278    /// the slot indicated by InstSave.
279    Save(InstSave),
280    /// Split causes the program to diverge to one of two paths in the
281    /// program, preferring goto1 in InstSplit.
282    Split(InstSplit),
283    /// EmptyLook represents a zero-width assertion in a regex program. A
284    /// zero-width assertion does not consume any of the input text.
285    EmptyLook(InstEmptyLook),
286    /// Char requires the regex program to match the character in InstChar at
287    /// the current position in the input.
288    Char(InstChar),
289    /// Ranges requires the regex program to match the character at the current
290    /// position in the input with one of the ranges specified in InstRanges.
291    Ranges(InstRanges),
292    /// Bytes is like Ranges, except it expresses a single byte range. It is
293    /// used in conjunction with Split instructions to implement multi-byte
294    /// character classes.
295    Bytes(InstBytes),
296}
297
298impl Inst {
299    /// Returns true if and only if this is a match instruction.
300    pub fn is_match(&self) -> bool {
301        match *self {
302            Inst::Match(_) => true,
303            _ => false,
304        }
305    }
306}
307
308/// Representation of the Save instruction.
309#[derive(Clone, Debug)]
310pub struct InstSave {
311    /// The next location to execute in the program.
312    pub goto: InstPtr,
313    /// The capture slot (there are two slots for every capture in a regex,
314    /// including the zeroth capture for the entire match).
315    pub slot: usize,
316}
317
318/// Representation of the Split instruction.
319#[derive(Clone, Debug)]
320pub struct InstSplit {
321    /// The first instruction to try. A match resulting from following goto1
322    /// has precedence over a match resulting from following goto2.
323    pub goto1: InstPtr,
324    /// The second instruction to try. A match resulting from following goto1
325    /// has precedence over a match resulting from following goto2.
326    pub goto2: InstPtr,
327}
328
329/// Representation of the `EmptyLook` instruction.
330#[derive(Clone, Debug)]
331pub struct InstEmptyLook {
332    /// The next location to execute in the program if this instruction
333    /// succeeds.
334    pub goto: InstPtr,
335    /// The type of zero-width assertion to check.
336    pub look: EmptyLook,
337}
338
339/// The set of zero-width match instructions.
340#[derive(Clone, Copy, Debug, PartialEq, Eq)]
341pub enum EmptyLook {
342    /// Start of line or input.
343    StartLine,
344    /// End of line or input.
345    EndLine,
346    /// Start of input.
347    StartText,
348    /// End of input.
349    EndText,
350    /// Word character on one side and non-word character on other.
351    WordBoundary,
352    /// Word character on both sides or non-word character on both sides.
353    NotWordBoundary,
354    /// ASCII word boundary.
355    WordBoundaryAscii,
356    /// Not ASCII word boundary.
357    NotWordBoundaryAscii,
358}
359
360/// Representation of the Char instruction.
361#[derive(Clone, Debug)]
362pub struct InstChar {
363    /// The next location to execute in the program if this instruction
364    /// succeeds.
365    pub goto: InstPtr,
366    /// The character to test.
367    pub c: char,
368}
369
370/// Representation of the Ranges instruction.
371#[derive(Clone, Debug)]
372pub struct InstRanges {
373    /// The next location to execute in the program if this instruction
374    /// succeeds.
375    pub goto: InstPtr,
376    /// The set of Unicode scalar value ranges to test.
377    pub ranges: Box<[(char, char)]>,
378}
379
380impl InstRanges {
381    /// Tests whether the given input character matches this instruction.
382    pub fn matches(&self, c: Char) -> bool {
383        // This speeds up the `match_class_unicode` benchmark by checking
384        // some common cases quickly without binary search. e.g., Matching
385        // a Unicode class on predominantly ASCII text.
386        for r in self.ranges.iter().take(4) {
387            if c < r.0 {
388                return false;
389            }
390            if c <= r.1 {
391                return true;
392            }
393        }
394        self.ranges
395            .binary_search_by(|r| {
396                if r.1 < c {
397                    Ordering::Less
398                } else if r.0 > c {
399                    Ordering::Greater
400                } else {
401                    Ordering::Equal
402                }
403            })
404            .is_ok()
405    }
406
407    /// Return the number of distinct characters represented by all of the
408    /// ranges.
409    pub fn num_chars(&self) -> usize {
410        self.ranges
411            .iter()
412            .map(|&(s, e)| 1 + (e as u32) - (s as u32))
413            .sum::<u32>() as usize
414    }
415}
416
417/// Representation of the Bytes instruction.
418#[derive(Clone, Debug)]
419pub struct InstBytes {
420    /// The next location to execute in the program if this instruction
421    /// succeeds.
422    pub goto: InstPtr,
423    /// The start (inclusive) of this byte range.
424    pub start: u8,
425    /// The end (inclusive) of this byte range.
426    pub end: u8,
427}
428
429impl InstBytes {
430    /// Returns true if and only if the given byte is in this range.
431    pub fn matches(&self, byte: u8) -> bool {
432        self.start <= byte && byte <= self.end
433    }
434}
435
436#[cfg(test)]
437mod test {
438    #[test]
439    #[cfg(target_pointer_width = "64")]
440    fn test_size_of_inst() {
441        use std::mem::size_of;
442
443        use super::Inst;
444
445        assert_eq!(32, size_of::<Inst>());
446    }
447}