regex/
pikevm.rs

Help
1// This module implements the Pike VM. That is, it guarantees linear time
2// search of a regex on any text with memory use proportional to the size of
3// the regex.
4//
5// It is equal in power to the backtracking engine in this crate, except the
6// backtracking engine is typically faster on small regexes/texts at the
7// expense of a bigger memory footprint.
8//
9// It can do more than the DFA can (specifically, record capture locations
10// and execute Unicode word boundary assertions), but at a slower speed.
11// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12// epsilon transitions. That is, the Pike VM engine can be in multiple states
13// at once where as the DFA is only ever in one state at a time.
14//
15// Therefore, the Pike VM is generally treated as the fallback when the other
16// matching engines either aren't feasible to run or are insufficient.
17
18use std::mem;
19
20use crate::exec::ProgramCache;
21use crate::input::{Input, InputAt};
22use crate::prog::{InstPtr, Program};
23use crate::re_trait::Slot;
24use crate::sparse::SparseSet;
25
26/// An NFA simulation matching engine.
27#[derive(Debug)]
28pub struct Fsm<'r, I> {
29    /// The sequence of opcodes (among other things) that is actually executed.
30    ///
31    /// The program may be byte oriented or Unicode codepoint oriented.
32    prog: &'r Program,
33    /// An explicit stack used for following epsilon transitions. (This is
34    /// borrowed from the cache.)
35    stack: &'r mut Vec<FollowEpsilon>,
36    /// The input to search.
37    input: I,
38}
39
40/// A cached allocation that can be reused on each execution.
41#[derive(Clone, Debug)]
42pub struct Cache {
43    /// A pair of ordered sets for tracking NFA states.
44    clist: Threads,
45    nlist: Threads,
46    /// An explicit stack used for following epsilon transitions.
47    stack: Vec<FollowEpsilon>,
48}
49
50/// An ordered set of NFA states and their captures.
51#[derive(Clone, Debug)]
52struct Threads {
53    /// An ordered set of opcodes (each opcode is an NFA state).
54    set: SparseSet,
55    /// Captures for every NFA state.
56    ///
57    /// It is stored in row-major order, where the columns are the capture
58    /// slots and the rows are the states.
59    caps: Vec<Slot>,
60    /// The number of capture slots stored per thread. (Every capture has
61    /// two slots.)
62    slots_per_thread: usize,
63}
64
65/// A representation of an explicit stack frame when following epsilon
66/// transitions. This is used to avoid recursion.
67#[derive(Clone, Debug)]
68enum FollowEpsilon {
69    /// Follow transitions at the given instruction pointer.
70    IP(InstPtr),
71    /// Restore the capture slot with the given position in the input.
72    Capture { slot: usize, pos: Slot },
73}
74
75impl Cache {
76    /// Create a new allocation used by the NFA machine to record execution
77    /// and captures.
78    pub fn new(_prog: &Program) -> Self {
79        Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80    }
81}
82
83impl<'r, I: Input> Fsm<'r, I> {
84    /// Execute the NFA matching engine.
85    ///
86    /// If there's a match, `exec` returns `true` and populates the given
87    /// captures accordingly.
88    pub fn exec(
89        prog: &'r Program,
90        cache: &ProgramCache,
91        matches: &mut [bool],
92        slots: &mut [Slot],
93        quit_after_match: bool,
94        input: I,
95        start: usize,
96        end: usize,
97    ) -> bool {
98        let mut cache = cache.borrow_mut();
99        let cache = &mut cache.pikevm;
100        cache.clist.resize(prog.len(), prog.captures.len());
101        cache.nlist.resize(prog.len(), prog.captures.len());
102        let at = input.at(start);
103        Fsm { prog, stack: &mut cache.stack, input }.exec_(
104            &mut cache.clist,
105            &mut cache.nlist,
106            matches,
107            slots,
108            quit_after_match,
109            at,
110            end,
111        )
112    }
113
114    fn exec_(
115        &mut self,
116        mut clist: &mut Threads,
117        mut nlist: &mut Threads,
118        matches: &mut [bool],
119        slots: &mut [Slot],
120        quit_after_match: bool,
121        mut at: InputAt,
122        end: usize,
123    ) -> bool {
124        let mut matched = false;
125        let mut all_matched = false;
126        clist.set.clear();
127        nlist.set.clear();
128        'LOOP: loop {
129            if clist.set.is_empty() {
130                // Three ways to bail out when our current set of threads is
131                // empty.
132                //
133                // 1. We have a match---so we're done exploring any possible
134                //    alternatives. Time to quit. (We can't do this if we're
135                //    looking for matches for multiple regexes, unless we know
136                //    they all matched.)
137                //
138                // 2. If the expression starts with a '^' we can terminate as
139                //    soon as the last thread dies.
140                if (matched && matches.len() <= 1)
141                    || all_matched
142                    || (!at.is_start() && self.prog.is_anchored_start)
143                {
144                    break;
145                }
146
147                // 3. If there's a literal prefix for the program, try to
148                //    jump ahead quickly. If it can't be found, then we can
149                //    bail out early.
150                if !self.prog.prefixes.is_empty() {
151                    at = match self.input.prefix_at(&self.prog.prefixes, at) {
152                        None => break,
153                        Some(at) => at,
154                    };
155                }
156            }
157
158            // This simulates a preceding '.*?' for every regex by adding
159            // a state starting at the current position in the input for the
160            // beginning of the program only if we don't already have a match.
161            if clist.set.is_empty()
162                || (!self.prog.is_anchored_start && !all_matched)
163            {
164                self.add(&mut clist, slots, 0, at);
165            }
166            // The previous call to "add" actually inspects the position just
167            // before the current character. For stepping through the machine,
168            // we can to look at the current character, so we advance the
169            // input.
170            let at_next = self.input.at(at.next_pos());
171            for i in 0..clist.set.len() {
172                let ip = clist.set[i];
173                if self.step(
174                    &mut nlist,
175                    matches,
176                    slots,
177                    clist.caps(ip),
178                    ip,
179                    at,
180                    at_next,
181                ) {
182                    matched = true;
183                    all_matched = all_matched || matches.iter().all(|&b| b);
184                    if quit_after_match {
185                        // If we only care if a match occurs (not its
186                        // position), then we can quit right now.
187                        break 'LOOP;
188                    }
189                    if self.prog.matches.len() == 1 {
190                        // We don't need to check the rest of the threads
191                        // in this set because we've matched something
192                        // ("leftmost-first"). However, we still need to check
193                        // threads in the next set to support things like
194                        // greedy matching.
195                        //
196                        // This is only true on normal regexes. For regex sets,
197                        // we need to mush on to observe other matches.
198                        break;
199                    }
200                }
201            }
202            if at.pos() >= end {
203                break;
204            }
205            at = at_next;
206            mem::swap(clist, nlist);
207            nlist.set.clear();
208        }
209        matched
210    }
211
212    /// Step through the input, one token (byte or codepoint) at a time.
213    ///
214    /// nlist is the set of states that will be processed on the next token
215    /// in the input.
216    ///
217    /// caps is the set of captures passed by the caller of the NFA. They are
218    /// written to only when a match state is visited.
219    ///
220    /// thread_caps is the set of captures set for the current NFA state, ip.
221    ///
222    /// at and at_next are the current and next positions in the input. at or
223    /// at_next may be EOF.
224    fn step(
225        &mut self,
226        nlist: &mut Threads,
227        matches: &mut [bool],
228        slots: &mut [Slot],
229        thread_caps: &mut [Option<usize>],
230        ip: usize,
231        at: InputAt,
232        at_next: InputAt,
233    ) -> bool {
234        use crate::prog::Inst::*;
235        match self.prog[ip] {
236            Match(match_slot) => {
237                if match_slot < matches.len() {
238                    matches[match_slot] = true;
239                }
240                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241                    *slot = *val;
242                }
243                true
244            }
245            Char(ref inst) => {
246                if inst.c == at.char() {
247                    self.add(nlist, thread_caps, inst.goto, at_next);
248                }
249                false
250            }
251            Ranges(ref inst) => {
252                if inst.matches(at.char()) {
253                    self.add(nlist, thread_caps, inst.goto, at_next);
254                }
255                false
256            }
257            Bytes(ref inst) => {
258                if let Some(b) = at.byte() {
259                    if inst.matches(b) {
260                        self.add(nlist, thread_caps, inst.goto, at_next);
261                    }
262                }
263                false
264            }
265            EmptyLook(_) | Save(_) | Split(_) => false,
266        }
267    }
268
269    /// Follows epsilon transitions and adds them for processing to nlist,
270    /// starting at and including ip.
271    fn add(
272        &mut self,
273        nlist: &mut Threads,
274        thread_caps: &mut [Option<usize>],
275        ip: usize,
276        at: InputAt,
277    ) {
278        self.stack.push(FollowEpsilon::IP(ip));
279        while let Some(frame) = self.stack.pop() {
280            match frame {
281                FollowEpsilon::IP(ip) => {
282                    self.add_step(nlist, thread_caps, ip, at);
283                }
284                FollowEpsilon::Capture { slot, pos } => {
285                    thread_caps[slot] = pos;
286                }
287            }
288        }
289    }
290
291    /// A helper function for add that avoids excessive pushing to the stack.
292    fn add_step(
293        &mut self,
294        nlist: &mut Threads,
295        thread_caps: &mut [Option<usize>],
296        mut ip: usize,
297        at: InputAt,
298    ) {
299        // Instead of pushing and popping to the stack, we mutate ip as we
300        // traverse the set of states. We only push to the stack when we
301        // absolutely need recursion (restoring captures or following a
302        // branch).
303        use crate::prog::Inst::*;
304        loop {
305            // Don't visit states we've already added.
306            if nlist.set.contains(ip) {
307                return;
308            }
309            nlist.set.insert(ip);
310            match self.prog[ip] {
311                EmptyLook(ref inst) => {
312                    if self.input.is_empty_match(at, inst) {
313                        ip = inst.goto;
314                    }
315                }
316                Save(ref inst) => {
317                    if inst.slot < thread_caps.len() {
318                        self.stack.push(FollowEpsilon::Capture {
319                            slot: inst.slot,
320                            pos: thread_caps[inst.slot],
321                        });
322                        thread_caps[inst.slot] = Some(at.pos());
323                    }
324                    ip = inst.goto;
325                }
326                Split(ref inst) => {
327                    self.stack.push(FollowEpsilon::IP(inst.goto2));
328                    ip = inst.goto1;
329                }
330                Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
331                    let t = &mut nlist.caps(ip);
332                    for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333                        *slot = *val;
334                    }
335                    return;
336                }
337            }
338        }
339    }
340}
341
342impl Threads {
343    fn new() -> Self {
344        Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
345    }
346
347    fn resize(&mut self, num_insts: usize, ncaps: usize) {
348        if num_insts == self.set.capacity() {
349            return;
350        }
351        self.slots_per_thread = ncaps * 2;
352        self.set = SparseSet::new(num_insts);
353        self.caps = vec![None; self.slots_per_thread * num_insts];
354    }
355
356    fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357        let i = pc * self.slots_per_thread;
358        &mut self.caps[i..i + self.slots_per_thread]
359    }
360}
regex/pikevm.rs

regex/
pikevm.rs