regex/pikevm.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
// This module implements the Pike VM. That is, it guarantees linear time
// search of a regex on any text with memory use proportional to the size of
// the regex.
//
// It is equal in power to the backtracking engine in this crate, except the
// backtracking engine is typically faster on small regexes/texts at the
// expense of a bigger memory footprint.
//
// It can do more than the DFA can (specifically, record capture locations
// and execute Unicode word boundary assertions), but at a slower speed.
// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
// epsilon transitions. That is, the Pike VM engine can be in multiple states
// at once where as the DFA is only ever in one state at a time.
//
// Therefore, the Pike VM is generally treated as the fallback when the other
// matching engines either aren't feasible to run or are insufficient.
use std::mem;
use crate::exec::ProgramCache;
use crate::input::{Input, InputAt};
use crate::prog::{InstPtr, Program};
use crate::re_trait::Slot;
use crate::sparse::SparseSet;
/// An NFA simulation matching engine.
#[derive(Debug)]
pub struct Fsm<'r, I> {
/// The sequence of opcodes (among other things) that is actually executed.
///
/// The program may be byte oriented or Unicode codepoint oriented.
prog: &'r Program,
/// An explicit stack used for following epsilon transitions. (This is
/// borrowed from the cache.)
stack: &'r mut Vec<FollowEpsilon>,
/// The input to search.
input: I,
}
/// A cached allocation that can be reused on each execution.
#[derive(Clone, Debug)]
pub struct Cache {
/// A pair of ordered sets for tracking NFA states.
clist: Threads,
nlist: Threads,
/// An explicit stack used for following epsilon transitions.
stack: Vec<FollowEpsilon>,
}
/// An ordered set of NFA states and their captures.
#[derive(Clone, Debug)]
struct Threads {
/// An ordered set of opcodes (each opcode is an NFA state).
set: SparseSet,
/// Captures for every NFA state.
///
/// It is stored in row-major order, where the columns are the capture
/// slots and the rows are the states.
caps: Vec<Slot>,
/// The number of capture slots stored per thread. (Every capture has
/// two slots.)
slots_per_thread: usize,
}
/// A representation of an explicit stack frame when following epsilon
/// transitions. This is used to avoid recursion.
#[derive(Clone, Debug)]
enum FollowEpsilon {
/// Follow transitions at the given instruction pointer.
IP(InstPtr),
/// Restore the capture slot with the given position in the input.
Capture { slot: usize, pos: Slot },
}
impl Cache {
/// Create a new allocation used by the NFA machine to record execution
/// and captures.
pub fn new(_prog: &Program) -> Self {
Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
}
}
impl<'r, I: Input> Fsm<'r, I> {
/// Execute the NFA matching engine.
///
/// If there's a match, `exec` returns `true` and populates the given
/// captures accordingly.
pub fn exec(
prog: &'r Program,
cache: &ProgramCache,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
input: I,
start: usize,
end: usize,
) -> bool {
let mut cache = cache.borrow_mut();
let cache = &mut cache.pikevm;
cache.clist.resize(prog.len(), prog.captures.len());
cache.nlist.resize(prog.len(), prog.captures.len());
let at = input.at(start);
Fsm { prog, stack: &mut cache.stack, input }.exec_(
&mut cache.clist,
&mut cache.nlist,
matches,
slots,
quit_after_match,
at,
end,
)
}
fn exec_(
&mut self,
mut clist: &mut Threads,
mut nlist: &mut Threads,
matches: &mut [bool],
slots: &mut [Slot],
quit_after_match: bool,
mut at: InputAt,
end: usize,
) -> bool {
let mut matched = false;
let mut all_matched = false;
clist.set.clear();
nlist.set.clear();
'LOOP: loop {
if clist.set.is_empty() {
// Three ways to bail out when our current set of threads is
// empty.
//
// 1. We have a match---so we're done exploring any possible
// alternatives. Time to quit. (We can't do this if we're
// looking for matches for multiple regexes, unless we know
// they all matched.)
//
// 2. If the expression starts with a '^' we can terminate as
// soon as the last thread dies.
if (matched && matches.len() <= 1)
|| all_matched
|| (!at.is_start() && self.prog.is_anchored_start)
{
break;
}
// 3. If there's a literal prefix for the program, try to
// jump ahead quickly. If it can't be found, then we can
// bail out early.
if !self.prog.prefixes.is_empty() {
at = match self.input.prefix_at(&self.prog.prefixes, at) {
None => break,
Some(at) => at,
};
}
}
// This simulates a preceding '.*?' for every regex by adding
// a state starting at the current position in the input for the
// beginning of the program only if we don't already have a match.
if clist.set.is_empty()
|| (!self.prog.is_anchored_start && !all_matched)
{
self.add(&mut clist, slots, 0, at);
}
// The previous call to "add" actually inspects the position just
// before the current character. For stepping through the machine,
// we can to look at the current character, so we advance the
// input.
let at_next = self.input.at(at.next_pos());
for i in 0..clist.set.len() {
let ip = clist.set[i];
if self.step(
&mut nlist,
matches,
slots,
clist.caps(ip),
ip,
at,
at_next,
) {
matched = true;
all_matched = all_matched || matches.iter().all(|&b| b);
if quit_after_match {
// If we only care if a match occurs (not its
// position), then we can quit right now.
break 'LOOP;
}
if self.prog.matches.len() == 1 {
// We don't need to check the rest of the threads
// in this set because we've matched something
// ("leftmost-first"). However, we still need to check
// threads in the next set to support things like
// greedy matching.
//
// This is only true on normal regexes. For regex sets,
// we need to mush on to observe other matches.
break;
}
}
}
if at.pos() >= end {
break;
}
at = at_next;
mem::swap(clist, nlist);
nlist.set.clear();
}
matched
}
/// Step through the input, one token (byte or codepoint) at a time.
///
/// nlist is the set of states that will be processed on the next token
/// in the input.
///
/// caps is the set of captures passed by the caller of the NFA. They are
/// written to only when a match state is visited.
///
/// thread_caps is the set of captures set for the current NFA state, ip.
///
/// at and at_next are the current and next positions in the input. at or
/// at_next may be EOF.
fn step(
&mut self,
nlist: &mut Threads,
matches: &mut [bool],
slots: &mut [Slot],
thread_caps: &mut [Option<usize>],
ip: usize,
at: InputAt,
at_next: InputAt,
) -> bool {
use crate::prog::Inst::*;
match self.prog[ip] {
Match(match_slot) => {
if match_slot < matches.len() {
matches[match_slot] = true;
}
for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
*slot = *val;
}
true
}
Char(ref inst) => {
if inst.c == at.char() {
self.add(nlist, thread_caps, inst.goto, at_next);
}
false
}
Ranges(ref inst) => {
if inst.matches(at.char()) {
self.add(nlist, thread_caps, inst.goto, at_next);
}
false
}
Bytes(ref inst) => {
if let Some(b) = at.byte() {
if inst.matches(b) {
self.add(nlist, thread_caps, inst.goto, at_next);
}
}
false
}
EmptyLook(_) | Save(_) | Split(_) => false,
}
}
/// Follows epsilon transitions and adds them for processing to nlist,
/// starting at and including ip.
fn add(
&mut self,
nlist: &mut Threads,
thread_caps: &mut [Option<usize>],
ip: usize,
at: InputAt,
) {
self.stack.push(FollowEpsilon::IP(ip));
while let Some(frame) = self.stack.pop() {
match frame {
FollowEpsilon::IP(ip) => {
self.add_step(nlist, thread_caps, ip, at);
}
FollowEpsilon::Capture { slot, pos } => {
thread_caps[slot] = pos;
}
}
}
}
/// A helper function for add that avoids excessive pushing to the stack.
fn add_step(
&mut self,
nlist: &mut Threads,
thread_caps: &mut [Option<usize>],
mut ip: usize,
at: InputAt,
) {
// Instead of pushing and popping to the stack, we mutate ip as we
// traverse the set of states. We only push to the stack when we
// absolutely need recursion (restoring captures or following a
// branch).
use crate::prog::Inst::*;
loop {
// Don't visit states we've already added.
if nlist.set.contains(ip) {
return;
}
nlist.set.insert(ip);
match self.prog[ip] {
EmptyLook(ref inst) => {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
}
}
Save(ref inst) => {
if inst.slot < thread_caps.len() {
self.stack.push(FollowEpsilon::Capture {
slot: inst.slot,
pos: thread_caps[inst.slot],
});
thread_caps[inst.slot] = Some(at.pos());
}
ip = inst.goto;
}
Split(ref inst) => {
self.stack.push(FollowEpsilon::IP(inst.goto2));
ip = inst.goto1;
}
Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
let t = &mut nlist.caps(ip);
for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
*slot = *val;
}
return;
}
}
}
}
}
impl Threads {
fn new() -> Self {
Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 }
}
fn resize(&mut self, num_insts: usize, ncaps: usize) {
if num_insts == self.set.capacity() {
return;
}
self.slots_per_thread = ncaps * 2;
self.set = SparseSet::new(num_insts);
self.caps = vec![None; self.slots_per_thread * num_insts];
}
fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
let i = pc * self.slots_per_thread;
&mut self.caps[i..i + self.slots_per_thread]
}
}