1use std::collections::HashMap;
2use std::fmt;
3use std::iter;
4use std::result;
5use std::sync::Arc;
6
7use regex_syntax::hir::{self, Hir};
8use regex_syntax::is_word_byte;
9use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11use crate::prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14};
15
16use crate::Error;
17
18type Result = result::Result<Patch, Error>;
19type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21#[derive(Debug)]
22struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25}
26
27#[allow(missing_debug_implementations)]
32pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41 extra_inst_bytes: usize,
52}
53
54impl Compiler {
55 pub fn new() -> Self {
59 Compiler {
60 insts: vec![],
61 compiled: Program::new(),
62 capture_name_idx: HashMap::new(),
63 num_exprs: 0,
64 size_limit: 10 * (1 << 20),
65 suffix_cache: SuffixCache::new(1000),
66 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67 byte_classes: ByteClassSet::new(),
68 extra_inst_bytes: 0,
69 }
70 }
71
72 pub fn size_limit(mut self, size_limit: usize) -> Self {
76 self.size_limit = size_limit;
77 self
78 }
79
80 pub fn bytes(mut self, yes: bool) -> Self {
92 self.compiled.is_bytes = yes;
93 self
94 }
95
96 pub fn only_utf8(mut self, yes: bool) -> Self {
101 self.compiled.only_utf8 = yes;
102 self
103 }
104
105 pub fn dfa(mut self, yes: bool) -> Self {
113 self.compiled.is_dfa = yes;
114 self
115 }
116
117 pub fn reverse(mut self, yes: bool) -> Self {
120 self.compiled.is_reverse = yes;
121 self
122 }
123
124 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130 debug_assert!(!exprs.is_empty());
131 self.num_exprs = exprs.len();
132 if exprs.len() == 1 {
133 self.compile_one(&exprs[0])
134 } else {
135 self.compile_many(exprs)
136 }
137 }
138
139 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145 self.compiled.is_anchored_start = expr.is_anchored_start();
146 self.compiled.is_anchored_end = expr.is_anchored_end();
147 if self.compiled.needs_dotstar() {
148 dotstar_patch = self.c_dotstar()?;
149 self.compiled.start = dotstar_patch.entry;
150 }
151 self.compiled.captures = vec![None];
152 let patch =
153 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
154 if self.compiled.needs_dotstar() {
155 self.fill(dotstar_patch.hole, patch.entry);
156 } else {
157 self.compiled.start = patch.entry;
158 }
159 self.fill_to_next(patch.hole);
160 self.compiled.matches = vec![self.insts.len()];
161 self.push_compiled(Inst::Match(0));
162 self.compile_finish()
163 }
164
165 fn compile_many(
166 mut self,
167 exprs: &[Hir],
168 ) -> result::Result<Program, Error> {
169 debug_assert!(exprs.len() > 1);
170
171 self.compiled.is_anchored_start =
172 exprs.iter().all(|e| e.is_anchored_start());
173 self.compiled.is_anchored_end =
174 exprs.iter().all(|e| e.is_anchored_end());
175 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
176 if self.compiled.needs_dotstar() {
177 dotstar_patch = self.c_dotstar()?;
178 self.compiled.start = dotstar_patch.entry;
179 } else {
180 self.compiled.start = 0; }
182 self.fill_to_next(dotstar_patch.hole);
183
184 let mut prev_hole = Hole::None;
185 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
186 self.fill_to_next(prev_hole);
187 let split = self.push_split_hole();
188 let Patch { hole, entry } =
189 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
190 self.fill_to_next(hole);
191 self.compiled.matches.push(self.insts.len());
192 self.push_compiled(Inst::Match(i));
193 prev_hole = self.fill_split(split, Some(entry), None);
194 }
195 let i = exprs.len() - 1;
196 let Patch { hole, entry } =
197 self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
198 self.fill(prev_hole, entry);
199 self.fill_to_next(hole);
200 self.compiled.matches.push(self.insts.len());
201 self.push_compiled(Inst::Match(i));
202 self.compile_finish()
203 }
204
205 fn compile_finish(mut self) -> result::Result<Program, Error> {
206 self.compiled.insts =
207 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
208 self.compiled.byte_classes = self.byte_classes.byte_classes();
209 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
210 Ok(self.compiled)
211 }
212
213 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
269 use crate::prog;
270 use regex_syntax::hir::HirKind::*;
271
272 self.check_size()?;
273 match *expr.kind() {
274 Empty => self.c_empty(),
275 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
276 Literal(hir::Literal::Byte(b)) => {
277 assert!(self.compiled.uses_bytes());
278 self.c_byte(b)
279 }
280 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
281 Class(hir::Class::Bytes(ref cls)) => {
282 if self.compiled.uses_bytes() {
283 self.c_class_bytes(cls.ranges())
284 } else {
285 assert!(cls.is_all_ascii());
286 let mut char_ranges = vec![];
287 for r in cls.iter() {
288 let (s, e) = (r.start() as char, r.end() as char);
289 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
290 }
291 self.c_class(&char_ranges)
292 }
293 }
294 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
295 self.byte_classes.set_range(b'\n', b'\n');
296 self.c_empty_look(prog::EmptyLook::EndLine)
297 }
298 Anchor(hir::Anchor::StartLine) => {
299 self.byte_classes.set_range(b'\n', b'\n');
300 self.c_empty_look(prog::EmptyLook::StartLine)
301 }
302 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
303 self.byte_classes.set_range(b'\n', b'\n');
304 self.c_empty_look(prog::EmptyLook::StartLine)
305 }
306 Anchor(hir::Anchor::EndLine) => {
307 self.byte_classes.set_range(b'\n', b'\n');
308 self.c_empty_look(prog::EmptyLook::EndLine)
309 }
310 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
311 self.c_empty_look(prog::EmptyLook::EndText)
312 }
313 Anchor(hir::Anchor::StartText) => {
314 self.c_empty_look(prog::EmptyLook::StartText)
315 }
316 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
317 self.c_empty_look(prog::EmptyLook::StartText)
318 }
319 Anchor(hir::Anchor::EndText) => {
320 self.c_empty_look(prog::EmptyLook::EndText)
321 }
322 WordBoundary(hir::WordBoundary::Unicode) => {
323 if !cfg!(feature = "unicode-perl") {
324 return Err(Error::Syntax(
325 "Unicode word boundaries are unavailable when \
326 the unicode-perl feature is disabled"
327 .to_string(),
328 ));
329 }
330 self.compiled.has_unicode_word_boundary = true;
331 self.byte_classes.set_word_boundary();
332 self.byte_classes.set_range(0, 0x7F);
339 self.c_empty_look(prog::EmptyLook::WordBoundary)
340 }
341 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
342 if !cfg!(feature = "unicode-perl") {
343 return Err(Error::Syntax(
344 "Unicode word boundaries are unavailable when \
345 the unicode-perl feature is disabled"
346 .to_string(),
347 ));
348 }
349 self.compiled.has_unicode_word_boundary = true;
350 self.byte_classes.set_word_boundary();
351 self.byte_classes.set_range(0, 0x7F);
353 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
354 }
355 WordBoundary(hir::WordBoundary::Ascii) => {
356 self.byte_classes.set_word_boundary();
357 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
358 }
359 WordBoundary(hir::WordBoundary::AsciiNegate) => {
360 self.byte_classes.set_word_boundary();
361 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
362 }
363 Group(ref g) => match g.kind {
364 hir::GroupKind::NonCapturing => self.c(&g.hir),
365 hir::GroupKind::CaptureIndex(index) => {
366 if index as usize >= self.compiled.captures.len() {
367 self.compiled.captures.push(None);
368 }
369 self.c_capture(2 * index as usize, &g.hir)
370 }
371 hir::GroupKind::CaptureName { index, ref name } => {
372 if index as usize >= self.compiled.captures.len() {
373 let n = name.to_string();
374 self.compiled.captures.push(Some(n.clone()));
375 self.capture_name_idx.insert(n, index as usize);
376 }
377 self.c_capture(2 * index as usize, &g.hir)
378 }
379 },
380 Concat(ref es) => {
381 if self.compiled.is_reverse {
382 self.c_concat(es.iter().rev())
383 } else {
384 self.c_concat(es)
385 }
386 }
387 Alternation(ref es) => self.c_alternate(&**es),
388 Repetition(ref rep) => self.c_repeat(rep),
389 }
390 }
391
392 fn c_empty(&mut self) -> ResultOrEmpty {
393 self.extra_inst_bytes += std::mem::size_of::<Inst>();
402 Ok(None)
403 }
404
405 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
406 if self.num_exprs > 1 || self.compiled.is_dfa {
407 self.c(expr)
411 } else {
412 let entry = self.insts.len();
413 let hole = self.push_hole(InstHole::Save { slot: first_slot });
414 let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
415 self.fill(hole, patch.entry);
416 self.fill_to_next(patch.hole);
417 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
418 Ok(Some(Patch { hole, entry }))
419 }
420 }
421
422 fn c_dotstar(&mut self) -> Result {
423 Ok(if !self.compiled.only_utf8() {
424 self.c(&Hir::repetition(hir::Repetition {
425 kind: hir::RepetitionKind::ZeroOrMore,
426 greedy: false,
427 hir: Box::new(Hir::any(true)),
428 }))?
429 .unwrap()
430 } else {
431 self.c(&Hir::repetition(hir::Repetition {
432 kind: hir::RepetitionKind::ZeroOrMore,
433 greedy: false,
434 hir: Box::new(Hir::any(false)),
435 }))?
436 .unwrap()
437 })
438 }
439
440 fn c_char(&mut self, c: char) -> ResultOrEmpty {
441 if self.compiled.uses_bytes() {
442 if c.is_ascii() {
443 let b = c as u8;
444 let hole =
445 self.push_hole(InstHole::Bytes { start: b, end: b });
446 self.byte_classes.set_range(b, b);
447 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
448 } else {
449 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
450 }
451 } else {
452 let hole = self.push_hole(InstHole::Char { c });
453 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
454 }
455 }
456
457 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
458 use std::mem::size_of;
459
460 assert!(!ranges.is_empty());
461 if self.compiled.uses_bytes() {
462 Ok(Some(CompileClass { c: self, ranges }.compile()?))
463 } else {
464 let ranges: Vec<(char, char)> =
465 ranges.iter().map(|r| (r.start(), r.end())).collect();
466 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
467 self.push_hole(InstHole::Char { c: ranges[0].0 })
468 } else {
469 self.extra_inst_bytes +=
470 ranges.len() * (size_of::<char>() * 2);
471 self.push_hole(InstHole::Ranges { ranges })
472 };
473 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
474 }
475 }
476
477 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
478 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
479 }
480
481 fn c_class_bytes(
482 &mut self,
483 ranges: &[hir::ClassBytesRange],
484 ) -> ResultOrEmpty {
485 debug_assert!(!ranges.is_empty());
486
487 let first_split_entry = self.insts.len();
488 let mut holes = vec![];
489 let mut prev_hole = Hole::None;
490 for r in &ranges[0..ranges.len() - 1] {
491 self.fill_to_next(prev_hole);
492 let split = self.push_split_hole();
493 let next = self.insts.len();
494 self.byte_classes.set_range(r.start(), r.end());
495 holes.push(self.push_hole(InstHole::Bytes {
496 start: r.start(),
497 end: r.end(),
498 }));
499 prev_hole = self.fill_split(split, Some(next), None);
500 }
501 let next = self.insts.len();
502 let r = &ranges[ranges.len() - 1];
503 self.byte_classes.set_range(r.start(), r.end());
504 holes.push(
505 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
506 );
507 self.fill(prev_hole, next);
508 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
509 }
510
511 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
512 let hole = self.push_hole(InstHole::EmptyLook { look });
513 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
514 }
515
516 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
517 where
518 I: IntoIterator<Item = &'a Hir>,
519 {
520 let mut exprs = exprs.into_iter();
521 let Patch { mut hole, entry } = loop {
522 match exprs.next() {
523 None => return self.c_empty(),
524 Some(e) => {
525 if let Some(p) = self.c(e)? {
526 break p;
527 }
528 }
529 }
530 };
531 for e in exprs {
532 if let Some(p) = self.c(e)? {
533 self.fill(hole, p.entry);
534 hole = p.hole;
535 }
536 }
537 Ok(Some(Patch { hole, entry }))
538 }
539
540 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
541 debug_assert!(
542 exprs.len() >= 2,
543 "alternates must have at least 2 exprs"
544 );
545
546 let first_split_entry = self.insts.len();
548
549 let mut holes = vec![];
552
553 let mut prev_hole = (Hole::None, false);
556 for e in &exprs[0..exprs.len() - 1] {
557 if prev_hole.1 {
558 let next = self.insts.len();
559 self.fill_split(prev_hole.0, None, Some(next));
560 } else {
561 self.fill_to_next(prev_hole.0);
562 }
563 let split = self.push_split_hole();
564 if let Some(Patch { hole, entry }) = self.c(e)? {
565 holes.push(hole);
566 prev_hole = (self.fill_split(split, Some(entry), None), false);
567 } else {
568 let (split1, split2) = split.dup_one();
569 holes.push(split1);
570 prev_hole = (split2, true);
571 }
572 }
573 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
574 holes.push(hole);
575 if prev_hole.1 {
576 self.fill_split(prev_hole.0, None, Some(entry));
577 } else {
578 self.fill(prev_hole.0, entry);
579 }
580 } else {
581 holes.push(prev_hole.0);
585 }
586 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
587 }
588
589 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
590 use regex_syntax::hir::RepetitionKind::*;
591 match rep.kind {
592 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
593 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
594 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
595 Range(hir::RepetitionRange::Exactly(min_max)) => {
596 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
597 }
598 Range(hir::RepetitionRange::AtLeast(min)) => {
599 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
600 }
601 Range(hir::RepetitionRange::Bounded(min, max)) => {
602 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
603 }
604 }
605 }
606
607 fn c_repeat_zero_or_one(
608 &mut self,
609 expr: &Hir,
610 greedy: bool,
611 ) -> ResultOrEmpty {
612 let split_entry = self.insts.len();
613 let split = self.push_split_hole();
614 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615 Some(p) => p,
616 None => return self.pop_split_hole(),
617 };
618 let split_hole = if greedy {
619 self.fill_split(split, Some(entry_rep), None)
620 } else {
621 self.fill_split(split, None, Some(entry_rep))
622 };
623 let holes = vec![hole_rep, split_hole];
624 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
625 }
626
627 fn c_repeat_zero_or_more(
628 &mut self,
629 expr: &Hir,
630 greedy: bool,
631 ) -> ResultOrEmpty {
632 let split_entry = self.insts.len();
633 let split = self.push_split_hole();
634 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
635 Some(p) => p,
636 None => return self.pop_split_hole(),
637 };
638
639 self.fill(hole_rep, split_entry);
640 let split_hole = if greedy {
641 self.fill_split(split, Some(entry_rep), None)
642 } else {
643 self.fill_split(split, None, Some(entry_rep))
644 };
645 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
646 }
647
648 fn c_repeat_one_or_more(
649 &mut self,
650 expr: &Hir,
651 greedy: bool,
652 ) -> ResultOrEmpty {
653 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
654 Some(p) => p,
655 None => return Ok(None),
656 };
657 self.fill_to_next(hole_rep);
658 let split = self.push_split_hole();
659
660 let split_hole = if greedy {
661 self.fill_split(split, Some(entry_rep), None)
662 } else {
663 self.fill_split(split, None, Some(entry_rep))
664 };
665 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
666 }
667
668 fn c_repeat_range_min_or_more(
669 &mut self,
670 expr: &Hir,
671 greedy: bool,
672 min: u32,
673 ) -> ResultOrEmpty {
674 let min = u32_to_usize(min);
675 let patch_concat = self
679 .c_concat(iter::repeat(expr).take(min))?
680 .unwrap_or_else(|| self.next_inst());
681 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
682 self.fill(patch_concat.hole, patch_rep.entry);
683 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
684 } else {
685 Ok(None)
686 }
687 }
688
689 fn c_repeat_range(
690 &mut self,
691 expr: &Hir,
692 greedy: bool,
693 min: u32,
694 max: u32,
695 ) -> ResultOrEmpty {
696 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
697 debug_assert!(min <= max);
698 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
699 if min == max {
700 return Ok(patch_concat);
701 }
702 let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
705 let initial_entry = patch_concat.entry;
706 let mut holes = vec![];
726 let mut prev_hole = patch_concat.hole;
727 for _ in min..max {
728 self.fill_to_next(prev_hole);
729 let split = self.push_split_hole();
730 let Patch { hole, entry } = match self.c(expr)? {
731 Some(p) => p,
732 None => return self.pop_split_hole(),
733 };
734 prev_hole = hole;
735 if greedy {
736 holes.push(self.fill_split(split, Some(entry), None));
737 } else {
738 holes.push(self.fill_split(split, None, Some(entry)));
739 }
740 }
741 holes.push(prev_hole);
742 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
743 }
744
745 fn next_inst(&self) -> Patch {
749 Patch { hole: Hole::None, entry: self.insts.len() }
750 }
751
752 fn fill(&mut self, hole: Hole, goto: InstPtr) {
753 match hole {
754 Hole::None => {}
755 Hole::One(pc) => {
756 self.insts[pc].fill(goto);
757 }
758 Hole::Many(holes) => {
759 for hole in holes {
760 self.fill(hole, goto);
761 }
762 }
763 }
764 }
765
766 fn fill_to_next(&mut self, hole: Hole) {
767 let next = self.insts.len();
768 self.fill(hole, next);
769 }
770
771 fn fill_split(
772 &mut self,
773 hole: Hole,
774 goto1: Option<InstPtr>,
775 goto2: Option<InstPtr>,
776 ) -> Hole {
777 match hole {
778 Hole::None => Hole::None,
779 Hole::One(pc) => match (goto1, goto2) {
780 (Some(goto1), Some(goto2)) => {
781 self.insts[pc].fill_split(goto1, goto2);
782 Hole::None
783 }
784 (Some(goto1), None) => {
785 self.insts[pc].half_fill_split_goto1(goto1);
786 Hole::One(pc)
787 }
788 (None, Some(goto2)) => {
789 self.insts[pc].half_fill_split_goto2(goto2);
790 Hole::One(pc)
791 }
792 (None, None) => unreachable!(
793 "at least one of the split \
794 holes must be filled"
795 ),
796 },
797 Hole::Many(holes) => {
798 let mut new_holes = vec![];
799 for hole in holes {
800 new_holes.push(self.fill_split(hole, goto1, goto2));
801 }
802 if new_holes.is_empty() {
803 Hole::None
804 } else if new_holes.len() == 1 {
805 new_holes.pop().unwrap()
806 } else {
807 Hole::Many(new_holes)
808 }
809 }
810 }
811 }
812
813 fn push_compiled(&mut self, inst: Inst) {
814 self.insts.push(MaybeInst::Compiled(inst));
815 }
816
817 fn push_hole(&mut self, inst: InstHole) -> Hole {
818 let hole = self.insts.len();
819 self.insts.push(MaybeInst::Uncompiled(inst));
820 Hole::One(hole)
821 }
822
823 fn push_split_hole(&mut self) -> Hole {
824 let hole = self.insts.len();
825 self.insts.push(MaybeInst::Split);
826 Hole::One(hole)
827 }
828
829 fn pop_split_hole(&mut self) -> ResultOrEmpty {
830 self.insts.pop();
831 Ok(None)
832 }
833
834 fn check_size(&self) -> result::Result<(), Error> {
835 use std::mem::size_of;
836
837 let size =
838 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
839 if size > self.size_limit {
840 Err(Error::CompiledTooBig(self.size_limit))
841 } else {
842 Ok(())
843 }
844 }
845}
846
847#[derive(Debug)]
848enum Hole {
849 None,
850 One(InstPtr),
851 Many(Vec<Hole>),
852}
853
854impl Hole {
855 fn dup_one(self) -> (Self, Self) {
856 match self {
857 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
858 Hole::None | Hole::Many(_) => {
859 unreachable!("must be called on single hole")
860 }
861 }
862 }
863}
864
865#[derive(Clone, Debug)]
866enum MaybeInst {
867 Compiled(Inst),
868 Uncompiled(InstHole),
869 Split,
870 Split1(InstPtr),
871 Split2(InstPtr),
872}
873
874impl MaybeInst {
875 fn fill(&mut self, goto: InstPtr) {
876 let maybeinst = match *self {
877 MaybeInst::Split => MaybeInst::Split1(goto),
878 MaybeInst::Uncompiled(ref inst) => {
879 MaybeInst::Compiled(inst.fill(goto))
880 }
881 MaybeInst::Split1(goto1) => {
882 MaybeInst::Compiled(Inst::Split(InstSplit {
883 goto1,
884 goto2: goto,
885 }))
886 }
887 MaybeInst::Split2(goto2) => {
888 MaybeInst::Compiled(Inst::Split(InstSplit {
889 goto1: goto,
890 goto2,
891 }))
892 }
893 _ => unreachable!(
894 "not all instructions were compiled! \
895 found uncompiled instruction: {:?}",
896 self
897 ),
898 };
899 *self = maybeinst;
900 }
901
902 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
903 let filled = match *self {
904 MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
905 _ => unreachable!(
906 "must be called on Split instruction, \
907 instead it was called on: {:?}",
908 self
909 ),
910 };
911 *self = MaybeInst::Compiled(filled);
912 }
913
914 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
915 let half_filled = match *self {
916 MaybeInst::Split => goto1,
917 _ => unreachable!(
918 "must be called on Split instruction, \
919 instead it was called on: {:?}",
920 self
921 ),
922 };
923 *self = MaybeInst::Split1(half_filled);
924 }
925
926 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
927 let half_filled = match *self {
928 MaybeInst::Split => goto2,
929 _ => unreachable!(
930 "must be called on Split instruction, \
931 instead it was called on: {:?}",
932 self
933 ),
934 };
935 *self = MaybeInst::Split2(half_filled);
936 }
937
938 fn unwrap(self) -> Inst {
939 match self {
940 MaybeInst::Compiled(inst) => inst,
941 _ => unreachable!(
942 "must be called on a compiled instruction, \
943 instead it was called on: {:?}",
944 self
945 ),
946 }
947 }
948}
949
950#[derive(Clone, Debug)]
951enum InstHole {
952 Save { slot: usize },
953 EmptyLook { look: EmptyLook },
954 Char { c: char },
955 Ranges { ranges: Vec<(char, char)> },
956 Bytes { start: u8, end: u8 },
957}
958
959impl InstHole {
960 fn fill(&self, goto: InstPtr) -> Inst {
961 match *self {
962 InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
963 InstHole::EmptyLook { look } => {
964 Inst::EmptyLook(InstEmptyLook { goto, look })
965 }
966 InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
967 InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
968 goto,
969 ranges: ranges.clone().into_boxed_slice(),
970 }),
971 InstHole::Bytes { start, end } => {
972 Inst::Bytes(InstBytes { goto, start, end })
973 }
974 }
975 }
976}
977
978struct CompileClass<'a, 'b> {
979 c: &'a mut Compiler,
980 ranges: &'b [hir::ClassUnicodeRange],
981}
982
983impl<'a, 'b> CompileClass<'a, 'b> {
984 fn compile(mut self) -> Result {
985 let mut holes = vec![];
986 let mut initial_entry = None;
987 let mut last_split = Hole::None;
988 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
989 self.c.suffix_cache.clear();
990
991 for (i, range) in self.ranges.iter().enumerate() {
992 let is_last_range = i + 1 == self.ranges.len();
993 utf8_seqs.reset(range.start(), range.end());
994 let mut it = (&mut utf8_seqs).peekable();
995 loop {
996 let utf8_seq = match it.next() {
997 None => break,
998 Some(utf8_seq) => utf8_seq,
999 };
1000 if is_last_range && it.peek().is_none() {
1001 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1002 holes.push(hole);
1003 self.c.fill(last_split, entry);
1004 last_split = Hole::None;
1005 if initial_entry.is_none() {
1006 initial_entry = Some(entry);
1007 }
1008 } else {
1009 if initial_entry.is_none() {
1010 initial_entry = Some(self.c.insts.len());
1011 }
1012 self.c.fill_to_next(last_split);
1013 last_split = self.c.push_split_hole();
1014 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1015 holes.push(hole);
1016 last_split =
1017 self.c.fill_split(last_split, Some(entry), None);
1018 }
1019 }
1020 }
1021 self.c.utf8_seqs = Some(utf8_seqs);
1022 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1023 }
1024
1025 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1026 if self.c.compiled.is_reverse {
1027 self.c_utf8_seq_(seq)
1028 } else {
1029 self.c_utf8_seq_(seq.into_iter().rev())
1030 }
1031 }
1032
1033 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1034 where
1035 I: IntoIterator<Item = &'r Utf8Range>,
1036 {
1037 let mut from_inst = ::std::usize::MAX;
1039 let mut last_hole = Hole::None;
1040 for byte_range in seq {
1041 let key = SuffixCacheKey {
1042 from_inst,
1043 start: byte_range.start,
1044 end: byte_range.end,
1045 };
1046 {
1047 let pc = self.c.insts.len();
1048 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1049 from_inst = cached_pc;
1050 continue;
1051 }
1052 }
1053 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1054 if from_inst == ::std::usize::MAX {
1055 last_hole = self.c.push_hole(InstHole::Bytes {
1056 start: byte_range.start,
1057 end: byte_range.end,
1058 });
1059 } else {
1060 self.c.push_compiled(Inst::Bytes(InstBytes {
1061 goto: from_inst,
1062 start: byte_range.start,
1063 end: byte_range.end,
1064 }));
1065 }
1066 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1067 debug_assert!(from_inst < ::std::usize::MAX);
1068 }
1069 debug_assert!(from_inst < ::std::usize::MAX);
1070 Ok(Patch { hole: last_hole, entry: from_inst })
1071 }
1072}
1073
1074#[derive(Debug)]
1097struct SuffixCache {
1098 sparse: Box<[usize]>,
1099 dense: Vec<SuffixCacheEntry>,
1100}
1101
1102#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1103struct SuffixCacheEntry {
1104 key: SuffixCacheKey,
1105 pc: InstPtr,
1106}
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1109struct SuffixCacheKey {
1110 from_inst: InstPtr,
1111 start: u8,
1112 end: u8,
1113}
1114
1115impl SuffixCache {
1116 fn new(size: usize) -> Self {
1117 SuffixCache {
1118 sparse: vec![0usize; size].into(),
1119 dense: Vec::with_capacity(size),
1120 }
1121 }
1122
1123 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1124 let hash = self.hash(&key);
1125 let pos = &mut self.sparse[hash];
1126 if let Some(entry) = self.dense.get(*pos) {
1127 if entry.key == key {
1128 return Some(entry.pc);
1129 }
1130 }
1131 *pos = self.dense.len();
1132 self.dense.push(SuffixCacheEntry { key, pc });
1133 None
1134 }
1135
1136 fn clear(&mut self) {
1137 self.dense.clear();
1138 }
1139
1140 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1141 const FNV_PRIME: u64 = 1_099_511_628_211;
1144 let mut h = 14_695_981_039_346_656_037;
1145 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1146 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1147 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1148 (h as usize) % self.sparse.len()
1149 }
1150}
1151
1152struct ByteClassSet([bool; 256]);
1153
1154impl ByteClassSet {
1155 fn new() -> Self {
1156 ByteClassSet([false; 256])
1157 }
1158
1159 fn set_range(&mut self, start: u8, end: u8) {
1160 debug_assert!(start <= end);
1161 if start > 0 {
1162 self.0[start as usize - 1] = true;
1163 }
1164 self.0[end as usize] = true;
1165 }
1166
1167 fn set_word_boundary(&mut self) {
1168 let iswb = is_word_byte;
1171 let mut b1: u16 = 0;
1172 let mut b2: u16;
1173 while b1 <= 255 {
1174 b2 = b1 + 1;
1175 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1176 b2 += 1;
1177 }
1178 self.set_range(b1 as u8, (b2 - 1) as u8);
1179 b1 = b2;
1180 }
1181 }
1182
1183 fn byte_classes(&self) -> Vec<u8> {
1184 let mut byte_classes = vec![0; 256];
1189 let mut class = 0u8;
1190 let mut i = 0;
1191 loop {
1192 byte_classes[i] = class as u8;
1193 if i >= 255 {
1194 break;
1195 }
1196 if self.0[i] {
1197 class = class.checked_add(1).unwrap();
1198 }
1199 i += 1;
1200 }
1201 byte_classes
1202 }
1203}
1204
1205impl fmt::Debug for ByteClassSet {
1206 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1208 }
1209}
1210
1211fn u32_to_usize(n: u32) -> usize {
1212 if (n as u64) > (::std::usize::MAX as u64) {
1216 panic!("BUG: {} is too big to be pointer sized", n)
1217 }
1218 n as usize
1219}
1220
1221#[cfg(test)]
1222mod tests {
1223 use super::ByteClassSet;
1224
1225 #[test]
1226 fn byte_classes() {
1227 let mut set = ByteClassSet::new();
1228 set.set_range(b'a', b'z');
1229 let classes = set.byte_classes();
1230 assert_eq!(classes[0], 0);
1231 assert_eq!(classes[1], 0);
1232 assert_eq!(classes[2], 0);
1233 assert_eq!(classes[b'a' as usize - 1], 0);
1234 assert_eq!(classes[b'a' as usize], 1);
1235 assert_eq!(classes[b'm' as usize], 1);
1236 assert_eq!(classes[b'z' as usize], 1);
1237 assert_eq!(classes[b'z' as usize + 1], 2);
1238 assert_eq!(classes[254], 2);
1239 assert_eq!(classes[255], 2);
1240
1241 let mut set = ByteClassSet::new();
1242 set.set_range(0, 2);
1243 set.set_range(4, 6);
1244 let classes = set.byte_classes();
1245 assert_eq!(classes[0], 0);
1246 assert_eq!(classes[1], 0);
1247 assert_eq!(classes[2], 0);
1248 assert_eq!(classes[3], 1);
1249 assert_eq!(classes[4], 2);
1250 assert_eq!(classes[5], 2);
1251 assert_eq!(classes[6], 2);
1252 assert_eq!(classes[7], 3);
1253 assert_eq!(classes[255], 3);
1254 }
1255
1256 #[test]
1257 fn full_byte_classes() {
1258 let mut set = ByteClassSet::new();
1259 for i in 0..256u16 {
1260 set.set_range(i as u8, i as u8);
1261 }
1262 assert_eq!(set.byte_classes().len(), 256);
1263 }
1264}