regex_syntax/hir/mod.rs
1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23 boxed::Box,
24 format,
25 string::{String, ToString},
26 vec,
27 vec::Vec,
28};
29
30use crate::{
31 ast::Span,
32 hir::interval::{Interval, IntervalSet, IntervalSetIter},
33 unicode,
34};
35
36pub use crate::{
37 hir::visitor::{visit, Visitor},
38 unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50 /// The kind of error.
51 kind: ErrorKind,
52 /// The original pattern that the translator's Ast was parsed from. Every
53 /// span in an error is a valid range into this string.
54 pattern: String,
55 /// The span of this error, derived from the Ast given to the translator.
56 span: Span,
57}
58
59impl Error {
60 /// Return the type of this error.
61 pub fn kind(&self) -> &ErrorKind {
62 &self.kind
63 }
64
65 /// The original pattern string in which this error occurred.
66 ///
67 /// Every span reported by this error is reported in terms of this string.
68 pub fn pattern(&self) -> &str {
69 &self.pattern
70 }
71
72 /// Return the span at which this error occurred.
73 pub fn span(&self) -> &Span {
74 &self.span
75 }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85 /// This error occurs when a Unicode feature is used when Unicode
86 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87 UnicodeNotAllowed,
88 /// This error occurs when translating a pattern that could match a byte
89 /// sequence that isn't UTF-8 and `utf8` was enabled.
90 InvalidUtf8,
91 /// This error occurs when one uses a non-ASCII byte for a line terminator,
92 /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93 InvalidLineTerminator,
94 /// This occurs when an unrecognized Unicode property name could not
95 /// be found.
96 UnicodePropertyNotFound,
97 /// This occurs when an unrecognized Unicode property value could not
98 /// be found.
99 UnicodePropertyValueNotFound,
100 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101 /// `\d`) could not be found. This can occur when the `unicode-perl`
102 /// crate feature is not enabled.
103 UnicodePerlClassNotFound,
104 /// This occurs when the Unicode simple case mapping tables are not
105 /// available, and the regular expression required Unicode aware case
106 /// insensitivity.
107 UnicodeCaseUnavailable,
108}
109
110#[cfg(feature = "std")]
111impl std::error::Error for Error {}
112
113impl core::fmt::Display for Error {
114 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115 crate::error::Formatter::from(self).fmt(f)
116 }
117}
118
119impl core::fmt::Display for ErrorKind {
120 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121 use self::ErrorKind::*;
122
123 let msg = match *self {
124 UnicodeNotAllowed => "Unicode not allowed here",
125 InvalidUtf8 => "pattern can match invalid UTF-8",
126 InvalidLineTerminator => "invalid line terminator, must be ASCII",
127 UnicodePropertyNotFound => "Unicode property not found",
128 UnicodePropertyValueNotFound => "Unicode property value not found",
129 UnicodePerlClassNotFound => {
130 "Unicode-aware Perl class not found \
131 (make sure the unicode-perl feature is enabled)"
132 }
133 UnicodeCaseUnavailable => {
134 "Unicode-aware case insensitivity matching is not available \
135 (make sure the unicode-case feature is enabled)"
136 }
137 };
138 f.write_str(msg)
139 }
140}
141
142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(Clone, Eq, PartialEq)]
205pub struct Hir {
206 /// The underlying HIR kind.
207 kind: HirKind,
208 /// Analysis info about this HIR, computed during construction.
209 props: Properties,
210}
211
212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214 /// Returns a reference to the underlying HIR kind.
215 pub fn kind(&self) -> &HirKind {
216 &self.kind
217 }
218
219 /// Consumes ownership of this HIR expression and returns its underlying
220 /// `HirKind`.
221 pub fn into_kind(mut self) -> HirKind {
222 core::mem::replace(&mut self.kind, HirKind::Empty)
223 }
224
225 /// Returns the properties computed for this `Hir`.
226 pub fn properties(&self) -> &Properties {
227 &self.props
228 }
229
230 /// Splits this HIR into its constituent parts.
231 ///
232 /// This is useful because `let Hir { kind, props } = hir;` does not work
233 /// because of `Hir`'s custom `Drop` implementation.
234 fn into_parts(mut self) -> (HirKind, Properties) {
235 (
236 core::mem::replace(&mut self.kind, HirKind::Empty),
237 core::mem::replace(&mut self.props, Properties::empty()),
238 )
239 }
240}
241
242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simulanteously keeping the representation as simple as possible.
253impl Hir {
254 /// Returns an empty HIR expression.
255 ///
256 /// An empty HIR expression always matches, including the empty string.
257 #[inline]
258 pub fn empty() -> Hir {
259 let props = Properties::empty();
260 Hir { kind: HirKind::Empty, props }
261 }
262
263 /// Returns an HIR expression that can never match anything. That is,
264 /// the size of the set of strings in the language described by the HIR
265 /// returned is `0`.
266 ///
267 /// This is distinct from [`Hir::empty`] in that the empty string matches
268 /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269 /// language describe described by `Hir::empty` is non-empty.
270 ///
271 /// Note that currently, the HIR returned uses an empty character class to
272 /// indicate that nothing can match. An equivalent expression that cannot
273 /// match is an empty alternation, but all such "fail" expressions are
274 /// normalized (via smart constructors) to empty character classes. This is
275 /// because empty character classes can be spelled in the concrete syntax
276 /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277 /// empty alternations cannot.
278 #[inline]
279 pub fn fail() -> Hir {
280 let class = Class::Bytes(ClassBytes::empty());
281 let props = Properties::class(&class);
282 // We can't just call Hir::class here because it defers to Hir::fail
283 // in order to canonicalize the Hir value used to represent "cannot
284 // match."
285 Hir { kind: HirKind::Class(class), props }
286 }
287
288 /// Creates a literal HIR expression.
289 ///
290 /// This accepts anything that can be converted into a `Box<[u8]>`.
291 ///
292 /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293 /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294 /// any HIR node) matches valid UTF-8 exclusively can be queried via
295 /// [`Properties::is_utf8`].
296 ///
297 /// # Example
298 ///
299 /// This example shows that concatenations of `Literal` HIR values will
300 /// automatically get flattened and combined together. So for example, even
301 /// if you concat multiple `Literal` values that are themselves not valid
302 /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303 /// how "smart" Hir's smart constructors are.
304 ///
305 /// ```
306 /// use regex_syntax::hir::{Hir, HirKind, Literal};
307 ///
308 /// let literals = vec![
309 /// Hir::literal([0xE2]),
310 /// Hir::literal([0x98]),
311 /// Hir::literal([0x83]),
312 /// ];
313 /// // Each literal, on its own, is invalid UTF-8.
314 /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315 ///
316 /// let concat = Hir::concat(literals);
317 /// // But the concatenation is valid UTF-8!
318 /// assert!(concat.properties().is_utf8());
319 ///
320 /// // And also notice that the literals have been concatenated into a
321 /// // single `Literal`, to the point where there is no explicit `Concat`!
322 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323 /// assert_eq!(&expected, concat.kind());
324 /// ```
325 ///
326 /// # Example: building a literal from a `char`
327 ///
328 /// This example shows how to build a single `Hir` literal from a `char`
329 /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330 /// encode a `char` value:
331 ///
332 /// ```
333 /// use regex_syntax::hir::{Hir, HirKind, Literal};
334 ///
335 /// let ch = '☃';
336 /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337 ///
338 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339 /// assert_eq!(&expected, got.kind());
340 /// ```
341 #[inline]
342 pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343 let bytes = lit.into();
344 if bytes.is_empty() {
345 return Hir::empty();
346 }
347
348 let lit = Literal(bytes);
349 let props = Properties::literal(&lit);
350 Hir { kind: HirKind::Literal(lit), props }
351 }
352
353 /// Creates a class HIR expression. The class may either be defined over
354 /// ranges of Unicode codepoints or ranges of raw byte values.
355 ///
356 /// Note that an empty class is permitted. An empty class is equivalent to
357 /// `Hir::fail()`.
358 #[inline]
359 pub fn class(class: Class) -> Hir {
360 if class.is_empty() {
361 return Hir::fail();
362 } else if let Some(bytes) = class.literal() {
363 return Hir::literal(bytes);
364 }
365 let props = Properties::class(&class);
366 Hir { kind: HirKind::Class(class), props }
367 }
368
369 /// Creates a look-around assertion HIR expression.
370 #[inline]
371 pub fn look(look: Look) -> Hir {
372 let props = Properties::look(look);
373 Hir { kind: HirKind::Look(look), props }
374 }
375
376 /// Creates a repetition HIR expression.
377 #[inline]
378 pub fn repetition(mut rep: Repetition) -> Hir {
379 // If the sub-expression of a repetition can only match the empty
380 // string, then we force its maximum to be at most 1.
381 if rep.sub.properties().maximum_len() == Some(0) {
382 rep.min = cmp::min(rep.min, 1);
383 rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384 }
385 // The regex 'a{0}' is always equivalent to the empty regex. This is
386 // true even when 'a' is an expression that never matches anything
387 // (like '\P{any}').
388 //
389 // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390 if rep.min == 0 && rep.max == Some(0) {
391 return Hir::empty();
392 } else if rep.min == 1 && rep.max == Some(1) {
393 return *rep.sub;
394 }
395 let props = Properties::repetition(&rep);
396 Hir { kind: HirKind::Repetition(rep), props }
397 }
398
399 /// Creates a capture HIR expression.
400 ///
401 /// Note that there is no explicit HIR value for a non-capturing group.
402 /// Since a non-capturing group only exists to override precedence in the
403 /// concrete syntax and since an HIR already does its own grouping based on
404 /// what is parsed, there is no need to explicitly represent non-capturing
405 /// groups in the HIR.
406 #[inline]
407 pub fn capture(capture: Capture) -> Hir {
408 let props = Properties::capture(&capture);
409 Hir { kind: HirKind::Capture(capture), props }
410 }
411
412 /// Returns the concatenation of the given expressions.
413 ///
414 /// This attempts to flatten and simplify the concatenation as appropriate.
415 ///
416 /// # Example
417 ///
418 /// This shows a simple example of basic flattening of both concatenations
419 /// and literals.
420 ///
421 /// ```
422 /// use regex_syntax::hir::Hir;
423 ///
424 /// let hir = Hir::concat(vec![
425 /// Hir::concat(vec![
426 /// Hir::literal([b'a']),
427 /// Hir::literal([b'b']),
428 /// Hir::literal([b'c']),
429 /// ]),
430 /// Hir::concat(vec![
431 /// Hir::literal([b'x']),
432 /// Hir::literal([b'y']),
433 /// Hir::literal([b'z']),
434 /// ]),
435 /// ]);
436 /// let expected = Hir::literal("abcxyz".as_bytes());
437 /// assert_eq!(expected, hir);
438 /// ```
439 pub fn concat(subs: Vec<Hir>) -> Hir {
440 // We rebuild the concatenation by simplifying it. Would be nice to do
441 // it in place, but that seems a little tricky?
442 let mut new = vec![];
443 // This gobbles up any adjacent literals in a concatenation and smushes
444 // them together. Basically, when we see a literal, we add its bytes
445 // to 'prior_lit', and whenever we see anything else, we first take
446 // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447 let mut prior_lit: Option<Vec<u8>> = None;
448 for sub in subs {
449 let (kind, props) = sub.into_parts();
450 match kind {
451 HirKind::Literal(Literal(bytes)) => {
452 if let Some(ref mut prior_bytes) = prior_lit {
453 prior_bytes.extend_from_slice(&bytes);
454 } else {
455 prior_lit = Some(bytes.to_vec());
456 }
457 }
458 // We also flatten concats that are direct children of another
459 // concat. We only need to do this one level deep since
460 // Hir::concat is the only way to build concatenations, and so
461 // flattening happens inductively.
462 HirKind::Concat(subs2) => {
463 for sub2 in subs2 {
464 let (kind2, props2) = sub2.into_parts();
465 match kind2 {
466 HirKind::Literal(Literal(bytes)) => {
467 if let Some(ref mut prior_bytes) = prior_lit {
468 prior_bytes.extend_from_slice(&bytes);
469 } else {
470 prior_lit = Some(bytes.to_vec());
471 }
472 }
473 kind2 => {
474 if let Some(prior_bytes) = prior_lit.take() {
475 new.push(Hir::literal(prior_bytes));
476 }
477 new.push(Hir { kind: kind2, props: props2 });
478 }
479 }
480 }
481 }
482 // We can just skip empty HIRs.
483 HirKind::Empty => {}
484 kind => {
485 if let Some(prior_bytes) = prior_lit.take() {
486 new.push(Hir::literal(prior_bytes));
487 }
488 new.push(Hir { kind, props });
489 }
490 }
491 }
492 if let Some(prior_bytes) = prior_lit.take() {
493 new.push(Hir::literal(prior_bytes));
494 }
495 if new.is_empty() {
496 return Hir::empty();
497 } else if new.len() == 1 {
498 return new.pop().unwrap();
499 }
500 let props = Properties::concat(&new);
501 Hir { kind: HirKind::Concat(new), props }
502 }
503
504 /// Returns the alternation of the given expressions.
505 ///
506 /// This flattens and simplifies the alternation as appropriate. This may
507 /// include factoring out common prefixes or even rewriting the alternation
508 /// as a character class.
509 ///
510 /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511 /// is not possible for one to write an empty alternation, or even an
512 /// alternation with a single sub-expression, in the concrete syntax of a
513 /// regex.)
514 ///
515 /// # Example
516 ///
517 /// This is a simple example showing how an alternation might get
518 /// simplified.
519 ///
520 /// ```
521 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522 ///
523 /// let hir = Hir::alternation(vec![
524 /// Hir::literal([b'a']),
525 /// Hir::literal([b'b']),
526 /// Hir::literal([b'c']),
527 /// Hir::literal([b'd']),
528 /// Hir::literal([b'e']),
529 /// Hir::literal([b'f']),
530 /// ]);
531 /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532 /// ClassUnicodeRange::new('a', 'f'),
533 /// ])));
534 /// assert_eq!(expected, hir);
535 /// ```
536 ///
537 /// And another example showing how common prefixes might get factored
538 /// out.
539 ///
540 /// ```
541 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542 ///
543 /// let hir = Hir::alternation(vec![
544 /// Hir::concat(vec![
545 /// Hir::literal("abc".as_bytes()),
546 /// Hir::class(Class::Unicode(ClassUnicode::new([
547 /// ClassUnicodeRange::new('A', 'Z'),
548 /// ]))),
549 /// ]),
550 /// Hir::concat(vec![
551 /// Hir::literal("abc".as_bytes()),
552 /// Hir::class(Class::Unicode(ClassUnicode::new([
553 /// ClassUnicodeRange::new('a', 'z'),
554 /// ]))),
555 /// ]),
556 /// ]);
557 /// let expected = Hir::concat(vec![
558 /// Hir::literal("abc".as_bytes()),
559 /// Hir::alternation(vec![
560 /// Hir::class(Class::Unicode(ClassUnicode::new([
561 /// ClassUnicodeRange::new('A', 'Z'),
562 /// ]))),
563 /// Hir::class(Class::Unicode(ClassUnicode::new([
564 /// ClassUnicodeRange::new('a', 'z'),
565 /// ]))),
566 /// ]),
567 /// ]);
568 /// assert_eq!(expected, hir);
569 /// ```
570 ///
571 /// Note that these sorts of simplifications are not guaranteed.
572 pub fn alternation(subs: Vec<Hir>) -> Hir {
573 // We rebuild the alternation by simplifying it. We proceed similarly
574 // as the concatenation case. But in this case, there's no literal
575 // simplification happening. We're just flattening alternations.
576 let mut new = Vec::with_capacity(subs.len());
577 for sub in subs {
578 let (kind, props) = sub.into_parts();
579 match kind {
580 HirKind::Alternation(subs2) => {
581 new.extend(subs2);
582 }
583 kind => {
584 new.push(Hir { kind, props });
585 }
586 }
587 }
588 if new.is_empty() {
589 return Hir::fail();
590 } else if new.len() == 1 {
591 return new.pop().unwrap();
592 }
593 // Now that it's completely flattened, look for the special case of
594 // 'char1|char2|...|charN' and collapse that into a class. Note that
595 // we look for 'char' first and then bytes. The issue here is that if
596 // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597 // then it isn't actually possible to smush them into a single class.
598 // (Because classes are either "all codepoints" or "all bytes." You
599 // can have a class that both matches non-ASCII but valid UTF-8 and
600 // invalid UTF-8.) So we look for all chars and then all bytes, and
601 // don't handle anything else.
602 if let Some(singletons) = singleton_chars(&new) {
603 let it = singletons
604 .into_iter()
605 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606 return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607 }
608 if let Some(singletons) = singleton_bytes(&new) {
609 let it = singletons
610 .into_iter()
611 .map(|b| ClassBytesRange { start: b, end: b });
612 return Hir::class(Class::Bytes(ClassBytes::new(it)));
613 }
614 // Similar to singleton chars, we can also look for alternations of
615 // classes. Those can be smushed into a single class.
616 if let Some(cls) = class_chars(&new) {
617 return Hir::class(cls);
618 }
619 if let Some(cls) = class_bytes(&new) {
620 return Hir::class(cls);
621 }
622 // Factor out a common prefix if we can, which might potentially
623 // simplify the expression and unlock other optimizations downstream.
624 // It also might generally make NFA matching and DFA construction
625 // faster by reducing the scope of branching in the regex.
626 new = match lift_common_prefix(new) {
627 Ok(hir) => return hir,
628 Err(unchanged) => unchanged,
629 };
630 let props = Properties::alternation(&new);
631 Hir { kind: HirKind::Alternation(new), props }
632 }
633
634 /// Returns an HIR expression for `.`.
635 ///
636 /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637 /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638 /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639 /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640 /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641 /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642 ///
643 /// # Example
644 ///
645 /// Note that this is a convenience routine for constructing the correct
646 /// character class based on the value of `Dot`. There is no explicit "dot"
647 /// HIR value. It is just an abbreviation for a common character class.
648 ///
649 /// ```
650 /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651 ///
652 /// let hir = Hir::dot(Dot::AnyByte);
653 /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654 /// ClassBytesRange::new(0x00, 0xFF),
655 /// ])));
656 /// assert_eq!(expected, hir);
657 /// ```
658 #[inline]
659 pub fn dot(dot: Dot) -> Hir {
660 match dot {
661 Dot::AnyChar => {
662 let mut cls = ClassUnicode::empty();
663 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
664 Hir::class(Class::Unicode(cls))
665 }
666 Dot::AnyByte => {
667 let mut cls = ClassBytes::empty();
668 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
669 Hir::class(Class::Bytes(cls))
670 }
671 Dot::AnyCharExcept(ch) => {
672 let mut cls =
673 ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
674 cls.negate();
675 Hir::class(Class::Unicode(cls))
676 }
677 Dot::AnyCharExceptLF => {
678 let mut cls = ClassUnicode::empty();
679 cls.push(ClassUnicodeRange::new('\0', '\x09'));
680 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
681 Hir::class(Class::Unicode(cls))
682 }
683 Dot::AnyCharExceptCRLF => {
684 let mut cls = ClassUnicode::empty();
685 cls.push(ClassUnicodeRange::new('\0', '\x09'));
686 cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
687 cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
688 Hir::class(Class::Unicode(cls))
689 }
690 Dot::AnyByteExcept(byte) => {
691 let mut cls =
692 ClassBytes::new([ClassBytesRange::new(byte, byte)]);
693 cls.negate();
694 Hir::class(Class::Bytes(cls))
695 }
696 Dot::AnyByteExceptLF => {
697 let mut cls = ClassBytes::empty();
698 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
699 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
700 Hir::class(Class::Bytes(cls))
701 }
702 Dot::AnyByteExceptCRLF => {
703 let mut cls = ClassBytes::empty();
704 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
705 cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
706 cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
707 Hir::class(Class::Bytes(cls))
708 }
709 }
710 }
711}
712
713/// The underlying kind of an arbitrary [`Hir`] expression.
714///
715/// An `HirKind` is principally useful for doing case analysis on the type
716/// of a regular expression. If you're looking to build new `Hir` values,
717/// then you _must_ use the smart constructors defined on `Hir`, like
718/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
719/// not expose any way of building an `Hir` directly from an `HirKind`.
720#[derive(Clone, Debug, Eq, PartialEq)]
721pub enum HirKind {
722 /// The empty regular expression, which matches everything, including the
723 /// empty string.
724 Empty,
725 /// A literalstring that matches exactly these bytes.
726 Literal(Literal),
727 /// A single character class that matches any of the characters in the
728 /// class. A class can either consist of Unicode scalar values as
729 /// characters, or it can use bytes.
730 ///
731 /// A class may be empty. In which case, it matches nothing.
732 Class(Class),
733 /// A look-around assertion. A look-around match always has zero length.
734 Look(Look),
735 /// A repetition operation applied to a sub-expression.
736 Repetition(Repetition),
737 /// A capturing group, which contains a sub-expression.
738 Capture(Capture),
739 /// A concatenation of expressions.
740 ///
741 /// A concatenation matches only if each of its sub-expressions match one
742 /// after the other.
743 ///
744 /// Concatenations are guaranteed by `Hir`'s smart constructors to always
745 /// have at least two sub-expressions.
746 Concat(Vec<Hir>),
747 /// An alternation of expressions.
748 ///
749 /// An alternation matches only if at least one of its sub-expressions
750 /// match. If multiple sub-expressions match, then the leftmost is
751 /// preferred.
752 ///
753 /// Alternations are guaranteed by `Hir`'s smart constructors to always
754 /// have at least two sub-expressions.
755 Alternation(Vec<Hir>),
756}
757
758impl HirKind {
759 /// Returns a slice of this kind's sub-expressions, if any.
760 pub fn subs(&self) -> &[Hir] {
761 use core::slice::from_ref;
762
763 match *self {
764 HirKind::Empty
765 | HirKind::Literal(_)
766 | HirKind::Class(_)
767 | HirKind::Look(_) => &[],
768 HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
769 HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
770 HirKind::Concat(ref subs) => subs,
771 HirKind::Alternation(ref subs) => subs,
772 }
773 }
774}
775
776impl core::fmt::Debug for Hir {
777 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
778 self.kind.fmt(f)
779 }
780}
781
782/// Print a display representation of this Hir.
783///
784/// The result of this is a valid regular expression pattern string.
785///
786/// This implementation uses constant stack space and heap space proportional
787/// to the size of the `Hir`.
788impl core::fmt::Display for Hir {
789 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
790 crate::hir::print::Printer::new().print(self, f)
791 }
792}
793
794/// The high-level intermediate representation of a literal.
795///
796/// A literal corresponds to `0` or more bytes that should be matched
797/// literally. The smart constructors defined on `Hir` will automatically
798/// concatenate adjacent literals into one literal, and will even automatically
799/// replace empty literals with `Hir::empty()`.
800///
801/// Note that despite a literal being represented by a sequence of bytes, its
802/// `Debug` implementation will attempt to print it as a normal string. (That
803/// is, not a sequence of decimal numbers.)
804#[derive(Clone, Eq, PartialEq)]
805pub struct Literal(pub Box<[u8]>);
806
807impl core::fmt::Debug for Literal {
808 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
809 crate::debug::Bytes(&self.0).fmt(f)
810 }
811}
812
813/// The high-level intermediate representation of a character class.
814///
815/// A character class corresponds to a set of characters. A character is either
816/// defined by a Unicode scalar value or a byte.
817///
818/// A character class, regardless of its character type, is represented by a
819/// sequence of non-overlapping non-adjacent ranges of characters.
820///
821/// There are no guarantees about which class variant is used. Generally
822/// speaking, the Unicode variat is used whenever a class needs to contain
823/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
824/// when Unicode mode is disabled. For example, at the time of writing, the
825/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
826/// `[a\u00A0]` due to optimizations.
827///
828/// Note that `Bytes` variant may be produced even when it exclusively matches
829/// valid UTF-8. This is because a `Bytes` variant represents an intention by
830/// the author of the regular expression to disable Unicode mode, which in turn
831/// impacts the semantics of case insensitive matching. For example, `(?i)k`
832/// and `(?i-u)k` will not match the same set of strings.
833#[derive(Clone, Eq, PartialEq)]
834pub enum Class {
835 /// A set of characters represented by Unicode scalar values.
836 Unicode(ClassUnicode),
837 /// A set of characters represented by arbitrary bytes (one byte per
838 /// character).
839 Bytes(ClassBytes),
840}
841
842impl Class {
843 /// Apply Unicode simple case folding to this character class, in place.
844 /// The character class will be expanded to include all simple case folded
845 /// character variants.
846 ///
847 /// If this is a byte oriented character class, then this will be limited
848 /// to the ASCII ranges `A-Z` and `a-z`.
849 ///
850 /// # Panics
851 ///
852 /// This routine panics when the case mapping data necessary for this
853 /// routine to complete is unavailable. This occurs when the `unicode-case`
854 /// feature is not enabled and the underlying class is Unicode oriented.
855 ///
856 /// Callers should prefer using `try_case_fold_simple` instead, which will
857 /// return an error instead of panicking.
858 pub fn case_fold_simple(&mut self) {
859 match *self {
860 Class::Unicode(ref mut x) => x.case_fold_simple(),
861 Class::Bytes(ref mut x) => x.case_fold_simple(),
862 }
863 }
864
865 /// Apply Unicode simple case folding to this character class, in place.
866 /// The character class will be expanded to include all simple case folded
867 /// character variants.
868 ///
869 /// If this is a byte oriented character class, then this will be limited
870 /// to the ASCII ranges `A-Z` and `a-z`.
871 ///
872 /// # Error
873 ///
874 /// This routine returns an error when the case mapping data necessary
875 /// for this routine to complete is unavailable. This occurs when the
876 /// `unicode-case` feature is not enabled and the underlying class is
877 /// Unicode oriented.
878 pub fn try_case_fold_simple(
879 &mut self,
880 ) -> core::result::Result<(), CaseFoldError> {
881 match *self {
882 Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
883 Class::Bytes(ref mut x) => x.case_fold_simple(),
884 }
885 Ok(())
886 }
887
888 /// Negate this character class in place.
889 ///
890 /// After completion, this character class will contain precisely the
891 /// characters that weren't previously in the class.
892 pub fn negate(&mut self) {
893 match *self {
894 Class::Unicode(ref mut x) => x.negate(),
895 Class::Bytes(ref mut x) => x.negate(),
896 }
897 }
898
899 /// Returns true if and only if this character class will only ever match
900 /// valid UTF-8.
901 ///
902 /// A character class can match invalid UTF-8 only when the following
903 /// conditions are met:
904 ///
905 /// 1. The translator was configured to permit generating an expression
906 /// that can match invalid UTF-8. (By default, this is disabled.)
907 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
908 /// syntax or in the parser builder. By default, Unicode mode is
909 /// enabled.
910 pub fn is_utf8(&self) -> bool {
911 match *self {
912 Class::Unicode(_) => true,
913 Class::Bytes(ref x) => x.is_ascii(),
914 }
915 }
916
917 /// Returns the length, in bytes, of the smallest string matched by this
918 /// character class.
919 ///
920 /// For non-empty byte oriented classes, this always returns `1`. For
921 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
922 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
923 /// be returned.
924 ///
925 /// # Example
926 ///
927 /// This example shows some examples of regexes and their corresponding
928 /// minimum length, if any.
929 ///
930 /// ```
931 /// use regex_syntax::{hir::Properties, parse};
932 ///
933 /// // The empty string has a min length of 0.
934 /// let hir = parse(r"")?;
935 /// assert_eq!(Some(0), hir.properties().minimum_len());
936 /// // As do other types of regexes that only match the empty string.
937 /// let hir = parse(r"^$\b\B")?;
938 /// assert_eq!(Some(0), hir.properties().minimum_len());
939 /// // A regex that can match the empty string but match more is still 0.
940 /// let hir = parse(r"a*")?;
941 /// assert_eq!(Some(0), hir.properties().minimum_len());
942 /// // A regex that matches nothing has no minimum defined.
943 /// let hir = parse(r"[a&&b]")?;
944 /// assert_eq!(None, hir.properties().minimum_len());
945 /// // Character classes usually have a minimum length of 1.
946 /// let hir = parse(r"\w")?;
947 /// assert_eq!(Some(1), hir.properties().minimum_len());
948 /// // But sometimes Unicode classes might be bigger!
949 /// let hir = parse(r"\p{Cyrillic}")?;
950 /// assert_eq!(Some(2), hir.properties().minimum_len());
951 ///
952 /// # Ok::<(), Box<dyn std::error::Error>>(())
953 /// ```
954 pub fn minimum_len(&self) -> Option<usize> {
955 match *self {
956 Class::Unicode(ref x) => x.minimum_len(),
957 Class::Bytes(ref x) => x.minimum_len(),
958 }
959 }
960
961 /// Returns the length, in bytes, of the longest string matched by this
962 /// character class.
963 ///
964 /// For non-empty byte oriented classes, this always returns `1`. For
965 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
966 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
967 /// be returned.
968 ///
969 /// # Example
970 ///
971 /// This example shows some examples of regexes and their corresponding
972 /// maximum length, if any.
973 ///
974 /// ```
975 /// use regex_syntax::{hir::Properties, parse};
976 ///
977 /// // The empty string has a max length of 0.
978 /// let hir = parse(r"")?;
979 /// assert_eq!(Some(0), hir.properties().maximum_len());
980 /// // As do other types of regexes that only match the empty string.
981 /// let hir = parse(r"^$\b\B")?;
982 /// assert_eq!(Some(0), hir.properties().maximum_len());
983 /// // A regex that matches nothing has no maximum defined.
984 /// let hir = parse(r"[a&&b]")?;
985 /// assert_eq!(None, hir.properties().maximum_len());
986 /// // Bounded repeats work as you expect.
987 /// let hir = parse(r"x{2,10}")?;
988 /// assert_eq!(Some(10), hir.properties().maximum_len());
989 /// // An unbounded repeat means there is no maximum.
990 /// let hir = parse(r"x{2,}")?;
991 /// assert_eq!(None, hir.properties().maximum_len());
992 /// // With Unicode enabled, \w can match up to 4 bytes!
993 /// let hir = parse(r"\w")?;
994 /// assert_eq!(Some(4), hir.properties().maximum_len());
995 /// // Without Unicode enabled, \w matches at most 1 byte.
996 /// let hir = parse(r"(?-u)\w")?;
997 /// assert_eq!(Some(1), hir.properties().maximum_len());
998 ///
999 /// # Ok::<(), Box<dyn std::error::Error>>(())
1000 /// ```
1001 pub fn maximum_len(&self) -> Option<usize> {
1002 match *self {
1003 Class::Unicode(ref x) => x.maximum_len(),
1004 Class::Bytes(ref x) => x.maximum_len(),
1005 }
1006 }
1007
1008 /// Returns true if and only if this character class is empty. That is,
1009 /// it has no elements.
1010 ///
1011 /// An empty character can never match anything, including an empty string.
1012 pub fn is_empty(&self) -> bool {
1013 match *self {
1014 Class::Unicode(ref x) => x.ranges().is_empty(),
1015 Class::Bytes(ref x) => x.ranges().is_empty(),
1016 }
1017 }
1018
1019 /// If this class consists of exactly one element (whether a codepoint or a
1020 /// byte), then return it as a literal byte string.
1021 ///
1022 /// If this class is empty or contains more than one element, then `None`
1023 /// is returned.
1024 pub fn literal(&self) -> Option<Vec<u8>> {
1025 match *self {
1026 Class::Unicode(ref x) => x.literal(),
1027 Class::Bytes(ref x) => x.literal(),
1028 }
1029 }
1030}
1031
1032impl core::fmt::Debug for Class {
1033 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1034 use crate::debug::Byte;
1035
1036 let mut fmter = f.debug_set();
1037 match *self {
1038 Class::Unicode(ref cls) => {
1039 for r in cls.ranges().iter() {
1040 fmter.entry(&(r.start..=r.end));
1041 }
1042 }
1043 Class::Bytes(ref cls) => {
1044 for r in cls.ranges().iter() {
1045 fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1046 }
1047 }
1048 }
1049 fmter.finish()
1050 }
1051}
1052
1053/// A set of characters represented by Unicode scalar values.
1054#[derive(Clone, Debug, Eq, PartialEq)]
1055pub struct ClassUnicode {
1056 set: IntervalSet<ClassUnicodeRange>,
1057}
1058
1059impl ClassUnicode {
1060 /// Create a new class from a sequence of ranges.
1061 ///
1062 /// The given ranges do not need to be in any specific order, and ranges
1063 /// may overlap. Ranges will automatically be sorted into a canonical
1064 /// non-overlapping order.
1065 pub fn new<I>(ranges: I) -> ClassUnicode
1066 where
1067 I: IntoIterator<Item = ClassUnicodeRange>,
1068 {
1069 ClassUnicode { set: IntervalSet::new(ranges) }
1070 }
1071
1072 /// Create a new class with no ranges.
1073 ///
1074 /// An empty class matches nothing. That is, it is equivalent to
1075 /// [`Hir::fail`].
1076 pub fn empty() -> ClassUnicode {
1077 ClassUnicode::new(vec![])
1078 }
1079
1080 /// Add a new range to this set.
1081 pub fn push(&mut self, range: ClassUnicodeRange) {
1082 self.set.push(range);
1083 }
1084
1085 /// Return an iterator over all ranges in this class.
1086 ///
1087 /// The iterator yields ranges in ascending order.
1088 pub fn iter(&self) -> ClassUnicodeIter<'_> {
1089 ClassUnicodeIter(self.set.iter())
1090 }
1091
1092 /// Return the underlying ranges as a slice.
1093 pub fn ranges(&self) -> &[ClassUnicodeRange] {
1094 self.set.intervals()
1095 }
1096
1097 /// Expand this character class such that it contains all case folded
1098 /// characters, according to Unicode's "simple" mapping. For example, if
1099 /// this class consists of the range `a-z`, then applying case folding will
1100 /// result in the class containing both the ranges `a-z` and `A-Z`.
1101 ///
1102 /// # Panics
1103 ///
1104 /// This routine panics when the case mapping data necessary for this
1105 /// routine to complete is unavailable. This occurs when the `unicode-case`
1106 /// feature is not enabled.
1107 ///
1108 /// Callers should prefer using `try_case_fold_simple` instead, which will
1109 /// return an error instead of panicking.
1110 pub fn case_fold_simple(&mut self) {
1111 self.set
1112 .case_fold_simple()
1113 .expect("unicode-case feature must be enabled");
1114 }
1115
1116 /// Expand this character class such that it contains all case folded
1117 /// characters, according to Unicode's "simple" mapping. For example, if
1118 /// this class consists of the range `a-z`, then applying case folding will
1119 /// result in the class containing both the ranges `a-z` and `A-Z`.
1120 ///
1121 /// # Error
1122 ///
1123 /// This routine returns an error when the case mapping data necessary
1124 /// for this routine to complete is unavailable. This occurs when the
1125 /// `unicode-case` feature is not enabled.
1126 pub fn try_case_fold_simple(
1127 &mut self,
1128 ) -> core::result::Result<(), CaseFoldError> {
1129 self.set.case_fold_simple()
1130 }
1131
1132 /// Negate this character class.
1133 ///
1134 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1135 /// set, then it will not be in this set after negation.
1136 pub fn negate(&mut self) {
1137 self.set.negate();
1138 }
1139
1140 /// Union this character class with the given character class, in place.
1141 pub fn union(&mut self, other: &ClassUnicode) {
1142 self.set.union(&other.set);
1143 }
1144
1145 /// Intersect this character class with the given character class, in
1146 /// place.
1147 pub fn intersect(&mut self, other: &ClassUnicode) {
1148 self.set.intersect(&other.set);
1149 }
1150
1151 /// Subtract the given character class from this character class, in place.
1152 pub fn difference(&mut self, other: &ClassUnicode) {
1153 self.set.difference(&other.set);
1154 }
1155
1156 /// Compute the symmetric difference of the given character classes, in
1157 /// place.
1158 ///
1159 /// This computes the symmetric difference of two character classes. This
1160 /// removes all elements in this class that are also in the given class,
1161 /// but all adds all elements from the given class that aren't in this
1162 /// class. That is, the class will contain all elements in either class,
1163 /// but will not contain any elements that are in both classes.
1164 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1165 self.set.symmetric_difference(&other.set);
1166 }
1167
1168 /// Returns true if and only if this character class will either match
1169 /// nothing or only ASCII bytes. Stated differently, this returns false
1170 /// if and only if this class contains a non-ASCII codepoint.
1171 pub fn is_ascii(&self) -> bool {
1172 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1173 }
1174
1175 /// Returns the length, in bytes, of the smallest string matched by this
1176 /// character class.
1177 ///
1178 /// Returns `None` when the class is empty.
1179 pub fn minimum_len(&self) -> Option<usize> {
1180 let first = self.ranges().get(0)?;
1181 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1182 Some(first.start.len_utf8())
1183 }
1184
1185 /// Returns the length, in bytes, of the longest string matched by this
1186 /// character class.
1187 ///
1188 /// Returns `None` when the class is empty.
1189 pub fn maximum_len(&self) -> Option<usize> {
1190 let last = self.ranges().last()?;
1191 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1192 Some(last.end.len_utf8())
1193 }
1194
1195 /// If this class consists of exactly one codepoint, then return it as
1196 /// a literal byte string.
1197 ///
1198 /// If this class is empty or contains more than one codepoint, then `None`
1199 /// is returned.
1200 pub fn literal(&self) -> Option<Vec<u8>> {
1201 let rs = self.ranges();
1202 if rs.len() == 1 && rs[0].start == rs[0].end {
1203 Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1204 } else {
1205 None
1206 }
1207 }
1208
1209 /// If this class consists of only ASCII ranges, then return its
1210 /// corresponding and equivalent byte class.
1211 pub fn to_byte_class(&self) -> Option<ClassBytes> {
1212 if !self.is_ascii() {
1213 return None;
1214 }
1215 Some(ClassBytes::new(self.ranges().iter().map(|r| {
1216 // Since we are guaranteed that our codepoint range is ASCII, the
1217 // 'u8::try_from' calls below are guaranteed to be correct.
1218 ClassBytesRange {
1219 start: u8::try_from(r.start).unwrap(),
1220 end: u8::try_from(r.end).unwrap(),
1221 }
1222 })))
1223 }
1224}
1225
1226/// An iterator over all ranges in a Unicode character class.
1227///
1228/// The lifetime `'a` refers to the lifetime of the underlying class.
1229#[derive(Debug)]
1230pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1231
1232impl<'a> Iterator for ClassUnicodeIter<'a> {
1233 type Item = &'a ClassUnicodeRange;
1234
1235 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1236 self.0.next()
1237 }
1238}
1239
1240/// A single range of characters represented by Unicode scalar values.
1241///
1242/// The range is closed. That is, the start and end of the range are included
1243/// in the range.
1244#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1245pub struct ClassUnicodeRange {
1246 start: char,
1247 end: char,
1248}
1249
1250impl core::fmt::Debug for ClassUnicodeRange {
1251 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1252 let start = if !self.start.is_whitespace() && !self.start.is_control()
1253 {
1254 self.start.to_string()
1255 } else {
1256 format!("0x{:X}", u32::from(self.start))
1257 };
1258 let end = if !self.end.is_whitespace() && !self.end.is_control() {
1259 self.end.to_string()
1260 } else {
1261 format!("0x{:X}", u32::from(self.end))
1262 };
1263 f.debug_struct("ClassUnicodeRange")
1264 .field("start", &start)
1265 .field("end", &end)
1266 .finish()
1267 }
1268}
1269
1270impl Interval for ClassUnicodeRange {
1271 type Bound = char;
1272
1273 #[inline]
1274 fn lower(&self) -> char {
1275 self.start
1276 }
1277 #[inline]
1278 fn upper(&self) -> char {
1279 self.end
1280 }
1281 #[inline]
1282 fn set_lower(&mut self, bound: char) {
1283 self.start = bound;
1284 }
1285 #[inline]
1286 fn set_upper(&mut self, bound: char) {
1287 self.end = bound;
1288 }
1289
1290 /// Apply simple case folding to this Unicode scalar value range.
1291 ///
1292 /// Additional ranges are appended to the given vector. Canonical ordering
1293 /// is *not* maintained in the given vector.
1294 fn case_fold_simple(
1295 &self,
1296 ranges: &mut Vec<ClassUnicodeRange>,
1297 ) -> Result<(), unicode::CaseFoldError> {
1298 let mut folder = unicode::SimpleCaseFolder::new()?;
1299 if !folder.overlaps(self.start, self.end) {
1300 return Ok(());
1301 }
1302 let (start, end) = (u32::from(self.start), u32::from(self.end));
1303 for cp in (start..=end).filter_map(char::from_u32) {
1304 for &cp_folded in folder.mapping(cp) {
1305 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1306 }
1307 }
1308 Ok(())
1309 }
1310}
1311
1312impl ClassUnicodeRange {
1313 /// Create a new Unicode scalar value range for a character class.
1314 ///
1315 /// The returned range is always in a canonical form. That is, the range
1316 /// returned always satisfies the invariant that `start <= end`.
1317 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1318 ClassUnicodeRange::create(start, end)
1319 }
1320
1321 /// Return the start of this range.
1322 ///
1323 /// The start of a range is always less than or equal to the end of the
1324 /// range.
1325 pub fn start(&self) -> char {
1326 self.start
1327 }
1328
1329 /// Return the end of this range.
1330 ///
1331 /// The end of a range is always greater than or equal to the start of the
1332 /// range.
1333 pub fn end(&self) -> char {
1334 self.end
1335 }
1336
1337 /// Returns the number of codepoints in this range.
1338 pub fn len(&self) -> usize {
1339 let diff = 1 + u32::from(self.end) - u32::from(self.start);
1340 // This is likely to panic in 16-bit targets since a usize can only fit
1341 // 2^16. It's not clear what to do here, other than to return an error
1342 // when building a Unicode class that contains a range whose length
1343 // overflows usize. (Which, to be honest, is probably quite common on
1344 // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1345 // would be impossible to build.)
1346 usize::try_from(diff).expect("char class len fits in usize")
1347 }
1348}
1349
1350/// A set of characters represented by arbitrary bytes.
1351///
1352/// Each byte corresponds to one character.
1353#[derive(Clone, Debug, Eq, PartialEq)]
1354pub struct ClassBytes {
1355 set: IntervalSet<ClassBytesRange>,
1356}
1357
1358impl ClassBytes {
1359 /// Create a new class from a sequence of ranges.
1360 ///
1361 /// The given ranges do not need to be in any specific order, and ranges
1362 /// may overlap. Ranges will automatically be sorted into a canonical
1363 /// non-overlapping order.
1364 pub fn new<I>(ranges: I) -> ClassBytes
1365 where
1366 I: IntoIterator<Item = ClassBytesRange>,
1367 {
1368 ClassBytes { set: IntervalSet::new(ranges) }
1369 }
1370
1371 /// Create a new class with no ranges.
1372 ///
1373 /// An empty class matches nothing. That is, it is equivalent to
1374 /// [`Hir::fail`].
1375 pub fn empty() -> ClassBytes {
1376 ClassBytes::new(vec![])
1377 }
1378
1379 /// Add a new range to this set.
1380 pub fn push(&mut self, range: ClassBytesRange) {
1381 self.set.push(range);
1382 }
1383
1384 /// Return an iterator over all ranges in this class.
1385 ///
1386 /// The iterator yields ranges in ascending order.
1387 pub fn iter(&self) -> ClassBytesIter<'_> {
1388 ClassBytesIter(self.set.iter())
1389 }
1390
1391 /// Return the underlying ranges as a slice.
1392 pub fn ranges(&self) -> &[ClassBytesRange] {
1393 self.set.intervals()
1394 }
1395
1396 /// Expand this character class such that it contains all case folded
1397 /// characters. For example, if this class consists of the range `a-z`,
1398 /// then applying case folding will result in the class containing both the
1399 /// ranges `a-z` and `A-Z`.
1400 ///
1401 /// Note that this only applies ASCII case folding, which is limited to the
1402 /// characters `a-z` and `A-Z`.
1403 pub fn case_fold_simple(&mut self) {
1404 self.set.case_fold_simple().expect("ASCII case folding never fails");
1405 }
1406
1407 /// Negate this byte class.
1408 ///
1409 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1410 /// will not be in this set after negation.
1411 pub fn negate(&mut self) {
1412 self.set.negate();
1413 }
1414
1415 /// Union this byte class with the given byte class, in place.
1416 pub fn union(&mut self, other: &ClassBytes) {
1417 self.set.union(&other.set);
1418 }
1419
1420 /// Intersect this byte class with the given byte class, in place.
1421 pub fn intersect(&mut self, other: &ClassBytes) {
1422 self.set.intersect(&other.set);
1423 }
1424
1425 /// Subtract the given byte class from this byte class, in place.
1426 pub fn difference(&mut self, other: &ClassBytes) {
1427 self.set.difference(&other.set);
1428 }
1429
1430 /// Compute the symmetric difference of the given byte classes, in place.
1431 ///
1432 /// This computes the symmetric difference of two byte classes. This
1433 /// removes all elements in this class that are also in the given class,
1434 /// but all adds all elements from the given class that aren't in this
1435 /// class. That is, the class will contain all elements in either class,
1436 /// but will not contain any elements that are in both classes.
1437 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1438 self.set.symmetric_difference(&other.set);
1439 }
1440
1441 /// Returns true if and only if this character class will either match
1442 /// nothing or only ASCII bytes. Stated differently, this returns false
1443 /// if and only if this class contains a non-ASCII byte.
1444 pub fn is_ascii(&self) -> bool {
1445 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1446 }
1447
1448 /// Returns the length, in bytes, of the smallest string matched by this
1449 /// character class.
1450 ///
1451 /// Returns `None` when the class is empty.
1452 pub fn minimum_len(&self) -> Option<usize> {
1453 if self.ranges().is_empty() {
1454 None
1455 } else {
1456 Some(1)
1457 }
1458 }
1459
1460 /// Returns the length, in bytes, of the longest string matched by this
1461 /// character class.
1462 ///
1463 /// Returns `None` when the class is empty.
1464 pub fn maximum_len(&self) -> Option<usize> {
1465 if self.ranges().is_empty() {
1466 None
1467 } else {
1468 Some(1)
1469 }
1470 }
1471
1472 /// If this class consists of exactly one byte, then return it as
1473 /// a literal byte string.
1474 ///
1475 /// If this class is empty or contains more than one byte, then `None`
1476 /// is returned.
1477 pub fn literal(&self) -> Option<Vec<u8>> {
1478 let rs = self.ranges();
1479 if rs.len() == 1 && rs[0].start == rs[0].end {
1480 Some(vec![rs[0].start])
1481 } else {
1482 None
1483 }
1484 }
1485
1486 /// If this class consists of only ASCII ranges, then return its
1487 /// corresponding and equivalent Unicode class.
1488 pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1489 if !self.is_ascii() {
1490 return None;
1491 }
1492 Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1493 // Since we are guaranteed that our byte range is ASCII, the
1494 // 'char::from' calls below are correct and will not erroneously
1495 // convert a raw byte value into its corresponding codepoint.
1496 ClassUnicodeRange {
1497 start: char::from(r.start),
1498 end: char::from(r.end),
1499 }
1500 })))
1501 }
1502}
1503
1504/// An iterator over all ranges in a byte character class.
1505///
1506/// The lifetime `'a` refers to the lifetime of the underlying class.
1507#[derive(Debug)]
1508pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1509
1510impl<'a> Iterator for ClassBytesIter<'a> {
1511 type Item = &'a ClassBytesRange;
1512
1513 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1514 self.0.next()
1515 }
1516}
1517
1518/// A single range of characters represented by arbitrary bytes.
1519///
1520/// The range is closed. That is, the start and end of the range are included
1521/// in the range.
1522#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1523pub struct ClassBytesRange {
1524 start: u8,
1525 end: u8,
1526}
1527
1528impl Interval for ClassBytesRange {
1529 type Bound = u8;
1530
1531 #[inline]
1532 fn lower(&self) -> u8 {
1533 self.start
1534 }
1535 #[inline]
1536 fn upper(&self) -> u8 {
1537 self.end
1538 }
1539 #[inline]
1540 fn set_lower(&mut self, bound: u8) {
1541 self.start = bound;
1542 }
1543 #[inline]
1544 fn set_upper(&mut self, bound: u8) {
1545 self.end = bound;
1546 }
1547
1548 /// Apply simple case folding to this byte range. Only ASCII case mappings
1549 /// (for a-z) are applied.
1550 ///
1551 /// Additional ranges are appended to the given vector. Canonical ordering
1552 /// is *not* maintained in the given vector.
1553 fn case_fold_simple(
1554 &self,
1555 ranges: &mut Vec<ClassBytesRange>,
1556 ) -> Result<(), unicode::CaseFoldError> {
1557 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1558 let lower = cmp::max(self.start, b'a');
1559 let upper = cmp::min(self.end, b'z');
1560 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1561 }
1562 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1563 let lower = cmp::max(self.start, b'A');
1564 let upper = cmp::min(self.end, b'Z');
1565 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1566 }
1567 Ok(())
1568 }
1569}
1570
1571impl ClassBytesRange {
1572 /// Create a new byte range for a character class.
1573 ///
1574 /// The returned range is always in a canonical form. That is, the range
1575 /// returned always satisfies the invariant that `start <= end`.
1576 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1577 ClassBytesRange::create(start, end)
1578 }
1579
1580 /// Return the start of this range.
1581 ///
1582 /// The start of a range is always less than or equal to the end of the
1583 /// range.
1584 pub fn start(&self) -> u8 {
1585 self.start
1586 }
1587
1588 /// Return the end of this range.
1589 ///
1590 /// The end of a range is always greater than or equal to the start of the
1591 /// range.
1592 pub fn end(&self) -> u8 {
1593 self.end
1594 }
1595
1596 /// Returns the number of bytes in this range.
1597 pub fn len(&self) -> usize {
1598 usize::from(self.end.checked_sub(self.start).unwrap())
1599 .checked_add(1)
1600 .unwrap()
1601 }
1602}
1603
1604impl core::fmt::Debug for ClassBytesRange {
1605 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1606 f.debug_struct("ClassBytesRange")
1607 .field("start", &crate::debug::Byte(self.start))
1608 .field("end", &crate::debug::Byte(self.end))
1609 .finish()
1610 }
1611}
1612
1613/// The high-level intermediate representation for a look-around assertion.
1614///
1615/// An assertion match is always zero-length. Also called an "empty match."
1616#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1617pub enum Look {
1618 /// Match the beginning of text. Specifically, this matches at the starting
1619 /// position of the input.
1620 Start = 1 << 0,
1621 /// Match the end of text. Specifically, this matches at the ending
1622 /// position of the input.
1623 End = 1 << 1,
1624 /// Match the beginning of a line or the beginning of text. Specifically,
1625 /// this matches at the starting position of the input, or at the position
1626 /// immediately following a `\n` character.
1627 StartLF = 1 << 2,
1628 /// Match the end of a line or the end of text. Specifically, this matches
1629 /// at the end position of the input, or at the position immediately
1630 /// preceding a `\n` character.
1631 EndLF = 1 << 3,
1632 /// Match the beginning of a line or the beginning of text. Specifically,
1633 /// this matches at the starting position of the input, or at the position
1634 /// immediately following either a `\r` or `\n` character, but never after
1635 /// a `\r` when a `\n` follows.
1636 StartCRLF = 1 << 4,
1637 /// Match the end of a line or the end of text. Specifically, this matches
1638 /// at the end position of the input, or at the position immediately
1639 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1640 /// precedes it.
1641 EndCRLF = 1 << 5,
1642 /// Match an ASCII-only word boundary. That is, this matches a position
1643 /// where the left adjacent character and right adjacent character
1644 /// correspond to a word and non-word or a non-word and word character.
1645 WordAscii = 1 << 6,
1646 /// Match an ASCII-only negation of a word boundary.
1647 WordAsciiNegate = 1 << 7,
1648 /// Match a Unicode-aware word boundary. That is, this matches a position
1649 /// where the left adjacent character and right adjacent character
1650 /// correspond to a word and non-word or a non-word and word character.
1651 WordUnicode = 1 << 8,
1652 /// Match a Unicode-aware negation of a word boundary.
1653 WordUnicodeNegate = 1 << 9,
1654 /// Match the start of an ASCII-only word boundary. That is, this matches a
1655 /// position at either the beginning of the haystack or where the previous
1656 /// character is not a word character and the following character is a word
1657 /// character.
1658 WordStartAscii = 1 << 10,
1659 /// Match the end of an ASCII-only word boundary. That is, this matches
1660 /// a position at either the end of the haystack or where the previous
1661 /// character is a word character and the following character is not a word
1662 /// character.
1663 WordEndAscii = 1 << 11,
1664 /// Match the start of a Unicode word boundary. That is, this matches a
1665 /// position at either the beginning of the haystack or where the previous
1666 /// character is not a word character and the following character is a word
1667 /// character.
1668 WordStartUnicode = 1 << 12,
1669 /// Match the end of a Unicode word boundary. That is, this matches a
1670 /// position at either the end of the haystack or where the previous
1671 /// character is a word character and the following character is not a word
1672 /// character.
1673 WordEndUnicode = 1 << 13,
1674 /// Match the start half of an ASCII-only word boundary. That is, this
1675 /// matches a position at either the beginning of the haystack or where the
1676 /// previous character is not a word character.
1677 WordStartHalfAscii = 1 << 14,
1678 /// Match the end half of an ASCII-only word boundary. That is, this
1679 /// matches a position at either the end of the haystack or where the
1680 /// following character is not a word character.
1681 WordEndHalfAscii = 1 << 15,
1682 /// Match the start half of a Unicode word boundary. That is, this matches
1683 /// a position at either the beginning of the haystack or where the
1684 /// previous character is not a word character.
1685 WordStartHalfUnicode = 1 << 16,
1686 /// Match the end half of a Unicode word boundary. That is, this matches
1687 /// a position at either the end of the haystack or where the following
1688 /// character is not a word character.
1689 WordEndHalfUnicode = 1 << 17,
1690}
1691
1692impl Look {
1693 /// Flip the look-around assertion to its equivalent for reverse searches.
1694 /// For example, `StartLF` gets translated to `EndLF`.
1695 ///
1696 /// Some assertions, such as `WordUnicode`, remain the same since they
1697 /// match the same positions regardless of the direction of the search.
1698 #[inline]
1699 pub const fn reversed(self) -> Look {
1700 match self {
1701 Look::Start => Look::End,
1702 Look::End => Look::Start,
1703 Look::StartLF => Look::EndLF,
1704 Look::EndLF => Look::StartLF,
1705 Look::StartCRLF => Look::EndCRLF,
1706 Look::EndCRLF => Look::StartCRLF,
1707 Look::WordAscii => Look::WordAscii,
1708 Look::WordAsciiNegate => Look::WordAsciiNegate,
1709 Look::WordUnicode => Look::WordUnicode,
1710 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1711 Look::WordStartAscii => Look::WordEndAscii,
1712 Look::WordEndAscii => Look::WordStartAscii,
1713 Look::WordStartUnicode => Look::WordEndUnicode,
1714 Look::WordEndUnicode => Look::WordStartUnicode,
1715 Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1716 Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1717 Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1718 Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1719 }
1720 }
1721
1722 /// Return the underlying representation of this look-around enumeration
1723 /// as an integer. Giving the return value to the [`Look::from_repr`]
1724 /// constructor is guaranteed to return the same look-around variant that
1725 /// one started with within a semver compatible release of this crate.
1726 #[inline]
1727 pub const fn as_repr(self) -> u32 {
1728 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1729 // actual int.
1730 self as u32
1731 }
1732
1733 /// Given the underlying representation of a `Look` value, return the
1734 /// corresponding `Look` value if the representation is valid. Otherwise
1735 /// `None` is returned.
1736 #[inline]
1737 pub const fn from_repr(repr: u32) -> Option<Look> {
1738 match repr {
1739 0b00_0000_0000_0000_0001 => Some(Look::Start),
1740 0b00_0000_0000_0000_0010 => Some(Look::End),
1741 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1742 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1743 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1744 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1745 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1746 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1747 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1748 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1749 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1750 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1751 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1752 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1753 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1754 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1755 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1756 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1757 _ => None,
1758 }
1759 }
1760
1761 /// Returns a convenient single codepoint representation of this
1762 /// look-around assertion. Each assertion is guaranteed to be represented
1763 /// by a distinct character.
1764 ///
1765 /// This is useful for succinctly representing a look-around assertion in
1766 /// human friendly but succinct output intended for a programmer working on
1767 /// regex internals.
1768 #[inline]
1769 pub const fn as_char(self) -> char {
1770 match self {
1771 Look::Start => 'A',
1772 Look::End => 'z',
1773 Look::StartLF => '^',
1774 Look::EndLF => '$',
1775 Look::StartCRLF => 'r',
1776 Look::EndCRLF => 'R',
1777 Look::WordAscii => 'b',
1778 Look::WordAsciiNegate => 'B',
1779 Look::WordUnicode => '𝛃',
1780 Look::WordUnicodeNegate => '𝚩',
1781 Look::WordStartAscii => '<',
1782 Look::WordEndAscii => '>',
1783 Look::WordStartUnicode => '〈',
1784 Look::WordEndUnicode => '〉',
1785 Look::WordStartHalfAscii => '◁',
1786 Look::WordEndHalfAscii => '▷',
1787 Look::WordStartHalfUnicode => '◀',
1788 Look::WordEndHalfUnicode => '▶',
1789 }
1790 }
1791}
1792
1793/// The high-level intermediate representation for a capturing group.
1794///
1795/// A capturing group always has an index and a child expression. It may
1796/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1797/// necessary.
1798///
1799/// Note that there is no explicit representation of a non-capturing group
1800/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1801/// the recursive structure of the `Hir` itself.
1802#[derive(Clone, Debug, Eq, PartialEq)]
1803pub struct Capture {
1804 /// The capture index of the capture.
1805 pub index: u32,
1806 /// The name of the capture, if it exists.
1807 pub name: Option<Box<str>>,
1808 /// The expression inside the capturing group, which may be empty.
1809 pub sub: Box<Hir>,
1810}
1811
1812/// The high-level intermediate representation of a repetition operator.
1813///
1814/// A repetition operator permits the repetition of an arbitrary
1815/// sub-expression.
1816#[derive(Clone, Debug, Eq, PartialEq)]
1817pub struct Repetition {
1818 /// The minimum range of the repetition.
1819 ///
1820 /// Note that special cases like `?`, `+` and `*` all get translated into
1821 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1822 ///
1823 /// When `min` is zero, this expression can match the empty string
1824 /// regardless of what its sub-expression is.
1825 pub min: u32,
1826 /// The maximum range of the repetition.
1827 ///
1828 /// Note that when `max` is `None`, `min` acts as a lower bound but where
1829 /// there is no upper bound. For something like `x{5}` where the min and
1830 /// max are equivalent, `min` will be set to `5` and `max` will be set to
1831 /// `Some(5)`.
1832 pub max: Option<u32>,
1833 /// Whether this repetition operator is greedy or not. A greedy operator
1834 /// will match as much as it can. A non-greedy operator will match as
1835 /// little as it can.
1836 ///
1837 /// Typically, operators are greedy by default and are only non-greedy when
1838 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1839 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1840 pub greedy: bool,
1841 /// The expression being repeated.
1842 pub sub: Box<Hir>,
1843}
1844
1845impl Repetition {
1846 /// Returns a new repetition with the same `min`, `max` and `greedy`
1847 /// values, but with its sub-expression replaced with the one given.
1848 pub fn with(&self, sub: Hir) -> Repetition {
1849 Repetition {
1850 min: self.min,
1851 max: self.max,
1852 greedy: self.greedy,
1853 sub: Box::new(sub),
1854 }
1855 }
1856}
1857
1858/// A type describing the different flavors of `.`.
1859///
1860/// This type is meant to be used with [`Hir::dot`], which is a convenience
1861/// routine for building HIR values derived from the `.` regex.
1862#[non_exhaustive]
1863#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1864pub enum Dot {
1865 /// Matches the UTF-8 encoding of any Unicode scalar value.
1866 ///
1867 /// This is equivalent to `(?su:.)` and also `\p{any}`.
1868 AnyChar,
1869 /// Matches any byte value.
1870 ///
1871 /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1872 AnyByte,
1873 /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1874 /// `char` given.
1875 ///
1876 /// This is equivalent to using `(?u-s:.)` with the line terminator set
1877 /// to a particular ASCII byte. (Because of peculiarities in the regex
1878 /// engines, a line terminator must be a single byte. It follows that when
1879 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1880 /// value. That is, ti must be ASCII.)
1881 ///
1882 /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1883 /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1884 AnyCharExcept(char),
1885 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1886 ///
1887 /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1888 AnyCharExceptLF,
1889 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1890 /// and `\n`.
1891 ///
1892 /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1893 AnyCharExceptCRLF,
1894 /// Matches any byte value except for the `u8` given.
1895 ///
1896 /// This is equivalent to using `(?-us:.)` with the line terminator set
1897 /// to a particular ASCII byte. (Because of peculiarities in the regex
1898 /// engines, a line terminator must be a single byte. It follows that when
1899 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1900 /// value. That is, ti must be ASCII.)
1901 ///
1902 /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1903 /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1904 AnyByteExcept(u8),
1905 /// Matches any byte value except for `\n`.
1906 ///
1907 /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1908 AnyByteExceptLF,
1909 /// Matches any byte value except for `\r` and `\n`.
1910 ///
1911 /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1912 AnyByteExceptCRLF,
1913}
1914
1915/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1916/// space but heap space proportional to the depth of the total `Hir`.
1917impl Drop for Hir {
1918 fn drop(&mut self) {
1919 use core::mem;
1920
1921 match *self.kind() {
1922 HirKind::Empty
1923 | HirKind::Literal(_)
1924 | HirKind::Class(_)
1925 | HirKind::Look(_) => return,
1926 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1927 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1928 return
1929 }
1930 HirKind::Concat(ref x) if x.is_empty() => return,
1931 HirKind::Alternation(ref x) if x.is_empty() => return,
1932 _ => {}
1933 }
1934
1935 let mut stack = vec![mem::replace(self, Hir::empty())];
1936 while let Some(mut expr) = stack.pop() {
1937 match expr.kind {
1938 HirKind::Empty
1939 | HirKind::Literal(_)
1940 | HirKind::Class(_)
1941 | HirKind::Look(_) => {}
1942 HirKind::Capture(ref mut x) => {
1943 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1944 }
1945 HirKind::Repetition(ref mut x) => {
1946 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1947 }
1948 HirKind::Concat(ref mut x) => {
1949 stack.extend(x.drain(..));
1950 }
1951 HirKind::Alternation(ref mut x) => {
1952 stack.extend(x.drain(..));
1953 }
1954 }
1955 }
1956 }
1957}
1958
1959/// A type that collects various properties of an HIR value.
1960///
1961/// Properties are always scalar values and represent meta data that is
1962/// computed inductively on an HIR value. Properties are defined for all
1963/// HIR values.
1964///
1965/// All methods on a `Properties` value take constant time and are meant to
1966/// be cheap to call.
1967#[derive(Clone, Debug, Eq, PartialEq)]
1968pub struct Properties(Box<PropertiesI>);
1969
1970/// The property definition. It is split out so that we can box it, and
1971/// there by make `Properties` use less stack size. This is kind-of important
1972/// because every HIR value has a `Properties` attached to it.
1973///
1974/// This does have the unfortunate consequence that creating any HIR value
1975/// always leads to at least one alloc for properties, but this is generally
1976/// true anyway (for pretty much all HirKinds except for look-arounds).
1977#[derive(Clone, Debug, Eq, PartialEq)]
1978struct PropertiesI {
1979 minimum_len: Option<usize>,
1980 maximum_len: Option<usize>,
1981 look_set: LookSet,
1982 look_set_prefix: LookSet,
1983 look_set_suffix: LookSet,
1984 look_set_prefix_any: LookSet,
1985 look_set_suffix_any: LookSet,
1986 utf8: bool,
1987 explicit_captures_len: usize,
1988 static_explicit_captures_len: Option<usize>,
1989 literal: bool,
1990 alternation_literal: bool,
1991}
1992
1993impl Properties {
1994 /// Returns the length (in bytes) of the smallest string matched by this
1995 /// HIR.
1996 ///
1997 /// A return value of `0` is possible and occurs when the HIR can match an
1998 /// empty string.
1999 ///
2000 /// `None` is returned when there is no minimum length. This occurs in
2001 /// precisely the cases where the HIR matches nothing. i.e., The language
2002 /// the regex matches is empty. An example of such a regex is `\P{any}`.
2003 #[inline]
2004 pub fn minimum_len(&self) -> Option<usize> {
2005 self.0.minimum_len
2006 }
2007
2008 /// Returns the length (in bytes) of the longest string matched by this
2009 /// HIR.
2010 ///
2011 /// A return value of `0` is possible and occurs when nothing longer than
2012 /// the empty string is in the language described by this HIR.
2013 ///
2014 /// `None` is returned when there is no longest matching string. This
2015 /// occurs when the HIR matches nothing or when there is no upper bound on
2016 /// the length of matching strings. Example of such regexes are `\P{any}`
2017 /// (matches nothing) and `a+` (has no upper bound).
2018 #[inline]
2019 pub fn maximum_len(&self) -> Option<usize> {
2020 self.0.maximum_len
2021 }
2022
2023 /// Returns a set of all look-around assertions that appear at least once
2024 /// in this HIR value.
2025 #[inline]
2026 pub fn look_set(&self) -> LookSet {
2027 self.0.look_set
2028 }
2029
2030 /// Returns a set of all look-around assertions that appear as a prefix for
2031 /// this HIR value. That is, the set returned corresponds to the set of
2032 /// assertions that must be passed before matching any bytes in a haystack.
2033 ///
2034 /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2035 /// if and only if the HIR is fully anchored at the start.
2036 #[inline]
2037 pub fn look_set_prefix(&self) -> LookSet {
2038 self.0.look_set_prefix
2039 }
2040
2041 /// Returns a set of all look-around assertions that appear as a _possible_
2042 /// prefix for this HIR value. That is, the set returned corresponds to the
2043 /// set of assertions that _may_ be passed before matching any bytes in a
2044 /// haystack.
2045 ///
2046 /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2047 /// true if and only if it's possible for the regex to match through a
2048 /// anchored assertion before consuming any input.
2049 #[inline]
2050 pub fn look_set_prefix_any(&self) -> LookSet {
2051 self.0.look_set_prefix_any
2052 }
2053
2054 /// Returns a set of all look-around assertions that appear as a suffix for
2055 /// this HIR value. That is, the set returned corresponds to the set of
2056 /// assertions that must be passed in order to be considered a match after
2057 /// all other consuming HIR expressions.
2058 ///
2059 /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2060 /// if and only if the HIR is fully anchored at the end.
2061 #[inline]
2062 pub fn look_set_suffix(&self) -> LookSet {
2063 self.0.look_set_suffix
2064 }
2065
2066 /// Returns a set of all look-around assertions that appear as a _possible_
2067 /// suffix for this HIR value. That is, the set returned corresponds to the
2068 /// set of assertions that _may_ be passed before matching any bytes in a
2069 /// haystack.
2070 ///
2071 /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2072 /// true if and only if it's possible for the regex to match through a
2073 /// anchored assertion at the end of a match without consuming any input.
2074 #[inline]
2075 pub fn look_set_suffix_any(&self) -> LookSet {
2076 self.0.look_set_suffix_any
2077 }
2078
2079 /// Return true if and only if the corresponding HIR will always match
2080 /// valid UTF-8.
2081 ///
2082 /// When this returns false, then it is possible for this HIR expression to
2083 /// match invalid UTF-8, including by matching between the code units of
2084 /// a single UTF-8 encoded codepoint.
2085 ///
2086 /// Note that this returns true even when the corresponding HIR can match
2087 /// the empty string. Since an empty string can technically appear between
2088 /// UTF-8 code units, it is possible for a match to be reported that splits
2089 /// a codepoint which could in turn be considered matching invalid UTF-8.
2090 /// However, it is generally assumed that such empty matches are handled
2091 /// specially by the search routine if it is absolutely required that
2092 /// matches not split a codepoint.
2093 ///
2094 /// # Example
2095 ///
2096 /// This code example shows the UTF-8 property of a variety of patterns.
2097 ///
2098 /// ```
2099 /// use regex_syntax::{ParserBuilder, parse};
2100 ///
2101 /// // Examples of 'is_utf8() == true'.
2102 /// assert!(parse(r"a")?.properties().is_utf8());
2103 /// assert!(parse(r"[^a]")?.properties().is_utf8());
2104 /// assert!(parse(r".")?.properties().is_utf8());
2105 /// assert!(parse(r"\W")?.properties().is_utf8());
2106 /// assert!(parse(r"\b")?.properties().is_utf8());
2107 /// assert!(parse(r"\B")?.properties().is_utf8());
2108 /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2109 /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2110 /// // Unicode mode is enabled by default, and in
2111 /// // that mode, all \x hex escapes are treated as
2112 /// // codepoints. So this actually matches the UTF-8
2113 /// // encoding of U+00FF.
2114 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2115 ///
2116 /// // Now we show examples of 'is_utf8() == false'.
2117 /// // The only way to do this is to force the parser
2118 /// // to permit invalid UTF-8, otherwise all of these
2119 /// // would fail to parse!
2120 /// let parse = |pattern| {
2121 /// ParserBuilder::new().utf8(false).build().parse(pattern)
2122 /// };
2123 /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2124 /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2125 /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2126 /// // Conversely to the equivalent example above,
2127 /// // when Unicode mode is disabled, \x hex escapes
2128 /// // are treated as their raw byte values.
2129 /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2130 /// // Note that just because we disabled UTF-8 in the
2131 /// // parser doesn't mean we still can't use Unicode.
2132 /// // It is enabled by default, so \xFF is still
2133 /// // equivalent to matching the UTF-8 encoding of
2134 /// // U+00FF by default.
2135 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2136 /// // Even though we use raw bytes that individually
2137 /// // are not valid UTF-8, when combined together, the
2138 /// // overall expression *does* match valid UTF-8!
2139 /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2140 ///
2141 /// # Ok::<(), Box<dyn std::error::Error>>(())
2142 /// ```
2143 #[inline]
2144 pub fn is_utf8(&self) -> bool {
2145 self.0.utf8
2146 }
2147
2148 /// Returns the total number of explicit capturing groups in the
2149 /// corresponding HIR.
2150 ///
2151 /// Note that this does not include the implicit capturing group
2152 /// corresponding to the entire match that is typically included by regex
2153 /// engines.
2154 ///
2155 /// # Example
2156 ///
2157 /// This method will return `0` for `a` and `1` for `(a)`:
2158 ///
2159 /// ```
2160 /// use regex_syntax::parse;
2161 ///
2162 /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2163 /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2164 ///
2165 /// # Ok::<(), Box<dyn std::error::Error>>(())
2166 /// ```
2167 #[inline]
2168 pub fn explicit_captures_len(&self) -> usize {
2169 self.0.explicit_captures_len
2170 }
2171
2172 /// Returns the total number of explicit capturing groups that appear in
2173 /// every possible match.
2174 ///
2175 /// If the number of capture groups can vary depending on the match, then
2176 /// this returns `None`. That is, a value is only returned when the number
2177 /// of matching groups is invariant or "static."
2178 ///
2179 /// Note that this does not include the implicit capturing group
2180 /// corresponding to the entire match.
2181 ///
2182 /// # Example
2183 ///
2184 /// This shows a few cases where a static number of capture groups is
2185 /// available and a few cases where it is not.
2186 ///
2187 /// ```
2188 /// use regex_syntax::parse;
2189 ///
2190 /// let len = |pattern| {
2191 /// parse(pattern).map(|h| {
2192 /// h.properties().static_explicit_captures_len()
2193 /// })
2194 /// };
2195 ///
2196 /// assert_eq!(Some(0), len("a")?);
2197 /// assert_eq!(Some(1), len("(a)")?);
2198 /// assert_eq!(Some(1), len("(a)|(b)")?);
2199 /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2200 /// assert_eq!(None, len("(a)|b")?);
2201 /// assert_eq!(None, len("a|(b)")?);
2202 /// assert_eq!(None, len("(b)*")?);
2203 /// assert_eq!(Some(1), len("(b)+")?);
2204 ///
2205 /// # Ok::<(), Box<dyn std::error::Error>>(())
2206 /// ```
2207 #[inline]
2208 pub fn static_explicit_captures_len(&self) -> Option<usize> {
2209 self.0.static_explicit_captures_len
2210 }
2211
2212 /// Return true if and only if this HIR is a simple literal. This is
2213 /// only true when this HIR expression is either itself a `Literal` or a
2214 /// concatenation of only `Literal`s.
2215 ///
2216 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2217 /// the empty string are not (even though they contain sub-expressions that
2218 /// are literals).
2219 #[inline]
2220 pub fn is_literal(&self) -> bool {
2221 self.0.literal
2222 }
2223
2224 /// Return true if and only if this HIR is either a simple literal or an
2225 /// alternation of simple literals. This is only
2226 /// true when this HIR expression is either itself a `Literal` or a
2227 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2228 ///
2229 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2230 /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2231 /// (even though that contain sub-expressions that are literals).
2232 #[inline]
2233 pub fn is_alternation_literal(&self) -> bool {
2234 self.0.alternation_literal
2235 }
2236
2237 /// Returns the total amount of heap memory usage, in bytes, used by this
2238 /// `Properties` value.
2239 #[inline]
2240 pub fn memory_usage(&self) -> usize {
2241 core::mem::size_of::<PropertiesI>()
2242 }
2243
2244 /// Returns a new set of properties that corresponds to the union of the
2245 /// iterator of properties given.
2246 ///
2247 /// This is useful when one has multiple `Hir` expressions and wants
2248 /// to combine them into a single alternation without constructing the
2249 /// corresponding `Hir`. This routine provides a way of combining the
2250 /// properties of each `Hir` expression into one set of properties
2251 /// representing the union of those expressions.
2252 ///
2253 /// # Example: union with HIRs that never match
2254 ///
2255 /// This example shows that unioning properties together with one that
2256 /// represents a regex that never matches will "poison" certain attributes,
2257 /// like the minimum and maximum lengths.
2258 ///
2259 /// ```
2260 /// use regex_syntax::{hir::Properties, parse};
2261 ///
2262 /// let hir1 = parse("ab?c?")?;
2263 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2264 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2265 ///
2266 /// let hir2 = parse(r"[a&&b]")?;
2267 /// assert_eq!(None, hir2.properties().minimum_len());
2268 /// assert_eq!(None, hir2.properties().maximum_len());
2269 ///
2270 /// let hir3 = parse(r"wxy?z?")?;
2271 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2272 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2273 ///
2274 /// let unioned = Properties::union([
2275 /// hir1.properties(),
2276 /// hir2.properties(),
2277 /// hir3.properties(),
2278 /// ]);
2279 /// assert_eq!(None, unioned.minimum_len());
2280 /// assert_eq!(None, unioned.maximum_len());
2281 ///
2282 /// # Ok::<(), Box<dyn std::error::Error>>(())
2283 /// ```
2284 ///
2285 /// The maximum length can also be "poisoned" by a pattern that has no
2286 /// upper bound on the length of a match. The minimum length remains
2287 /// unaffected:
2288 ///
2289 /// ```
2290 /// use regex_syntax::{hir::Properties, parse};
2291 ///
2292 /// let hir1 = parse("ab?c?")?;
2293 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2294 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2295 ///
2296 /// let hir2 = parse(r"a+")?;
2297 /// assert_eq!(Some(1), hir2.properties().minimum_len());
2298 /// assert_eq!(None, hir2.properties().maximum_len());
2299 ///
2300 /// let hir3 = parse(r"wxy?z?")?;
2301 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2302 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2303 ///
2304 /// let unioned = Properties::union([
2305 /// hir1.properties(),
2306 /// hir2.properties(),
2307 /// hir3.properties(),
2308 /// ]);
2309 /// assert_eq!(Some(1), unioned.minimum_len());
2310 /// assert_eq!(None, unioned.maximum_len());
2311 ///
2312 /// # Ok::<(), Box<dyn std::error::Error>>(())
2313 /// ```
2314 pub fn union<I, P>(props: I) -> Properties
2315 where
2316 I: IntoIterator<Item = P>,
2317 P: core::borrow::Borrow<Properties>,
2318 {
2319 let mut it = props.into_iter().peekable();
2320 // While empty alternations aren't possible, we still behave as if they
2321 // are. When we have an empty alternate, then clearly the look-around
2322 // prefix and suffix is empty. Otherwise, it is the intersection of all
2323 // prefixes and suffixes (respectively) of the branches.
2324 let fix = if it.peek().is_none() {
2325 LookSet::empty()
2326 } else {
2327 LookSet::full()
2328 };
2329 // And also, an empty alternate means we have 0 static capture groups,
2330 // but we otherwise start with the number corresponding to the first
2331 // alternate. If any subsequent alternate has a different number of
2332 // static capture groups, then we overall have a variation and not a
2333 // static number of groups.
2334 let static_explicit_captures_len =
2335 it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2336 // The base case is an empty alternation, which matches nothing.
2337 // Note though that empty alternations aren't possible, because the
2338 // Hir::alternation smart constructor rewrites those as empty character
2339 // classes.
2340 let mut props = PropertiesI {
2341 minimum_len: None,
2342 maximum_len: None,
2343 look_set: LookSet::empty(),
2344 look_set_prefix: fix,
2345 look_set_suffix: fix,
2346 look_set_prefix_any: LookSet::empty(),
2347 look_set_suffix_any: LookSet::empty(),
2348 utf8: true,
2349 explicit_captures_len: 0,
2350 static_explicit_captures_len,
2351 literal: false,
2352 alternation_literal: true,
2353 };
2354 let (mut min_poisoned, mut max_poisoned) = (false, false);
2355 // Handle properties that need to visit every child hir.
2356 for prop in it {
2357 let p = prop.borrow();
2358 props.look_set.set_union(p.look_set());
2359 props.look_set_prefix.set_intersect(p.look_set_prefix());
2360 props.look_set_suffix.set_intersect(p.look_set_suffix());
2361 props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2362 props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2363 props.utf8 = props.utf8 && p.is_utf8();
2364 props.explicit_captures_len = props
2365 .explicit_captures_len
2366 .saturating_add(p.explicit_captures_len());
2367 if props.static_explicit_captures_len
2368 != p.static_explicit_captures_len()
2369 {
2370 props.static_explicit_captures_len = None;
2371 }
2372 props.alternation_literal =
2373 props.alternation_literal && p.is_literal();
2374 if !min_poisoned {
2375 if let Some(xmin) = p.minimum_len() {
2376 if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2377 props.minimum_len = Some(xmin);
2378 }
2379 } else {
2380 props.minimum_len = None;
2381 min_poisoned = true;
2382 }
2383 }
2384 if !max_poisoned {
2385 if let Some(xmax) = p.maximum_len() {
2386 if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2387 props.maximum_len = Some(xmax);
2388 }
2389 } else {
2390 props.maximum_len = None;
2391 max_poisoned = true;
2392 }
2393 }
2394 }
2395 Properties(Box::new(props))
2396 }
2397}
2398
2399impl Properties {
2400 /// Create a new set of HIR properties for an empty regex.
2401 fn empty() -> Properties {
2402 let inner = PropertiesI {
2403 minimum_len: Some(0),
2404 maximum_len: Some(0),
2405 look_set: LookSet::empty(),
2406 look_set_prefix: LookSet::empty(),
2407 look_set_suffix: LookSet::empty(),
2408 look_set_prefix_any: LookSet::empty(),
2409 look_set_suffix_any: LookSet::empty(),
2410 // It is debatable whether an empty regex always matches at valid
2411 // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2412 // it is clearly false. There are, for example, many empty strings
2413 // between the bytes encoding a '☃'.
2414 //
2415 // However, when Unicode mode is enabled, the fundamental atom
2416 // of matching is really a codepoint. And in that scenario, an
2417 // empty regex is defined to only match at valid UTF-8 boundaries
2418 // and to never split a codepoint. It just so happens that this
2419 // enforcement is somewhat tricky to do for regexes that match
2420 // the empty string inside regex engines themselves. It usually
2421 // requires some layer above the regex engine to filter out such
2422 // matches.
2423 //
2424 // In any case, 'true' is really the only coherent option. If it
2425 // were false, for example, then 'a*' would also need to be false
2426 // since it too can match the empty string.
2427 utf8: true,
2428 explicit_captures_len: 0,
2429 static_explicit_captures_len: Some(0),
2430 literal: false,
2431 alternation_literal: false,
2432 };
2433 Properties(Box::new(inner))
2434 }
2435
2436 /// Create a new set of HIR properties for a literal regex.
2437 fn literal(lit: &Literal) -> Properties {
2438 let inner = PropertiesI {
2439 minimum_len: Some(lit.0.len()),
2440 maximum_len: Some(lit.0.len()),
2441 look_set: LookSet::empty(),
2442 look_set_prefix: LookSet::empty(),
2443 look_set_suffix: LookSet::empty(),
2444 look_set_prefix_any: LookSet::empty(),
2445 look_set_suffix_any: LookSet::empty(),
2446 utf8: core::str::from_utf8(&lit.0).is_ok(),
2447 explicit_captures_len: 0,
2448 static_explicit_captures_len: Some(0),
2449 literal: true,
2450 alternation_literal: true,
2451 };
2452 Properties(Box::new(inner))
2453 }
2454
2455 /// Create a new set of HIR properties for a character class.
2456 fn class(class: &Class) -> Properties {
2457 let inner = PropertiesI {
2458 minimum_len: class.minimum_len(),
2459 maximum_len: class.maximum_len(),
2460 look_set: LookSet::empty(),
2461 look_set_prefix: LookSet::empty(),
2462 look_set_suffix: LookSet::empty(),
2463 look_set_prefix_any: LookSet::empty(),
2464 look_set_suffix_any: LookSet::empty(),
2465 utf8: class.is_utf8(),
2466 explicit_captures_len: 0,
2467 static_explicit_captures_len: Some(0),
2468 literal: false,
2469 alternation_literal: false,
2470 };
2471 Properties(Box::new(inner))
2472 }
2473
2474 /// Create a new set of HIR properties for a look-around assertion.
2475 fn look(look: Look) -> Properties {
2476 let inner = PropertiesI {
2477 minimum_len: Some(0),
2478 maximum_len: Some(0),
2479 look_set: LookSet::singleton(look),
2480 look_set_prefix: LookSet::singleton(look),
2481 look_set_suffix: LookSet::singleton(look),
2482 look_set_prefix_any: LookSet::singleton(look),
2483 look_set_suffix_any: LookSet::singleton(look),
2484 // This requires a little explanation. Basically, we don't consider
2485 // matching an empty string to be equivalent to matching invalid
2486 // UTF-8, even though technically matching every empty string will
2487 // split the UTF-8 encoding of a single codepoint when treating a
2488 // UTF-8 encoded string as a sequence of bytes. Our defense here is
2489 // that in such a case, a codepoint should logically be treated as
2490 // the fundamental atom for matching, and thus the only valid match
2491 // points are between codepoints and not bytes.
2492 //
2493 // More practically, this is true here because it's also true
2494 // for 'Hir::empty()', otherwise something like 'a*' would be
2495 // considered to match invalid UTF-8. That in turn makes this
2496 // property borderline useless.
2497 utf8: true,
2498 explicit_captures_len: 0,
2499 static_explicit_captures_len: Some(0),
2500 literal: false,
2501 alternation_literal: false,
2502 };
2503 Properties(Box::new(inner))
2504 }
2505
2506 /// Create a new set of HIR properties for a repetition.
2507 fn repetition(rep: &Repetition) -> Properties {
2508 let p = rep.sub.properties();
2509 let minimum_len = p.minimum_len().map(|child_min| {
2510 let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2511 child_min.saturating_mul(rep_min)
2512 });
2513 let maximum_len = rep.max.and_then(|rep_max| {
2514 let rep_max = usize::try_from(rep_max).ok()?;
2515 let child_max = p.maximum_len()?;
2516 child_max.checked_mul(rep_max)
2517 });
2518
2519 let mut inner = PropertiesI {
2520 minimum_len,
2521 maximum_len,
2522 look_set: p.look_set(),
2523 look_set_prefix: LookSet::empty(),
2524 look_set_suffix: LookSet::empty(),
2525 look_set_prefix_any: p.look_set_prefix_any(),
2526 look_set_suffix_any: p.look_set_suffix_any(),
2527 utf8: p.is_utf8(),
2528 explicit_captures_len: p.explicit_captures_len(),
2529 static_explicit_captures_len: p.static_explicit_captures_len(),
2530 literal: false,
2531 alternation_literal: false,
2532 };
2533 // If the repetition operator can match the empty string, then its
2534 // lookset prefix and suffixes themselves remain empty since they are
2535 // no longer required to match.
2536 if rep.min > 0 {
2537 inner.look_set_prefix = p.look_set_prefix();
2538 inner.look_set_suffix = p.look_set_suffix();
2539 }
2540 // If the static captures len of the sub-expression is not known or
2541 // is greater than zero, then it automatically propagates to the
2542 // repetition, regardless of the repetition. Otherwise, it might
2543 // change, but only when the repetition can match 0 times.
2544 if rep.min == 0
2545 && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2546 {
2547 // If we require a match 0 times, then our captures len is
2548 // guaranteed to be zero. Otherwise, if we *can* match the empty
2549 // string, then it's impossible to know how many captures will be
2550 // in the resulting match.
2551 if rep.max == Some(0) {
2552 inner.static_explicit_captures_len = Some(0);
2553 } else {
2554 inner.static_explicit_captures_len = None;
2555 }
2556 }
2557 Properties(Box::new(inner))
2558 }
2559
2560 /// Create a new set of HIR properties for a capture.
2561 fn capture(capture: &Capture) -> Properties {
2562 let p = capture.sub.properties();
2563 Properties(Box::new(PropertiesI {
2564 explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2565 static_explicit_captures_len: p
2566 .static_explicit_captures_len()
2567 .map(|len| len.saturating_add(1)),
2568 literal: false,
2569 alternation_literal: false,
2570 ..*p.0.clone()
2571 }))
2572 }
2573
2574 /// Create a new set of HIR properties for a concatenation.
2575 fn concat(concat: &[Hir]) -> Properties {
2576 // The base case is an empty concatenation, which matches the empty
2577 // string. Note though that empty concatenations aren't possible,
2578 // because the Hir::concat smart constructor rewrites those as
2579 // Hir::empty.
2580 let mut props = PropertiesI {
2581 minimum_len: Some(0),
2582 maximum_len: Some(0),
2583 look_set: LookSet::empty(),
2584 look_set_prefix: LookSet::empty(),
2585 look_set_suffix: LookSet::empty(),
2586 look_set_prefix_any: LookSet::empty(),
2587 look_set_suffix_any: LookSet::empty(),
2588 utf8: true,
2589 explicit_captures_len: 0,
2590 static_explicit_captures_len: Some(0),
2591 literal: true,
2592 alternation_literal: true,
2593 };
2594 // Handle properties that need to visit every child hir.
2595 for x in concat.iter() {
2596 let p = x.properties();
2597 props.look_set.set_union(p.look_set());
2598 props.utf8 = props.utf8 && p.is_utf8();
2599 props.explicit_captures_len = props
2600 .explicit_captures_len
2601 .saturating_add(p.explicit_captures_len());
2602 props.static_explicit_captures_len = p
2603 .static_explicit_captures_len()
2604 .and_then(|len1| {
2605 Some((len1, props.static_explicit_captures_len?))
2606 })
2607 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2608 props.literal = props.literal && p.is_literal();
2609 props.alternation_literal =
2610 props.alternation_literal && p.is_alternation_literal();
2611 if let Some(minimum_len) = props.minimum_len {
2612 match p.minimum_len() {
2613 None => props.minimum_len = None,
2614 Some(len) => {
2615 // We use saturating arithmetic here because the
2616 // minimum is just a lower bound. We can't go any
2617 // higher than what our number types permit.
2618 props.minimum_len =
2619 Some(minimum_len.saturating_add(len));
2620 }
2621 }
2622 }
2623 if let Some(maximum_len) = props.maximum_len {
2624 match p.maximum_len() {
2625 None => props.maximum_len = None,
2626 Some(len) => {
2627 props.maximum_len = maximum_len.checked_add(len)
2628 }
2629 }
2630 }
2631 }
2632 // Handle the prefix properties, which only requires visiting
2633 // child exprs until one matches more than the empty string.
2634 let mut it = concat.iter();
2635 while let Some(x) = it.next() {
2636 props.look_set_prefix.set_union(x.properties().look_set_prefix());
2637 props
2638 .look_set_prefix_any
2639 .set_union(x.properties().look_set_prefix_any());
2640 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2641 break;
2642 }
2643 }
2644 // Same thing for the suffix properties, but in reverse.
2645 let mut it = concat.iter().rev();
2646 while let Some(x) = it.next() {
2647 props.look_set_suffix.set_union(x.properties().look_set_suffix());
2648 props
2649 .look_set_suffix_any
2650 .set_union(x.properties().look_set_suffix_any());
2651 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2652 break;
2653 }
2654 }
2655 Properties(Box::new(props))
2656 }
2657
2658 /// Create a new set of HIR properties for a concatenation.
2659 fn alternation(alts: &[Hir]) -> Properties {
2660 Properties::union(alts.iter().map(|hir| hir.properties()))
2661 }
2662}
2663
2664/// A set of look-around assertions.
2665///
2666/// This is useful for efficiently tracking look-around assertions. For
2667/// example, an [`Hir`] provides properties that return `LookSet`s.
2668#[derive(Clone, Copy, Default, Eq, PartialEq)]
2669pub struct LookSet {
2670 /// The underlying representation this set is exposed to make it possible
2671 /// to store it somewhere efficiently. The representation is that
2672 /// of a bitset, where each assertion occupies bit `i` where `i =
2673 /// Look::as_repr()`.
2674 ///
2675 /// Note that users of this internal representation must permit the full
2676 /// range of `u16` values to be represented. For example, even if the
2677 /// current implementation only makes use of the 10 least significant bits,
2678 /// it may use more bits in a future semver compatible release.
2679 pub bits: u32,
2680}
2681
2682impl LookSet {
2683 /// Create an empty set of look-around assertions.
2684 #[inline]
2685 pub fn empty() -> LookSet {
2686 LookSet { bits: 0 }
2687 }
2688
2689 /// Create a full set of look-around assertions.
2690 ///
2691 /// This set contains all possible look-around assertions.
2692 #[inline]
2693 pub fn full() -> LookSet {
2694 LookSet { bits: !0 }
2695 }
2696
2697 /// Create a look-around set containing the look-around assertion given.
2698 ///
2699 /// This is a convenience routine for creating an empty set and inserting
2700 /// one look-around assertions.
2701 #[inline]
2702 pub fn singleton(look: Look) -> LookSet {
2703 LookSet::empty().insert(look)
2704 }
2705
2706 /// Returns the total number of look-around assertions in this set.
2707 #[inline]
2708 pub fn len(self) -> usize {
2709 // OK because max value always fits in a u8, which in turn always
2710 // fits in a usize, regardless of target.
2711 usize::try_from(self.bits.count_ones()).unwrap()
2712 }
2713
2714 /// Returns true if and only if this set is empty.
2715 #[inline]
2716 pub fn is_empty(self) -> bool {
2717 self.len() == 0
2718 }
2719
2720 /// Returns true if and only if the given look-around assertion is in this
2721 /// set.
2722 #[inline]
2723 pub fn contains(self, look: Look) -> bool {
2724 self.bits & look.as_repr() != 0
2725 }
2726
2727 /// Returns true if and only if this set contains any anchor assertions.
2728 /// This includes both "start/end of haystack" and "start/end of line."
2729 #[inline]
2730 pub fn contains_anchor(&self) -> bool {
2731 self.contains_anchor_haystack() || self.contains_anchor_line()
2732 }
2733
2734 /// Returns true if and only if this set contains any "start/end of
2735 /// haystack" anchors. This doesn't include "start/end of line" anchors.
2736 #[inline]
2737 pub fn contains_anchor_haystack(&self) -> bool {
2738 self.contains(Look::Start) || self.contains(Look::End)
2739 }
2740
2741 /// Returns true if and only if this set contains any "start/end of line"
2742 /// anchors. This doesn't include "start/end of haystack" anchors. This
2743 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2744 #[inline]
2745 pub fn contains_anchor_line(&self) -> bool {
2746 self.contains(Look::StartLF)
2747 || self.contains(Look::EndLF)
2748 || self.contains(Look::StartCRLF)
2749 || self.contains(Look::EndCRLF)
2750 }
2751
2752 /// Returns true if and only if this set contains any "start/end of line"
2753 /// anchors that only treat `\n` as line terminators. This does not include
2754 /// haystack anchors or CRLF aware line anchors.
2755 #[inline]
2756 pub fn contains_anchor_lf(&self) -> bool {
2757 self.contains(Look::StartLF) || self.contains(Look::EndLF)
2758 }
2759
2760 /// Returns true if and only if this set contains any "start/end of line"
2761 /// anchors that are CRLF-aware. This doesn't include "start/end of
2762 /// haystack" or "start/end of line-feed" anchors.
2763 #[inline]
2764 pub fn contains_anchor_crlf(&self) -> bool {
2765 self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2766 }
2767
2768 /// Returns true if and only if this set contains any word boundary or
2769 /// negated word boundary assertions. This include both Unicode and ASCII
2770 /// word boundaries.
2771 #[inline]
2772 pub fn contains_word(self) -> bool {
2773 self.contains_word_unicode() || self.contains_word_ascii()
2774 }
2775
2776 /// Returns true if and only if this set contains any Unicode word boundary
2777 /// or negated Unicode word boundary assertions.
2778 #[inline]
2779 pub fn contains_word_unicode(self) -> bool {
2780 self.contains(Look::WordUnicode)
2781 || self.contains(Look::WordUnicodeNegate)
2782 || self.contains(Look::WordStartUnicode)
2783 || self.contains(Look::WordEndUnicode)
2784 || self.contains(Look::WordStartHalfUnicode)
2785 || self.contains(Look::WordEndHalfUnicode)
2786 }
2787
2788 /// Returns true if and only if this set contains any ASCII word boundary
2789 /// or negated ASCII word boundary assertions.
2790 #[inline]
2791 pub fn contains_word_ascii(self) -> bool {
2792 self.contains(Look::WordAscii)
2793 || self.contains(Look::WordAsciiNegate)
2794 || self.contains(Look::WordStartAscii)
2795 || self.contains(Look::WordEndAscii)
2796 || self.contains(Look::WordStartHalfAscii)
2797 || self.contains(Look::WordEndHalfAscii)
2798 }
2799
2800 /// Returns an iterator over all of the look-around assertions in this set.
2801 #[inline]
2802 pub fn iter(self) -> LookSetIter {
2803 LookSetIter { set: self }
2804 }
2805
2806 /// Return a new set that is equivalent to the original, but with the given
2807 /// assertion added to it. If the assertion is already in the set, then the
2808 /// returned set is equivalent to the original.
2809 #[inline]
2810 pub fn insert(self, look: Look) -> LookSet {
2811 LookSet { bits: self.bits | look.as_repr() }
2812 }
2813
2814 /// Updates this set in place with the result of inserting the given
2815 /// assertion into this set.
2816 #[inline]
2817 pub fn set_insert(&mut self, look: Look) {
2818 *self = self.insert(look);
2819 }
2820
2821 /// Return a new set that is equivalent to the original, but with the given
2822 /// assertion removed from it. If the assertion is not in the set, then the
2823 /// returned set is equivalent to the original.
2824 #[inline]
2825 pub fn remove(self, look: Look) -> LookSet {
2826 LookSet { bits: self.bits & !look.as_repr() }
2827 }
2828
2829 /// Updates this set in place with the result of removing the given
2830 /// assertion from this set.
2831 #[inline]
2832 pub fn set_remove(&mut self, look: Look) {
2833 *self = self.remove(look);
2834 }
2835
2836 /// Returns a new set that is the result of subtracting the given set from
2837 /// this set.
2838 #[inline]
2839 pub fn subtract(self, other: LookSet) -> LookSet {
2840 LookSet { bits: self.bits & !other.bits }
2841 }
2842
2843 /// Updates this set in place with the result of subtracting the given set
2844 /// from this set.
2845 #[inline]
2846 pub fn set_subtract(&mut self, other: LookSet) {
2847 *self = self.subtract(other);
2848 }
2849
2850 /// Returns a new set that is the union of this and the one given.
2851 #[inline]
2852 pub fn union(self, other: LookSet) -> LookSet {
2853 LookSet { bits: self.bits | other.bits }
2854 }
2855
2856 /// Updates this set in place with the result of unioning it with the one
2857 /// given.
2858 #[inline]
2859 pub fn set_union(&mut self, other: LookSet) {
2860 *self = self.union(other);
2861 }
2862
2863 /// Returns a new set that is the intersection of this and the one given.
2864 #[inline]
2865 pub fn intersect(self, other: LookSet) -> LookSet {
2866 LookSet { bits: self.bits & other.bits }
2867 }
2868
2869 /// Updates this set in place with the result of intersecting it with the
2870 /// one given.
2871 #[inline]
2872 pub fn set_intersect(&mut self, other: LookSet) {
2873 *self = self.intersect(other);
2874 }
2875
2876 /// Return a `LookSet` from the slice given as a native endian 32-bit
2877 /// integer.
2878 ///
2879 /// # Panics
2880 ///
2881 /// This panics if `slice.len() < 4`.
2882 #[inline]
2883 pub fn read_repr(slice: &[u8]) -> LookSet {
2884 let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2885 LookSet { bits }
2886 }
2887
2888 /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2889 /// of the slice given.
2890 ///
2891 /// # Panics
2892 ///
2893 /// This panics if `slice.len() < 4`.
2894 #[inline]
2895 pub fn write_repr(self, slice: &mut [u8]) {
2896 let raw = self.bits.to_ne_bytes();
2897 slice[0] = raw[0];
2898 slice[1] = raw[1];
2899 slice[2] = raw[2];
2900 slice[3] = raw[3];
2901 }
2902}
2903
2904impl core::fmt::Debug for LookSet {
2905 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2906 if self.is_empty() {
2907 return write!(f, "∅");
2908 }
2909 for look in self.iter() {
2910 write!(f, "{}", look.as_char())?;
2911 }
2912 Ok(())
2913 }
2914}
2915
2916/// An iterator over all look-around assertions in a [`LookSet`].
2917///
2918/// This iterator is created by [`LookSet::iter`].
2919#[derive(Clone, Debug)]
2920pub struct LookSetIter {
2921 set: LookSet,
2922}
2923
2924impl Iterator for LookSetIter {
2925 type Item = Look;
2926
2927 #[inline]
2928 fn next(&mut self) -> Option<Look> {
2929 if self.set.is_empty() {
2930 return None;
2931 }
2932 // We'll never have more than u8::MAX distinct look-around assertions,
2933 // so 'bit' will always fit into a u16.
2934 let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2935 let look = Look::from_repr(1 << bit)?;
2936 self.set = self.set.remove(look);
2937 Some(look)
2938 }
2939}
2940
2941/// Given a sequence of HIR values where each value corresponds to a Unicode
2942/// class (or an all-ASCII byte class), return a single Unicode class
2943/// corresponding to the union of the classes found.
2944fn class_chars(hirs: &[Hir]) -> Option<Class> {
2945 let mut cls = ClassUnicode::new(vec![]);
2946 for hir in hirs.iter() {
2947 match *hir.kind() {
2948 HirKind::Class(Class::Unicode(ref cls2)) => {
2949 cls.union(cls2);
2950 }
2951 HirKind::Class(Class::Bytes(ref cls2)) => {
2952 cls.union(&cls2.to_unicode_class()?);
2953 }
2954 _ => return None,
2955 };
2956 }
2957 Some(Class::Unicode(cls))
2958}
2959
2960/// Given a sequence of HIR values where each value corresponds to a byte class
2961/// (or an all-ASCII Unicode class), return a single byte class corresponding
2962/// to the union of the classes found.
2963fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2964 let mut cls = ClassBytes::new(vec![]);
2965 for hir in hirs.iter() {
2966 match *hir.kind() {
2967 HirKind::Class(Class::Unicode(ref cls2)) => {
2968 cls.union(&cls2.to_byte_class()?);
2969 }
2970 HirKind::Class(Class::Bytes(ref cls2)) => {
2971 cls.union(cls2);
2972 }
2973 _ => return None,
2974 };
2975 }
2976 Some(Class::Bytes(cls))
2977}
2978
2979/// Given a sequence of HIR values where each value corresponds to a literal
2980/// that is a single `char`, return that sequence of `char`s. Otherwise return
2981/// None. No deduplication is done.
2982fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2983 let mut singletons = vec![];
2984 for hir in hirs.iter() {
2985 let literal = match *hir.kind() {
2986 HirKind::Literal(Literal(ref bytes)) => bytes,
2987 _ => return None,
2988 };
2989 let ch = match crate::debug::utf8_decode(literal) {
2990 None => return None,
2991 Some(Err(_)) => return None,
2992 Some(Ok(ch)) => ch,
2993 };
2994 if literal.len() != ch.len_utf8() {
2995 return None;
2996 }
2997 singletons.push(ch);
2998 }
2999 Some(singletons)
3000}
3001
3002/// Given a sequence of HIR values where each value corresponds to a literal
3003/// that is a single byte, return that sequence of bytes. Otherwise return
3004/// None. No deduplication is done.
3005fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3006 let mut singletons = vec![];
3007 for hir in hirs.iter() {
3008 let literal = match *hir.kind() {
3009 HirKind::Literal(Literal(ref bytes)) => bytes,
3010 _ => return None,
3011 };
3012 if literal.len() != 1 {
3013 return None;
3014 }
3015 singletons.push(literal[0]);
3016 }
3017 Some(singletons)
3018}
3019
3020/// Looks for a common prefix in the list of alternation branches given. If one
3021/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3022/// Otherwise, the original given list of branches is returned unmodified.
3023///
3024/// This is not quite as good as it could be. Right now, it requires that
3025/// all branches are 'Concat' expressions. It also doesn't do well with
3026/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3027/// 'foo(?:foo|bar)' because literals are flattened into their own special
3028/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3029/// instead of a string of bytes because of this. Otherwise, handling the
3030/// current representation in this routine will be pretty gnarly. Sigh.)
3031fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3032 if hirs.len() <= 1 {
3033 return Err(hirs);
3034 }
3035 let mut prefix = match hirs[0].kind() {
3036 HirKind::Concat(ref xs) => &**xs,
3037 _ => return Err(hirs),
3038 };
3039 if prefix.is_empty() {
3040 return Err(hirs);
3041 }
3042 for h in hirs.iter().skip(1) {
3043 let concat = match h.kind() {
3044 HirKind::Concat(ref xs) => xs,
3045 _ => return Err(hirs),
3046 };
3047 let common_len = prefix
3048 .iter()
3049 .zip(concat.iter())
3050 .take_while(|(x, y)| x == y)
3051 .count();
3052 prefix = &prefix[..common_len];
3053 if prefix.is_empty() {
3054 return Err(hirs);
3055 }
3056 }
3057 let len = prefix.len();
3058 assert_ne!(0, len);
3059 let mut prefix_concat = vec![];
3060 let mut suffix_alts = vec![];
3061 for h in hirs {
3062 let mut concat = match h.into_kind() {
3063 HirKind::Concat(xs) => xs,
3064 // We required all sub-expressions to be
3065 // concats above, so we're only here if we
3066 // have a concat.
3067 _ => unreachable!(),
3068 };
3069 suffix_alts.push(Hir::concat(concat.split_off(len)));
3070 if prefix_concat.is_empty() {
3071 prefix_concat = concat;
3072 }
3073 }
3074 let mut concat = prefix_concat;
3075 concat.push(Hir::alternation(suffix_alts));
3076 Ok(Hir::concat(concat))
3077}
3078
3079#[cfg(test)]
3080mod tests {
3081 use super::*;
3082
3083 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3084 let ranges: Vec<ClassUnicodeRange> = ranges
3085 .iter()
3086 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3087 .collect();
3088 ClassUnicode::new(ranges)
3089 }
3090
3091 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3092 let ranges: Vec<ClassBytesRange> =
3093 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3094 ClassBytes::new(ranges)
3095 }
3096
3097 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3098 cls.iter().map(|x| (x.start(), x.end())).collect()
3099 }
3100
3101 #[cfg(feature = "unicode-case")]
3102 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3103 let mut cls_ = cls.clone();
3104 cls_.case_fold_simple();
3105 cls_
3106 }
3107
3108 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3109 let mut cls_ = cls1.clone();
3110 cls_.union(cls2);
3111 cls_
3112 }
3113
3114 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3115 let mut cls_ = cls1.clone();
3116 cls_.intersect(cls2);
3117 cls_
3118 }
3119
3120 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3121 let mut cls_ = cls1.clone();
3122 cls_.difference(cls2);
3123 cls_
3124 }
3125
3126 fn usymdifference(
3127 cls1: &ClassUnicode,
3128 cls2: &ClassUnicode,
3129 ) -> ClassUnicode {
3130 let mut cls_ = cls1.clone();
3131 cls_.symmetric_difference(cls2);
3132 cls_
3133 }
3134
3135 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3136 let mut cls_ = cls.clone();
3137 cls_.negate();
3138 cls_
3139 }
3140
3141 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3142 cls.iter().map(|x| (x.start(), x.end())).collect()
3143 }
3144
3145 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3146 let mut cls_ = cls.clone();
3147 cls_.case_fold_simple();
3148 cls_
3149 }
3150
3151 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3152 let mut cls_ = cls1.clone();
3153 cls_.union(cls2);
3154 cls_
3155 }
3156
3157 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3158 let mut cls_ = cls1.clone();
3159 cls_.intersect(cls2);
3160 cls_
3161 }
3162
3163 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3164 let mut cls_ = cls1.clone();
3165 cls_.difference(cls2);
3166 cls_
3167 }
3168
3169 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3170 let mut cls_ = cls1.clone();
3171 cls_.symmetric_difference(cls2);
3172 cls_
3173 }
3174
3175 fn bnegate(cls: &ClassBytes) -> ClassBytes {
3176 let mut cls_ = cls.clone();
3177 cls_.negate();
3178 cls_
3179 }
3180
3181 #[test]
3182 fn class_range_canonical_unicode() {
3183 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3184 assert_eq!('\0', range.start());
3185 assert_eq!('\u{00FF}', range.end());
3186 }
3187
3188 #[test]
3189 fn class_range_canonical_bytes() {
3190 let range = ClassBytesRange::new(b'\xFF', b'\0');
3191 assert_eq!(b'\0', range.start());
3192 assert_eq!(b'\xFF', range.end());
3193 }
3194
3195 #[test]
3196 fn class_canonicalize_unicode() {
3197 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3198 let expected = vec![('a', 'c'), ('x', 'z')];
3199 assert_eq!(expected, uranges(&cls));
3200
3201 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3202 let expected = vec![('a', 'c'), ('x', 'z')];
3203 assert_eq!(expected, uranges(&cls));
3204
3205 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3206 let expected = vec![('w', 'z')];
3207 assert_eq!(expected, uranges(&cls));
3208
3209 let cls = uclass(&[
3210 ('c', 'f'),
3211 ('a', 'g'),
3212 ('d', 'j'),
3213 ('a', 'c'),
3214 ('m', 'p'),
3215 ('l', 's'),
3216 ]);
3217 let expected = vec![('a', 'j'), ('l', 's')];
3218 assert_eq!(expected, uranges(&cls));
3219
3220 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3221 let expected = vec![('u', 'z')];
3222 assert_eq!(expected, uranges(&cls));
3223
3224 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3225 let expected = vec![('\x00', '\u{10FFFF}')];
3226 assert_eq!(expected, uranges(&cls));
3227
3228 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3229 let expected = vec![('a', 'b')];
3230 assert_eq!(expected, uranges(&cls));
3231 }
3232
3233 #[test]
3234 fn class_canonicalize_bytes() {
3235 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3236 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237 assert_eq!(expected, branges(&cls));
3238
3239 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3240 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3241 assert_eq!(expected, branges(&cls));
3242
3243 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3244 let expected = vec![(b'w', b'z')];
3245 assert_eq!(expected, branges(&cls));
3246
3247 let cls = bclass(&[
3248 (b'c', b'f'),
3249 (b'a', b'g'),
3250 (b'd', b'j'),
3251 (b'a', b'c'),
3252 (b'm', b'p'),
3253 (b'l', b's'),
3254 ]);
3255 let expected = vec![(b'a', b'j'), (b'l', b's')];
3256 assert_eq!(expected, branges(&cls));
3257
3258 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3259 let expected = vec![(b'u', b'z')];
3260 assert_eq!(expected, branges(&cls));
3261
3262 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3263 let expected = vec![(b'\x00', b'\xFF')];
3264 assert_eq!(expected, branges(&cls));
3265
3266 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3267 let expected = vec![(b'a', b'b')];
3268 assert_eq!(expected, branges(&cls));
3269 }
3270
3271 #[test]
3272 #[cfg(feature = "unicode-case")]
3273 fn class_case_fold_unicode() {
3274 let cls = uclass(&[
3275 ('C', 'F'),
3276 ('A', 'G'),
3277 ('D', 'J'),
3278 ('A', 'C'),
3279 ('M', 'P'),
3280 ('L', 'S'),
3281 ('c', 'f'),
3282 ]);
3283 let expected = uclass(&[
3284 ('A', 'J'),
3285 ('L', 'S'),
3286 ('a', 'j'),
3287 ('l', 's'),
3288 ('\u{17F}', '\u{17F}'),
3289 ]);
3290 assert_eq!(expected, ucasefold(&cls));
3291
3292 let cls = uclass(&[('A', 'Z')]);
3293 let expected = uclass(&[
3294 ('A', 'Z'),
3295 ('a', 'z'),
3296 ('\u{17F}', '\u{17F}'),
3297 ('\u{212A}', '\u{212A}'),
3298 ]);
3299 assert_eq!(expected, ucasefold(&cls));
3300
3301 let cls = uclass(&[('a', 'z')]);
3302 let expected = uclass(&[
3303 ('A', 'Z'),
3304 ('a', 'z'),
3305 ('\u{17F}', '\u{17F}'),
3306 ('\u{212A}', '\u{212A}'),
3307 ]);
3308 assert_eq!(expected, ucasefold(&cls));
3309
3310 let cls = uclass(&[('A', 'A'), ('_', '_')]);
3311 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3312 assert_eq!(expected, ucasefold(&cls));
3313
3314 let cls = uclass(&[('A', 'A'), ('=', '=')]);
3315 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3316 assert_eq!(expected, ucasefold(&cls));
3317
3318 let cls = uclass(&[('\x00', '\x10')]);
3319 assert_eq!(cls, ucasefold(&cls));
3320
3321 let cls = uclass(&[('k', 'k')]);
3322 let expected =
3323 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3324 assert_eq!(expected, ucasefold(&cls));
3325
3326 let cls = uclass(&[('@', '@')]);
3327 assert_eq!(cls, ucasefold(&cls));
3328 }
3329
3330 #[test]
3331 #[cfg(not(feature = "unicode-case"))]
3332 fn class_case_fold_unicode_disabled() {
3333 let mut cls = uclass(&[
3334 ('C', 'F'),
3335 ('A', 'G'),
3336 ('D', 'J'),
3337 ('A', 'C'),
3338 ('M', 'P'),
3339 ('L', 'S'),
3340 ('c', 'f'),
3341 ]);
3342 assert!(cls.try_case_fold_simple().is_err());
3343 }
3344
3345 #[test]
3346 #[should_panic]
3347 #[cfg(not(feature = "unicode-case"))]
3348 fn class_case_fold_unicode_disabled_panics() {
3349 let mut cls = uclass(&[
3350 ('C', 'F'),
3351 ('A', 'G'),
3352 ('D', 'J'),
3353 ('A', 'C'),
3354 ('M', 'P'),
3355 ('L', 'S'),
3356 ('c', 'f'),
3357 ]);
3358 cls.case_fold_simple();
3359 }
3360
3361 #[test]
3362 fn class_case_fold_bytes() {
3363 let cls = bclass(&[
3364 (b'C', b'F'),
3365 (b'A', b'G'),
3366 (b'D', b'J'),
3367 (b'A', b'C'),
3368 (b'M', b'P'),
3369 (b'L', b'S'),
3370 (b'c', b'f'),
3371 ]);
3372 let expected =
3373 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3374 assert_eq!(expected, bcasefold(&cls));
3375
3376 let cls = bclass(&[(b'A', b'Z')]);
3377 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378 assert_eq!(expected, bcasefold(&cls));
3379
3380 let cls = bclass(&[(b'a', b'z')]);
3381 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3382 assert_eq!(expected, bcasefold(&cls));
3383
3384 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3385 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3386 assert_eq!(expected, bcasefold(&cls));
3387
3388 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3389 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3390 assert_eq!(expected, bcasefold(&cls));
3391
3392 let cls = bclass(&[(b'\x00', b'\x10')]);
3393 assert_eq!(cls, bcasefold(&cls));
3394
3395 let cls = bclass(&[(b'k', b'k')]);
3396 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3397 assert_eq!(expected, bcasefold(&cls));
3398
3399 let cls = bclass(&[(b'@', b'@')]);
3400 assert_eq!(cls, bcasefold(&cls));
3401 }
3402
3403 #[test]
3404 fn class_negate_unicode() {
3405 let cls = uclass(&[('a', 'a')]);
3406 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3407 assert_eq!(expected, unegate(&cls));
3408
3409 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3410 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3411 assert_eq!(expected, unegate(&cls));
3412
3413 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3414 let expected = uclass(&[
3415 ('\x00', '\x60'),
3416 ('\x64', '\x77'),
3417 ('\x7B', '\u{10FFFF}'),
3418 ]);
3419 assert_eq!(expected, unegate(&cls));
3420
3421 let cls = uclass(&[('\x00', 'a')]);
3422 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3423 assert_eq!(expected, unegate(&cls));
3424
3425 let cls = uclass(&[('a', '\u{10FFFF}')]);
3426 let expected = uclass(&[('\x00', '\x60')]);
3427 assert_eq!(expected, unegate(&cls));
3428
3429 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3430 let expected = uclass(&[]);
3431 assert_eq!(expected, unegate(&cls));
3432
3433 let cls = uclass(&[]);
3434 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3435 assert_eq!(expected, unegate(&cls));
3436
3437 let cls =
3438 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3439 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3440 assert_eq!(expected, unegate(&cls));
3441
3442 let cls = uclass(&[('\x00', '\u{D7FF}')]);
3443 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3444 assert_eq!(expected, unegate(&cls));
3445
3446 let cls = uclass(&[('\x00', '\u{D7FE}')]);
3447 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3448 assert_eq!(expected, unegate(&cls));
3449
3450 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3451 let expected = uclass(&[('\x00', '\u{D7FF}')]);
3452 assert_eq!(expected, unegate(&cls));
3453
3454 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3455 let expected = uclass(&[('\x00', '\u{E000}')]);
3456 assert_eq!(expected, unegate(&cls));
3457 }
3458
3459 #[test]
3460 fn class_negate_bytes() {
3461 let cls = bclass(&[(b'a', b'a')]);
3462 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3463 assert_eq!(expected, bnegate(&cls));
3464
3465 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3466 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3467 assert_eq!(expected, bnegate(&cls));
3468
3469 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3470 let expected = bclass(&[
3471 (b'\x00', b'\x60'),
3472 (b'\x64', b'\x77'),
3473 (b'\x7B', b'\xFF'),
3474 ]);
3475 assert_eq!(expected, bnegate(&cls));
3476
3477 let cls = bclass(&[(b'\x00', b'a')]);
3478 let expected = bclass(&[(b'\x62', b'\xFF')]);
3479 assert_eq!(expected, bnegate(&cls));
3480
3481 let cls = bclass(&[(b'a', b'\xFF')]);
3482 let expected = bclass(&[(b'\x00', b'\x60')]);
3483 assert_eq!(expected, bnegate(&cls));
3484
3485 let cls = bclass(&[(b'\x00', b'\xFF')]);
3486 let expected = bclass(&[]);
3487 assert_eq!(expected, bnegate(&cls));
3488
3489 let cls = bclass(&[]);
3490 let expected = bclass(&[(b'\x00', b'\xFF')]);
3491 assert_eq!(expected, bnegate(&cls));
3492
3493 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3494 let expected = bclass(&[(b'\xFE', b'\xFE')]);
3495 assert_eq!(expected, bnegate(&cls));
3496 }
3497
3498 #[test]
3499 fn class_union_unicode() {
3500 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3501 let cls2 = uclass(&[('a', 'z')]);
3502 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3503 assert_eq!(expected, uunion(&cls1, &cls2));
3504 }
3505
3506 #[test]
3507 fn class_union_bytes() {
3508 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3509 let cls2 = bclass(&[(b'a', b'z')]);
3510 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3511 assert_eq!(expected, bunion(&cls1, &cls2));
3512 }
3513
3514 #[test]
3515 fn class_intersect_unicode() {
3516 let cls1 = uclass(&[]);
3517 let cls2 = uclass(&[('a', 'a')]);
3518 let expected = uclass(&[]);
3519 assert_eq!(expected, uintersect(&cls1, &cls2));
3520
3521 let cls1 = uclass(&[('a', 'a')]);
3522 let cls2 = uclass(&[('a', 'a')]);
3523 let expected = uclass(&[('a', 'a')]);
3524 assert_eq!(expected, uintersect(&cls1, &cls2));
3525
3526 let cls1 = uclass(&[('a', 'a')]);
3527 let cls2 = uclass(&[('b', 'b')]);
3528 let expected = uclass(&[]);
3529 assert_eq!(expected, uintersect(&cls1, &cls2));
3530
3531 let cls1 = uclass(&[('a', 'a')]);
3532 let cls2 = uclass(&[('a', 'c')]);
3533 let expected = uclass(&[('a', 'a')]);
3534 assert_eq!(expected, uintersect(&cls1, &cls2));
3535
3536 let cls1 = uclass(&[('a', 'b')]);
3537 let cls2 = uclass(&[('a', 'c')]);
3538 let expected = uclass(&[('a', 'b')]);
3539 assert_eq!(expected, uintersect(&cls1, &cls2));
3540
3541 let cls1 = uclass(&[('a', 'b')]);
3542 let cls2 = uclass(&[('b', 'c')]);
3543 let expected = uclass(&[('b', 'b')]);
3544 assert_eq!(expected, uintersect(&cls1, &cls2));
3545
3546 let cls1 = uclass(&[('a', 'b')]);
3547 let cls2 = uclass(&[('c', 'd')]);
3548 let expected = uclass(&[]);
3549 assert_eq!(expected, uintersect(&cls1, &cls2));
3550
3551 let cls1 = uclass(&[('b', 'c')]);
3552 let cls2 = uclass(&[('a', 'd')]);
3553 let expected = uclass(&[('b', 'c')]);
3554 assert_eq!(expected, uintersect(&cls1, &cls2));
3555
3556 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3557 let cls2 = uclass(&[('a', 'h')]);
3558 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559 assert_eq!(expected, uintersect(&cls1, &cls2));
3560
3561 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3562 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3563 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3564 assert_eq!(expected, uintersect(&cls1, &cls2));
3565
3566 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3567 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3568 let expected = uclass(&[]);
3569 assert_eq!(expected, uintersect(&cls1, &cls2));
3570
3571 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3572 let cls2 = uclass(&[('h', 'h')]);
3573 let expected = uclass(&[('h', 'h')]);
3574 assert_eq!(expected, uintersect(&cls1, &cls2));
3575
3576 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3577 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3578 let expected = uclass(&[]);
3579 assert_eq!(expected, uintersect(&cls1, &cls2));
3580
3581 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3582 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3583 let expected = uclass(&[('b', 'f')]);
3584 assert_eq!(expected, uintersect(&cls1, &cls2));
3585 }
3586
3587 #[test]
3588 fn class_intersect_bytes() {
3589 let cls1 = bclass(&[]);
3590 let cls2 = bclass(&[(b'a', b'a')]);
3591 let expected = bclass(&[]);
3592 assert_eq!(expected, bintersect(&cls1, &cls2));
3593
3594 let cls1 = bclass(&[(b'a', b'a')]);
3595 let cls2 = bclass(&[(b'a', b'a')]);
3596 let expected = bclass(&[(b'a', b'a')]);
3597 assert_eq!(expected, bintersect(&cls1, &cls2));
3598
3599 let cls1 = bclass(&[(b'a', b'a')]);
3600 let cls2 = bclass(&[(b'b', b'b')]);
3601 let expected = bclass(&[]);
3602 assert_eq!(expected, bintersect(&cls1, &cls2));
3603
3604 let cls1 = bclass(&[(b'a', b'a')]);
3605 let cls2 = bclass(&[(b'a', b'c')]);
3606 let expected = bclass(&[(b'a', b'a')]);
3607 assert_eq!(expected, bintersect(&cls1, &cls2));
3608
3609 let cls1 = bclass(&[(b'a', b'b')]);
3610 let cls2 = bclass(&[(b'a', b'c')]);
3611 let expected = bclass(&[(b'a', b'b')]);
3612 assert_eq!(expected, bintersect(&cls1, &cls2));
3613
3614 let cls1 = bclass(&[(b'a', b'b')]);
3615 let cls2 = bclass(&[(b'b', b'c')]);
3616 let expected = bclass(&[(b'b', b'b')]);
3617 assert_eq!(expected, bintersect(&cls1, &cls2));
3618
3619 let cls1 = bclass(&[(b'a', b'b')]);
3620 let cls2 = bclass(&[(b'c', b'd')]);
3621 let expected = bclass(&[]);
3622 assert_eq!(expected, bintersect(&cls1, &cls2));
3623
3624 let cls1 = bclass(&[(b'b', b'c')]);
3625 let cls2 = bclass(&[(b'a', b'd')]);
3626 let expected = bclass(&[(b'b', b'c')]);
3627 assert_eq!(expected, bintersect(&cls1, &cls2));
3628
3629 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3630 let cls2 = bclass(&[(b'a', b'h')]);
3631 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632 assert_eq!(expected, bintersect(&cls1, &cls2));
3633
3634 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3635 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3636 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3637 assert_eq!(expected, bintersect(&cls1, &cls2));
3638
3639 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3640 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3641 let expected = bclass(&[]);
3642 assert_eq!(expected, bintersect(&cls1, &cls2));
3643
3644 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3645 let cls2 = bclass(&[(b'h', b'h')]);
3646 let expected = bclass(&[(b'h', b'h')]);
3647 assert_eq!(expected, bintersect(&cls1, &cls2));
3648
3649 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3650 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3651 let expected = bclass(&[]);
3652 assert_eq!(expected, bintersect(&cls1, &cls2));
3653
3654 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3655 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3656 let expected = bclass(&[(b'b', b'f')]);
3657 assert_eq!(expected, bintersect(&cls1, &cls2));
3658 }
3659
3660 #[test]
3661 fn class_difference_unicode() {
3662 let cls1 = uclass(&[('a', 'a')]);
3663 let cls2 = uclass(&[('a', 'a')]);
3664 let expected = uclass(&[]);
3665 assert_eq!(expected, udifference(&cls1, &cls2));
3666
3667 let cls1 = uclass(&[('a', 'a')]);
3668 let cls2 = uclass(&[]);
3669 let expected = uclass(&[('a', 'a')]);
3670 assert_eq!(expected, udifference(&cls1, &cls2));
3671
3672 let cls1 = uclass(&[]);
3673 let cls2 = uclass(&[('a', 'a')]);
3674 let expected = uclass(&[]);
3675 assert_eq!(expected, udifference(&cls1, &cls2));
3676
3677 let cls1 = uclass(&[('a', 'z')]);
3678 let cls2 = uclass(&[('a', 'a')]);
3679 let expected = uclass(&[('b', 'z')]);
3680 assert_eq!(expected, udifference(&cls1, &cls2));
3681
3682 let cls1 = uclass(&[('a', 'z')]);
3683 let cls2 = uclass(&[('z', 'z')]);
3684 let expected = uclass(&[('a', 'y')]);
3685 assert_eq!(expected, udifference(&cls1, &cls2));
3686
3687 let cls1 = uclass(&[('a', 'z')]);
3688 let cls2 = uclass(&[('m', 'm')]);
3689 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3690 assert_eq!(expected, udifference(&cls1, &cls2));
3691
3692 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3693 let cls2 = uclass(&[('a', 'z')]);
3694 let expected = uclass(&[]);
3695 assert_eq!(expected, udifference(&cls1, &cls2));
3696
3697 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3698 let cls2 = uclass(&[('d', 'v')]);
3699 let expected = uclass(&[('a', 'c')]);
3700 assert_eq!(expected, udifference(&cls1, &cls2));
3701
3702 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3703 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3704 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3705 assert_eq!(expected, udifference(&cls1, &cls2));
3706
3707 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3708 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3709 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3710 assert_eq!(expected, udifference(&cls1, &cls2));
3711
3712 let cls1 = uclass(&[('x', 'z')]);
3713 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3714 let expected = uclass(&[('x', 'z')]);
3715 assert_eq!(expected, udifference(&cls1, &cls2));
3716
3717 let cls1 = uclass(&[('a', 'z')]);
3718 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3719 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3720 assert_eq!(expected, udifference(&cls1, &cls2));
3721 }
3722
3723 #[test]
3724 fn class_difference_bytes() {
3725 let cls1 = bclass(&[(b'a', b'a')]);
3726 let cls2 = bclass(&[(b'a', b'a')]);
3727 let expected = bclass(&[]);
3728 assert_eq!(expected, bdifference(&cls1, &cls2));
3729
3730 let cls1 = bclass(&[(b'a', b'a')]);
3731 let cls2 = bclass(&[]);
3732 let expected = bclass(&[(b'a', b'a')]);
3733 assert_eq!(expected, bdifference(&cls1, &cls2));
3734
3735 let cls1 = bclass(&[]);
3736 let cls2 = bclass(&[(b'a', b'a')]);
3737 let expected = bclass(&[]);
3738 assert_eq!(expected, bdifference(&cls1, &cls2));
3739
3740 let cls1 = bclass(&[(b'a', b'z')]);
3741 let cls2 = bclass(&[(b'a', b'a')]);
3742 let expected = bclass(&[(b'b', b'z')]);
3743 assert_eq!(expected, bdifference(&cls1, &cls2));
3744
3745 let cls1 = bclass(&[(b'a', b'z')]);
3746 let cls2 = bclass(&[(b'z', b'z')]);
3747 let expected = bclass(&[(b'a', b'y')]);
3748 assert_eq!(expected, bdifference(&cls1, &cls2));
3749
3750 let cls1 = bclass(&[(b'a', b'z')]);
3751 let cls2 = bclass(&[(b'm', b'm')]);
3752 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3753 assert_eq!(expected, bdifference(&cls1, &cls2));
3754
3755 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3756 let cls2 = bclass(&[(b'a', b'z')]);
3757 let expected = bclass(&[]);
3758 assert_eq!(expected, bdifference(&cls1, &cls2));
3759
3760 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3761 let cls2 = bclass(&[(b'd', b'v')]);
3762 let expected = bclass(&[(b'a', b'c')]);
3763 assert_eq!(expected, bdifference(&cls1, &cls2));
3764
3765 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3766 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3767 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3768 assert_eq!(expected, bdifference(&cls1, &cls2));
3769
3770 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3771 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3772 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3773 assert_eq!(expected, bdifference(&cls1, &cls2));
3774
3775 let cls1 = bclass(&[(b'x', b'z')]);
3776 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3777 let expected = bclass(&[(b'x', b'z')]);
3778 assert_eq!(expected, bdifference(&cls1, &cls2));
3779
3780 let cls1 = bclass(&[(b'a', b'z')]);
3781 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3782 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3783 assert_eq!(expected, bdifference(&cls1, &cls2));
3784 }
3785
3786 #[test]
3787 fn class_symmetric_difference_unicode() {
3788 let cls1 = uclass(&[('a', 'm')]);
3789 let cls2 = uclass(&[('g', 't')]);
3790 let expected = uclass(&[('a', 'f'), ('n', 't')]);
3791 assert_eq!(expected, usymdifference(&cls1, &cls2));
3792 }
3793
3794 #[test]
3795 fn class_symmetric_difference_bytes() {
3796 let cls1 = bclass(&[(b'a', b'm')]);
3797 let cls2 = bclass(&[(b'g', b't')]);
3798 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3799 assert_eq!(expected, bsymdifference(&cls1, &cls2));
3800 }
3801
3802 // We use a thread with an explicit stack size to test that our destructor
3803 // for Hir can handle arbitrarily sized expressions in constant stack
3804 // space. In case we run on a platform without threads (WASM?), we limit
3805 // this test to Windows/Unix.
3806 #[test]
3807 #[cfg(any(unix, windows))]
3808 fn no_stack_overflow_on_drop() {
3809 use std::thread;
3810
3811 let run = || {
3812 let mut expr = Hir::empty();
3813 for _ in 0..100 {
3814 expr = Hir::capture(Capture {
3815 index: 1,
3816 name: None,
3817 sub: Box::new(expr),
3818 });
3819 expr = Hir::repetition(Repetition {
3820 min: 0,
3821 max: Some(1),
3822 greedy: true,
3823 sub: Box::new(expr),
3824 });
3825
3826 expr = Hir {
3827 kind: HirKind::Concat(vec![expr]),
3828 props: Properties::empty(),
3829 };
3830 expr = Hir {
3831 kind: HirKind::Alternation(vec![expr]),
3832 props: Properties::empty(),
3833 };
3834 }
3835 assert!(!matches!(*expr.kind(), HirKind::Empty));
3836 };
3837
3838 // We run our test on a thread with a small stack size so we can
3839 // force the issue more easily.
3840 //
3841 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3842 // for context on the specific stack size chosen here.
3843 thread::Builder::new()
3844 .stack_size(16 << 10)
3845 .spawn(run)
3846 .unwrap()
3847 .join()
3848 .unwrap();
3849 }
3850
3851 #[test]
3852 fn look_set_iter() {
3853 let set = LookSet::empty();
3854 assert_eq!(0, set.iter().count());
3855
3856 let set = LookSet::full();
3857 assert_eq!(18, set.iter().count());
3858
3859 let set =
3860 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3861 assert_eq!(2, set.iter().count());
3862
3863 let set = LookSet::empty().insert(Look::StartLF);
3864 assert_eq!(1, set.iter().count());
3865
3866 let set = LookSet::empty().insert(Look::WordAsciiNegate);
3867 assert_eq!(1, set.iter().count());
3868 }
3869
3870 #[test]
3871 fn look_set_debug() {
3872 let res = format!("{:?}", LookSet::empty());
3873 assert_eq!("∅", res);
3874 let res = format!("{:?}", LookSet::full());
3875 assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3876 }
3877}