regex_syntax/hir/
mod.rs

Help
1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23    boxed::Box,
24    format,
25    string::{String, ToString},
26    vec,
27    vec::Vec,
28};
29
30use crate::{
31    ast::Span,
32    hir::interval::{Interval, IntervalSet, IntervalSetIter},
33    unicode,
34};
35
36pub use crate::{
37    hir::visitor::{visit, Visitor},
38    unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50    /// The kind of error.
51    kind: ErrorKind,
52    /// The original pattern that the translator's Ast was parsed from. Every
53    /// span in an error is a valid range into this string.
54    pattern: String,
55    /// The span of this error, derived from the Ast given to the translator.
56    span: Span,
57}
58
59impl Error {
60    /// Return the type of this error.
61    pub fn kind(&self) -> &ErrorKind {
62        &self.kind
63    }
64
65    /// The original pattern string in which this error occurred.
66    ///
67    /// Every span reported by this error is reported in terms of this string.
68    pub fn pattern(&self) -> &str {
69        &self.pattern
70    }
71
72    /// Return the span at which this error occurred.
73    pub fn span(&self) -> &Span {
74        &self.span
75    }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85    /// This error occurs when a Unicode feature is used when Unicode
86    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87    UnicodeNotAllowed,
88    /// This error occurs when translating a pattern that could match a byte
89    /// sequence that isn't UTF-8 and `utf8` was enabled.
90    InvalidUtf8,
91    /// This error occurs when one uses a non-ASCII byte for a line terminator,
92    /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93    InvalidLineTerminator,
94    /// This occurs when an unrecognized Unicode property name could not
95    /// be found.
96    UnicodePropertyNotFound,
97    /// This occurs when an unrecognized Unicode property value could not
98    /// be found.
99    UnicodePropertyValueNotFound,
100    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101    /// `\d`) could not be found. This can occur when the `unicode-perl`
102    /// crate feature is not enabled.
103    UnicodePerlClassNotFound,
104    /// This occurs when the Unicode simple case mapping tables are not
105    /// available, and the regular expression required Unicode aware case
106    /// insensitivity.
107    UnicodeCaseUnavailable,
108}
109
110#[cfg(feature = "std")]
111impl std::error::Error for Error {}
112
113impl core::fmt::Display for Error {
114    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115        crate::error::Formatter::from(self).fmt(f)
116    }
117}
118
119impl core::fmt::Display for ErrorKind {
120    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121        use self::ErrorKind::*;
122
123        let msg = match *self {
124            UnicodeNotAllowed => "Unicode not allowed here",
125            InvalidUtf8 => "pattern can match invalid UTF-8",
126            InvalidLineTerminator => "invalid line terminator, must be ASCII",
127            UnicodePropertyNotFound => "Unicode property not found",
128            UnicodePropertyValueNotFound => "Unicode property value not found",
129            UnicodePerlClassNotFound => {
130                "Unicode-aware Perl class not found \
131                 (make sure the unicode-perl feature is enabled)"
132            }
133            UnicodeCaseUnavailable => {
134                "Unicode-aware case insensitivity matching is not available \
135                 (make sure the unicode-case feature is enabled)"
136            }
137        };
138        f.write_str(msg)
139    }
140}
141
142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(Clone, Eq, PartialEq)]
205pub struct Hir {
206    /// The underlying HIR kind.
207    kind: HirKind,
208    /// Analysis info about this HIR, computed during construction.
209    props: Properties,
210}
211
212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214    /// Returns a reference to the underlying HIR kind.
215    pub fn kind(&self) -> &HirKind {
216        &self.kind
217    }
218
219    /// Consumes ownership of this HIR expression and returns its underlying
220    /// `HirKind`.
221    pub fn into_kind(mut self) -> HirKind {
222        core::mem::replace(&mut self.kind, HirKind::Empty)
223    }
224
225    /// Returns the properties computed for this `Hir`.
226    pub fn properties(&self) -> &Properties {
227        &self.props
228    }
229
230    /// Splits this HIR into its constituent parts.
231    ///
232    /// This is useful because `let Hir { kind, props } = hir;` does not work
233    /// because of `Hir`'s custom `Drop` implementation.
234    fn into_parts(mut self) -> (HirKind, Properties) {
235        (
236            core::mem::replace(&mut self.kind, HirKind::Empty),
237            core::mem::replace(&mut self.props, Properties::empty()),
238        )
239    }
240}
241
242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simulanteously keeping the representation as simple as possible.
253impl Hir {
254    /// Returns an empty HIR expression.
255    ///
256    /// An empty HIR expression always matches, including the empty string.
257    #[inline]
258    pub fn empty() -> Hir {
259        let props = Properties::empty();
260        Hir { kind: HirKind::Empty, props }
261    }
262
263    /// Returns an HIR expression that can never match anything. That is,
264    /// the size of the set of strings in the language described by the HIR
265    /// returned is `0`.
266    ///
267    /// This is distinct from [`Hir::empty`] in that the empty string matches
268    /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269    /// language describe described by `Hir::empty` is non-empty.
270    ///
271    /// Note that currently, the HIR returned uses an empty character class to
272    /// indicate that nothing can match. An equivalent expression that cannot
273    /// match is an empty alternation, but all such "fail" expressions are
274    /// normalized (via smart constructors) to empty character classes. This is
275    /// because empty character classes can be spelled in the concrete syntax
276    /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277    /// empty alternations cannot.
278    #[inline]
279    pub fn fail() -> Hir {
280        let class = Class::Bytes(ClassBytes::empty());
281        let props = Properties::class(&class);
282        // We can't just call Hir::class here because it defers to Hir::fail
283        // in order to canonicalize the Hir value used to represent "cannot
284        // match."
285        Hir { kind: HirKind::Class(class), props }
286    }
287
288    /// Creates a literal HIR expression.
289    ///
290    /// This accepts anything that can be converted into a `Box<[u8]>`.
291    ///
292    /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293    /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294    /// any HIR node) matches valid UTF-8 exclusively can be queried via
295    /// [`Properties::is_utf8`].
296    ///
297    /// # Example
298    ///
299    /// This example shows that concatenations of `Literal` HIR values will
300    /// automatically get flattened and combined together. So for example, even
301    /// if you concat multiple `Literal` values that are themselves not valid
302    /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303    /// how "smart" Hir's smart constructors are.
304    ///
305    /// ```
306    /// use regex_syntax::hir::{Hir, HirKind, Literal};
307    ///
308    /// let literals = vec![
309    ///     Hir::literal([0xE2]),
310    ///     Hir::literal([0x98]),
311    ///     Hir::literal([0x83]),
312    /// ];
313    /// // Each literal, on its own, is invalid UTF-8.
314    /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315    ///
316    /// let concat = Hir::concat(literals);
317    /// // But the concatenation is valid UTF-8!
318    /// assert!(concat.properties().is_utf8());
319    ///
320    /// // And also notice that the literals have been concatenated into a
321    /// // single `Literal`, to the point where there is no explicit `Concat`!
322    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323    /// assert_eq!(&expected, concat.kind());
324    /// ```
325    ///
326    /// # Example: building a literal from a `char`
327    ///
328    /// This example shows how to build a single `Hir` literal from a `char`
329    /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330    /// encode a `char` value:
331    ///
332    /// ```
333    /// use regex_syntax::hir::{Hir, HirKind, Literal};
334    ///
335    /// let ch = '☃';
336    /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337    ///
338    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339    /// assert_eq!(&expected, got.kind());
340    /// ```
341    #[inline]
342    pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343        let bytes = lit.into();
344        if bytes.is_empty() {
345            return Hir::empty();
346        }
347
348        let lit = Literal(bytes);
349        let props = Properties::literal(&lit);
350        Hir { kind: HirKind::Literal(lit), props }
351    }
352
353    /// Creates a class HIR expression. The class may either be defined over
354    /// ranges of Unicode codepoints or ranges of raw byte values.
355    ///
356    /// Note that an empty class is permitted. An empty class is equivalent to
357    /// `Hir::fail()`.
358    #[inline]
359    pub fn class(class: Class) -> Hir {
360        if class.is_empty() {
361            return Hir::fail();
362        } else if let Some(bytes) = class.literal() {
363            return Hir::literal(bytes);
364        }
365        let props = Properties::class(&class);
366        Hir { kind: HirKind::Class(class), props }
367    }
368
369    /// Creates a look-around assertion HIR expression.
370    #[inline]
371    pub fn look(look: Look) -> Hir {
372        let props = Properties::look(look);
373        Hir { kind: HirKind::Look(look), props }
374    }
375
376    /// Creates a repetition HIR expression.
377    #[inline]
378    pub fn repetition(mut rep: Repetition) -> Hir {
379        // If the sub-expression of a repetition can only match the empty
380        // string, then we force its maximum to be at most 1.
381        if rep.sub.properties().maximum_len() == Some(0) {
382            rep.min = cmp::min(rep.min, 1);
383            rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384        }
385        // The regex 'a{0}' is always equivalent to the empty regex. This is
386        // true even when 'a' is an expression that never matches anything
387        // (like '\P{any}').
388        //
389        // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390        if rep.min == 0 && rep.max == Some(0) {
391            return Hir::empty();
392        } else if rep.min == 1 && rep.max == Some(1) {
393            return *rep.sub;
394        }
395        let props = Properties::repetition(&rep);
396        Hir { kind: HirKind::Repetition(rep), props }
397    }
398
399    /// Creates a capture HIR expression.
400    ///
401    /// Note that there is no explicit HIR value for a non-capturing group.
402    /// Since a non-capturing group only exists to override precedence in the
403    /// concrete syntax and since an HIR already does its own grouping based on
404    /// what is parsed, there is no need to explicitly represent non-capturing
405    /// groups in the HIR.
406    #[inline]
407    pub fn capture(capture: Capture) -> Hir {
408        let props = Properties::capture(&capture);
409        Hir { kind: HirKind::Capture(capture), props }
410    }
411
412    /// Returns the concatenation of the given expressions.
413    ///
414    /// This attempts to flatten and simplify the concatenation as appropriate.
415    ///
416    /// # Example
417    ///
418    /// This shows a simple example of basic flattening of both concatenations
419    /// and literals.
420    ///
421    /// ```
422    /// use regex_syntax::hir::Hir;
423    ///
424    /// let hir = Hir::concat(vec![
425    ///     Hir::concat(vec![
426    ///         Hir::literal([b'a']),
427    ///         Hir::literal([b'b']),
428    ///         Hir::literal([b'c']),
429    ///     ]),
430    ///     Hir::concat(vec![
431    ///         Hir::literal([b'x']),
432    ///         Hir::literal([b'y']),
433    ///         Hir::literal([b'z']),
434    ///     ]),
435    /// ]);
436    /// let expected = Hir::literal("abcxyz".as_bytes());
437    /// assert_eq!(expected, hir);
438    /// ```
439    pub fn concat(subs: Vec<Hir>) -> Hir {
440        // We rebuild the concatenation by simplifying it. Would be nice to do
441        // it in place, but that seems a little tricky?
442        let mut new = vec![];
443        // This gobbles up any adjacent literals in a concatenation and smushes
444        // them together. Basically, when we see a literal, we add its bytes
445        // to 'prior_lit', and whenever we see anything else, we first take
446        // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447        let mut prior_lit: Option<Vec<u8>> = None;
448        for sub in subs {
449            let (kind, props) = sub.into_parts();
450            match kind {
451                HirKind::Literal(Literal(bytes)) => {
452                    if let Some(ref mut prior_bytes) = prior_lit {
453                        prior_bytes.extend_from_slice(&bytes);
454                    } else {
455                        prior_lit = Some(bytes.to_vec());
456                    }
457                }
458                // We also flatten concats that are direct children of another
459                // concat. We only need to do this one level deep since
460                // Hir::concat is the only way to build concatenations, and so
461                // flattening happens inductively.
462                HirKind::Concat(subs2) => {
463                    for sub2 in subs2 {
464                        let (kind2, props2) = sub2.into_parts();
465                        match kind2 {
466                            HirKind::Literal(Literal(bytes)) => {
467                                if let Some(ref mut prior_bytes) = prior_lit {
468                                    prior_bytes.extend_from_slice(&bytes);
469                                } else {
470                                    prior_lit = Some(bytes.to_vec());
471                                }
472                            }
473                            kind2 => {
474                                if let Some(prior_bytes) = prior_lit.take() {
475                                    new.push(Hir::literal(prior_bytes));
476                                }
477                                new.push(Hir { kind: kind2, props: props2 });
478                            }
479                        }
480                    }
481                }
482                // We can just skip empty HIRs.
483                HirKind::Empty => {}
484                kind => {
485                    if let Some(prior_bytes) = prior_lit.take() {
486                        new.push(Hir::literal(prior_bytes));
487                    }
488                    new.push(Hir { kind, props });
489                }
490            }
491        }
492        if let Some(prior_bytes) = prior_lit.take() {
493            new.push(Hir::literal(prior_bytes));
494        }
495        if new.is_empty() {
496            return Hir::empty();
497        } else if new.len() == 1 {
498            return new.pop().unwrap();
499        }
500        let props = Properties::concat(&new);
501        Hir { kind: HirKind::Concat(new), props }
502    }
503
504    /// Returns the alternation of the given expressions.
505    ///
506    /// This flattens and simplifies the alternation as appropriate. This may
507    /// include factoring out common prefixes or even rewriting the alternation
508    /// as a character class.
509    ///
510    /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511    /// is not possible for one to write an empty alternation, or even an
512    /// alternation with a single sub-expression, in the concrete syntax of a
513    /// regex.)
514    ///
515    /// # Example
516    ///
517    /// This is a simple example showing how an alternation might get
518    /// simplified.
519    ///
520    /// ```
521    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522    ///
523    /// let hir = Hir::alternation(vec![
524    ///     Hir::literal([b'a']),
525    ///     Hir::literal([b'b']),
526    ///     Hir::literal([b'c']),
527    ///     Hir::literal([b'd']),
528    ///     Hir::literal([b'e']),
529    ///     Hir::literal([b'f']),
530    /// ]);
531    /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532    ///     ClassUnicodeRange::new('a', 'f'),
533    /// ])));
534    /// assert_eq!(expected, hir);
535    /// ```
536    ///
537    /// And another example showing how common prefixes might get factored
538    /// out.
539    ///
540    /// ```
541    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542    ///
543    /// let hir = Hir::alternation(vec![
544    ///     Hir::concat(vec![
545    ///         Hir::literal("abc".as_bytes()),
546    ///         Hir::class(Class::Unicode(ClassUnicode::new([
547    ///             ClassUnicodeRange::new('A', 'Z'),
548    ///         ]))),
549    ///     ]),
550    ///     Hir::concat(vec![
551    ///         Hir::literal("abc".as_bytes()),
552    ///         Hir::class(Class::Unicode(ClassUnicode::new([
553    ///             ClassUnicodeRange::new('a', 'z'),
554    ///         ]))),
555    ///     ]),
556    /// ]);
557    /// let expected = Hir::concat(vec![
558    ///     Hir::literal("abc".as_bytes()),
559    ///     Hir::alternation(vec![
560    ///         Hir::class(Class::Unicode(ClassUnicode::new([
561    ///             ClassUnicodeRange::new('A', 'Z'),
562    ///         ]))),
563    ///         Hir::class(Class::Unicode(ClassUnicode::new([
564    ///             ClassUnicodeRange::new('a', 'z'),
565    ///         ]))),
566    ///     ]),
567    /// ]);
568    /// assert_eq!(expected, hir);
569    /// ```
570    ///
571    /// Note that these sorts of simplifications are not guaranteed.
572    pub fn alternation(subs: Vec<Hir>) -> Hir {
573        // We rebuild the alternation by simplifying it. We proceed similarly
574        // as the concatenation case. But in this case, there's no literal
575        // simplification happening. We're just flattening alternations.
576        let mut new = Vec::with_capacity(subs.len());
577        for sub in subs {
578            let (kind, props) = sub.into_parts();
579            match kind {
580                HirKind::Alternation(subs2) => {
581                    new.extend(subs2);
582                }
583                kind => {
584                    new.push(Hir { kind, props });
585                }
586            }
587        }
588        if new.is_empty() {
589            return Hir::fail();
590        } else if new.len() == 1 {
591            return new.pop().unwrap();
592        }
593        // Now that it's completely flattened, look for the special case of
594        // 'char1|char2|...|charN' and collapse that into a class. Note that
595        // we look for 'char' first and then bytes. The issue here is that if
596        // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597        // then it isn't actually possible to smush them into a single class.
598        // (Because classes are either "all codepoints" or "all bytes." You
599        // can have a class that both matches non-ASCII but valid UTF-8 and
600        // invalid UTF-8.) So we look for all chars and then all bytes, and
601        // don't handle anything else.
602        if let Some(singletons) = singleton_chars(&new) {
603            let it = singletons
604                .into_iter()
605                .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606            return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607        }
608        if let Some(singletons) = singleton_bytes(&new) {
609            let it = singletons
610                .into_iter()
611                .map(|b| ClassBytesRange { start: b, end: b });
612            return Hir::class(Class::Bytes(ClassBytes::new(it)));
613        }
614        // Similar to singleton chars, we can also look for alternations of
615        // classes. Those can be smushed into a single class.
616        if let Some(cls) = class_chars(&new) {
617            return Hir::class(cls);
618        }
619        if let Some(cls) = class_bytes(&new) {
620            return Hir::class(cls);
621        }
622        // Factor out a common prefix if we can, which might potentially
623        // simplify the expression and unlock other optimizations downstream.
624        // It also might generally make NFA matching and DFA construction
625        // faster by reducing the scope of branching in the regex.
626        new = match lift_common_prefix(new) {
627            Ok(hir) => return hir,
628            Err(unchanged) => unchanged,
629        };
630        let props = Properties::alternation(&new);
631        Hir { kind: HirKind::Alternation(new), props }
632    }
633
634    /// Returns an HIR expression for `.`.
635    ///
636    /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637    /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638    /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639    /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640    /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641    /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642    ///
643    /// # Example
644    ///
645    /// Note that this is a convenience routine for constructing the correct
646    /// character class based on the value of `Dot`. There is no explicit "dot"
647    /// HIR value. It is just an abbreviation for a common character class.
648    ///
649    /// ```
650    /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651    ///
652    /// let hir = Hir::dot(Dot::AnyByte);
653    /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654    ///     ClassBytesRange::new(0x00, 0xFF),
655    /// ])));
656    /// assert_eq!(expected, hir);
657    /// ```
658    #[inline]
659    pub fn dot(dot: Dot) -> Hir {
660        match dot {
661            Dot::AnyChar => {
662                let mut cls = ClassUnicode::empty();
663                cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
664                Hir::class(Class::Unicode(cls))
665            }
666            Dot::AnyByte => {
667                let mut cls = ClassBytes::empty();
668                cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
669                Hir::class(Class::Bytes(cls))
670            }
671            Dot::AnyCharExcept(ch) => {
672                let mut cls =
673                    ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
674                cls.negate();
675                Hir::class(Class::Unicode(cls))
676            }
677            Dot::AnyCharExceptLF => {
678                let mut cls = ClassUnicode::empty();
679                cls.push(ClassUnicodeRange::new('\0', '\x09'));
680                cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
681                Hir::class(Class::Unicode(cls))
682            }
683            Dot::AnyCharExceptCRLF => {
684                let mut cls = ClassUnicode::empty();
685                cls.push(ClassUnicodeRange::new('\0', '\x09'));
686                cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
687                cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
688                Hir::class(Class::Unicode(cls))
689            }
690            Dot::AnyByteExcept(byte) => {
691                let mut cls =
692                    ClassBytes::new([ClassBytesRange::new(byte, byte)]);
693                cls.negate();
694                Hir::class(Class::Bytes(cls))
695            }
696            Dot::AnyByteExceptLF => {
697                let mut cls = ClassBytes::empty();
698                cls.push(ClassBytesRange::new(b'\0', b'\x09'));
699                cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
700                Hir::class(Class::Bytes(cls))
701            }
702            Dot::AnyByteExceptCRLF => {
703                let mut cls = ClassBytes::empty();
704                cls.push(ClassBytesRange::new(b'\0', b'\x09'));
705                cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
706                cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
707                Hir::class(Class::Bytes(cls))
708            }
709        }
710    }
711}
712
713/// The underlying kind of an arbitrary [`Hir`] expression.
714///
715/// An `HirKind` is principally useful for doing case analysis on the type
716/// of a regular expression. If you're looking to build new `Hir` values,
717/// then you _must_ use the smart constructors defined on `Hir`, like
718/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
719/// not expose any way of building an `Hir` directly from an `HirKind`.
720#[derive(Clone, Debug, Eq, PartialEq)]
721pub enum HirKind {
722    /// The empty regular expression, which matches everything, including the
723    /// empty string.
724    Empty,
725    /// A literalstring that matches exactly these bytes.
726    Literal(Literal),
727    /// A single character class that matches any of the characters in the
728    /// class. A class can either consist of Unicode scalar values as
729    /// characters, or it can use bytes.
730    ///
731    /// A class may be empty. In which case, it matches nothing.
732    Class(Class),
733    /// A look-around assertion. A look-around match always has zero length.
734    Look(Look),
735    /// A repetition operation applied to a sub-expression.
736    Repetition(Repetition),
737    /// A capturing group, which contains a sub-expression.
738    Capture(Capture),
739    /// A concatenation of expressions.
740    ///
741    /// A concatenation matches only if each of its sub-expressions match one
742    /// after the other.
743    ///
744    /// Concatenations are guaranteed by `Hir`'s smart constructors to always
745    /// have at least two sub-expressions.
746    Concat(Vec<Hir>),
747    /// An alternation of expressions.
748    ///
749    /// An alternation matches only if at least one of its sub-expressions
750    /// match. If multiple sub-expressions match, then the leftmost is
751    /// preferred.
752    ///
753    /// Alternations are guaranteed by `Hir`'s smart constructors to always
754    /// have at least two sub-expressions.
755    Alternation(Vec<Hir>),
756}
757
758impl HirKind {
759    /// Returns a slice of this kind's sub-expressions, if any.
760    pub fn subs(&self) -> &[Hir] {
761        use core::slice::from_ref;
762
763        match *self {
764            HirKind::Empty
765            | HirKind::Literal(_)
766            | HirKind::Class(_)
767            | HirKind::Look(_) => &[],
768            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
769            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
770            HirKind::Concat(ref subs) => subs,
771            HirKind::Alternation(ref subs) => subs,
772        }
773    }
774}
775
776impl core::fmt::Debug for Hir {
777    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
778        self.kind.fmt(f)
779    }
780}
781
782/// Print a display representation of this Hir.
783///
784/// The result of this is a valid regular expression pattern string.
785///
786/// This implementation uses constant stack space and heap space proportional
787/// to the size of the `Hir`.
788impl core::fmt::Display for Hir {
789    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
790        crate::hir::print::Printer::new().print(self, f)
791    }
792}
793
794/// The high-level intermediate representation of a literal.
795///
796/// A literal corresponds to `0` or more bytes that should be matched
797/// literally. The smart constructors defined on `Hir` will automatically
798/// concatenate adjacent literals into one literal, and will even automatically
799/// replace empty literals with `Hir::empty()`.
800///
801/// Note that despite a literal being represented by a sequence of bytes, its
802/// `Debug` implementation will attempt to print it as a normal string. (That
803/// is, not a sequence of decimal numbers.)
804#[derive(Clone, Eq, PartialEq)]
805pub struct Literal(pub Box<[u8]>);
806
807impl core::fmt::Debug for Literal {
808    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
809        crate::debug::Bytes(&self.0).fmt(f)
810    }
811}
812
813/// The high-level intermediate representation of a character class.
814///
815/// A character class corresponds to a set of characters. A character is either
816/// defined by a Unicode scalar value or a byte.
817///
818/// A character class, regardless of its character type, is represented by a
819/// sequence of non-overlapping non-adjacent ranges of characters.
820///
821/// There are no guarantees about which class variant is used. Generally
822/// speaking, the Unicode variat is used whenever a class needs to contain
823/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
824/// when Unicode mode is disabled. For example, at the time of writing, the
825/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
826/// `[a\u00A0]` due to optimizations.
827///
828/// Note that `Bytes` variant may be produced even when it exclusively matches
829/// valid UTF-8. This is because a `Bytes` variant represents an intention by
830/// the author of the regular expression to disable Unicode mode, which in turn
831/// impacts the semantics of case insensitive matching. For example, `(?i)k`
832/// and `(?i-u)k` will not match the same set of strings.
833#[derive(Clone, Eq, PartialEq)]
834pub enum Class {
835    /// A set of characters represented by Unicode scalar values.
836    Unicode(ClassUnicode),
837    /// A set of characters represented by arbitrary bytes (one byte per
838    /// character).
839    Bytes(ClassBytes),
840}
841
842impl Class {
843    /// Apply Unicode simple case folding to this character class, in place.
844    /// The character class will be expanded to include all simple case folded
845    /// character variants.
846    ///
847    /// If this is a byte oriented character class, then this will be limited
848    /// to the ASCII ranges `A-Z` and `a-z`.
849    ///
850    /// # Panics
851    ///
852    /// This routine panics when the case mapping data necessary for this
853    /// routine to complete is unavailable. This occurs when the `unicode-case`
854    /// feature is not enabled and the underlying class is Unicode oriented.
855    ///
856    /// Callers should prefer using `try_case_fold_simple` instead, which will
857    /// return an error instead of panicking.
858    pub fn case_fold_simple(&mut self) {
859        match *self {
860            Class::Unicode(ref mut x) => x.case_fold_simple(),
861            Class::Bytes(ref mut x) => x.case_fold_simple(),
862        }
863    }
864
865    /// Apply Unicode simple case folding to this character class, in place.
866    /// The character class will be expanded to include all simple case folded
867    /// character variants.
868    ///
869    /// If this is a byte oriented character class, then this will be limited
870    /// to the ASCII ranges `A-Z` and `a-z`.
871    ///
872    /// # Error
873    ///
874    /// This routine returns an error when the case mapping data necessary
875    /// for this routine to complete is unavailable. This occurs when the
876    /// `unicode-case` feature is not enabled and the underlying class is
877    /// Unicode oriented.
878    pub fn try_case_fold_simple(
879        &mut self,
880    ) -> core::result::Result<(), CaseFoldError> {
881        match *self {
882            Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
883            Class::Bytes(ref mut x) => x.case_fold_simple(),
884        }
885        Ok(())
886    }
887
888    /// Negate this character class in place.
889    ///
890    /// After completion, this character class will contain precisely the
891    /// characters that weren't previously in the class.
892    pub fn negate(&mut self) {
893        match *self {
894            Class::Unicode(ref mut x) => x.negate(),
895            Class::Bytes(ref mut x) => x.negate(),
896        }
897    }
898
899    /// Returns true if and only if this character class will only ever match
900    /// valid UTF-8.
901    ///
902    /// A character class can match invalid UTF-8 only when the following
903    /// conditions are met:
904    ///
905    /// 1. The translator was configured to permit generating an expression
906    ///    that can match invalid UTF-8. (By default, this is disabled.)
907    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
908    ///    syntax or in the parser builder. By default, Unicode mode is
909    ///    enabled.
910    pub fn is_utf8(&self) -> bool {
911        match *self {
912            Class::Unicode(_) => true,
913            Class::Bytes(ref x) => x.is_ascii(),
914        }
915    }
916
917    /// Returns the length, in bytes, of the smallest string matched by this
918    /// character class.
919    ///
920    /// For non-empty byte oriented classes, this always returns `1`. For
921    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
922    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
923    /// be returned.
924    ///
925    /// # Example
926    ///
927    /// This example shows some examples of regexes and their corresponding
928    /// minimum length, if any.
929    ///
930    /// ```
931    /// use regex_syntax::{hir::Properties, parse};
932    ///
933    /// // The empty string has a min length of 0.
934    /// let hir = parse(r"")?;
935    /// assert_eq!(Some(0), hir.properties().minimum_len());
936    /// // As do other types of regexes that only match the empty string.
937    /// let hir = parse(r"^$\b\B")?;
938    /// assert_eq!(Some(0), hir.properties().minimum_len());
939    /// // A regex that can match the empty string but match more is still 0.
940    /// let hir = parse(r"a*")?;
941    /// assert_eq!(Some(0), hir.properties().minimum_len());
942    /// // A regex that matches nothing has no minimum defined.
943    /// let hir = parse(r"[a&&b]")?;
944    /// assert_eq!(None, hir.properties().minimum_len());
945    /// // Character classes usually have a minimum length of 1.
946    /// let hir = parse(r"\w")?;
947    /// assert_eq!(Some(1), hir.properties().minimum_len());
948    /// // But sometimes Unicode classes might be bigger!
949    /// let hir = parse(r"\p{Cyrillic}")?;
950    /// assert_eq!(Some(2), hir.properties().minimum_len());
951    ///
952    /// # Ok::<(), Box<dyn std::error::Error>>(())
953    /// ```
954    pub fn minimum_len(&self) -> Option<usize> {
955        match *self {
956            Class::Unicode(ref x) => x.minimum_len(),
957            Class::Bytes(ref x) => x.minimum_len(),
958        }
959    }
960
961    /// Returns the length, in bytes, of the longest string matched by this
962    /// character class.
963    ///
964    /// For non-empty byte oriented classes, this always returns `1`. For
965    /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
966    /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
967    /// be returned.
968    ///
969    /// # Example
970    ///
971    /// This example shows some examples of regexes and their corresponding
972    /// maximum length, if any.
973    ///
974    /// ```
975    /// use regex_syntax::{hir::Properties, parse};
976    ///
977    /// // The empty string has a max length of 0.
978    /// let hir = parse(r"")?;
979    /// assert_eq!(Some(0), hir.properties().maximum_len());
980    /// // As do other types of regexes that only match the empty string.
981    /// let hir = parse(r"^$\b\B")?;
982    /// assert_eq!(Some(0), hir.properties().maximum_len());
983    /// // A regex that matches nothing has no maximum defined.
984    /// let hir = parse(r"[a&&b]")?;
985    /// assert_eq!(None, hir.properties().maximum_len());
986    /// // Bounded repeats work as you expect.
987    /// let hir = parse(r"x{2,10}")?;
988    /// assert_eq!(Some(10), hir.properties().maximum_len());
989    /// // An unbounded repeat means there is no maximum.
990    /// let hir = parse(r"x{2,}")?;
991    /// assert_eq!(None, hir.properties().maximum_len());
992    /// // With Unicode enabled, \w can match up to 4 bytes!
993    /// let hir = parse(r"\w")?;
994    /// assert_eq!(Some(4), hir.properties().maximum_len());
995    /// // Without Unicode enabled, \w matches at most 1 byte.
996    /// let hir = parse(r"(?-u)\w")?;
997    /// assert_eq!(Some(1), hir.properties().maximum_len());
998    ///
999    /// # Ok::<(), Box<dyn std::error::Error>>(())
1000    /// ```
1001    pub fn maximum_len(&self) -> Option<usize> {
1002        match *self {
1003            Class::Unicode(ref x) => x.maximum_len(),
1004            Class::Bytes(ref x) => x.maximum_len(),
1005        }
1006    }
1007
1008    /// Returns true if and only if this character class is empty. That is,
1009    /// it has no elements.
1010    ///
1011    /// An empty character can never match anything, including an empty string.
1012    pub fn is_empty(&self) -> bool {
1013        match *self {
1014            Class::Unicode(ref x) => x.ranges().is_empty(),
1015            Class::Bytes(ref x) => x.ranges().is_empty(),
1016        }
1017    }
1018
1019    /// If this class consists of exactly one element (whether a codepoint or a
1020    /// byte), then return it as a literal byte string.
1021    ///
1022    /// If this class is empty or contains more than one element, then `None`
1023    /// is returned.
1024    pub fn literal(&self) -> Option<Vec<u8>> {
1025        match *self {
1026            Class::Unicode(ref x) => x.literal(),
1027            Class::Bytes(ref x) => x.literal(),
1028        }
1029    }
1030}
1031
1032impl core::fmt::Debug for Class {
1033    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1034        use crate::debug::Byte;
1035
1036        let mut fmter = f.debug_set();
1037        match *self {
1038            Class::Unicode(ref cls) => {
1039                for r in cls.ranges().iter() {
1040                    fmter.entry(&(r.start..=r.end));
1041                }
1042            }
1043            Class::Bytes(ref cls) => {
1044                for r in cls.ranges().iter() {
1045                    fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1046                }
1047            }
1048        }
1049        fmter.finish()
1050    }
1051}
1052
1053/// A set of characters represented by Unicode scalar values.
1054#[derive(Clone, Debug, Eq, PartialEq)]
1055pub struct ClassUnicode {
1056    set: IntervalSet<ClassUnicodeRange>,
1057}
1058
1059impl ClassUnicode {
1060    /// Create a new class from a sequence of ranges.
1061    ///
1062    /// The given ranges do not need to be in any specific order, and ranges
1063    /// may overlap. Ranges will automatically be sorted into a canonical
1064    /// non-overlapping order.
1065    pub fn new<I>(ranges: I) -> ClassUnicode
1066    where
1067        I: IntoIterator<Item = ClassUnicodeRange>,
1068    {
1069        ClassUnicode { set: IntervalSet::new(ranges) }
1070    }
1071
1072    /// Create a new class with no ranges.
1073    ///
1074    /// An empty class matches nothing. That is, it is equivalent to
1075    /// [`Hir::fail`].
1076    pub fn empty() -> ClassUnicode {
1077        ClassUnicode::new(vec![])
1078    }
1079
1080    /// Add a new range to this set.
1081    pub fn push(&mut self, range: ClassUnicodeRange) {
1082        self.set.push(range);
1083    }
1084
1085    /// Return an iterator over all ranges in this class.
1086    ///
1087    /// The iterator yields ranges in ascending order.
1088    pub fn iter(&self) -> ClassUnicodeIter<'_> {
1089        ClassUnicodeIter(self.set.iter())
1090    }
1091
1092    /// Return the underlying ranges as a slice.
1093    pub fn ranges(&self) -> &[ClassUnicodeRange] {
1094        self.set.intervals()
1095    }
1096
1097    /// Expand this character class such that it contains all case folded
1098    /// characters, according to Unicode's "simple" mapping. For example, if
1099    /// this class consists of the range `a-z`, then applying case folding will
1100    /// result in the class containing both the ranges `a-z` and `A-Z`.
1101    ///
1102    /// # Panics
1103    ///
1104    /// This routine panics when the case mapping data necessary for this
1105    /// routine to complete is unavailable. This occurs when the `unicode-case`
1106    /// feature is not enabled.
1107    ///
1108    /// Callers should prefer using `try_case_fold_simple` instead, which will
1109    /// return an error instead of panicking.
1110    pub fn case_fold_simple(&mut self) {
1111        self.set
1112            .case_fold_simple()
1113            .expect("unicode-case feature must be enabled");
1114    }
1115
1116    /// Expand this character class such that it contains all case folded
1117    /// characters, according to Unicode's "simple" mapping. For example, if
1118    /// this class consists of the range `a-z`, then applying case folding will
1119    /// result in the class containing both the ranges `a-z` and `A-Z`.
1120    ///
1121    /// # Error
1122    ///
1123    /// This routine returns an error when the case mapping data necessary
1124    /// for this routine to complete is unavailable. This occurs when the
1125    /// `unicode-case` feature is not enabled.
1126    pub fn try_case_fold_simple(
1127        &mut self,
1128    ) -> core::result::Result<(), CaseFoldError> {
1129        self.set.case_fold_simple()
1130    }
1131
1132    /// Negate this character class.
1133    ///
1134    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1135    /// set, then it will not be in this set after negation.
1136    pub fn negate(&mut self) {
1137        self.set.negate();
1138    }
1139
1140    /// Union this character class with the given character class, in place.
1141    pub fn union(&mut self, other: &ClassUnicode) {
1142        self.set.union(&other.set);
1143    }
1144
1145    /// Intersect this character class with the given character class, in
1146    /// place.
1147    pub fn intersect(&mut self, other: &ClassUnicode) {
1148        self.set.intersect(&other.set);
1149    }
1150
1151    /// Subtract the given character class from this character class, in place.
1152    pub fn difference(&mut self, other: &ClassUnicode) {
1153        self.set.difference(&other.set);
1154    }
1155
1156    /// Compute the symmetric difference of the given character classes, in
1157    /// place.
1158    ///
1159    /// This computes the symmetric difference of two character classes. This
1160    /// removes all elements in this class that are also in the given class,
1161    /// but all adds all elements from the given class that aren't in this
1162    /// class. That is, the class will contain all elements in either class,
1163    /// but will not contain any elements that are in both classes.
1164    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1165        self.set.symmetric_difference(&other.set);
1166    }
1167
1168    /// Returns true if and only if this character class will either match
1169    /// nothing or only ASCII bytes. Stated differently, this returns false
1170    /// if and only if this class contains a non-ASCII codepoint.
1171    pub fn is_ascii(&self) -> bool {
1172        self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1173    }
1174
1175    /// Returns the length, in bytes, of the smallest string matched by this
1176    /// character class.
1177    ///
1178    /// Returns `None` when the class is empty.
1179    pub fn minimum_len(&self) -> Option<usize> {
1180        let first = self.ranges().get(0)?;
1181        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1182        Some(first.start.len_utf8())
1183    }
1184
1185    /// Returns the length, in bytes, of the longest string matched by this
1186    /// character class.
1187    ///
1188    /// Returns `None` when the class is empty.
1189    pub fn maximum_len(&self) -> Option<usize> {
1190        let last = self.ranges().last()?;
1191        // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1192        Some(last.end.len_utf8())
1193    }
1194
1195    /// If this class consists of exactly one codepoint, then return it as
1196    /// a literal byte string.
1197    ///
1198    /// If this class is empty or contains more than one codepoint, then `None`
1199    /// is returned.
1200    pub fn literal(&self) -> Option<Vec<u8>> {
1201        let rs = self.ranges();
1202        if rs.len() == 1 && rs[0].start == rs[0].end {
1203            Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1204        } else {
1205            None
1206        }
1207    }
1208
1209    /// If this class consists of only ASCII ranges, then return its
1210    /// corresponding and equivalent byte class.
1211    pub fn to_byte_class(&self) -> Option<ClassBytes> {
1212        if !self.is_ascii() {
1213            return None;
1214        }
1215        Some(ClassBytes::new(self.ranges().iter().map(|r| {
1216            // Since we are guaranteed that our codepoint range is ASCII, the
1217            // 'u8::try_from' calls below are guaranteed to be correct.
1218            ClassBytesRange {
1219                start: u8::try_from(r.start).unwrap(),
1220                end: u8::try_from(r.end).unwrap(),
1221            }
1222        })))
1223    }
1224}
1225
1226/// An iterator over all ranges in a Unicode character class.
1227///
1228/// The lifetime `'a` refers to the lifetime of the underlying class.
1229#[derive(Debug)]
1230pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1231
1232impl<'a> Iterator for ClassUnicodeIter<'a> {
1233    type Item = &'a ClassUnicodeRange;
1234
1235    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1236        self.0.next()
1237    }
1238}
1239
1240/// A single range of characters represented by Unicode scalar values.
1241///
1242/// The range is closed. That is, the start and end of the range are included
1243/// in the range.
1244#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1245pub struct ClassUnicodeRange {
1246    start: char,
1247    end: char,
1248}
1249
1250impl core::fmt::Debug for ClassUnicodeRange {
1251    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1252        let start = if !self.start.is_whitespace() && !self.start.is_control()
1253        {
1254            self.start.to_string()
1255        } else {
1256            format!("0x{:X}", u32::from(self.start))
1257        };
1258        let end = if !self.end.is_whitespace() && !self.end.is_control() {
1259            self.end.to_string()
1260        } else {
1261            format!("0x{:X}", u32::from(self.end))
1262        };
1263        f.debug_struct("ClassUnicodeRange")
1264            .field("start", &start)
1265            .field("end", &end)
1266            .finish()
1267    }
1268}
1269
1270impl Interval for ClassUnicodeRange {
1271    type Bound = char;
1272
1273    #[inline]
1274    fn lower(&self) -> char {
1275        self.start
1276    }
1277    #[inline]
1278    fn upper(&self) -> char {
1279        self.end
1280    }
1281    #[inline]
1282    fn set_lower(&mut self, bound: char) {
1283        self.start = bound;
1284    }
1285    #[inline]
1286    fn set_upper(&mut self, bound: char) {
1287        self.end = bound;
1288    }
1289
1290    /// Apply simple case folding to this Unicode scalar value range.
1291    ///
1292    /// Additional ranges are appended to the given vector. Canonical ordering
1293    /// is *not* maintained in the given vector.
1294    fn case_fold_simple(
1295        &self,
1296        ranges: &mut Vec<ClassUnicodeRange>,
1297    ) -> Result<(), unicode::CaseFoldError> {
1298        let mut folder = unicode::SimpleCaseFolder::new()?;
1299        if !folder.overlaps(self.start, self.end) {
1300            return Ok(());
1301        }
1302        let (start, end) = (u32::from(self.start), u32::from(self.end));
1303        for cp in (start..=end).filter_map(char::from_u32) {
1304            for &cp_folded in folder.mapping(cp) {
1305                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1306            }
1307        }
1308        Ok(())
1309    }
1310}
1311
1312impl ClassUnicodeRange {
1313    /// Create a new Unicode scalar value range for a character class.
1314    ///
1315    /// The returned range is always in a canonical form. That is, the range
1316    /// returned always satisfies the invariant that `start <= end`.
1317    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1318        ClassUnicodeRange::create(start, end)
1319    }
1320
1321    /// Return the start of this range.
1322    ///
1323    /// The start of a range is always less than or equal to the end of the
1324    /// range.
1325    pub fn start(&self) -> char {
1326        self.start
1327    }
1328
1329    /// Return the end of this range.
1330    ///
1331    /// The end of a range is always greater than or equal to the start of the
1332    /// range.
1333    pub fn end(&self) -> char {
1334        self.end
1335    }
1336
1337    /// Returns the number of codepoints in this range.
1338    pub fn len(&self) -> usize {
1339        let diff = 1 + u32::from(self.end) - u32::from(self.start);
1340        // This is likely to panic in 16-bit targets since a usize can only fit
1341        // 2^16. It's not clear what to do here, other than to return an error
1342        // when building a Unicode class that contains a range whose length
1343        // overflows usize. (Which, to be honest, is probably quite common on
1344        // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1345        // would be impossible to build.)
1346        usize::try_from(diff).expect("char class len fits in usize")
1347    }
1348}
1349
1350/// A set of characters represented by arbitrary bytes.
1351///
1352/// Each byte corresponds to one character.
1353#[derive(Clone, Debug, Eq, PartialEq)]
1354pub struct ClassBytes {
1355    set: IntervalSet<ClassBytesRange>,
1356}
1357
1358impl ClassBytes {
1359    /// Create a new class from a sequence of ranges.
1360    ///
1361    /// The given ranges do not need to be in any specific order, and ranges
1362    /// may overlap. Ranges will automatically be sorted into a canonical
1363    /// non-overlapping order.
1364    pub fn new<I>(ranges: I) -> ClassBytes
1365    where
1366        I: IntoIterator<Item = ClassBytesRange>,
1367    {
1368        ClassBytes { set: IntervalSet::new(ranges) }
1369    }
1370
1371    /// Create a new class with no ranges.
1372    ///
1373    /// An empty class matches nothing. That is, it is equivalent to
1374    /// [`Hir::fail`].
1375    pub fn empty() -> ClassBytes {
1376        ClassBytes::new(vec![])
1377    }
1378
1379    /// Add a new range to this set.
1380    pub fn push(&mut self, range: ClassBytesRange) {
1381        self.set.push(range);
1382    }
1383
1384    /// Return an iterator over all ranges in this class.
1385    ///
1386    /// The iterator yields ranges in ascending order.
1387    pub fn iter(&self) -> ClassBytesIter<'_> {
1388        ClassBytesIter(self.set.iter())
1389    }
1390
1391    /// Return the underlying ranges as a slice.
1392    pub fn ranges(&self) -> &[ClassBytesRange] {
1393        self.set.intervals()
1394    }
1395
1396    /// Expand this character class such that it contains all case folded
1397    /// characters. For example, if this class consists of the range `a-z`,
1398    /// then applying case folding will result in the class containing both the
1399    /// ranges `a-z` and `A-Z`.
1400    ///
1401    /// Note that this only applies ASCII case folding, which is limited to the
1402    /// characters `a-z` and `A-Z`.
1403    pub fn case_fold_simple(&mut self) {
1404        self.set.case_fold_simple().expect("ASCII case folding never fails");
1405    }
1406
1407    /// Negate this byte class.
1408    ///
1409    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1410    /// will not be in this set after negation.
1411    pub fn negate(&mut self) {
1412        self.set.negate();
1413    }
1414
1415    /// Union this byte class with the given byte class, in place.
1416    pub fn union(&mut self, other: &ClassBytes) {
1417        self.set.union(&other.set);
1418    }
1419
1420    /// Intersect this byte class with the given byte class, in place.
1421    pub fn intersect(&mut self, other: &ClassBytes) {
1422        self.set.intersect(&other.set);
1423    }
1424
1425    /// Subtract the given byte class from this byte class, in place.
1426    pub fn difference(&mut self, other: &ClassBytes) {
1427        self.set.difference(&other.set);
1428    }
1429
1430    /// Compute the symmetric difference of the given byte classes, in place.
1431    ///
1432    /// This computes the symmetric difference of two byte classes. This
1433    /// removes all elements in this class that are also in the given class,
1434    /// but all adds all elements from the given class that aren't in this
1435    /// class. That is, the class will contain all elements in either class,
1436    /// but will not contain any elements that are in both classes.
1437    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1438        self.set.symmetric_difference(&other.set);
1439    }
1440
1441    /// Returns true if and only if this character class will either match
1442    /// nothing or only ASCII bytes. Stated differently, this returns false
1443    /// if and only if this class contains a non-ASCII byte.
1444    pub fn is_ascii(&self) -> bool {
1445        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1446    }
1447
1448    /// Returns the length, in bytes, of the smallest string matched by this
1449    /// character class.
1450    ///
1451    /// Returns `None` when the class is empty.
1452    pub fn minimum_len(&self) -> Option<usize> {
1453        if self.ranges().is_empty() {
1454            None
1455        } else {
1456            Some(1)
1457        }
1458    }
1459
1460    /// Returns the length, in bytes, of the longest string matched by this
1461    /// character class.
1462    ///
1463    /// Returns `None` when the class is empty.
1464    pub fn maximum_len(&self) -> Option<usize> {
1465        if self.ranges().is_empty() {
1466            None
1467        } else {
1468            Some(1)
1469        }
1470    }
1471
1472    /// If this class consists of exactly one byte, then return it as
1473    /// a literal byte string.
1474    ///
1475    /// If this class is empty or contains more than one byte, then `None`
1476    /// is returned.
1477    pub fn literal(&self) -> Option<Vec<u8>> {
1478        let rs = self.ranges();
1479        if rs.len() == 1 && rs[0].start == rs[0].end {
1480            Some(vec![rs[0].start])
1481        } else {
1482            None
1483        }
1484    }
1485
1486    /// If this class consists of only ASCII ranges, then return its
1487    /// corresponding and equivalent Unicode class.
1488    pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1489        if !self.is_ascii() {
1490            return None;
1491        }
1492        Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1493            // Since we are guaranteed that our byte range is ASCII, the
1494            // 'char::from' calls below are correct and will not erroneously
1495            // convert a raw byte value into its corresponding codepoint.
1496            ClassUnicodeRange {
1497                start: char::from(r.start),
1498                end: char::from(r.end),
1499            }
1500        })))
1501    }
1502}
1503
1504/// An iterator over all ranges in a byte character class.
1505///
1506/// The lifetime `'a` refers to the lifetime of the underlying class.
1507#[derive(Debug)]
1508pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1509
1510impl<'a> Iterator for ClassBytesIter<'a> {
1511    type Item = &'a ClassBytesRange;
1512
1513    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1514        self.0.next()
1515    }
1516}
1517
1518/// A single range of characters represented by arbitrary bytes.
1519///
1520/// The range is closed. That is, the start and end of the range are included
1521/// in the range.
1522#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1523pub struct ClassBytesRange {
1524    start: u8,
1525    end: u8,
1526}
1527
1528impl Interval for ClassBytesRange {
1529    type Bound = u8;
1530
1531    #[inline]
1532    fn lower(&self) -> u8 {
1533        self.start
1534    }
1535    #[inline]
1536    fn upper(&self) -> u8 {
1537        self.end
1538    }
1539    #[inline]
1540    fn set_lower(&mut self, bound: u8) {
1541        self.start = bound;
1542    }
1543    #[inline]
1544    fn set_upper(&mut self, bound: u8) {
1545        self.end = bound;
1546    }
1547
1548    /// Apply simple case folding to this byte range. Only ASCII case mappings
1549    /// (for a-z) are applied.
1550    ///
1551    /// Additional ranges are appended to the given vector. Canonical ordering
1552    /// is *not* maintained in the given vector.
1553    fn case_fold_simple(
1554        &self,
1555        ranges: &mut Vec<ClassBytesRange>,
1556    ) -> Result<(), unicode::CaseFoldError> {
1557        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1558            let lower = cmp::max(self.start, b'a');
1559            let upper = cmp::min(self.end, b'z');
1560            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1561        }
1562        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1563            let lower = cmp::max(self.start, b'A');
1564            let upper = cmp::min(self.end, b'Z');
1565            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1566        }
1567        Ok(())
1568    }
1569}
1570
1571impl ClassBytesRange {
1572    /// Create a new byte range for a character class.
1573    ///
1574    /// The returned range is always in a canonical form. That is, the range
1575    /// returned always satisfies the invariant that `start <= end`.
1576    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1577        ClassBytesRange::create(start, end)
1578    }
1579
1580    /// Return the start of this range.
1581    ///
1582    /// The start of a range is always less than or equal to the end of the
1583    /// range.
1584    pub fn start(&self) -> u8 {
1585        self.start
1586    }
1587
1588    /// Return the end of this range.
1589    ///
1590    /// The end of a range is always greater than or equal to the start of the
1591    /// range.
1592    pub fn end(&self) -> u8 {
1593        self.end
1594    }
1595
1596    /// Returns the number of bytes in this range.
1597    pub fn len(&self) -> usize {
1598        usize::from(self.end.checked_sub(self.start).unwrap())
1599            .checked_add(1)
1600            .unwrap()
1601    }
1602}
1603
1604impl core::fmt::Debug for ClassBytesRange {
1605    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1606        f.debug_struct("ClassBytesRange")
1607            .field("start", &crate::debug::Byte(self.start))
1608            .field("end", &crate::debug::Byte(self.end))
1609            .finish()
1610    }
1611}
1612
1613/// The high-level intermediate representation for a look-around assertion.
1614///
1615/// An assertion match is always zero-length. Also called an "empty match."
1616#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1617pub enum Look {
1618    /// Match the beginning of text. Specifically, this matches at the starting
1619    /// position of the input.
1620    Start = 1 << 0,
1621    /// Match the end of text. Specifically, this matches at the ending
1622    /// position of the input.
1623    End = 1 << 1,
1624    /// Match the beginning of a line or the beginning of text. Specifically,
1625    /// this matches at the starting position of the input, or at the position
1626    /// immediately following a `\n` character.
1627    StartLF = 1 << 2,
1628    /// Match the end of a line or the end of text. Specifically, this matches
1629    /// at the end position of the input, or at the position immediately
1630    /// preceding a `\n` character.
1631    EndLF = 1 << 3,
1632    /// Match the beginning of a line or the beginning of text. Specifically,
1633    /// this matches at the starting position of the input, or at the position
1634    /// immediately following either a `\r` or `\n` character, but never after
1635    /// a `\r` when a `\n` follows.
1636    StartCRLF = 1 << 4,
1637    /// Match the end of a line or the end of text. Specifically, this matches
1638    /// at the end position of the input, or at the position immediately
1639    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1640    /// precedes it.
1641    EndCRLF = 1 << 5,
1642    /// Match an ASCII-only word boundary. That is, this matches a position
1643    /// where the left adjacent character and right adjacent character
1644    /// correspond to a word and non-word or a non-word and word character.
1645    WordAscii = 1 << 6,
1646    /// Match an ASCII-only negation of a word boundary.
1647    WordAsciiNegate = 1 << 7,
1648    /// Match a Unicode-aware word boundary. That is, this matches a position
1649    /// where the left adjacent character and right adjacent character
1650    /// correspond to a word and non-word or a non-word and word character.
1651    WordUnicode = 1 << 8,
1652    /// Match a Unicode-aware negation of a word boundary.
1653    WordUnicodeNegate = 1 << 9,
1654    /// Match the start of an ASCII-only word boundary. That is, this matches a
1655    /// position at either the beginning of the haystack or where the previous
1656    /// character is not a word character and the following character is a word
1657    /// character.
1658    WordStartAscii = 1 << 10,
1659    /// Match the end of an ASCII-only word boundary. That is, this matches
1660    /// a position at either the end of the haystack or where the previous
1661    /// character is a word character and the following character is not a word
1662    /// character.
1663    WordEndAscii = 1 << 11,
1664    /// Match the start of a Unicode word boundary. That is, this matches a
1665    /// position at either the beginning of the haystack or where the previous
1666    /// character is not a word character and the following character is a word
1667    /// character.
1668    WordStartUnicode = 1 << 12,
1669    /// Match the end of a Unicode word boundary. That is, this matches a
1670    /// position at either the end of the haystack or where the previous
1671    /// character is a word character and the following character is not a word
1672    /// character.
1673    WordEndUnicode = 1 << 13,
1674    /// Match the start half of an ASCII-only word boundary. That is, this
1675    /// matches a position at either the beginning of the haystack or where the
1676    /// previous character is not a word character.
1677    WordStartHalfAscii = 1 << 14,
1678    /// Match the end half of an ASCII-only word boundary. That is, this
1679    /// matches a position at either the end of the haystack or where the
1680    /// following character is not a word character.
1681    WordEndHalfAscii = 1 << 15,
1682    /// Match the start half of a Unicode word boundary. That is, this matches
1683    /// a position at either the beginning of the haystack or where the
1684    /// previous character is not a word character.
1685    WordStartHalfUnicode = 1 << 16,
1686    /// Match the end half of a Unicode word boundary. That is, this matches
1687    /// a position at either the end of the haystack or where the following
1688    /// character is not a word character.
1689    WordEndHalfUnicode = 1 << 17,
1690}
1691
1692impl Look {
1693    /// Flip the look-around assertion to its equivalent for reverse searches.
1694    /// For example, `StartLF` gets translated to `EndLF`.
1695    ///
1696    /// Some assertions, such as `WordUnicode`, remain the same since they
1697    /// match the same positions regardless of the direction of the search.
1698    #[inline]
1699    pub const fn reversed(self) -> Look {
1700        match self {
1701            Look::Start => Look::End,
1702            Look::End => Look::Start,
1703            Look::StartLF => Look::EndLF,
1704            Look::EndLF => Look::StartLF,
1705            Look::StartCRLF => Look::EndCRLF,
1706            Look::EndCRLF => Look::StartCRLF,
1707            Look::WordAscii => Look::WordAscii,
1708            Look::WordAsciiNegate => Look::WordAsciiNegate,
1709            Look::WordUnicode => Look::WordUnicode,
1710            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1711            Look::WordStartAscii => Look::WordEndAscii,
1712            Look::WordEndAscii => Look::WordStartAscii,
1713            Look::WordStartUnicode => Look::WordEndUnicode,
1714            Look::WordEndUnicode => Look::WordStartUnicode,
1715            Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1716            Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1717            Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1718            Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1719        }
1720    }
1721
1722    /// Return the underlying representation of this look-around enumeration
1723    /// as an integer. Giving the return value to the [`Look::from_repr`]
1724    /// constructor is guaranteed to return the same look-around variant that
1725    /// one started with within a semver compatible release of this crate.
1726    #[inline]
1727    pub const fn as_repr(self) -> u32 {
1728        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1729        // actual int.
1730        self as u32
1731    }
1732
1733    /// Given the underlying representation of a `Look` value, return the
1734    /// corresponding `Look` value if the representation is valid. Otherwise
1735    /// `None` is returned.
1736    #[inline]
1737    pub const fn from_repr(repr: u32) -> Option<Look> {
1738        match repr {
1739            0b00_0000_0000_0000_0001 => Some(Look::Start),
1740            0b00_0000_0000_0000_0010 => Some(Look::End),
1741            0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1742            0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1743            0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1744            0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1745            0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1746            0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1747            0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1748            0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1749            0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1750            0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1751            0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1752            0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1753            0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1754            0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1755            0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1756            0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1757            _ => None,
1758        }
1759    }
1760
1761    /// Returns a convenient single codepoint representation of this
1762    /// look-around assertion. Each assertion is guaranteed to be represented
1763    /// by a distinct character.
1764    ///
1765    /// This is useful for succinctly representing a look-around assertion in
1766    /// human friendly but succinct output intended for a programmer working on
1767    /// regex internals.
1768    #[inline]
1769    pub const fn as_char(self) -> char {
1770        match self {
1771            Look::Start => 'A',
1772            Look::End => 'z',
1773            Look::StartLF => '^',
1774            Look::EndLF => '$',
1775            Look::StartCRLF => 'r',
1776            Look::EndCRLF => 'R',
1777            Look::WordAscii => 'b',
1778            Look::WordAsciiNegate => 'B',
1779            Look::WordUnicode => '𝛃',
1780            Look::WordUnicodeNegate => '𝚩',
1781            Look::WordStartAscii => '<',
1782            Look::WordEndAscii => '>',
1783            Look::WordStartUnicode => '〈',
1784            Look::WordEndUnicode => '〉',
1785            Look::WordStartHalfAscii => '◁',
1786            Look::WordEndHalfAscii => '▷',
1787            Look::WordStartHalfUnicode => '◀',
1788            Look::WordEndHalfUnicode => '▶',
1789        }
1790    }
1791}
1792
1793/// The high-level intermediate representation for a capturing group.
1794///
1795/// A capturing group always has an index and a child expression. It may
1796/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1797/// necessary.
1798///
1799/// Note that there is no explicit representation of a non-capturing group
1800/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1801/// the recursive structure of the `Hir` itself.
1802#[derive(Clone, Debug, Eq, PartialEq)]
1803pub struct Capture {
1804    /// The capture index of the capture.
1805    pub index: u32,
1806    /// The name of the capture, if it exists.
1807    pub name: Option<Box<str>>,
1808    /// The expression inside the capturing group, which may be empty.
1809    pub sub: Box<Hir>,
1810}
1811
1812/// The high-level intermediate representation of a repetition operator.
1813///
1814/// A repetition operator permits the repetition of an arbitrary
1815/// sub-expression.
1816#[derive(Clone, Debug, Eq, PartialEq)]
1817pub struct Repetition {
1818    /// The minimum range of the repetition.
1819    ///
1820    /// Note that special cases like `?`, `+` and `*` all get translated into
1821    /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1822    ///
1823    /// When `min` is zero, this expression can match the empty string
1824    /// regardless of what its sub-expression is.
1825    pub min: u32,
1826    /// The maximum range of the repetition.
1827    ///
1828    /// Note that when `max` is `None`, `min` acts as a lower bound but where
1829    /// there is no upper bound. For something like `x{5}` where the min and
1830    /// max are equivalent, `min` will be set to `5` and `max` will be set to
1831    /// `Some(5)`.
1832    pub max: Option<u32>,
1833    /// Whether this repetition operator is greedy or not. A greedy operator
1834    /// will match as much as it can. A non-greedy operator will match as
1835    /// little as it can.
1836    ///
1837    /// Typically, operators are greedy by default and are only non-greedy when
1838    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1839    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1840    pub greedy: bool,
1841    /// The expression being repeated.
1842    pub sub: Box<Hir>,
1843}
1844
1845impl Repetition {
1846    /// Returns a new repetition with the same `min`, `max` and `greedy`
1847    /// values, but with its sub-expression replaced with the one given.
1848    pub fn with(&self, sub: Hir) -> Repetition {
1849        Repetition {
1850            min: self.min,
1851            max: self.max,
1852            greedy: self.greedy,
1853            sub: Box::new(sub),
1854        }
1855    }
1856}
1857
1858/// A type describing the different flavors of `.`.
1859///
1860/// This type is meant to be used with [`Hir::dot`], which is a convenience
1861/// routine for building HIR values derived from the `.` regex.
1862#[non_exhaustive]
1863#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1864pub enum Dot {
1865    /// Matches the UTF-8 encoding of any Unicode scalar value.
1866    ///
1867    /// This is equivalent to `(?su:.)` and also `\p{any}`.
1868    AnyChar,
1869    /// Matches any byte value.
1870    ///
1871    /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1872    AnyByte,
1873    /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1874    /// `char` given.
1875    ///
1876    /// This is equivalent to using `(?u-s:.)` with the line terminator set
1877    /// to a particular ASCII byte. (Because of peculiarities in the regex
1878    /// engines, a line terminator must be a single byte. It follows that when
1879    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1880    /// value. That is, ti must be ASCII.)
1881    ///
1882    /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1883    /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1884    AnyCharExcept(char),
1885    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1886    ///
1887    /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1888    AnyCharExceptLF,
1889    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1890    /// and `\n`.
1891    ///
1892    /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1893    AnyCharExceptCRLF,
1894    /// Matches any byte value except for the `u8` given.
1895    ///
1896    /// This is equivalent to using `(?-us:.)` with the line terminator set
1897    /// to a particular ASCII byte. (Because of peculiarities in the regex
1898    /// engines, a line terminator must be a single byte. It follows that when
1899    /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1900    /// value. That is, ti must be ASCII.)
1901    ///
1902    /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1903    /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1904    AnyByteExcept(u8),
1905    /// Matches any byte value except for `\n`.
1906    ///
1907    /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1908    AnyByteExceptLF,
1909    /// Matches any byte value except for `\r` and `\n`.
1910    ///
1911    /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1912    AnyByteExceptCRLF,
1913}
1914
1915/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1916/// space but heap space proportional to the depth of the total `Hir`.
1917impl Drop for Hir {
1918    fn drop(&mut self) {
1919        use core::mem;
1920
1921        match *self.kind() {
1922            HirKind::Empty
1923            | HirKind::Literal(_)
1924            | HirKind::Class(_)
1925            | HirKind::Look(_) => return,
1926            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1927            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1928                return
1929            }
1930            HirKind::Concat(ref x) if x.is_empty() => return,
1931            HirKind::Alternation(ref x) if x.is_empty() => return,
1932            _ => {}
1933        }
1934
1935        let mut stack = vec![mem::replace(self, Hir::empty())];
1936        while let Some(mut expr) = stack.pop() {
1937            match expr.kind {
1938                HirKind::Empty
1939                | HirKind::Literal(_)
1940                | HirKind::Class(_)
1941                | HirKind::Look(_) => {}
1942                HirKind::Capture(ref mut x) => {
1943                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1944                }
1945                HirKind::Repetition(ref mut x) => {
1946                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
1947                }
1948                HirKind::Concat(ref mut x) => {
1949                    stack.extend(x.drain(..));
1950                }
1951                HirKind::Alternation(ref mut x) => {
1952                    stack.extend(x.drain(..));
1953                }
1954            }
1955        }
1956    }
1957}
1958
1959/// A type that collects various properties of an HIR value.
1960///
1961/// Properties are always scalar values and represent meta data that is
1962/// computed inductively on an HIR value. Properties are defined for all
1963/// HIR values.
1964///
1965/// All methods on a `Properties` value take constant time and are meant to
1966/// be cheap to call.
1967#[derive(Clone, Debug, Eq, PartialEq)]
1968pub struct Properties(Box<PropertiesI>);
1969
1970/// The property definition. It is split out so that we can box it, and
1971/// there by make `Properties` use less stack size. This is kind-of important
1972/// because every HIR value has a `Properties` attached to it.
1973///
1974/// This does have the unfortunate consequence that creating any HIR value
1975/// always leads to at least one alloc for properties, but this is generally
1976/// true anyway (for pretty much all HirKinds except for look-arounds).
1977#[derive(Clone, Debug, Eq, PartialEq)]
1978struct PropertiesI {
1979    minimum_len: Option<usize>,
1980    maximum_len: Option<usize>,
1981    look_set: LookSet,
1982    look_set_prefix: LookSet,
1983    look_set_suffix: LookSet,
1984    look_set_prefix_any: LookSet,
1985    look_set_suffix_any: LookSet,
1986    utf8: bool,
1987    explicit_captures_len: usize,
1988    static_explicit_captures_len: Option<usize>,
1989    literal: bool,
1990    alternation_literal: bool,
1991}
1992
1993impl Properties {
1994    /// Returns the length (in bytes) of the smallest string matched by this
1995    /// HIR.
1996    ///
1997    /// A return value of `0` is possible and occurs when the HIR can match an
1998    /// empty string.
1999    ///
2000    /// `None` is returned when there is no minimum length. This occurs in
2001    /// precisely the cases where the HIR matches nothing. i.e., The language
2002    /// the regex matches is empty. An example of such a regex is `\P{any}`.
2003    #[inline]
2004    pub fn minimum_len(&self) -> Option<usize> {
2005        self.0.minimum_len
2006    }
2007
2008    /// Returns the length (in bytes) of the longest string matched by this
2009    /// HIR.
2010    ///
2011    /// A return value of `0` is possible and occurs when nothing longer than
2012    /// the empty string is in the language described by this HIR.
2013    ///
2014    /// `None` is returned when there is no longest matching string. This
2015    /// occurs when the HIR matches nothing or when there is no upper bound on
2016    /// the length of matching strings. Example of such regexes are `\P{any}`
2017    /// (matches nothing) and `a+` (has no upper bound).
2018    #[inline]
2019    pub fn maximum_len(&self) -> Option<usize> {
2020        self.0.maximum_len
2021    }
2022
2023    /// Returns a set of all look-around assertions that appear at least once
2024    /// in this HIR value.
2025    #[inline]
2026    pub fn look_set(&self) -> LookSet {
2027        self.0.look_set
2028    }
2029
2030    /// Returns a set of all look-around assertions that appear as a prefix for
2031    /// this HIR value. That is, the set returned corresponds to the set of
2032    /// assertions that must be passed before matching any bytes in a haystack.
2033    ///
2034    /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2035    /// if and only if the HIR is fully anchored at the start.
2036    #[inline]
2037    pub fn look_set_prefix(&self) -> LookSet {
2038        self.0.look_set_prefix
2039    }
2040
2041    /// Returns a set of all look-around assertions that appear as a _possible_
2042    /// prefix for this HIR value. That is, the set returned corresponds to the
2043    /// set of assertions that _may_ be passed before matching any bytes in a
2044    /// haystack.
2045    ///
2046    /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2047    /// true if and only if it's possible for the regex to match through a
2048    /// anchored assertion before consuming any input.
2049    #[inline]
2050    pub fn look_set_prefix_any(&self) -> LookSet {
2051        self.0.look_set_prefix_any
2052    }
2053
2054    /// Returns a set of all look-around assertions that appear as a suffix for
2055    /// this HIR value. That is, the set returned corresponds to the set of
2056    /// assertions that must be passed in order to be considered a match after
2057    /// all other consuming HIR expressions.
2058    ///
2059    /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2060    /// if and only if the HIR is fully anchored at the end.
2061    #[inline]
2062    pub fn look_set_suffix(&self) -> LookSet {
2063        self.0.look_set_suffix
2064    }
2065
2066    /// Returns a set of all look-around assertions that appear as a _possible_
2067    /// suffix for this HIR value. That is, the set returned corresponds to the
2068    /// set of assertions that _may_ be passed before matching any bytes in a
2069    /// haystack.
2070    ///
2071    /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2072    /// true if and only if it's possible for the regex to match through a
2073    /// anchored assertion at the end of a match without consuming any input.
2074    #[inline]
2075    pub fn look_set_suffix_any(&self) -> LookSet {
2076        self.0.look_set_suffix_any
2077    }
2078
2079    /// Return true if and only if the corresponding HIR will always match
2080    /// valid UTF-8.
2081    ///
2082    /// When this returns false, then it is possible for this HIR expression to
2083    /// match invalid UTF-8, including by matching between the code units of
2084    /// a single UTF-8 encoded codepoint.
2085    ///
2086    /// Note that this returns true even when the corresponding HIR can match
2087    /// the empty string. Since an empty string can technically appear between
2088    /// UTF-8 code units, it is possible for a match to be reported that splits
2089    /// a codepoint which could in turn be considered matching invalid UTF-8.
2090    /// However, it is generally assumed that such empty matches are handled
2091    /// specially by the search routine if it is absolutely required that
2092    /// matches not split a codepoint.
2093    ///
2094    /// # Example
2095    ///
2096    /// This code example shows the UTF-8 property of a variety of patterns.
2097    ///
2098    /// ```
2099    /// use regex_syntax::{ParserBuilder, parse};
2100    ///
2101    /// // Examples of 'is_utf8() == true'.
2102    /// assert!(parse(r"a")?.properties().is_utf8());
2103    /// assert!(parse(r"[^a]")?.properties().is_utf8());
2104    /// assert!(parse(r".")?.properties().is_utf8());
2105    /// assert!(parse(r"\W")?.properties().is_utf8());
2106    /// assert!(parse(r"\b")?.properties().is_utf8());
2107    /// assert!(parse(r"\B")?.properties().is_utf8());
2108    /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2109    /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2110    /// // Unicode mode is enabled by default, and in
2111    /// // that mode, all \x hex escapes are treated as
2112    /// // codepoints. So this actually matches the UTF-8
2113    /// // encoding of U+00FF.
2114    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2115    ///
2116    /// // Now we show examples of 'is_utf8() == false'.
2117    /// // The only way to do this is to force the parser
2118    /// // to permit invalid UTF-8, otherwise all of these
2119    /// // would fail to parse!
2120    /// let parse = |pattern| {
2121    ///     ParserBuilder::new().utf8(false).build().parse(pattern)
2122    /// };
2123    /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2124    /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2125    /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2126    /// // Conversely to the equivalent example above,
2127    /// // when Unicode mode is disabled, \x hex escapes
2128    /// // are treated as their raw byte values.
2129    /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2130    /// // Note that just because we disabled UTF-8 in the
2131    /// // parser doesn't mean we still can't use Unicode.
2132    /// // It is enabled by default, so \xFF is still
2133    /// // equivalent to matching the UTF-8 encoding of
2134    /// // U+00FF by default.
2135    /// assert!(parse(r"\xFF")?.properties().is_utf8());
2136    /// // Even though we use raw bytes that individually
2137    /// // are not valid UTF-8, when combined together, the
2138    /// // overall expression *does* match valid UTF-8!
2139    /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2140    ///
2141    /// # Ok::<(), Box<dyn std::error::Error>>(())
2142    /// ```
2143    #[inline]
2144    pub fn is_utf8(&self) -> bool {
2145        self.0.utf8
2146    }
2147
2148    /// Returns the total number of explicit capturing groups in the
2149    /// corresponding HIR.
2150    ///
2151    /// Note that this does not include the implicit capturing group
2152    /// corresponding to the entire match that is typically included by regex
2153    /// engines.
2154    ///
2155    /// # Example
2156    ///
2157    /// This method will return `0` for `a` and `1` for `(a)`:
2158    ///
2159    /// ```
2160    /// use regex_syntax::parse;
2161    ///
2162    /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2163    /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2164    ///
2165    /// # Ok::<(), Box<dyn std::error::Error>>(())
2166    /// ```
2167    #[inline]
2168    pub fn explicit_captures_len(&self) -> usize {
2169        self.0.explicit_captures_len
2170    }
2171
2172    /// Returns the total number of explicit capturing groups that appear in
2173    /// every possible match.
2174    ///
2175    /// If the number of capture groups can vary depending on the match, then
2176    /// this returns `None`. That is, a value is only returned when the number
2177    /// of matching groups is invariant or "static."
2178    ///
2179    /// Note that this does not include the implicit capturing group
2180    /// corresponding to the entire match.
2181    ///
2182    /// # Example
2183    ///
2184    /// This shows a few cases where a static number of capture groups is
2185    /// available and a few cases where it is not.
2186    ///
2187    /// ```
2188    /// use regex_syntax::parse;
2189    ///
2190    /// let len = |pattern| {
2191    ///     parse(pattern).map(|h| {
2192    ///         h.properties().static_explicit_captures_len()
2193    ///     })
2194    /// };
2195    ///
2196    /// assert_eq!(Some(0), len("a")?);
2197    /// assert_eq!(Some(1), len("(a)")?);
2198    /// assert_eq!(Some(1), len("(a)|(b)")?);
2199    /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2200    /// assert_eq!(None, len("(a)|b")?);
2201    /// assert_eq!(None, len("a|(b)")?);
2202    /// assert_eq!(None, len("(b)*")?);
2203    /// assert_eq!(Some(1), len("(b)+")?);
2204    ///
2205    /// # Ok::<(), Box<dyn std::error::Error>>(())
2206    /// ```
2207    #[inline]
2208    pub fn static_explicit_captures_len(&self) -> Option<usize> {
2209        self.0.static_explicit_captures_len
2210    }
2211
2212    /// Return true if and only if this HIR is a simple literal. This is
2213    /// only true when this HIR expression is either itself a `Literal` or a
2214    /// concatenation of only `Literal`s.
2215    ///
2216    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2217    /// the empty string are not (even though they contain sub-expressions that
2218    /// are literals).
2219    #[inline]
2220    pub fn is_literal(&self) -> bool {
2221        self.0.literal
2222    }
2223
2224    /// Return true if and only if this HIR is either a simple literal or an
2225    /// alternation of simple literals. This is only
2226    /// true when this HIR expression is either itself a `Literal` or a
2227    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2228    ///
2229    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2230    /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2231    /// (even though that contain sub-expressions that are literals).
2232    #[inline]
2233    pub fn is_alternation_literal(&self) -> bool {
2234        self.0.alternation_literal
2235    }
2236
2237    /// Returns the total amount of heap memory usage, in bytes, used by this
2238    /// `Properties` value.
2239    #[inline]
2240    pub fn memory_usage(&self) -> usize {
2241        core::mem::size_of::<PropertiesI>()
2242    }
2243
2244    /// Returns a new set of properties that corresponds to the union of the
2245    /// iterator of properties given.
2246    ///
2247    /// This is useful when one has multiple `Hir` expressions and wants
2248    /// to combine them into a single alternation without constructing the
2249    /// corresponding `Hir`. This routine provides a way of combining the
2250    /// properties of each `Hir` expression into one set of properties
2251    /// representing the union of those expressions.
2252    ///
2253    /// # Example: union with HIRs that never match
2254    ///
2255    /// This example shows that unioning properties together with one that
2256    /// represents a regex that never matches will "poison" certain attributes,
2257    /// like the minimum and maximum lengths.
2258    ///
2259    /// ```
2260    /// use regex_syntax::{hir::Properties, parse};
2261    ///
2262    /// let hir1 = parse("ab?c?")?;
2263    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2264    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2265    ///
2266    /// let hir2 = parse(r"[a&&b]")?;
2267    /// assert_eq!(None, hir2.properties().minimum_len());
2268    /// assert_eq!(None, hir2.properties().maximum_len());
2269    ///
2270    /// let hir3 = parse(r"wxy?z?")?;
2271    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2272    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2273    ///
2274    /// let unioned = Properties::union([
2275    ///		hir1.properties(),
2276    ///		hir2.properties(),
2277    ///		hir3.properties(),
2278    ///	]);
2279    /// assert_eq!(None, unioned.minimum_len());
2280    /// assert_eq!(None, unioned.maximum_len());
2281    ///
2282    /// # Ok::<(), Box<dyn std::error::Error>>(())
2283    /// ```
2284    ///
2285    /// The maximum length can also be "poisoned" by a pattern that has no
2286    /// upper bound on the length of a match. The minimum length remains
2287    /// unaffected:
2288    ///
2289    /// ```
2290    /// use regex_syntax::{hir::Properties, parse};
2291    ///
2292    /// let hir1 = parse("ab?c?")?;
2293    /// assert_eq!(Some(1), hir1.properties().minimum_len());
2294    /// assert_eq!(Some(3), hir1.properties().maximum_len());
2295    ///
2296    /// let hir2 = parse(r"a+")?;
2297    /// assert_eq!(Some(1), hir2.properties().minimum_len());
2298    /// assert_eq!(None, hir2.properties().maximum_len());
2299    ///
2300    /// let hir3 = parse(r"wxy?z?")?;
2301    /// assert_eq!(Some(2), hir3.properties().minimum_len());
2302    /// assert_eq!(Some(4), hir3.properties().maximum_len());
2303    ///
2304    /// let unioned = Properties::union([
2305    ///		hir1.properties(),
2306    ///		hir2.properties(),
2307    ///		hir3.properties(),
2308    ///	]);
2309    /// assert_eq!(Some(1), unioned.minimum_len());
2310    /// assert_eq!(None, unioned.maximum_len());
2311    ///
2312    /// # Ok::<(), Box<dyn std::error::Error>>(())
2313    /// ```
2314    pub fn union<I, P>(props: I) -> Properties
2315    where
2316        I: IntoIterator<Item = P>,
2317        P: core::borrow::Borrow<Properties>,
2318    {
2319        let mut it = props.into_iter().peekable();
2320        // While empty alternations aren't possible, we still behave as if they
2321        // are. When we have an empty alternate, then clearly the look-around
2322        // prefix and suffix is empty. Otherwise, it is the intersection of all
2323        // prefixes and suffixes (respectively) of the branches.
2324        let fix = if it.peek().is_none() {
2325            LookSet::empty()
2326        } else {
2327            LookSet::full()
2328        };
2329        // And also, an empty alternate means we have 0 static capture groups,
2330        // but we otherwise start with the number corresponding to the first
2331        // alternate. If any subsequent alternate has a different number of
2332        // static capture groups, then we overall have a variation and not a
2333        // static number of groups.
2334        let static_explicit_captures_len =
2335            it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2336        // The base case is an empty alternation, which matches nothing.
2337        // Note though that empty alternations aren't possible, because the
2338        // Hir::alternation smart constructor rewrites those as empty character
2339        // classes.
2340        let mut props = PropertiesI {
2341            minimum_len: None,
2342            maximum_len: None,
2343            look_set: LookSet::empty(),
2344            look_set_prefix: fix,
2345            look_set_suffix: fix,
2346            look_set_prefix_any: LookSet::empty(),
2347            look_set_suffix_any: LookSet::empty(),
2348            utf8: true,
2349            explicit_captures_len: 0,
2350            static_explicit_captures_len,
2351            literal: false,
2352            alternation_literal: true,
2353        };
2354        let (mut min_poisoned, mut max_poisoned) = (false, false);
2355        // Handle properties that need to visit every child hir.
2356        for prop in it {
2357            let p = prop.borrow();
2358            props.look_set.set_union(p.look_set());
2359            props.look_set_prefix.set_intersect(p.look_set_prefix());
2360            props.look_set_suffix.set_intersect(p.look_set_suffix());
2361            props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2362            props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2363            props.utf8 = props.utf8 && p.is_utf8();
2364            props.explicit_captures_len = props
2365                .explicit_captures_len
2366                .saturating_add(p.explicit_captures_len());
2367            if props.static_explicit_captures_len
2368                != p.static_explicit_captures_len()
2369            {
2370                props.static_explicit_captures_len = None;
2371            }
2372            props.alternation_literal =
2373                props.alternation_literal && p.is_literal();
2374            if !min_poisoned {
2375                if let Some(xmin) = p.minimum_len() {
2376                    if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2377                        props.minimum_len = Some(xmin);
2378                    }
2379                } else {
2380                    props.minimum_len = None;
2381                    min_poisoned = true;
2382                }
2383            }
2384            if !max_poisoned {
2385                if let Some(xmax) = p.maximum_len() {
2386                    if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2387                        props.maximum_len = Some(xmax);
2388                    }
2389                } else {
2390                    props.maximum_len = None;
2391                    max_poisoned = true;
2392                }
2393            }
2394        }
2395        Properties(Box::new(props))
2396    }
2397}
2398
2399impl Properties {
2400    /// Create a new set of HIR properties for an empty regex.
2401    fn empty() -> Properties {
2402        let inner = PropertiesI {
2403            minimum_len: Some(0),
2404            maximum_len: Some(0),
2405            look_set: LookSet::empty(),
2406            look_set_prefix: LookSet::empty(),
2407            look_set_suffix: LookSet::empty(),
2408            look_set_prefix_any: LookSet::empty(),
2409            look_set_suffix_any: LookSet::empty(),
2410            // It is debatable whether an empty regex always matches at valid
2411            // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2412            // it is clearly false. There are, for example, many empty strings
2413            // between the bytes encoding a '☃'.
2414            //
2415            // However, when Unicode mode is enabled, the fundamental atom
2416            // of matching is really a codepoint. And in that scenario, an
2417            // empty regex is defined to only match at valid UTF-8 boundaries
2418            // and to never split a codepoint. It just so happens that this
2419            // enforcement is somewhat tricky to do for regexes that match
2420            // the empty string inside regex engines themselves. It usually
2421            // requires some layer above the regex engine to filter out such
2422            // matches.
2423            //
2424            // In any case, 'true' is really the only coherent option. If it
2425            // were false, for example, then 'a*' would also need to be false
2426            // since it too can match the empty string.
2427            utf8: true,
2428            explicit_captures_len: 0,
2429            static_explicit_captures_len: Some(0),
2430            literal: false,
2431            alternation_literal: false,
2432        };
2433        Properties(Box::new(inner))
2434    }
2435
2436    /// Create a new set of HIR properties for a literal regex.
2437    fn literal(lit: &Literal) -> Properties {
2438        let inner = PropertiesI {
2439            minimum_len: Some(lit.0.len()),
2440            maximum_len: Some(lit.0.len()),
2441            look_set: LookSet::empty(),
2442            look_set_prefix: LookSet::empty(),
2443            look_set_suffix: LookSet::empty(),
2444            look_set_prefix_any: LookSet::empty(),
2445            look_set_suffix_any: LookSet::empty(),
2446            utf8: core::str::from_utf8(&lit.0).is_ok(),
2447            explicit_captures_len: 0,
2448            static_explicit_captures_len: Some(0),
2449            literal: true,
2450            alternation_literal: true,
2451        };
2452        Properties(Box::new(inner))
2453    }
2454
2455    /// Create a new set of HIR properties for a character class.
2456    fn class(class: &Class) -> Properties {
2457        let inner = PropertiesI {
2458            minimum_len: class.minimum_len(),
2459            maximum_len: class.maximum_len(),
2460            look_set: LookSet::empty(),
2461            look_set_prefix: LookSet::empty(),
2462            look_set_suffix: LookSet::empty(),
2463            look_set_prefix_any: LookSet::empty(),
2464            look_set_suffix_any: LookSet::empty(),
2465            utf8: class.is_utf8(),
2466            explicit_captures_len: 0,
2467            static_explicit_captures_len: Some(0),
2468            literal: false,
2469            alternation_literal: false,
2470        };
2471        Properties(Box::new(inner))
2472    }
2473
2474    /// Create a new set of HIR properties for a look-around assertion.
2475    fn look(look: Look) -> Properties {
2476        let inner = PropertiesI {
2477            minimum_len: Some(0),
2478            maximum_len: Some(0),
2479            look_set: LookSet::singleton(look),
2480            look_set_prefix: LookSet::singleton(look),
2481            look_set_suffix: LookSet::singleton(look),
2482            look_set_prefix_any: LookSet::singleton(look),
2483            look_set_suffix_any: LookSet::singleton(look),
2484            // This requires a little explanation. Basically, we don't consider
2485            // matching an empty string to be equivalent to matching invalid
2486            // UTF-8, even though technically matching every empty string will
2487            // split the UTF-8 encoding of a single codepoint when treating a
2488            // UTF-8 encoded string as a sequence of bytes. Our defense here is
2489            // that in such a case, a codepoint should logically be treated as
2490            // the fundamental atom for matching, and thus the only valid match
2491            // points are between codepoints and not bytes.
2492            //
2493            // More practically, this is true here because it's also true
2494            // for 'Hir::empty()', otherwise something like 'a*' would be
2495            // considered to match invalid UTF-8. That in turn makes this
2496            // property borderline useless.
2497            utf8: true,
2498            explicit_captures_len: 0,
2499            static_explicit_captures_len: Some(0),
2500            literal: false,
2501            alternation_literal: false,
2502        };
2503        Properties(Box::new(inner))
2504    }
2505
2506    /// Create a new set of HIR properties for a repetition.
2507    fn repetition(rep: &Repetition) -> Properties {
2508        let p = rep.sub.properties();
2509        let minimum_len = p.minimum_len().map(|child_min| {
2510            let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2511            child_min.saturating_mul(rep_min)
2512        });
2513        let maximum_len = rep.max.and_then(|rep_max| {
2514            let rep_max = usize::try_from(rep_max).ok()?;
2515            let child_max = p.maximum_len()?;
2516            child_max.checked_mul(rep_max)
2517        });
2518
2519        let mut inner = PropertiesI {
2520            minimum_len,
2521            maximum_len,
2522            look_set: p.look_set(),
2523            look_set_prefix: LookSet::empty(),
2524            look_set_suffix: LookSet::empty(),
2525            look_set_prefix_any: p.look_set_prefix_any(),
2526            look_set_suffix_any: p.look_set_suffix_any(),
2527            utf8: p.is_utf8(),
2528            explicit_captures_len: p.explicit_captures_len(),
2529            static_explicit_captures_len: p.static_explicit_captures_len(),
2530            literal: false,
2531            alternation_literal: false,
2532        };
2533        // If the repetition operator can match the empty string, then its
2534        // lookset prefix and suffixes themselves remain empty since they are
2535        // no longer required to match.
2536        if rep.min > 0 {
2537            inner.look_set_prefix = p.look_set_prefix();
2538            inner.look_set_suffix = p.look_set_suffix();
2539        }
2540        // If the static captures len of the sub-expression is not known or
2541        // is greater than zero, then it automatically propagates to the
2542        // repetition, regardless of the repetition. Otherwise, it might
2543        // change, but only when the repetition can match 0 times.
2544        if rep.min == 0
2545            && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2546        {
2547            // If we require a match 0 times, then our captures len is
2548            // guaranteed to be zero. Otherwise, if we *can* match the empty
2549            // string, then it's impossible to know how many captures will be
2550            // in the resulting match.
2551            if rep.max == Some(0) {
2552                inner.static_explicit_captures_len = Some(0);
2553            } else {
2554                inner.static_explicit_captures_len = None;
2555            }
2556        }
2557        Properties(Box::new(inner))
2558    }
2559
2560    /// Create a new set of HIR properties for a capture.
2561    fn capture(capture: &Capture) -> Properties {
2562        let p = capture.sub.properties();
2563        Properties(Box::new(PropertiesI {
2564            explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2565            static_explicit_captures_len: p
2566                .static_explicit_captures_len()
2567                .map(|len| len.saturating_add(1)),
2568            literal: false,
2569            alternation_literal: false,
2570            ..*p.0.clone()
2571        }))
2572    }
2573
2574    /// Create a new set of HIR properties for a concatenation.
2575    fn concat(concat: &[Hir]) -> Properties {
2576        // The base case is an empty concatenation, which matches the empty
2577        // string. Note though that empty concatenations aren't possible,
2578        // because the Hir::concat smart constructor rewrites those as
2579        // Hir::empty.
2580        let mut props = PropertiesI {
2581            minimum_len: Some(0),
2582            maximum_len: Some(0),
2583            look_set: LookSet::empty(),
2584            look_set_prefix: LookSet::empty(),
2585            look_set_suffix: LookSet::empty(),
2586            look_set_prefix_any: LookSet::empty(),
2587            look_set_suffix_any: LookSet::empty(),
2588            utf8: true,
2589            explicit_captures_len: 0,
2590            static_explicit_captures_len: Some(0),
2591            literal: true,
2592            alternation_literal: true,
2593        };
2594        // Handle properties that need to visit every child hir.
2595        for x in concat.iter() {
2596            let p = x.properties();
2597            props.look_set.set_union(p.look_set());
2598            props.utf8 = props.utf8 && p.is_utf8();
2599            props.explicit_captures_len = props
2600                .explicit_captures_len
2601                .saturating_add(p.explicit_captures_len());
2602            props.static_explicit_captures_len = p
2603                .static_explicit_captures_len()
2604                .and_then(|len1| {
2605                    Some((len1, props.static_explicit_captures_len?))
2606                })
2607                .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2608            props.literal = props.literal && p.is_literal();
2609            props.alternation_literal =
2610                props.alternation_literal && p.is_alternation_literal();
2611            if let Some(minimum_len) = props.minimum_len {
2612                match p.minimum_len() {
2613                    None => props.minimum_len = None,
2614                    Some(len) => {
2615                        // We use saturating arithmetic here because the
2616                        // minimum is just a lower bound. We can't go any
2617                        // higher than what our number types permit.
2618                        props.minimum_len =
2619                            Some(minimum_len.saturating_add(len));
2620                    }
2621                }
2622            }
2623            if let Some(maximum_len) = props.maximum_len {
2624                match p.maximum_len() {
2625                    None => props.maximum_len = None,
2626                    Some(len) => {
2627                        props.maximum_len = maximum_len.checked_add(len)
2628                    }
2629                }
2630            }
2631        }
2632        // Handle the prefix properties, which only requires visiting
2633        // child exprs until one matches more than the empty string.
2634        let mut it = concat.iter();
2635        while let Some(x) = it.next() {
2636            props.look_set_prefix.set_union(x.properties().look_set_prefix());
2637            props
2638                .look_set_prefix_any
2639                .set_union(x.properties().look_set_prefix_any());
2640            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2641                break;
2642            }
2643        }
2644        // Same thing for the suffix properties, but in reverse.
2645        let mut it = concat.iter().rev();
2646        while let Some(x) = it.next() {
2647            props.look_set_suffix.set_union(x.properties().look_set_suffix());
2648            props
2649                .look_set_suffix_any
2650                .set_union(x.properties().look_set_suffix_any());
2651            if x.properties().maximum_len().map_or(true, |x| x > 0) {
2652                break;
2653            }
2654        }
2655        Properties(Box::new(props))
2656    }
2657
2658    /// Create a new set of HIR properties for a concatenation.
2659    fn alternation(alts: &[Hir]) -> Properties {
2660        Properties::union(alts.iter().map(|hir| hir.properties()))
2661    }
2662}
2663
2664/// A set of look-around assertions.
2665///
2666/// This is useful for efficiently tracking look-around assertions. For
2667/// example, an [`Hir`] provides properties that return `LookSet`s.
2668#[derive(Clone, Copy, Default, Eq, PartialEq)]
2669pub struct LookSet {
2670    /// The underlying representation this set is exposed to make it possible
2671    /// to store it somewhere efficiently. The representation is that
2672    /// of a bitset, where each assertion occupies bit `i` where `i =
2673    /// Look::as_repr()`.
2674    ///
2675    /// Note that users of this internal representation must permit the full
2676    /// range of `u16` values to be represented. For example, even if the
2677    /// current implementation only makes use of the 10 least significant bits,
2678    /// it may use more bits in a future semver compatible release.
2679    pub bits: u32,
2680}
2681
2682impl LookSet {
2683    /// Create an empty set of look-around assertions.
2684    #[inline]
2685    pub fn empty() -> LookSet {
2686        LookSet { bits: 0 }
2687    }
2688
2689    /// Create a full set of look-around assertions.
2690    ///
2691    /// This set contains all possible look-around assertions.
2692    #[inline]
2693    pub fn full() -> LookSet {
2694        LookSet { bits: !0 }
2695    }
2696
2697    /// Create a look-around set containing the look-around assertion given.
2698    ///
2699    /// This is a convenience routine for creating an empty set and inserting
2700    /// one look-around assertions.
2701    #[inline]
2702    pub fn singleton(look: Look) -> LookSet {
2703        LookSet::empty().insert(look)
2704    }
2705
2706    /// Returns the total number of look-around assertions in this set.
2707    #[inline]
2708    pub fn len(self) -> usize {
2709        // OK because max value always fits in a u8, which in turn always
2710        // fits in a usize, regardless of target.
2711        usize::try_from(self.bits.count_ones()).unwrap()
2712    }
2713
2714    /// Returns true if and only if this set is empty.
2715    #[inline]
2716    pub fn is_empty(self) -> bool {
2717        self.len() == 0
2718    }
2719
2720    /// Returns true if and only if the given look-around assertion is in this
2721    /// set.
2722    #[inline]
2723    pub fn contains(self, look: Look) -> bool {
2724        self.bits & look.as_repr() != 0
2725    }
2726
2727    /// Returns true if and only if this set contains any anchor assertions.
2728    /// This includes both "start/end of haystack" and "start/end of line."
2729    #[inline]
2730    pub fn contains_anchor(&self) -> bool {
2731        self.contains_anchor_haystack() || self.contains_anchor_line()
2732    }
2733
2734    /// Returns true if and only if this set contains any "start/end of
2735    /// haystack" anchors. This doesn't include "start/end of line" anchors.
2736    #[inline]
2737    pub fn contains_anchor_haystack(&self) -> bool {
2738        self.contains(Look::Start) || self.contains(Look::End)
2739    }
2740
2741    /// Returns true if and only if this set contains any "start/end of line"
2742    /// anchors. This doesn't include "start/end of haystack" anchors. This
2743    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2744    #[inline]
2745    pub fn contains_anchor_line(&self) -> bool {
2746        self.contains(Look::StartLF)
2747            || self.contains(Look::EndLF)
2748            || self.contains(Look::StartCRLF)
2749            || self.contains(Look::EndCRLF)
2750    }
2751
2752    /// Returns true if and only if this set contains any "start/end of line"
2753    /// anchors that only treat `\n` as line terminators. This does not include
2754    /// haystack anchors or CRLF aware line anchors.
2755    #[inline]
2756    pub fn contains_anchor_lf(&self) -> bool {
2757        self.contains(Look::StartLF) || self.contains(Look::EndLF)
2758    }
2759
2760    /// Returns true if and only if this set contains any "start/end of line"
2761    /// anchors that are CRLF-aware. This doesn't include "start/end of
2762    /// haystack" or "start/end of line-feed" anchors.
2763    #[inline]
2764    pub fn contains_anchor_crlf(&self) -> bool {
2765        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2766    }
2767
2768    /// Returns true if and only if this set contains any word boundary or
2769    /// negated word boundary assertions. This include both Unicode and ASCII
2770    /// word boundaries.
2771    #[inline]
2772    pub fn contains_word(self) -> bool {
2773        self.contains_word_unicode() || self.contains_word_ascii()
2774    }
2775
2776    /// Returns true if and only if this set contains any Unicode word boundary
2777    /// or negated Unicode word boundary assertions.
2778    #[inline]
2779    pub fn contains_word_unicode(self) -> bool {
2780        self.contains(Look::WordUnicode)
2781            || self.contains(Look::WordUnicodeNegate)
2782            || self.contains(Look::WordStartUnicode)
2783            || self.contains(Look::WordEndUnicode)
2784            || self.contains(Look::WordStartHalfUnicode)
2785            || self.contains(Look::WordEndHalfUnicode)
2786    }
2787
2788    /// Returns true if and only if this set contains any ASCII word boundary
2789    /// or negated ASCII word boundary assertions.
2790    #[inline]
2791    pub fn contains_word_ascii(self) -> bool {
2792        self.contains(Look::WordAscii)
2793            || self.contains(Look::WordAsciiNegate)
2794            || self.contains(Look::WordStartAscii)
2795            || self.contains(Look::WordEndAscii)
2796            || self.contains(Look::WordStartHalfAscii)
2797            || self.contains(Look::WordEndHalfAscii)
2798    }
2799
2800    /// Returns an iterator over all of the look-around assertions in this set.
2801    #[inline]
2802    pub fn iter(self) -> LookSetIter {
2803        LookSetIter { set: self }
2804    }
2805
2806    /// Return a new set that is equivalent to the original, but with the given
2807    /// assertion added to it. If the assertion is already in the set, then the
2808    /// returned set is equivalent to the original.
2809    #[inline]
2810    pub fn insert(self, look: Look) -> LookSet {
2811        LookSet { bits: self.bits | look.as_repr() }
2812    }
2813
2814    /// Updates this set in place with the result of inserting the given
2815    /// assertion into this set.
2816    #[inline]
2817    pub fn set_insert(&mut self, look: Look) {
2818        *self = self.insert(look);
2819    }
2820
2821    /// Return a new set that is equivalent to the original, but with the given
2822    /// assertion removed from it. If the assertion is not in the set, then the
2823    /// returned set is equivalent to the original.
2824    #[inline]
2825    pub fn remove(self, look: Look) -> LookSet {
2826        LookSet { bits: self.bits & !look.as_repr() }
2827    }
2828
2829    /// Updates this set in place with the result of removing the given
2830    /// assertion from this set.
2831    #[inline]
2832    pub fn set_remove(&mut self, look: Look) {
2833        *self = self.remove(look);
2834    }
2835
2836    /// Returns a new set that is the result of subtracting the given set from
2837    /// this set.
2838    #[inline]
2839    pub fn subtract(self, other: LookSet) -> LookSet {
2840        LookSet { bits: self.bits & !other.bits }
2841    }
2842
2843    /// Updates this set in place with the result of subtracting the given set
2844    /// from this set.
2845    #[inline]
2846    pub fn set_subtract(&mut self, other: LookSet) {
2847        *self = self.subtract(other);
2848    }
2849
2850    /// Returns a new set that is the union of this and the one given.
2851    #[inline]
2852    pub fn union(self, other: LookSet) -> LookSet {
2853        LookSet { bits: self.bits | other.bits }
2854    }
2855
2856    /// Updates this set in place with the result of unioning it with the one
2857    /// given.
2858    #[inline]
2859    pub fn set_union(&mut self, other: LookSet) {
2860        *self = self.union(other);
2861    }
2862
2863    /// Returns a new set that is the intersection of this and the one given.
2864    #[inline]
2865    pub fn intersect(self, other: LookSet) -> LookSet {
2866        LookSet { bits: self.bits & other.bits }
2867    }
2868
2869    /// Updates this set in place with the result of intersecting it with the
2870    /// one given.
2871    #[inline]
2872    pub fn set_intersect(&mut self, other: LookSet) {
2873        *self = self.intersect(other);
2874    }
2875
2876    /// Return a `LookSet` from the slice given as a native endian 32-bit
2877    /// integer.
2878    ///
2879    /// # Panics
2880    ///
2881    /// This panics if `slice.len() < 4`.
2882    #[inline]
2883    pub fn read_repr(slice: &[u8]) -> LookSet {
2884        let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2885        LookSet { bits }
2886    }
2887
2888    /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2889    /// of the slice given.
2890    ///
2891    /// # Panics
2892    ///
2893    /// This panics if `slice.len() < 4`.
2894    #[inline]
2895    pub fn write_repr(self, slice: &mut [u8]) {
2896        let raw = self.bits.to_ne_bytes();
2897        slice[0] = raw[0];
2898        slice[1] = raw[1];
2899        slice[2] = raw[2];
2900        slice[3] = raw[3];
2901    }
2902}
2903
2904impl core::fmt::Debug for LookSet {
2905    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2906        if self.is_empty() {
2907            return write!(f, "∅");
2908        }
2909        for look in self.iter() {
2910            write!(f, "{}", look.as_char())?;
2911        }
2912        Ok(())
2913    }
2914}
2915
2916/// An iterator over all look-around assertions in a [`LookSet`].
2917///
2918/// This iterator is created by [`LookSet::iter`].
2919#[derive(Clone, Debug)]
2920pub struct LookSetIter {
2921    set: LookSet,
2922}
2923
2924impl Iterator for LookSetIter {
2925    type Item = Look;
2926
2927    #[inline]
2928    fn next(&mut self) -> Option<Look> {
2929        if self.set.is_empty() {
2930            return None;
2931        }
2932        // We'll never have more than u8::MAX distinct look-around assertions,
2933        // so 'bit' will always fit into a u16.
2934        let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2935        let look = Look::from_repr(1 << bit)?;
2936        self.set = self.set.remove(look);
2937        Some(look)
2938    }
2939}
2940
2941/// Given a sequence of HIR values where each value corresponds to a Unicode
2942/// class (or an all-ASCII byte class), return a single Unicode class
2943/// corresponding to the union of the classes found.
2944fn class_chars(hirs: &[Hir]) -> Option<Class> {
2945    let mut cls = ClassUnicode::new(vec![]);
2946    for hir in hirs.iter() {
2947        match *hir.kind() {
2948            HirKind::Class(Class::Unicode(ref cls2)) => {
2949                cls.union(cls2);
2950            }
2951            HirKind::Class(Class::Bytes(ref cls2)) => {
2952                cls.union(&cls2.to_unicode_class()?);
2953            }
2954            _ => return None,
2955        };
2956    }
2957    Some(Class::Unicode(cls))
2958}
2959
2960/// Given a sequence of HIR values where each value corresponds to a byte class
2961/// (or an all-ASCII Unicode class), return a single byte class corresponding
2962/// to the union of the classes found.
2963fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2964    let mut cls = ClassBytes::new(vec![]);
2965    for hir in hirs.iter() {
2966        match *hir.kind() {
2967            HirKind::Class(Class::Unicode(ref cls2)) => {
2968                cls.union(&cls2.to_byte_class()?);
2969            }
2970            HirKind::Class(Class::Bytes(ref cls2)) => {
2971                cls.union(cls2);
2972            }
2973            _ => return None,
2974        };
2975    }
2976    Some(Class::Bytes(cls))
2977}
2978
2979/// Given a sequence of HIR values where each value corresponds to a literal
2980/// that is a single `char`, return that sequence of `char`s. Otherwise return
2981/// None. No deduplication is done.
2982fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2983    let mut singletons = vec![];
2984    for hir in hirs.iter() {
2985        let literal = match *hir.kind() {
2986            HirKind::Literal(Literal(ref bytes)) => bytes,
2987            _ => return None,
2988        };
2989        let ch = match crate::debug::utf8_decode(literal) {
2990            None => return None,
2991            Some(Err(_)) => return None,
2992            Some(Ok(ch)) => ch,
2993        };
2994        if literal.len() != ch.len_utf8() {
2995            return None;
2996        }
2997        singletons.push(ch);
2998    }
2999    Some(singletons)
3000}
3001
3002/// Given a sequence of HIR values where each value corresponds to a literal
3003/// that is a single byte, return that sequence of bytes. Otherwise return
3004/// None. No deduplication is done.
3005fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3006    let mut singletons = vec![];
3007    for hir in hirs.iter() {
3008        let literal = match *hir.kind() {
3009            HirKind::Literal(Literal(ref bytes)) => bytes,
3010            _ => return None,
3011        };
3012        if literal.len() != 1 {
3013            return None;
3014        }
3015        singletons.push(literal[0]);
3016    }
3017    Some(singletons)
3018}
3019
3020/// Looks for a common prefix in the list of alternation branches given. If one
3021/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3022/// Otherwise, the original given list of branches is returned unmodified.
3023///
3024/// This is not quite as good as it could be. Right now, it requires that
3025/// all branches are 'Concat' expressions. It also doesn't do well with
3026/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3027/// 'foo(?:foo|bar)' because literals are flattened into their own special
3028/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3029/// instead of a string of bytes because of this. Otherwise, handling the
3030/// current representation in this routine will be pretty gnarly. Sigh.)
3031fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3032    if hirs.len() <= 1 {
3033        return Err(hirs);
3034    }
3035    let mut prefix = match hirs[0].kind() {
3036        HirKind::Concat(ref xs) => &**xs,
3037        _ => return Err(hirs),
3038    };
3039    if prefix.is_empty() {
3040        return Err(hirs);
3041    }
3042    for h in hirs.iter().skip(1) {
3043        let concat = match h.kind() {
3044            HirKind::Concat(ref xs) => xs,
3045            _ => return Err(hirs),
3046        };
3047        let common_len = prefix
3048            .iter()
3049            .zip(concat.iter())
3050            .take_while(|(x, y)| x == y)
3051            .count();
3052        prefix = &prefix[..common_len];
3053        if prefix.is_empty() {
3054            return Err(hirs);
3055        }
3056    }
3057    let len = prefix.len();
3058    assert_ne!(0, len);
3059    let mut prefix_concat = vec![];
3060    let mut suffix_alts = vec![];
3061    for h in hirs {
3062        let mut concat = match h.into_kind() {
3063            HirKind::Concat(xs) => xs,
3064            // We required all sub-expressions to be
3065            // concats above, so we're only here if we
3066            // have a concat.
3067            _ => unreachable!(),
3068        };
3069        suffix_alts.push(Hir::concat(concat.split_off(len)));
3070        if prefix_concat.is_empty() {
3071            prefix_concat = concat;
3072        }
3073    }
3074    let mut concat = prefix_concat;
3075    concat.push(Hir::alternation(suffix_alts));
3076    Ok(Hir::concat(concat))
3077}
3078
3079#[cfg(test)]
3080mod tests {
3081    use super::*;
3082
3083    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3084        let ranges: Vec<ClassUnicodeRange> = ranges
3085            .iter()
3086            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3087            .collect();
3088        ClassUnicode::new(ranges)
3089    }
3090
3091    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3092        let ranges: Vec<ClassBytesRange> =
3093            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3094        ClassBytes::new(ranges)
3095    }
3096
3097    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3098        cls.iter().map(|x| (x.start(), x.end())).collect()
3099    }
3100
3101    #[cfg(feature = "unicode-case")]
3102    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3103        let mut cls_ = cls.clone();
3104        cls_.case_fold_simple();
3105        cls_
3106    }
3107
3108    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3109        let mut cls_ = cls1.clone();
3110        cls_.union(cls2);
3111        cls_
3112    }
3113
3114    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3115        let mut cls_ = cls1.clone();
3116        cls_.intersect(cls2);
3117        cls_
3118    }
3119
3120    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3121        let mut cls_ = cls1.clone();
3122        cls_.difference(cls2);
3123        cls_
3124    }
3125
3126    fn usymdifference(
3127        cls1: &ClassUnicode,
3128        cls2: &ClassUnicode,
3129    ) -> ClassUnicode {
3130        let mut cls_ = cls1.clone();
3131        cls_.symmetric_difference(cls2);
3132        cls_
3133    }
3134
3135    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3136        let mut cls_ = cls.clone();
3137        cls_.negate();
3138        cls_
3139    }
3140
3141    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3142        cls.iter().map(|x| (x.start(), x.end())).collect()
3143    }
3144
3145    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3146        let mut cls_ = cls.clone();
3147        cls_.case_fold_simple();
3148        cls_
3149    }
3150
3151    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3152        let mut cls_ = cls1.clone();
3153        cls_.union(cls2);
3154        cls_
3155    }
3156
3157    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3158        let mut cls_ = cls1.clone();
3159        cls_.intersect(cls2);
3160        cls_
3161    }
3162
3163    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3164        let mut cls_ = cls1.clone();
3165        cls_.difference(cls2);
3166        cls_
3167    }
3168
3169    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3170        let mut cls_ = cls1.clone();
3171        cls_.symmetric_difference(cls2);
3172        cls_
3173    }
3174
3175    fn bnegate(cls: &ClassBytes) -> ClassBytes {
3176        let mut cls_ = cls.clone();
3177        cls_.negate();
3178        cls_
3179    }
3180
3181    #[test]
3182    fn class_range_canonical_unicode() {
3183        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3184        assert_eq!('\0', range.start());
3185        assert_eq!('\u{00FF}', range.end());
3186    }
3187
3188    #[test]
3189    fn class_range_canonical_bytes() {
3190        let range = ClassBytesRange::new(b'\xFF', b'\0');
3191        assert_eq!(b'\0', range.start());
3192        assert_eq!(b'\xFF', range.end());
3193    }
3194
3195    #[test]
3196    fn class_canonicalize_unicode() {
3197        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3198        let expected = vec![('a', 'c'), ('x', 'z')];
3199        assert_eq!(expected, uranges(&cls));
3200
3201        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3202        let expected = vec![('a', 'c'), ('x', 'z')];
3203        assert_eq!(expected, uranges(&cls));
3204
3205        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3206        let expected = vec![('w', 'z')];
3207        assert_eq!(expected, uranges(&cls));
3208
3209        let cls = uclass(&[
3210            ('c', 'f'),
3211            ('a', 'g'),
3212            ('d', 'j'),
3213            ('a', 'c'),
3214            ('m', 'p'),
3215            ('l', 's'),
3216        ]);
3217        let expected = vec![('a', 'j'), ('l', 's')];
3218        assert_eq!(expected, uranges(&cls));
3219
3220        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3221        let expected = vec![('u', 'z')];
3222        assert_eq!(expected, uranges(&cls));
3223
3224        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3225        let expected = vec![('\x00', '\u{10FFFF}')];
3226        assert_eq!(expected, uranges(&cls));
3227
3228        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3229        let expected = vec![('a', 'b')];
3230        assert_eq!(expected, uranges(&cls));
3231    }
3232
3233    #[test]
3234    fn class_canonicalize_bytes() {
3235        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3236        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237        assert_eq!(expected, branges(&cls));
3238
3239        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3240        let expected = vec![(b'a', b'c'), (b'x', b'z')];
3241        assert_eq!(expected, branges(&cls));
3242
3243        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3244        let expected = vec![(b'w', b'z')];
3245        assert_eq!(expected, branges(&cls));
3246
3247        let cls = bclass(&[
3248            (b'c', b'f'),
3249            (b'a', b'g'),
3250            (b'd', b'j'),
3251            (b'a', b'c'),
3252            (b'm', b'p'),
3253            (b'l', b's'),
3254        ]);
3255        let expected = vec![(b'a', b'j'), (b'l', b's')];
3256        assert_eq!(expected, branges(&cls));
3257
3258        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3259        let expected = vec![(b'u', b'z')];
3260        assert_eq!(expected, branges(&cls));
3261
3262        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3263        let expected = vec![(b'\x00', b'\xFF')];
3264        assert_eq!(expected, branges(&cls));
3265
3266        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3267        let expected = vec![(b'a', b'b')];
3268        assert_eq!(expected, branges(&cls));
3269    }
3270
3271    #[test]
3272    #[cfg(feature = "unicode-case")]
3273    fn class_case_fold_unicode() {
3274        let cls = uclass(&[
3275            ('C', 'F'),
3276            ('A', 'G'),
3277            ('D', 'J'),
3278            ('A', 'C'),
3279            ('M', 'P'),
3280            ('L', 'S'),
3281            ('c', 'f'),
3282        ]);
3283        let expected = uclass(&[
3284            ('A', 'J'),
3285            ('L', 'S'),
3286            ('a', 'j'),
3287            ('l', 's'),
3288            ('\u{17F}', '\u{17F}'),
3289        ]);
3290        assert_eq!(expected, ucasefold(&cls));
3291
3292        let cls = uclass(&[('A', 'Z')]);
3293        let expected = uclass(&[
3294            ('A', 'Z'),
3295            ('a', 'z'),
3296            ('\u{17F}', '\u{17F}'),
3297            ('\u{212A}', '\u{212A}'),
3298        ]);
3299        assert_eq!(expected, ucasefold(&cls));
3300
3301        let cls = uclass(&[('a', 'z')]);
3302        let expected = uclass(&[
3303            ('A', 'Z'),
3304            ('a', 'z'),
3305            ('\u{17F}', '\u{17F}'),
3306            ('\u{212A}', '\u{212A}'),
3307        ]);
3308        assert_eq!(expected, ucasefold(&cls));
3309
3310        let cls = uclass(&[('A', 'A'), ('_', '_')]);
3311        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3312        assert_eq!(expected, ucasefold(&cls));
3313
3314        let cls = uclass(&[('A', 'A'), ('=', '=')]);
3315        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3316        assert_eq!(expected, ucasefold(&cls));
3317
3318        let cls = uclass(&[('\x00', '\x10')]);
3319        assert_eq!(cls, ucasefold(&cls));
3320
3321        let cls = uclass(&[('k', 'k')]);
3322        let expected =
3323            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3324        assert_eq!(expected, ucasefold(&cls));
3325
3326        let cls = uclass(&[('@', '@')]);
3327        assert_eq!(cls, ucasefold(&cls));
3328    }
3329
3330    #[test]
3331    #[cfg(not(feature = "unicode-case"))]
3332    fn class_case_fold_unicode_disabled() {
3333        let mut cls = uclass(&[
3334            ('C', 'F'),
3335            ('A', 'G'),
3336            ('D', 'J'),
3337            ('A', 'C'),
3338            ('M', 'P'),
3339            ('L', 'S'),
3340            ('c', 'f'),
3341        ]);
3342        assert!(cls.try_case_fold_simple().is_err());
3343    }
3344
3345    #[test]
3346    #[should_panic]
3347    #[cfg(not(feature = "unicode-case"))]
3348    fn class_case_fold_unicode_disabled_panics() {
3349        let mut cls = uclass(&[
3350            ('C', 'F'),
3351            ('A', 'G'),
3352            ('D', 'J'),
3353            ('A', 'C'),
3354            ('M', 'P'),
3355            ('L', 'S'),
3356            ('c', 'f'),
3357        ]);
3358        cls.case_fold_simple();
3359    }
3360
3361    #[test]
3362    fn class_case_fold_bytes() {
3363        let cls = bclass(&[
3364            (b'C', b'F'),
3365            (b'A', b'G'),
3366            (b'D', b'J'),
3367            (b'A', b'C'),
3368            (b'M', b'P'),
3369            (b'L', b'S'),
3370            (b'c', b'f'),
3371        ]);
3372        let expected =
3373            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3374        assert_eq!(expected, bcasefold(&cls));
3375
3376        let cls = bclass(&[(b'A', b'Z')]);
3377        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378        assert_eq!(expected, bcasefold(&cls));
3379
3380        let cls = bclass(&[(b'a', b'z')]);
3381        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3382        assert_eq!(expected, bcasefold(&cls));
3383
3384        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3385        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3386        assert_eq!(expected, bcasefold(&cls));
3387
3388        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3389        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3390        assert_eq!(expected, bcasefold(&cls));
3391
3392        let cls = bclass(&[(b'\x00', b'\x10')]);
3393        assert_eq!(cls, bcasefold(&cls));
3394
3395        let cls = bclass(&[(b'k', b'k')]);
3396        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3397        assert_eq!(expected, bcasefold(&cls));
3398
3399        let cls = bclass(&[(b'@', b'@')]);
3400        assert_eq!(cls, bcasefold(&cls));
3401    }
3402
3403    #[test]
3404    fn class_negate_unicode() {
3405        let cls = uclass(&[('a', 'a')]);
3406        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3407        assert_eq!(expected, unegate(&cls));
3408
3409        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3410        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3411        assert_eq!(expected, unegate(&cls));
3412
3413        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3414        let expected = uclass(&[
3415            ('\x00', '\x60'),
3416            ('\x64', '\x77'),
3417            ('\x7B', '\u{10FFFF}'),
3418        ]);
3419        assert_eq!(expected, unegate(&cls));
3420
3421        let cls = uclass(&[('\x00', 'a')]);
3422        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3423        assert_eq!(expected, unegate(&cls));
3424
3425        let cls = uclass(&[('a', '\u{10FFFF}')]);
3426        let expected = uclass(&[('\x00', '\x60')]);
3427        assert_eq!(expected, unegate(&cls));
3428
3429        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3430        let expected = uclass(&[]);
3431        assert_eq!(expected, unegate(&cls));
3432
3433        let cls = uclass(&[]);
3434        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3435        assert_eq!(expected, unegate(&cls));
3436
3437        let cls =
3438            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3439        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3440        assert_eq!(expected, unegate(&cls));
3441
3442        let cls = uclass(&[('\x00', '\u{D7FF}')]);
3443        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3444        assert_eq!(expected, unegate(&cls));
3445
3446        let cls = uclass(&[('\x00', '\u{D7FE}')]);
3447        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3448        assert_eq!(expected, unegate(&cls));
3449
3450        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3451        let expected = uclass(&[('\x00', '\u{D7FF}')]);
3452        assert_eq!(expected, unegate(&cls));
3453
3454        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3455        let expected = uclass(&[('\x00', '\u{E000}')]);
3456        assert_eq!(expected, unegate(&cls));
3457    }
3458
3459    #[test]
3460    fn class_negate_bytes() {
3461        let cls = bclass(&[(b'a', b'a')]);
3462        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3463        assert_eq!(expected, bnegate(&cls));
3464
3465        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3466        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3467        assert_eq!(expected, bnegate(&cls));
3468
3469        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3470        let expected = bclass(&[
3471            (b'\x00', b'\x60'),
3472            (b'\x64', b'\x77'),
3473            (b'\x7B', b'\xFF'),
3474        ]);
3475        assert_eq!(expected, bnegate(&cls));
3476
3477        let cls = bclass(&[(b'\x00', b'a')]);
3478        let expected = bclass(&[(b'\x62', b'\xFF')]);
3479        assert_eq!(expected, bnegate(&cls));
3480
3481        let cls = bclass(&[(b'a', b'\xFF')]);
3482        let expected = bclass(&[(b'\x00', b'\x60')]);
3483        assert_eq!(expected, bnegate(&cls));
3484
3485        let cls = bclass(&[(b'\x00', b'\xFF')]);
3486        let expected = bclass(&[]);
3487        assert_eq!(expected, bnegate(&cls));
3488
3489        let cls = bclass(&[]);
3490        let expected = bclass(&[(b'\x00', b'\xFF')]);
3491        assert_eq!(expected, bnegate(&cls));
3492
3493        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3494        let expected = bclass(&[(b'\xFE', b'\xFE')]);
3495        assert_eq!(expected, bnegate(&cls));
3496    }
3497
3498    #[test]
3499    fn class_union_unicode() {
3500        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3501        let cls2 = uclass(&[('a', 'z')]);
3502        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3503        assert_eq!(expected, uunion(&cls1, &cls2));
3504    }
3505
3506    #[test]
3507    fn class_union_bytes() {
3508        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3509        let cls2 = bclass(&[(b'a', b'z')]);
3510        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3511        assert_eq!(expected, bunion(&cls1, &cls2));
3512    }
3513
3514    #[test]
3515    fn class_intersect_unicode() {
3516        let cls1 = uclass(&[]);
3517        let cls2 = uclass(&[('a', 'a')]);
3518        let expected = uclass(&[]);
3519        assert_eq!(expected, uintersect(&cls1, &cls2));
3520
3521        let cls1 = uclass(&[('a', 'a')]);
3522        let cls2 = uclass(&[('a', 'a')]);
3523        let expected = uclass(&[('a', 'a')]);
3524        assert_eq!(expected, uintersect(&cls1, &cls2));
3525
3526        let cls1 = uclass(&[('a', 'a')]);
3527        let cls2 = uclass(&[('b', 'b')]);
3528        let expected = uclass(&[]);
3529        assert_eq!(expected, uintersect(&cls1, &cls2));
3530
3531        let cls1 = uclass(&[('a', 'a')]);
3532        let cls2 = uclass(&[('a', 'c')]);
3533        let expected = uclass(&[('a', 'a')]);
3534        assert_eq!(expected, uintersect(&cls1, &cls2));
3535
3536        let cls1 = uclass(&[('a', 'b')]);
3537        let cls2 = uclass(&[('a', 'c')]);
3538        let expected = uclass(&[('a', 'b')]);
3539        assert_eq!(expected, uintersect(&cls1, &cls2));
3540
3541        let cls1 = uclass(&[('a', 'b')]);
3542        let cls2 = uclass(&[('b', 'c')]);
3543        let expected = uclass(&[('b', 'b')]);
3544        assert_eq!(expected, uintersect(&cls1, &cls2));
3545
3546        let cls1 = uclass(&[('a', 'b')]);
3547        let cls2 = uclass(&[('c', 'd')]);
3548        let expected = uclass(&[]);
3549        assert_eq!(expected, uintersect(&cls1, &cls2));
3550
3551        let cls1 = uclass(&[('b', 'c')]);
3552        let cls2 = uclass(&[('a', 'd')]);
3553        let expected = uclass(&[('b', 'c')]);
3554        assert_eq!(expected, uintersect(&cls1, &cls2));
3555
3556        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3557        let cls2 = uclass(&[('a', 'h')]);
3558        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559        assert_eq!(expected, uintersect(&cls1, &cls2));
3560
3561        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3562        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3563        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3564        assert_eq!(expected, uintersect(&cls1, &cls2));
3565
3566        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3567        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3568        let expected = uclass(&[]);
3569        assert_eq!(expected, uintersect(&cls1, &cls2));
3570
3571        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3572        let cls2 = uclass(&[('h', 'h')]);
3573        let expected = uclass(&[('h', 'h')]);
3574        assert_eq!(expected, uintersect(&cls1, &cls2));
3575
3576        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3577        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3578        let expected = uclass(&[]);
3579        assert_eq!(expected, uintersect(&cls1, &cls2));
3580
3581        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3582        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3583        let expected = uclass(&[('b', 'f')]);
3584        assert_eq!(expected, uintersect(&cls1, &cls2));
3585    }
3586
3587    #[test]
3588    fn class_intersect_bytes() {
3589        let cls1 = bclass(&[]);
3590        let cls2 = bclass(&[(b'a', b'a')]);
3591        let expected = bclass(&[]);
3592        assert_eq!(expected, bintersect(&cls1, &cls2));
3593
3594        let cls1 = bclass(&[(b'a', b'a')]);
3595        let cls2 = bclass(&[(b'a', b'a')]);
3596        let expected = bclass(&[(b'a', b'a')]);
3597        assert_eq!(expected, bintersect(&cls1, &cls2));
3598
3599        let cls1 = bclass(&[(b'a', b'a')]);
3600        let cls2 = bclass(&[(b'b', b'b')]);
3601        let expected = bclass(&[]);
3602        assert_eq!(expected, bintersect(&cls1, &cls2));
3603
3604        let cls1 = bclass(&[(b'a', b'a')]);
3605        let cls2 = bclass(&[(b'a', b'c')]);
3606        let expected = bclass(&[(b'a', b'a')]);
3607        assert_eq!(expected, bintersect(&cls1, &cls2));
3608
3609        let cls1 = bclass(&[(b'a', b'b')]);
3610        let cls2 = bclass(&[(b'a', b'c')]);
3611        let expected = bclass(&[(b'a', b'b')]);
3612        assert_eq!(expected, bintersect(&cls1, &cls2));
3613
3614        let cls1 = bclass(&[(b'a', b'b')]);
3615        let cls2 = bclass(&[(b'b', b'c')]);
3616        let expected = bclass(&[(b'b', b'b')]);
3617        assert_eq!(expected, bintersect(&cls1, &cls2));
3618
3619        let cls1 = bclass(&[(b'a', b'b')]);
3620        let cls2 = bclass(&[(b'c', b'd')]);
3621        let expected = bclass(&[]);
3622        assert_eq!(expected, bintersect(&cls1, &cls2));
3623
3624        let cls1 = bclass(&[(b'b', b'c')]);
3625        let cls2 = bclass(&[(b'a', b'd')]);
3626        let expected = bclass(&[(b'b', b'c')]);
3627        assert_eq!(expected, bintersect(&cls1, &cls2));
3628
3629        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3630        let cls2 = bclass(&[(b'a', b'h')]);
3631        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632        assert_eq!(expected, bintersect(&cls1, &cls2));
3633
3634        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3635        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3636        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3637        assert_eq!(expected, bintersect(&cls1, &cls2));
3638
3639        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3640        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3641        let expected = bclass(&[]);
3642        assert_eq!(expected, bintersect(&cls1, &cls2));
3643
3644        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3645        let cls2 = bclass(&[(b'h', b'h')]);
3646        let expected = bclass(&[(b'h', b'h')]);
3647        assert_eq!(expected, bintersect(&cls1, &cls2));
3648
3649        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3650        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3651        let expected = bclass(&[]);
3652        assert_eq!(expected, bintersect(&cls1, &cls2));
3653
3654        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3655        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3656        let expected = bclass(&[(b'b', b'f')]);
3657        assert_eq!(expected, bintersect(&cls1, &cls2));
3658    }
3659
3660    #[test]
3661    fn class_difference_unicode() {
3662        let cls1 = uclass(&[('a', 'a')]);
3663        let cls2 = uclass(&[('a', 'a')]);
3664        let expected = uclass(&[]);
3665        assert_eq!(expected, udifference(&cls1, &cls2));
3666
3667        let cls1 = uclass(&[('a', 'a')]);
3668        let cls2 = uclass(&[]);
3669        let expected = uclass(&[('a', 'a')]);
3670        assert_eq!(expected, udifference(&cls1, &cls2));
3671
3672        let cls1 = uclass(&[]);
3673        let cls2 = uclass(&[('a', 'a')]);
3674        let expected = uclass(&[]);
3675        assert_eq!(expected, udifference(&cls1, &cls2));
3676
3677        let cls1 = uclass(&[('a', 'z')]);
3678        let cls2 = uclass(&[('a', 'a')]);
3679        let expected = uclass(&[('b', 'z')]);
3680        assert_eq!(expected, udifference(&cls1, &cls2));
3681
3682        let cls1 = uclass(&[('a', 'z')]);
3683        let cls2 = uclass(&[('z', 'z')]);
3684        let expected = uclass(&[('a', 'y')]);
3685        assert_eq!(expected, udifference(&cls1, &cls2));
3686
3687        let cls1 = uclass(&[('a', 'z')]);
3688        let cls2 = uclass(&[('m', 'm')]);
3689        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3690        assert_eq!(expected, udifference(&cls1, &cls2));
3691
3692        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3693        let cls2 = uclass(&[('a', 'z')]);
3694        let expected = uclass(&[]);
3695        assert_eq!(expected, udifference(&cls1, &cls2));
3696
3697        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3698        let cls2 = uclass(&[('d', 'v')]);
3699        let expected = uclass(&[('a', 'c')]);
3700        assert_eq!(expected, udifference(&cls1, &cls2));
3701
3702        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3703        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3704        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3705        assert_eq!(expected, udifference(&cls1, &cls2));
3706
3707        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3708        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3709        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3710        assert_eq!(expected, udifference(&cls1, &cls2));
3711
3712        let cls1 = uclass(&[('x', 'z')]);
3713        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3714        let expected = uclass(&[('x', 'z')]);
3715        assert_eq!(expected, udifference(&cls1, &cls2));
3716
3717        let cls1 = uclass(&[('a', 'z')]);
3718        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3719        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3720        assert_eq!(expected, udifference(&cls1, &cls2));
3721    }
3722
3723    #[test]
3724    fn class_difference_bytes() {
3725        let cls1 = bclass(&[(b'a', b'a')]);
3726        let cls2 = bclass(&[(b'a', b'a')]);
3727        let expected = bclass(&[]);
3728        assert_eq!(expected, bdifference(&cls1, &cls2));
3729
3730        let cls1 = bclass(&[(b'a', b'a')]);
3731        let cls2 = bclass(&[]);
3732        let expected = bclass(&[(b'a', b'a')]);
3733        assert_eq!(expected, bdifference(&cls1, &cls2));
3734
3735        let cls1 = bclass(&[]);
3736        let cls2 = bclass(&[(b'a', b'a')]);
3737        let expected = bclass(&[]);
3738        assert_eq!(expected, bdifference(&cls1, &cls2));
3739
3740        let cls1 = bclass(&[(b'a', b'z')]);
3741        let cls2 = bclass(&[(b'a', b'a')]);
3742        let expected = bclass(&[(b'b', b'z')]);
3743        assert_eq!(expected, bdifference(&cls1, &cls2));
3744
3745        let cls1 = bclass(&[(b'a', b'z')]);
3746        let cls2 = bclass(&[(b'z', b'z')]);
3747        let expected = bclass(&[(b'a', b'y')]);
3748        assert_eq!(expected, bdifference(&cls1, &cls2));
3749
3750        let cls1 = bclass(&[(b'a', b'z')]);
3751        let cls2 = bclass(&[(b'm', b'm')]);
3752        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3753        assert_eq!(expected, bdifference(&cls1, &cls2));
3754
3755        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3756        let cls2 = bclass(&[(b'a', b'z')]);
3757        let expected = bclass(&[]);
3758        assert_eq!(expected, bdifference(&cls1, &cls2));
3759
3760        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3761        let cls2 = bclass(&[(b'd', b'v')]);
3762        let expected = bclass(&[(b'a', b'c')]);
3763        assert_eq!(expected, bdifference(&cls1, &cls2));
3764
3765        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3766        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3767        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3768        assert_eq!(expected, bdifference(&cls1, &cls2));
3769
3770        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3771        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3772        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3773        assert_eq!(expected, bdifference(&cls1, &cls2));
3774
3775        let cls1 = bclass(&[(b'x', b'z')]);
3776        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3777        let expected = bclass(&[(b'x', b'z')]);
3778        assert_eq!(expected, bdifference(&cls1, &cls2));
3779
3780        let cls1 = bclass(&[(b'a', b'z')]);
3781        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3782        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3783        assert_eq!(expected, bdifference(&cls1, &cls2));
3784    }
3785
3786    #[test]
3787    fn class_symmetric_difference_unicode() {
3788        let cls1 = uclass(&[('a', 'm')]);
3789        let cls2 = uclass(&[('g', 't')]);
3790        let expected = uclass(&[('a', 'f'), ('n', 't')]);
3791        assert_eq!(expected, usymdifference(&cls1, &cls2));
3792    }
3793
3794    #[test]
3795    fn class_symmetric_difference_bytes() {
3796        let cls1 = bclass(&[(b'a', b'm')]);
3797        let cls2 = bclass(&[(b'g', b't')]);
3798        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3799        assert_eq!(expected, bsymdifference(&cls1, &cls2));
3800    }
3801
3802    // We use a thread with an explicit stack size to test that our destructor
3803    // for Hir can handle arbitrarily sized expressions in constant stack
3804    // space. In case we run on a platform without threads (WASM?), we limit
3805    // this test to Windows/Unix.
3806    #[test]
3807    #[cfg(any(unix, windows))]
3808    fn no_stack_overflow_on_drop() {
3809        use std::thread;
3810
3811        let run = || {
3812            let mut expr = Hir::empty();
3813            for _ in 0..100 {
3814                expr = Hir::capture(Capture {
3815                    index: 1,
3816                    name: None,
3817                    sub: Box::new(expr),
3818                });
3819                expr = Hir::repetition(Repetition {
3820                    min: 0,
3821                    max: Some(1),
3822                    greedy: true,
3823                    sub: Box::new(expr),
3824                });
3825
3826                expr = Hir {
3827                    kind: HirKind::Concat(vec![expr]),
3828                    props: Properties::empty(),
3829                };
3830                expr = Hir {
3831                    kind: HirKind::Alternation(vec![expr]),
3832                    props: Properties::empty(),
3833                };
3834            }
3835            assert!(!matches!(*expr.kind(), HirKind::Empty));
3836        };
3837
3838        // We run our test on a thread with a small stack size so we can
3839        // force the issue more easily.
3840        //
3841        // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3842        // for context on the specific stack size chosen here.
3843        thread::Builder::new()
3844            .stack_size(16 << 10)
3845            .spawn(run)
3846            .unwrap()
3847            .join()
3848            .unwrap();
3849    }
3850
3851    #[test]
3852    fn look_set_iter() {
3853        let set = LookSet::empty();
3854        assert_eq!(0, set.iter().count());
3855
3856        let set = LookSet::full();
3857        assert_eq!(18, set.iter().count());
3858
3859        let set =
3860            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3861        assert_eq!(2, set.iter().count());
3862
3863        let set = LookSet::empty().insert(Look::StartLF);
3864        assert_eq!(1, set.iter().count());
3865
3866        let set = LookSet::empty().insert(Look::WordAsciiNegate);
3867        assert_eq!(1, set.iter().count());
3868    }
3869
3870    #[test]
3871    fn look_set_debug() {
3872        let res = format!("{:?}", LookSet::empty());
3873        assert_eq!("∅", res);
3874        let res = format!("{:?}", LookSet::full());
3875        assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3876    }
3877}
regex_syntax/hir/mod.rs

regex_syntax/hir/
mod.rs