regex/
re_builder.rs

1/// The set of user configurable options for compiling zero or more regexes.
2#[derive(Clone, Debug)]
3#[allow(missing_docs)]
4pub struct RegexOptions {
5    pub pats: Vec<String>,
6    pub size_limit: usize,
7    pub dfa_size_limit: usize,
8    pub nest_limit: u32,
9    pub case_insensitive: bool,
10    pub multi_line: bool,
11    pub dot_matches_new_line: bool,
12    pub swap_greed: bool,
13    pub ignore_whitespace: bool,
14    pub unicode: bool,
15    pub octal: bool,
16}
17
18impl Default for RegexOptions {
19    fn default() -> Self {
20        RegexOptions {
21            pats: vec![],
22            size_limit: 10 * (1 << 20),
23            dfa_size_limit: 2 * (1 << 20),
24            nest_limit: 250,
25            case_insensitive: false,
26            multi_line: false,
27            dot_matches_new_line: false,
28            swap_greed: false,
29            ignore_whitespace: false,
30            unicode: true,
31            octal: false,
32        }
33    }
34}
35
36macro_rules! define_builder {
37    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38        pub mod $name {
39            use super::RegexOptions;
40            use crate::error::Error;
41            use crate::exec::ExecBuilder;
42
43            use crate::$regex_mod::Regex;
44
45            /// A configurable builder for a regular expression.
46            ///
47            /// A builder can be used to configure how the regex is built, for example, by
48            /// setting the default flags (which can be overridden in the expression
49            /// itself) or setting various limits.
50            #[derive(Debug)]
51            pub struct RegexBuilder(RegexOptions);
52
53            impl RegexBuilder {
54                /// Create a new regular expression builder with the given pattern.
55                ///
56                /// If the pattern is invalid, then an error will be returned when
57                /// `build` is called.
58                pub fn new(pattern: &str) -> RegexBuilder {
59                    let mut builder = RegexBuilder(RegexOptions::default());
60                    builder.0.pats.push(pattern.to_owned());
61                    builder
62                }
63
64                /// Consume the builder and compile the regular expression.
65                ///
66                /// Note that calling `as_str` on the resulting `Regex` will produce the
67                /// pattern given to `new` verbatim. Notably, it will not incorporate any
68                /// of the flags set on this builder.
69                pub fn build(&self) -> Result<Regex, Error> {
70                    ExecBuilder::new_options(self.0.clone())
71                        .only_utf8($only_utf8)
72                        .build()
73                        .map(Regex::from)
74                }
75
76                /// Set the value for the case insensitive (`i`) flag.
77                ///
78                /// When enabled, letters in the pattern will match both upper case and
79                /// lower case variants.
80                pub fn case_insensitive(
81                    &mut self,
82                    yes: bool,
83                ) -> &mut RegexBuilder {
84                    self.0.case_insensitive = yes;
85                    self
86                }
87
88                /// Set the value for the multi-line matching (`m`) flag.
89                ///
90                /// When enabled, `^` matches the beginning of lines and `$` matches the
91                /// end of lines.
92                ///
93                /// By default, they match beginning/end of the input.
94                pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
95                    self.0.multi_line = yes;
96                    self
97                }
98
99                /// Set the value for the any character (`s`) flag, where in `.` matches
100                /// anything when `s` is set and matches anything except for new line when
101                /// it is not set (the default).
102                ///
103                /// N.B. "matches anything" means "any byte" when Unicode is disabled and
104                /// means "any valid UTF-8 encoding of any Unicode scalar value" when
105                /// Unicode is enabled.
106                pub fn dot_matches_new_line(
107                    &mut self,
108                    yes: bool,
109                ) -> &mut RegexBuilder {
110                    self.0.dot_matches_new_line = yes;
111                    self
112                }
113
114                /// Set the value for the greedy swap (`U`) flag.
115                ///
116                /// When enabled, a pattern like `a*` is lazy (tries to find shortest
117                /// match) and `a*?` is greedy (tries to find longest match).
118                ///
119                /// By default, `a*` is greedy and `a*?` is lazy.
120                pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
121                    self.0.swap_greed = yes;
122                    self
123                }
124
125                /// Set the value for the ignore whitespace (`x`) flag.
126                ///
127                /// When enabled, whitespace such as new lines and spaces will be ignored
128                /// between expressions of the pattern, and `#` can be used to start a
129                /// comment until the next new line.
130                pub fn ignore_whitespace(
131                    &mut self,
132                    yes: bool,
133                ) -> &mut RegexBuilder {
134                    self.0.ignore_whitespace = yes;
135                    self
136                }
137
138                /// Set the value for the Unicode (`u`) flag.
139                ///
140                /// Enabled by default. When disabled, character classes such as `\w` only
141                /// match ASCII word characters instead of all Unicode word characters.
142                pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
143                    self.0.unicode = yes;
144                    self
145                }
146
147                /// Whether to support octal syntax or not.
148                ///
149                /// Octal syntax is a little-known way of uttering Unicode codepoints in
150                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
151                /// `\141` are all equivalent regular expressions, where the last example
152                /// shows octal syntax.
153                ///
154                /// While supporting octal syntax isn't in and of itself a problem, it does
155                /// make good error messages harder. That is, in PCRE based regex engines,
156                /// syntax like `\0` invokes a backreference, which is explicitly
157                /// unsupported in Rust's regex engine. However, many users expect it to
158                /// be supported. Therefore, when octal support is disabled, the error
159                /// message will explicitly mention that backreferences aren't supported.
160                ///
161                /// Octal syntax is disabled by default.
162                pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163                    self.0.octal = yes;
164                    self
165                }
166
167                /// Set the approximate size limit of the compiled regular expression.
168                ///
169                /// This roughly corresponds to the number of bytes occupied by a single
170                /// compiled program. If the program exceeds this number, then a
171                /// compilation error is returned.
172                pub fn size_limit(
173                    &mut self,
174                    limit: usize,
175                ) -> &mut RegexBuilder {
176                    self.0.size_limit = limit;
177                    self
178                }
179
180                /// Set the approximate size of the cache used by the DFA.
181                ///
182                /// This roughly corresponds to the number of bytes that the DFA will
183                /// use while searching.
184                ///
185                /// Note that this is a *per thread* limit. There is no way to set a global
186                /// limit. In particular, if a regex is used from multiple threads
187                /// simultaneously, then each thread may use up to the number of bytes
188                /// specified here.
189                pub fn dfa_size_limit(
190                    &mut self,
191                    limit: usize,
192                ) -> &mut RegexBuilder {
193                    self.0.dfa_size_limit = limit;
194                    self
195                }
196
197                /// Set the nesting limit for this parser.
198                ///
199                /// The nesting limit controls how deep the abstract syntax tree is allowed
200                /// to be. If the AST exceeds the given limit (e.g., with too many nested
201                /// groups), then an error is returned by the parser.
202                ///
203                /// The purpose of this limit is to act as a heuristic to prevent stack
204                /// overflow for consumers that do structural induction on an `Ast` using
205                /// explicit recursion. While this crate never does this (instead using
206                /// constant stack space and moving the call stack to the heap), other
207                /// crates may.
208                ///
209                /// This limit is not checked until the entire Ast is parsed. Therefore,
210                /// if callers want to put a limit on the amount of heap space used, then
211                /// they should impose a limit on the length, in bytes, of the concrete
212                /// pattern string. In particular, this is viable since this parser
213                /// implementation will limit itself to heap space proportional to the
214                /// length of the pattern string.
215                ///
216                /// Note that a nest limit of `0` will return a nest limit error for most
217                /// patterns but not all. For example, a nest limit of `0` permits `a` but
218                /// not `ab`, since `ab` requires a concatenation, which results in a nest
219                /// depth of `1`. In general, a nest limit is not something that manifests
220                /// in an obvious way in the concrete syntax, therefore, it should not be
221                /// used in a granular way.
222                pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
223                    self.0.nest_limit = limit;
224                    self
225                }
226            }
227        }
228    };
229}
230
231define_builder!(bytes, re_bytes, false);
232define_builder!(unicode, re_unicode, true);
233
234macro_rules! define_set_builder {
235    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
236        pub mod $name {
237            use super::RegexOptions;
238            use crate::error::Error;
239            use crate::exec::ExecBuilder;
240
241            use crate::re_set::$regex_mod::RegexSet;
242
243            /// A configurable builder for a set of regular expressions.
244            ///
245            /// A builder can be used to configure how the regexes are built, for example,
246            /// by setting the default flags (which can be overridden in the expression
247            /// itself) or setting various limits.
248            #[derive(Debug)]
249            pub struct RegexSetBuilder(RegexOptions);
250
251            impl RegexSetBuilder {
252                /// Create a new regular expression builder with the given pattern.
253                ///
254                /// If the pattern is invalid, then an error will be returned when
255                /// `build` is called.
256                pub fn new<I, S>(patterns: I) -> RegexSetBuilder
257                where
258                    S: AsRef<str>,
259                    I: IntoIterator<Item = S>,
260                {
261                    let mut builder = RegexSetBuilder(RegexOptions::default());
262                    for pat in patterns {
263                        builder.0.pats.push(pat.as_ref().to_owned());
264                    }
265                    builder
266                }
267
268                /// Consume the builder and compile the regular expressions into a set.
269                pub fn build(&self) -> Result<RegexSet, Error> {
270                    ExecBuilder::new_options(self.0.clone())
271                        .only_utf8($only_utf8)
272                        .build()
273                        .map(RegexSet::from)
274                }
275
276                /// Set the value for the case insensitive (`i`) flag.
277                pub fn case_insensitive(
278                    &mut self,
279                    yes: bool,
280                ) -> &mut RegexSetBuilder {
281                    self.0.case_insensitive = yes;
282                    self
283                }
284
285                /// Set the value for the multi-line matching (`m`) flag.
286                pub fn multi_line(
287                    &mut self,
288                    yes: bool,
289                ) -> &mut RegexSetBuilder {
290                    self.0.multi_line = yes;
291                    self
292                }
293
294                /// Set the value for the any character (`s`) flag, where in `.` matches
295                /// anything when `s` is set and matches anything except for new line when
296                /// it is not set (the default).
297                ///
298                /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
299                /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
300                /// expressions.
301                pub fn dot_matches_new_line(
302                    &mut self,
303                    yes: bool,
304                ) -> &mut RegexSetBuilder {
305                    self.0.dot_matches_new_line = yes;
306                    self
307                }
308
309                /// Set the value for the greedy swap (`U`) flag.
310                pub fn swap_greed(
311                    &mut self,
312                    yes: bool,
313                ) -> &mut RegexSetBuilder {
314                    self.0.swap_greed = yes;
315                    self
316                }
317
318                /// Set the value for the ignore whitespace (`x`) flag.
319                pub fn ignore_whitespace(
320                    &mut self,
321                    yes: bool,
322                ) -> &mut RegexSetBuilder {
323                    self.0.ignore_whitespace = yes;
324                    self
325                }
326
327                /// Set the value for the Unicode (`u`) flag.
328                pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
329                    self.0.unicode = yes;
330                    self
331                }
332
333                /// Whether to support octal syntax or not.
334                ///
335                /// Octal syntax is a little-known way of uttering Unicode codepoints in
336                /// a regular expression. For example, `a`, `\x61`, `\u0061` and
337                /// `\141` are all equivalent regular expressions, where the last example
338                /// shows octal syntax.
339                ///
340                /// While supporting octal syntax isn't in and of itself a problem, it does
341                /// make good error messages harder. That is, in PCRE based regex engines,
342                /// syntax like `\0` invokes a backreference, which is explicitly
343                /// unsupported in Rust's regex engine. However, many users expect it to
344                /// be supported. Therefore, when octal support is disabled, the error
345                /// message will explicitly mention that backreferences aren't supported.
346                ///
347                /// Octal syntax is disabled by default.
348                pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
349                    self.0.octal = yes;
350                    self
351                }
352
353                /// Set the approximate size limit of the compiled regular expression.
354                ///
355                /// This roughly corresponds to the number of bytes occupied by a single
356                /// compiled program. If the program exceeds this number, then a
357                /// compilation error is returned.
358                pub fn size_limit(
359                    &mut self,
360                    limit: usize,
361                ) -> &mut RegexSetBuilder {
362                    self.0.size_limit = limit;
363                    self
364                }
365
366                /// Set the approximate size of the cache used by the DFA.
367                ///
368                /// This roughly corresponds to the number of bytes that the DFA will
369                /// use while searching.
370                ///
371                /// Note that this is a *per thread* limit. There is no way to set a global
372                /// limit. In particular, if a regex is used from multiple threads
373                /// simultaneously, then each thread may use up to the number of bytes
374                /// specified here.
375                pub fn dfa_size_limit(
376                    &mut self,
377                    limit: usize,
378                ) -> &mut RegexSetBuilder {
379                    self.0.dfa_size_limit = limit;
380                    self
381                }
382
383                /// Set the nesting limit for this parser.
384                ///
385                /// The nesting limit controls how deep the abstract syntax tree is allowed
386                /// to be. If the AST exceeds the given limit (e.g., with too many nested
387                /// groups), then an error is returned by the parser.
388                ///
389                /// The purpose of this limit is to act as a heuristic to prevent stack
390                /// overflow for consumers that do structural induction on an `Ast` using
391                /// explicit recursion. While this crate never does this (instead using
392                /// constant stack space and moving the call stack to the heap), other
393                /// crates may.
394                ///
395                /// This limit is not checked until the entire Ast is parsed. Therefore,
396                /// if callers want to put a limit on the amount of heap space used, then
397                /// they should impose a limit on the length, in bytes, of the concrete
398                /// pattern string. In particular, this is viable since this parser
399                /// implementation will limit itself to heap space proportional to the
400                /// length of the pattern string.
401                ///
402                /// Note that a nest limit of `0` will return a nest limit error for most
403                /// patterns but not all. For example, a nest limit of `0` permits `a` but
404                /// not `ab`, since `ab` requires a concatenation, which results in a nest
405                /// depth of `1`. In general, a nest limit is not something that manifests
406                /// in an obvious way in the concrete syntax, therefore, it should not be
407                /// used in a granular way.
408                pub fn nest_limit(
409                    &mut self,
410                    limit: u32,
411                ) -> &mut RegexSetBuilder {
412                    self.0.nest_limit = limit;
413                    self
414                }
415            }
416        }
417    };
418}
419
420define_set_builder!(set_bytes, bytes, false);
421define_set_builder!(set_unicode, unicode, true);