regex/
re_set.rs

1macro_rules! define_set {
2    ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3     $(#[$doc_regexset_example:meta])* ) => {
4        pub mod $name {
5            use std::fmt;
6            use std::iter;
7            use std::slice;
8            use std::vec;
9
10            use crate::error::Error;
11            use crate::exec::Exec;
12            use crate::re_builder::$builder_mod::RegexSetBuilder;
13            use crate::re_trait::RegularExpression;
14
15/// Match multiple (possibly overlapping) regular expressions in a single scan.
16///
17/// A regex set corresponds to the union of two or more regular expressions.
18/// That is, a regex set will match text where at least one of its
19/// constituent regular expressions matches. A regex set as its formulated here
20/// provides a touch more power: it will also report *which* regular
21/// expressions in the set match. Indeed, this is the key difference between
22/// regex sets and a single `Regex` with many alternates, since only one
23/// alternate can match at a time.
24///
25/// For example, consider regular expressions to match email addresses and
26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27/// regex set is constructed from those regexes, then searching the text
28/// `foo@example.com` will report both regexes as matching. Of course, one
29/// could accomplish this by compiling each regex on its own and doing two
30/// searches over the text. The key advantage of using a regex set is that it
31/// will report the matching regexes using a *single pass through the text*.
32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33/// router for a complex web application or a user agent matcher), then a regex
34/// set can realize huge performance gains.
35///
36/// # Example
37///
38/// This shows how the above two regexes (for matching email addresses and
39/// domains) might work:
40///
41$(#[$doc_regexset_example])*
42///
43/// Note that it would be possible to adapt the above example to using `Regex`
44/// with an expression like:
45///
46/// ```text
47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48/// ```
49///
50/// After a match, one could then inspect the capture groups to figure out
51/// which alternates matched. The problem is that it is hard to make this
52/// approach scale when there are many regexes since the overlap between each
53/// alternate isn't always obvious to reason about.
54///
55/// # Limitations
56///
57/// Regex sets are limited to answering the following two questions:
58///
59/// 1. Does any regex in the set match?
60/// 2. If so, which regexes in the set match?
61///
62/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1)
63/// instead of (2) since the matching engines can stop after the first match
64/// is found.
65///
66/// You cannot directly extract [`Match`][crate::Match] or
67/// [`Captures`][crate::Captures] objects from a regex set. If you need these
68/// operations, the recommended approach is to compile each pattern in the set
69/// independently and scan the exact same input a second time with those
70/// independently compiled patterns:
71///
72/// ```rust
73/// use regex::{Regex, RegexSet};
74///
75/// let patterns = ["foo", "bar"];
76/// // Both patterns will match different ranges of this string.
77/// let text = "barfoo";
78///
79/// // Compile a set matching any of our patterns.
80/// let set = RegexSet::new(&patterns).unwrap();
81/// // Compile each pattern independently.
82/// let regexes: Vec<_> = set.patterns().iter()
83///     .map(|pat| Regex::new(pat).unwrap())
84///     .collect();
85///
86/// // Match against the whole set first and identify the individual
87/// // matching patterns.
88/// let matches: Vec<&str> = set.matches(text).into_iter()
89///     // Dereference the match index to get the corresponding
90///     // compiled pattern.
91///     .map(|match_idx| &regexes[match_idx])
92///     // To get match locations or any other info, we then have to search
93///     // the exact same text again, using our separately-compiled pattern.
94///     .map(|pat| pat.find(text).unwrap().as_str())
95///     .collect();
96///
97/// // Matches arrive in the order the constituent patterns were declared,
98/// // not the order they appear in the input.
99/// assert_eq!(vec!["foo", "bar"], matches);
100/// ```
101///
102/// # Performance
103///
104/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
105/// search takes `O(mn)` time, where `m` is proportional to the size of the
106/// regex set and `n` is proportional to the length of the search text.
107#[derive(Clone)]
108pub struct RegexSet(Exec);
109
110impl RegexSet {
111    /// Create a new regex set with the given regular expressions.
112    ///
113    /// This takes an iterator of `S`, where `S` is something that can produce
114    /// a `&str`. If any of the strings in the iterator are not valid regular
115    /// expressions, then an error is returned.
116    ///
117    /// # Example
118    ///
119    /// Create a new regex set from an iterator of strings:
120    ///
121    /// ```rust
122    /// # use regex::RegexSet;
123    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
124    /// assert!(set.is_match("foo"));
125    /// ```
126    pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
127            where S: AsRef<str>, I: IntoIterator<Item=S> {
128        RegexSetBuilder::new(exprs).build()
129    }
130
131    /// Create a new empty regex set.
132    ///
133    /// # Example
134    ///
135    /// ```rust
136    /// # use regex::RegexSet;
137    /// let set = RegexSet::empty();
138    /// assert!(set.is_empty());
139    /// ```
140    pub fn empty() -> RegexSet {
141        RegexSetBuilder::new(&[""; 0]).build().unwrap()
142    }
143
144    /// Returns true if and only if one of the regexes in this set matches
145    /// the text given.
146    ///
147    /// This method should be preferred if you only need to test whether any
148    /// of the regexes in the set should match, but don't care about *which*
149    /// regexes matched. This is because the underlying matching engine will
150    /// quit immediately after seeing the first match instead of continuing to
151    /// find all matches.
152    ///
153    /// Note that as with searches using `Regex`, the expression is unanchored
154    /// by default. That is, if the regex does not start with `^` or `\A`, or
155    /// end with `$` or `\z`, then it is permitted to match anywhere in the
156    /// text.
157    ///
158    /// # Example
159    ///
160    /// Tests whether a set matches some text:
161    ///
162    /// ```rust
163    /// # use regex::RegexSet;
164    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
165    /// assert!(set.is_match("foo"));
166    /// assert!(!set.is_match("☃"));
167    /// ```
168    pub fn is_match(&self, text: $text_ty) -> bool {
169        self.is_match_at(text, 0)
170    }
171
172    /// Returns the same as is_match, but starts the search at the given
173    /// offset.
174    ///
175    /// The significance of the starting point is that it takes the surrounding
176    /// context into consideration. For example, the `\A` anchor can only
177    /// match when `start == 0`.
178    #[doc(hidden)]
179    pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
180        self.0.searcher().is_match_at($as_bytes(text), start)
181    }
182
183    /// Returns the set of regular expressions that match in the given text.
184    ///
185    /// The set returned contains the index of each regular expression that
186    /// matches in the given text. The index is in correspondence with the
187    /// order of regular expressions given to `RegexSet`'s constructor.
188    ///
189    /// The set can also be used to iterate over the matched indices.
190    ///
191    /// Note that as with searches using `Regex`, the expression is unanchored
192    /// by default. That is, if the regex does not start with `^` or `\A`, or
193    /// end with `$` or `\z`, then it is permitted to match anywhere in the
194    /// text.
195    ///
196    /// # Example
197    ///
198    /// Tests which regular expressions match the given text:
199    ///
200    /// ```rust
201    /// # use regex::RegexSet;
202    /// let set = RegexSet::new(&[
203    ///     r"\w+",
204    ///     r"\d+",
205    ///     r"\pL+",
206    ///     r"foo",
207    ///     r"bar",
208    ///     r"barfoo",
209    ///     r"foobar",
210    /// ]).unwrap();
211    /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
212    /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
213    ///
214    /// // You can also test whether a particular regex matched:
215    /// let matches = set.matches("foobar");
216    /// assert!(!matches.matched(5));
217    /// assert!(matches.matched(6));
218    /// ```
219    pub fn matches(&self, text: $text_ty) -> SetMatches {
220        let mut matches = vec![false; self.0.regex_strings().len()];
221        let any = self.read_matches_at(&mut matches, text, 0);
222        SetMatches {
223            matched_any: any,
224            matches: matches,
225        }
226    }
227
228    /// Returns the same as matches, but starts the search at the given
229    /// offset and stores the matches into the slice given.
230    ///
231    /// The significance of the starting point is that it takes the surrounding
232    /// context into consideration. For example, the `\A` anchor can only
233    /// match when `start == 0`.
234    ///
235    /// `matches` must have a length that is at least the number of regexes
236    /// in this set.
237    ///
238    /// This method returns true if and only if at least one member of
239    /// `matches` is true after executing the set against `text`.
240    #[doc(hidden)]
241    pub fn read_matches_at(
242        &self,
243        matches: &mut [bool],
244        text: $text_ty,
245        start: usize,
246    ) -> bool {
247        self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
248    }
249
250    /// Returns the total number of regular expressions in this set.
251    pub fn len(&self) -> usize {
252        self.0.regex_strings().len()
253    }
254
255    /// Returns `true` if this set contains no regular expressions.
256    pub fn is_empty(&self) -> bool {
257        self.0.regex_strings().is_empty()
258    }
259
260    /// Returns the patterns that this set will match on.
261    ///
262    /// This function can be used to determine the pattern for a match. The
263    /// slice returned has exactly as many patterns givens to this regex set,
264    /// and the order of the slice is the same as the order of the patterns
265    /// provided to the set.
266    ///
267    /// # Example
268    ///
269    /// ```rust
270    /// # use regex::RegexSet;
271    /// let set = RegexSet::new(&[
272    ///     r"\w+",
273    ///     r"\d+",
274    ///     r"\pL+",
275    ///     r"foo",
276    ///     r"bar",
277    ///     r"barfoo",
278    ///     r"foobar",
279    /// ]).unwrap();
280    /// let matches: Vec<_> = set
281    ///     .matches("foobar")
282    ///     .into_iter()
283    ///     .map(|match_idx| &set.patterns()[match_idx])
284    ///     .collect();
285    /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
286    /// ```
287    pub fn patterns(&self) -> &[String] {
288        self.0.regex_strings()
289    }
290}
291
292/// A set of matches returned by a regex set.
293#[derive(Clone, Debug)]
294pub struct SetMatches {
295    matched_any: bool,
296    matches: Vec<bool>,
297}
298
299impl SetMatches {
300    /// Whether this set contains any matches.
301    pub fn matched_any(&self) -> bool {
302        self.matched_any
303    }
304
305    /// Whether the regex at the given index matched.
306    ///
307    /// The index for a regex is determined by its insertion order upon the
308    /// initial construction of a `RegexSet`, starting at `0`.
309    ///
310    /// # Panics
311    ///
312    /// If `regex_index` is greater than or equal to `self.len()`.
313    pub fn matched(&self, regex_index: usize) -> bool {
314        self.matches[regex_index]
315    }
316
317    /// The total number of regexes in the set that created these matches.
318    pub fn len(&self) -> usize {
319        self.matches.len()
320    }
321
322    /// Returns an iterator over indexes in the regex that matched.
323    ///
324    /// This will always produces matches in ascending order of index, where
325    /// the index corresponds to the index of the regex that matched with
326    /// respect to its position when initially building the set.
327    pub fn iter(&self) -> SetMatchesIter<'_> {
328        SetMatchesIter((&*self.matches).into_iter().enumerate())
329    }
330}
331
332impl IntoIterator for SetMatches {
333    type IntoIter = SetMatchesIntoIter;
334    type Item = usize;
335
336    fn into_iter(self) -> Self::IntoIter {
337        SetMatchesIntoIter(self.matches.into_iter().enumerate())
338    }
339}
340
341impl<'a> IntoIterator for &'a SetMatches {
342    type IntoIter = SetMatchesIter<'a>;
343    type Item = usize;
344
345    fn into_iter(self) -> Self::IntoIter {
346        self.iter()
347    }
348}
349
350/// An owned iterator over the set of matches from a regex set.
351///
352/// This will always produces matches in ascending order of index, where the
353/// index corresponds to the index of the regex that matched with respect to
354/// its position when initially building the set.
355#[derive(Debug)]
356pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
357
358impl Iterator for SetMatchesIntoIter {
359    type Item = usize;
360
361    fn next(&mut self) -> Option<usize> {
362        loop {
363            match self.0.next() {
364                None => return None,
365                Some((_, false)) => {}
366                Some((i, true)) => return Some(i),
367            }
368        }
369    }
370
371    fn size_hint(&self) -> (usize, Option<usize>) {
372        self.0.size_hint()
373    }
374}
375
376impl DoubleEndedIterator for SetMatchesIntoIter {
377    fn next_back(&mut self) -> Option<usize> {
378        loop {
379            match self.0.next_back() {
380                None => return None,
381                Some((_, false)) => {}
382                Some((i, true)) => return Some(i),
383            }
384        }
385    }
386}
387
388impl iter::FusedIterator for SetMatchesIntoIter {}
389
390/// A borrowed iterator over the set of matches from a regex set.
391///
392/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
393///
394/// This will always produces matches in ascending order of index, where the
395/// index corresponds to the index of the regex that matched with respect to
396/// its position when initially building the set.
397#[derive(Clone, Debug)]
398pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
399
400impl<'a> Iterator for SetMatchesIter<'a> {
401    type Item = usize;
402
403    fn next(&mut self) -> Option<usize> {
404        loop {
405            match self.0.next() {
406                None => return None,
407                Some((_, &false)) => {}
408                Some((i, &true)) => return Some(i),
409            }
410        }
411    }
412
413    fn size_hint(&self) -> (usize, Option<usize>) {
414        self.0.size_hint()
415    }
416}
417
418impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
419    fn next_back(&mut self) -> Option<usize> {
420        loop {
421            match self.0.next_back() {
422                None => return None,
423                Some((_, &false)) => {}
424                Some((i, &true)) => return Some(i),
425            }
426        }
427    }
428}
429
430impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
431
432#[doc(hidden)]
433impl From<Exec> for RegexSet {
434    fn from(exec: Exec) -> Self {
435        RegexSet(exec)
436    }
437}
438
439impl fmt::Debug for RegexSet {
440    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
441        write!(f, "RegexSet({:?})", self.0.regex_strings())
442    }
443}
444
445#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
446#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
447        }
448    }
449}
450
451define_set! {
452    unicode,
453    set_unicode,
454    &str,
455    as_bytes_str,
456/// ```rust
457/// # use regex::RegexSet;
458/// let set = RegexSet::new(&[
459///     r"[a-z]+@[a-z]+\.(com|org|net)",
460///     r"[a-z]+\.(com|org|net)",
461/// ]).unwrap();
462///
463/// // Ask whether any regexes in the set match.
464/// assert!(set.is_match("foo@example.com"));
465///
466/// // Identify which regexes in the set match.
467/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
468/// assert_eq!(vec![0, 1], matches);
469///
470/// // Try again, but with text that only matches one of the regexes.
471/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
472/// assert_eq!(vec![1], matches);
473///
474/// // Try again, but with text that doesn't match any regex in the set.
475/// let matches: Vec<_> = set.matches("example").into_iter().collect();
476/// assert!(matches.is_empty());
477/// ```
478}
479
480define_set! {
481    bytes,
482    set_bytes,
483    &[u8],
484    as_bytes_bytes,
485/// ```rust
486/// # use regex::bytes::RegexSet;
487/// let set = RegexSet::new(&[
488///     r"[a-z]+@[a-z]+\.(com|org|net)",
489///     r"[a-z]+\.(com|org|net)",
490/// ]).unwrap();
491///
492/// // Ask whether any regexes in the set match.
493/// assert!(set.is_match(b"foo@example.com"));
494///
495/// // Identify which regexes in the set match.
496/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
497/// assert_eq!(vec![0, 1], matches);
498///
499/// // Try again, but with text that only matches one of the regexes.
500/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
501/// assert_eq!(vec![1], matches);
502///
503/// // Try again, but with text that doesn't match any regex in the set.
504/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
505/// assert!(matches.is_empty());
506/// ```
507}