regex_automata/
dfa.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
use state_id::StateID;

/// A trait describing the interface of a deterministic finite automaton (DFA).
///
/// Every DFA has exactly one start state and at least one dead state (which
/// may be the same, as in the case of an empty DFA). In all cases, a state
/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
/// always returns `true`.
///
/// Every DFA also has zero or more match states, such that
/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
/// a match state.
///
/// In general, users of this trait likely will only need to use the search
/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
/// methods are lower level and are used for walking the transitions of a DFA
/// manually. In particular, the aforementioned search routines are implemented
/// generically in terms of the lower level transition walking routines.
pub trait DFA {
    /// The representation used for state identifiers in this DFA.
    ///
    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
    type ID: StateID;

    /// Return the identifier of this DFA's start state.
    fn start_state(&self) -> Self::ID;

    /// Returns true if and only if the given identifier corresponds to a match
    /// state.
    fn is_match_state(&self, id: Self::ID) -> bool;

    /// Returns true if and only if the given identifier corresponds to a dead
    /// state. When a DFA enters a dead state, it is impossible to leave and
    /// thus can never lead to a match.
    fn is_dead_state(&self, id: Self::ID) -> bool;

    /// Returns true if and only if the given identifier corresponds to either
    /// a dead state or a match state, such that one of `is_match_state(id)`
    /// or `is_dead_state(id)` must return true.
    ///
    /// Depending on the implementation of the DFA, this routine can be used
    /// to save a branch in the core matching loop. Nevertheless,
    /// `is_match_state(id) || is_dead_state(id)` is always a valid
    /// implementation.
    fn is_match_or_dead_state(&self, id: Self::ID) -> bool;

    /// Returns true if and only if this DFA is anchored.
    ///
    /// When a DFA is anchored, it is only allowed to report matches that
    /// start at index `0`.
    fn is_anchored(&self) -> bool;

    /// Given the current state that this DFA is in and the next input byte,
    /// this method returns the identifier of the next state. The identifier
    /// returned is always valid, but it may correspond to a dead state.
    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;

    /// Like `next_state`, but its implementation may look up the next state
    /// without memory safety checks such as bounds checks. As such, callers
    /// must ensure that the given identifier corresponds to a valid DFA
    /// state. Implementors must, in turn, ensure that this routine is safe
    /// for all valid state identifiers and for all possible `u8` values.
    unsafe fn next_state_unchecked(
        &self,
        current: Self::ID,
        input: u8,
    ) -> Self::ID;

    /// Returns true if and only if the given bytes match this DFA.
    ///
    /// This routine may short circuit if it knows that scanning future input
    /// will never lead to a different result. In particular, if a DFA enters
    /// a match state or a dead state, then this routine will return `true` or
    /// `false`, respectively, without inspecting any future input.
    ///
    /// # Example
    ///
    /// This example shows how to use this method with a
    /// [`DenseDFA`](enum.DenseDFA.html).
    ///
    /// ```
    /// use regex_automata::{DFA, DenseDFA};
    ///
    /// # fn example() -> Result<(), regex_automata::Error> {
    /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
    /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
    /// assert_eq!(false, dfa.is_match(b"foobar"));
    /// # Ok(()) }; example().unwrap()
    /// ```
    #[inline]
    fn is_match(&self, bytes: &[u8]) -> bool {
        self.is_match_at(bytes, 0)
    }

    /// Returns the first position at which a match is found.
    ///
    /// This routine stops scanning input in precisely the same circumstances
    /// as `is_match`. The key difference is that this routine returns the
    /// position at which it stopped scanning input if and only if a match
    /// was found. If no match is found, then `None` is returned.
    ///
    /// # Example
    ///
    /// This example shows how to use this method with a
    /// [`DenseDFA`](enum.DenseDFA.html).
    ///
    /// ```
    /// use regex_automata::{DFA, DenseDFA};
    ///
    /// # fn example() -> Result<(), regex_automata::Error> {
    /// let dfa = DenseDFA::new("foo[0-9]+")?;
    /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
    ///
    /// // Normally, the end of the leftmost first match here would be 3,
    /// // but the shortest match semantics detect a match earlier.
    /// let dfa = DenseDFA::new("abc|a")?;
    /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
    /// # Ok(()) }; example().unwrap()
    /// ```
    #[inline]
    fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
        self.shortest_match_at(bytes, 0)
    }

    /// Returns the end offset of the longest match. If no match exists,
    /// then `None` is returned.
    ///
    /// Implementors of this trait are not required to implement any particular
    /// match semantics (such as leftmost-first), which are instead manifest in
    /// the DFA's topology itself.
    ///
    /// In particular, this method must continue searching even after it
    /// enters a match state. The search should only terminate once it has
    /// reached the end of the input or when it has entered a dead state. Upon
    /// termination, the position of the last byte seen while still in a match
    /// state is returned.
    ///
    /// # Example
    ///
    /// This example shows how to use this method with a
    /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
    /// "leftmost first" match semantics.
    ///
    /// Leftmost first match semantics corresponds to the match with the
    /// smallest starting offset, but where the end offset is determined by
    /// preferring earlier branches in the original regular expression. For
    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
    /// will match `Samwise` in `Samwise`.
    ///
    /// Generally speaking, the "leftmost first" match is how most backtracking
    /// regular expressions tend to work. This is in contrast to POSIX-style
    /// regular expressions that yield "leftmost longest" matches. Namely,
    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
    /// leftmost longest semantics.
    ///
    /// ```
    /// use regex_automata::{DFA, DenseDFA};
    ///
    /// # fn example() -> Result<(), regex_automata::Error> {
    /// let dfa = DenseDFA::new("foo[0-9]+")?;
    /// assert_eq!(Some(8), dfa.find(b"foo12345"));
    ///
    /// // Even though a match is found after reading the first byte (`a`),
    /// // the leftmost first match semantics demand that we find the earliest
    /// // match that prefers earlier parts of the pattern over latter parts.
    /// let dfa = DenseDFA::new("abc|a")?;
    /// assert_eq!(Some(3), dfa.find(b"abc"));
    /// # Ok(()) }; example().unwrap()
    /// ```
    #[inline]
    fn find(&self, bytes: &[u8]) -> Option<usize> {
        self.find_at(bytes, 0)
    }

    /// Returns the start offset of the longest match in reverse, by searching
    /// from the end of the input towards the start of the input. If no match
    /// exists, then `None` is returned. In other words, this has the same
    /// match semantics as `find`, but in reverse.
    ///
    /// # Example
    ///
    /// This example shows how to use this method with a
    /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
    /// is principally useful when used in conjunction with the
    /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
    /// configuration knob. In general, it's unlikely to be correct to use both
    /// `find` and `rfind` with the same DFA since any particular DFA will only
    /// support searching in one direction.
    ///
    /// ```
    /// use regex_automata::{dense, DFA};
    ///
    /// # fn example() -> Result<(), regex_automata::Error> {
    /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
    /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
    /// # Ok(()) }; example().unwrap()
    /// ```
    #[inline]
    fn rfind(&self, bytes: &[u8]) -> Option<usize> {
        self.rfind_at(bytes, bytes.len())
    }

    /// Returns the same as `is_match`, but starts the search at the given
    /// offset.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, if the DFA is anchored, then
    /// a match can only occur when `start == 0`.
    #[inline]
    fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
        if self.is_anchored() && start > 0 {
            return false;
        }

        let mut state = self.start_state();
        if self.is_match_or_dead_state(state) {
            return self.is_match_state(state);
        }
        for &b in bytes[start..].iter() {
            state = unsafe { self.next_state_unchecked(state, b) };
            if self.is_match_or_dead_state(state) {
                return self.is_match_state(state);
            }
        }
        false
    }

    /// Returns the same as `shortest_match`, but starts the search at the
    /// given offset.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, if the DFA is anchored, then
    /// a match can only occur when `start == 0`.
    #[inline]
    fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
        if self.is_anchored() && start > 0 {
            return None;
        }

        let mut state = self.start_state();
        if self.is_match_or_dead_state(state) {
            return if self.is_dead_state(state) { None } else { Some(start) };
        }
        for (i, &b) in bytes[start..].iter().enumerate() {
            state = unsafe { self.next_state_unchecked(state, b) };
            if self.is_match_or_dead_state(state) {
                return if self.is_dead_state(state) {
                    None
                } else {
                    Some(start + i + 1)
                };
            }
        }
        None
    }

    /// Returns the same as `find`, but starts the search at the given
    /// offset.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, if the DFA is anchored, then
    /// a match can only occur when `start == 0`.
    #[inline]
    fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
        if self.is_anchored() && start > 0 {
            return None;
        }

        let mut state = self.start_state();
        let mut last_match = if self.is_dead_state(state) {
            return None;
        } else if self.is_match_state(state) {
            Some(start)
        } else {
            None
        };
        for (i, &b) in bytes[start..].iter().enumerate() {
            state = unsafe { self.next_state_unchecked(state, b) };
            if self.is_match_or_dead_state(state) {
                if self.is_dead_state(state) {
                    return last_match;
                }
                last_match = Some(start + i + 1);
            }
        }
        last_match
    }

    /// Returns the same as `rfind`, but starts the search at the given
    /// offset.
    ///
    /// The significance of the starting point is that it takes the surrounding
    /// context into consideration. For example, if the DFA is anchored, then
    /// a match can only occur when `start == bytes.len()`.
    #[inline(never)]
    fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
        if self.is_anchored() && start < bytes.len() {
            return None;
        }

        let mut state = self.start_state();
        let mut last_match = if self.is_dead_state(state) {
            return None;
        } else if self.is_match_state(state) {
            Some(start)
        } else {
            None
        };
        for (i, &b) in bytes[..start].iter().enumerate().rev() {
            state = unsafe { self.next_state_unchecked(state, b) };
            if self.is_match_or_dead_state(state) {
                if self.is_dead_state(state) {
                    return last_match;
                }
                last_match = Some(i);
            }
        }
        last_match
    }
}

impl<'a, T: DFA> DFA for &'a T {
    type ID = T::ID;

    #[inline]
    fn start_state(&self) -> Self::ID {
        (**self).start_state()
    }

    #[inline]
    fn is_match_state(&self, id: Self::ID) -> bool {
        (**self).is_match_state(id)
    }

    #[inline]
    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
        (**self).is_match_or_dead_state(id)
    }

    #[inline]
    fn is_dead_state(&self, id: Self::ID) -> bool {
        (**self).is_dead_state(id)
    }

    #[inline]
    fn is_anchored(&self) -> bool {
        (**self).is_anchored()
    }

    #[inline]
    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
        (**self).next_state(current, input)
    }

    #[inline]
    unsafe fn next_state_unchecked(
        &self,
        current: Self::ID,
        input: u8,
    ) -> Self::ID {
        (**self).next_state_unchecked(current, input)
    }
}