regex/
input.rs

1use std::char;
2use std::cmp::Ordering;
3use std::fmt;
4use std::ops;
5use std::u32;
6
7use crate::literal::LiteralSearcher;
8use crate::prog::InstEmptyLook;
9use crate::utf8::{decode_last_utf8, decode_utf8};
10
11/// Represents a location in the input.
12#[derive(Clone, Copy, Debug)]
13pub struct InputAt {
14    pos: usize,
15    c: Char,
16    byte: Option<u8>,
17    len: usize,
18}
19
20impl InputAt {
21    /// Returns true iff this position is at the beginning of the input.
22    pub fn is_start(&self) -> bool {
23        self.pos == 0
24    }
25
26    /// Returns true iff this position is past the end of the input.
27    pub fn is_end(&self) -> bool {
28        self.c.is_none() && self.byte.is_none()
29    }
30
31    /// Returns the character at this position.
32    ///
33    /// If this position is just before or after the input, then an absent
34    /// character is returned.
35    pub fn char(&self) -> Char {
36        self.c
37    }
38
39    /// Returns the byte at this position.
40    pub fn byte(&self) -> Option<u8> {
41        self.byte
42    }
43
44    /// Returns the UTF-8 width of the character at this position.
45    pub fn len(&self) -> usize {
46        self.len
47    }
48
49    /// Returns whether the UTF-8 width of the character at this position
50    /// is zero.
51    pub fn is_empty(&self) -> bool {
52        self.len == 0
53    }
54
55    /// Returns the byte offset of this position.
56    pub fn pos(&self) -> usize {
57        self.pos
58    }
59
60    /// Returns the byte offset of the next position in the input.
61    pub fn next_pos(&self) -> usize {
62        self.pos + self.len
63    }
64}
65
66/// An abstraction over input used in the matching engines.
67pub trait Input: fmt::Debug {
68    /// Return an encoding of the position at byte offset `i`.
69    fn at(&self, i: usize) -> InputAt;
70
71    /// Return the Unicode character occurring next to `at`.
72    ///
73    /// If no such character could be decoded, then `Char` is absent.
74    fn next_char(&self, at: InputAt) -> Char;
75
76    /// Return the Unicode character occurring previous to `at`.
77    ///
78    /// If no such character could be decoded, then `Char` is absent.
79    fn previous_char(&self, at: InputAt) -> Char;
80
81    /// Return true if the given empty width instruction matches at the
82    /// input position given.
83    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
84
85    /// Scan the input for a matching prefix.
86    fn prefix_at(
87        &self,
88        prefixes: &LiteralSearcher,
89        at: InputAt,
90    ) -> Option<InputAt>;
91
92    /// The number of bytes in the input.
93    fn len(&self) -> usize;
94
95    /// Whether the input is empty.
96    fn is_empty(&self) -> bool {
97        self.len() == 0
98    }
99
100    /// Return the given input as a sequence of bytes.
101    fn as_bytes(&self) -> &[u8];
102}
103
104impl<'a, T: Input> Input for &'a T {
105    fn at(&self, i: usize) -> InputAt {
106        (**self).at(i)
107    }
108
109    fn next_char(&self, at: InputAt) -> Char {
110        (**self).next_char(at)
111    }
112
113    fn previous_char(&self, at: InputAt) -> Char {
114        (**self).previous_char(at)
115    }
116
117    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
118        (**self).is_empty_match(at, empty)
119    }
120
121    fn prefix_at(
122        &self,
123        prefixes: &LiteralSearcher,
124        at: InputAt,
125    ) -> Option<InputAt> {
126        (**self).prefix_at(prefixes, at)
127    }
128
129    fn len(&self) -> usize {
130        (**self).len()
131    }
132
133    fn as_bytes(&self) -> &[u8] {
134        (**self).as_bytes()
135    }
136}
137
138/// An input reader over characters.
139#[derive(Clone, Copy, Debug)]
140pub struct CharInput<'t>(&'t [u8]);
141
142impl<'t> CharInput<'t> {
143    /// Return a new character input reader for the given string.
144    pub fn new(s: &'t [u8]) -> CharInput<'t> {
145        CharInput(s)
146    }
147}
148
149impl<'t> ops::Deref for CharInput<'t> {
150    type Target = [u8];
151
152    fn deref(&self) -> &[u8] {
153        self.0
154    }
155}
156
157impl<'t> Input for CharInput<'t> {
158    fn at(&self, i: usize) -> InputAt {
159        if i >= self.len() {
160            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
161        } else {
162            let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
163            InputAt { pos: i, c, byte: None, len: c.len_utf8() }
164        }
165    }
166
167    fn next_char(&self, at: InputAt) -> Char {
168        at.char()
169    }
170
171    fn previous_char(&self, at: InputAt) -> Char {
172        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
173    }
174
175    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
176        use crate::prog::EmptyLook::*;
177        match empty.look {
178            StartLine => {
179                let c = self.previous_char(at);
180                at.pos() == 0 || c == '\n'
181            }
182            EndLine => {
183                let c = self.next_char(at);
184                at.pos() == self.len() || c == '\n'
185            }
186            StartText => at.pos() == 0,
187            EndText => at.pos() == self.len(),
188            WordBoundary => {
189                let (c1, c2) = (self.previous_char(at), self.next_char(at));
190                c1.is_word_char() != c2.is_word_char()
191            }
192            NotWordBoundary => {
193                let (c1, c2) = (self.previous_char(at), self.next_char(at));
194                c1.is_word_char() == c2.is_word_char()
195            }
196            WordBoundaryAscii => {
197                let (c1, c2) = (self.previous_char(at), self.next_char(at));
198                c1.is_word_byte() != c2.is_word_byte()
199            }
200            NotWordBoundaryAscii => {
201                let (c1, c2) = (self.previous_char(at), self.next_char(at));
202                c1.is_word_byte() == c2.is_word_byte()
203            }
204        }
205    }
206
207    fn prefix_at(
208        &self,
209        prefixes: &LiteralSearcher,
210        at: InputAt,
211    ) -> Option<InputAt> {
212        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
213    }
214
215    fn len(&self) -> usize {
216        self.0.len()
217    }
218
219    fn as_bytes(&self) -> &[u8] {
220        self.0
221    }
222}
223
224/// An input reader over bytes.
225#[derive(Clone, Copy, Debug)]
226pub struct ByteInput<'t> {
227    text: &'t [u8],
228    only_utf8: bool,
229}
230
231impl<'t> ByteInput<'t> {
232    /// Return a new byte-based input reader for the given string.
233    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
234        ByteInput { text, only_utf8 }
235    }
236}
237
238impl<'t> ops::Deref for ByteInput<'t> {
239    type Target = [u8];
240
241    fn deref(&self) -> &[u8] {
242        self.text
243    }
244}
245
246impl<'t> Input for ByteInput<'t> {
247    fn at(&self, i: usize) -> InputAt {
248        if i >= self.len() {
249            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
250        } else {
251            InputAt {
252                pos: i,
253                c: None.into(),
254                byte: self.get(i).cloned(),
255                len: 1,
256            }
257        }
258    }
259
260    fn next_char(&self, at: InputAt) -> Char {
261        decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
262    }
263
264    fn previous_char(&self, at: InputAt) -> Char {
265        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
266    }
267
268    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
269        use crate::prog::EmptyLook::*;
270        match empty.look {
271            StartLine => {
272                let c = self.previous_char(at);
273                at.pos() == 0 || c == '\n'
274            }
275            EndLine => {
276                let c = self.next_char(at);
277                at.pos() == self.len() || c == '\n'
278            }
279            StartText => at.pos() == 0,
280            EndText => at.pos() == self.len(),
281            WordBoundary => {
282                let (c1, c2) = (self.previous_char(at), self.next_char(at));
283                c1.is_word_char() != c2.is_word_char()
284            }
285            NotWordBoundary => {
286                let (c1, c2) = (self.previous_char(at), self.next_char(at));
287                c1.is_word_char() == c2.is_word_char()
288            }
289            WordBoundaryAscii => {
290                let (c1, c2) = (self.previous_char(at), self.next_char(at));
291                if self.only_utf8 {
292                    // If we must match UTF-8, then we can't match word
293                    // boundaries at invalid UTF-8.
294                    if c1.is_none() && !at.is_start() {
295                        return false;
296                    }
297                    if c2.is_none() && !at.is_end() {
298                        return false;
299                    }
300                }
301                c1.is_word_byte() != c2.is_word_byte()
302            }
303            NotWordBoundaryAscii => {
304                let (c1, c2) = (self.previous_char(at), self.next_char(at));
305                if self.only_utf8 {
306                    // If we must match UTF-8, then we can't match word
307                    // boundaries at invalid UTF-8.
308                    if c1.is_none() && !at.is_start() {
309                        return false;
310                    }
311                    if c2.is_none() && !at.is_end() {
312                        return false;
313                    }
314                }
315                c1.is_word_byte() == c2.is_word_byte()
316            }
317        }
318    }
319
320    fn prefix_at(
321        &self,
322        prefixes: &LiteralSearcher,
323        at: InputAt,
324    ) -> Option<InputAt> {
325        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
326    }
327
328    fn len(&self) -> usize {
329        self.text.len()
330    }
331
332    fn as_bytes(&self) -> &[u8] {
333        self.text
334    }
335}
336
337/// An inline representation of `Option<char>`.
338///
339/// This eliminates the need to do case analysis on `Option<char>` to determine
340/// ordinality with other characters.
341///
342/// (The `Option<char>` is not related to encoding. Instead, it is used in the
343/// matching engines to represent the beginning and ending boundaries of the
344/// search text.)
345#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
346pub struct Char(u32);
347
348impl fmt::Debug for Char {
349    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350        match char::from_u32(self.0) {
351            None => write!(f, "Empty"),
352            Some(c) => write!(f, "{:?}", c),
353        }
354    }
355}
356
357impl Char {
358    /// Returns true iff the character is absent.
359    #[inline]
360    pub fn is_none(self) -> bool {
361        self.0 == u32::MAX
362    }
363
364    /// Returns the length of the character's UTF-8 encoding.
365    ///
366    /// If the character is absent, then `1` is returned.
367    #[inline]
368    pub fn len_utf8(self) -> usize {
369        char::from_u32(self.0).map_or(1, |c| c.len_utf8())
370    }
371
372    /// Returns true iff the character is a word character.
373    ///
374    /// If the character is absent, then false is returned.
375    pub fn is_word_char(self) -> bool {
376        // is_word_character can panic if the Unicode data for \w isn't
377        // available. However, our compiler ensures that if a Unicode word
378        // boundary is used, then the data must also be available. If it isn't,
379        // then the compiler returns an error.
380        char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
381    }
382
383    /// Returns true iff the byte is a word byte.
384    ///
385    /// If the byte is absent, then false is returned.
386    pub fn is_word_byte(self) -> bool {
387        match char::from_u32(self.0) {
388            Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
389            None | Some(_) => false,
390        }
391    }
392}
393
394impl From<char> for Char {
395    fn from(c: char) -> Char {
396        Char(c as u32)
397    }
398}
399
400impl From<Option<char>> for Char {
401    fn from(c: Option<char>) -> Char {
402        c.map_or(Char(u32::MAX), |c| c.into())
403    }
404}
405
406impl PartialEq<char> for Char {
407    #[inline]
408    fn eq(&self, other: &char) -> bool {
409        self.0 == *other as u32
410    }
411}
412
413impl PartialEq<Char> for char {
414    #[inline]
415    fn eq(&self, other: &Char) -> bool {
416        *self as u32 == other.0
417    }
418}
419
420impl PartialOrd<char> for Char {
421    #[inline]
422    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
423        self.0.partial_cmp(&(*other as u32))
424    }
425}
426
427impl PartialOrd<Char> for char {
428    #[inline]
429    fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
430        (*self as u32).partial_cmp(&other.0)
431    }
432}