pest/
position.rs

1// pest. The Elegant Parser
2// Copyright (c) 2018 Dragoș Tiselice
3//
4// Licensed under the Apache License, Version 2.0
5// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. All files in the project carrying such notice may not be copied,
8// modified, or distributed except according to those terms.
9
10use core::cmp::Ordering;
11use core::fmt;
12use core::hash::{Hash, Hasher};
13use core::ops::Range;
14use core::ptr;
15use core::str;
16
17use crate::span;
18
19/// A cursor position in a `&str` which provides useful methods to manually parse that string.
20#[derive(Clone, Copy)]
21pub struct Position<'i> {
22    input: &'i str,
23    /// # Safety:
24    ///
25    /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus).
26    pos: usize,
27}
28
29impl<'i> Position<'i> {
30    /// Create a new `Position` without checking invariants. (Checked with `debug_assertions`.)
31    ///
32    /// # Safety:
33    ///
34    /// `input[pos..]` must be a valid codepoint boundary (should not panic when indexing thus).
35    pub(crate) unsafe fn new_unchecked(input: &str, pos: usize) -> Position<'_> {
36        debug_assert!(input.get(pos..).is_some());
37        Position { input, pos }
38    }
39
40    /// Attempts to create a new `Position` at the given position. If the specified position is
41    /// an invalid index, or the specified position is not a valid UTF8 boundary, then None is
42    /// returned.
43    ///
44    /// # Examples
45    /// ```
46    /// # use pest::Position;
47    /// let cheart = '💖';
48    /// let heart = "💖";
49    /// assert_eq!(Position::new(heart, 1), None);
50    /// assert_ne!(Position::new(heart, cheart.len_utf8()), None);
51    /// ```
52    pub fn new(input: &str, pos: usize) -> Option<Position<'_>> {
53        input.get(pos..).map(|_| Position { input, pos })
54    }
55
56    /// Creates a `Position` at the start of a `&str`.
57    ///
58    /// # Examples
59    ///
60    /// ```
61    /// # use pest::Position;
62    /// let start = Position::from_start("");
63    /// assert_eq!(start.pos(), 0);
64    /// ```
65    #[inline]
66    pub fn from_start(input: &'i str) -> Position<'i> {
67        // Position 0 is always safe because it's always a valid UTF-8 border.
68        Position { input, pos: 0 }
69    }
70
71    /// Returns the byte position of this `Position` as a `usize`.
72    ///
73    /// # Examples
74    ///
75    /// ```
76    /// # use pest::Position;
77    /// let input = "ab";
78    /// let mut start = Position::from_start(input);
79    ///
80    /// assert_eq!(start.pos(), 0);
81    /// ```
82    #[inline]
83    pub fn pos(&self) -> usize {
84        self.pos
85    }
86
87    /// Creates a `Span` from two `Position`s.
88    ///
89    /// # Panics
90    ///
91    /// Panics if the positions come from different inputs.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// # use pest::Position;
97    /// let input = "ab";
98    /// let start = Position::from_start(input);
99    /// let span = start.span(&start.clone());
100    ///
101    /// assert_eq!(span.start(), 0);
102    /// assert_eq!(span.end(), 0);
103    /// ```
104    #[inline]
105    pub fn span(&self, other: &Position<'i>) -> span::Span<'i> {
106        if ptr::eq(self.input, other.input)
107        /* && self.input.get(self.pos..other.pos).is_some() */
108        {
109            // This is safe because the pos field of a Position should always be a valid str index.
110            unsafe { span::Span::new_unchecked(self.input, self.pos, other.pos) }
111        } else {
112            // TODO: maybe a panic if self.pos < other.pos
113            panic!("span created from positions from different inputs")
114        }
115    }
116
117    /// Returns the line and column number of this `Position`.
118    ///
119    /// This is an O(n) operation, where n is the number of chars in the input.
120    /// You better use [`pair.line_col()`](struct.Pair.html#method.line_col) instead.
121    ///
122    /// # Examples
123    ///
124    /// ```
125    /// # use pest;
126    /// # #[allow(non_camel_case_types)]
127    /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
128    /// enum Rule {}
129    ///
130    /// let input = "\na";
131    /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
132    /// let mut result = state.match_string("\na");
133    /// assert!(result.is_ok());
134    /// assert_eq!(result.unwrap().position().line_col(), (2, 2));
135    /// ```
136    #[inline]
137    pub fn line_col(&self) -> (usize, usize) {
138        if self.pos > self.input.len() {
139            panic!("position out of bounds");
140        }
141        let mut pos = self.pos;
142        let slice = &self.input[..pos];
143        let mut chars = slice.chars().peekable();
144
145        let mut line_col = (1, 1);
146
147        while pos != 0 {
148            match chars.next() {
149                Some('\r') => {
150                    if let Some(&'\n') = chars.peek() {
151                        chars.next();
152
153                        if pos == 1 {
154                            pos -= 1;
155                        } else {
156                            pos -= 2;
157                        }
158
159                        line_col = (line_col.0 + 1, 1);
160                    } else {
161                        pos -= 1;
162                        line_col = (line_col.0, line_col.1 + 1);
163                    }
164                }
165                Some('\n') => {
166                    pos -= 1;
167                    line_col = (line_col.0 + 1, 1);
168                }
169                Some(c) => {
170                    pos -= c.len_utf8();
171                    line_col = (line_col.0, line_col.1 + 1);
172                }
173                None => unreachable!(),
174            }
175        }
176
177        line_col
178    }
179
180    /// Returns the entire line of the input that contains this `Position`.
181    ///
182    /// # Examples
183    ///
184    /// ```
185    /// # use pest;
186    /// # #[allow(non_camel_case_types)]
187    /// # #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
188    /// enum Rule {}
189    ///
190    /// let input = "\na";
191    /// let mut state: Box<pest::ParserState<'_, Rule>> = pest::ParserState::new(input);
192    /// let mut result = state.match_string("\na");
193    /// assert!(result.is_ok());
194    /// assert_eq!(result.unwrap().position().line_of(), "a");
195    /// ```
196    #[inline]
197    pub fn line_of(&self) -> &'i str {
198        if self.pos > self.input.len() {
199            panic!("position out of bounds");
200        };
201        // Safe since start and end can only be valid UTF-8 borders.
202        &self.input[self.find_line_start()..self.find_line_end()]
203    }
204
205    pub(crate) fn find_line_start(&self) -> usize {
206        if self.input.is_empty() {
207            return 0;
208        };
209        // Position's pos is always a UTF-8 border.
210        let start = self
211            .input
212            .char_indices()
213            .rev()
214            .skip_while(|&(i, _)| i >= self.pos)
215            .find(|&(_, c)| c == '\n');
216        match start {
217            Some((i, _)) => i + 1,
218            None => 0,
219        }
220    }
221
222    pub(crate) fn find_line_end(&self) -> usize {
223        if self.input.is_empty() {
224            0
225        } else if self.pos == self.input.len() - 1 {
226            self.input.len()
227        } else {
228            // Position's pos is always a UTF-8 border.
229            let end = self
230                .input
231                .char_indices()
232                .skip_while(|&(i, _)| i < self.pos)
233                .find(|&(_, c)| c == '\n');
234            match end {
235                Some((i, _)) => i + 1,
236                None => self.input.len(),
237            }
238        }
239    }
240
241    /// Returns `true` when the `Position` points to the start of the input `&str`.
242    #[inline]
243    pub(crate) fn at_start(&self) -> bool {
244        self.pos == 0
245    }
246
247    /// Returns `true` when the `Position` points to the end of the input `&str`.
248    #[inline]
249    pub(crate) fn at_end(&self) -> bool {
250        self.pos == self.input.len()
251    }
252
253    /// Skips `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
254    /// otherwise. If the return value is `false`, `pos` will not be updated.
255    #[inline]
256    pub(crate) fn skip(&mut self, n: usize) -> bool {
257        let skipped = {
258            let mut len = 0;
259            // Position's pos is always a UTF-8 border.
260            let mut chars = self.input[self.pos..].chars();
261            for _ in 0..n {
262                if let Some(c) = chars.next() {
263                    len += c.len_utf8();
264                } else {
265                    return false;
266                }
267            }
268            len
269        };
270
271        self.pos += skipped;
272        true
273    }
274
275    /// Goes back `n` `char`s from the `Position` and returns `true` if the skip was possible or `false`
276    /// otherwise. If the return value is `false`, `pos` will not be updated.
277    #[inline]
278    pub(crate) fn skip_back(&mut self, n: usize) -> bool {
279        let skipped = {
280            let mut len = 0;
281            // Position's pos is always a UTF-8 border.
282            let mut chars = self.input[..self.pos].chars().rev();
283            for _ in 0..n {
284                if let Some(c) = chars.next() {
285                    len += c.len_utf8();
286                } else {
287                    return false;
288                }
289            }
290            len
291        };
292
293        self.pos -= skipped;
294        true
295    }
296
297    /// Skips until one of the given `strings` is found. If none of the `strings` can be found,
298    /// this function will return `false` but its `pos` will *still* be updated.
299    #[inline]
300    pub(crate) fn skip_until(&mut self, strings: &[&str]) -> bool {
301        #[cfg(not(feature = "memchr"))]
302        {
303            self.skip_until_basic(strings)
304        }
305        #[cfg(feature = "memchr")]
306        {
307            match strings {
308                [] => (),
309                [s1] => {
310                    if let Some(from) =
311                        memchr::memmem::find(&self.input.as_bytes()[self.pos..], s1.as_bytes())
312                    {
313                        self.pos += from;
314                        return true;
315                    }
316                }
317                [s1, s2] if !s1.is_empty() && !s2.is_empty() => {
318                    let b1 = s1.as_bytes()[0];
319                    let b2 = s2.as_bytes()[0];
320                    let miter = memchr::memchr2_iter(b1, b2, &self.input.as_bytes()[self.pos..]);
321                    for from in miter {
322                        let start = &self.input[self.pos + from..];
323                        if start.starts_with(s1) || start.starts_with(s2) {
324                            self.pos += from;
325                            return true;
326                        }
327                    }
328                }
329                [s1, s2, s3] if !s1.is_empty() && !s2.is_empty() && s3.is_empty() => {
330                    let b1 = s1.as_bytes()[0];
331                    let b2 = s2.as_bytes()[0];
332                    let b3 = s2.as_bytes()[0];
333                    let miter =
334                        memchr::memchr3_iter(b1, b2, b3, &self.input.as_bytes()[self.pos..]);
335                    for from in miter {
336                        let start = &self.input[self.pos + from..];
337                        if start.starts_with(s1) || start.starts_with(s2) || start.starts_with(s3) {
338                            self.pos += from;
339                            return true;
340                        }
341                    }
342                }
343                _ => {
344                    return self.skip_until_basic(strings);
345                }
346            }
347            self.pos = self.input.len();
348            false
349        }
350    }
351
352    #[inline]
353    fn skip_until_basic(&mut self, strings: &[&str]) -> bool {
354        // TODO: optimize with Aho-Corasick, e.g. https://crates.io/crates/daachorse?
355        for from in self.pos..self.input.len() {
356            let bytes = if let Some(string) = self.input.get(from..) {
357                string.as_bytes()
358            } else {
359                continue;
360            };
361
362            for slice in strings.iter() {
363                let to = slice.len();
364                if Some(slice.as_bytes()) == bytes.get(0..to) {
365                    self.pos = from;
366                    return true;
367                }
368            }
369        }
370
371        self.pos = self.input.len();
372        false
373    }
374
375    /// Matches the char at the `Position` against a specified character and returns `true` if a match
376    /// was made. If no match was made, returns `false`.
377    /// `pos` will not be updated in either case.
378    #[inline]
379    pub(crate) fn match_char(&self, c: char) -> bool {
380        matches!(self.input[self.pos..].chars().next(), Some(cc) if c == cc)
381    }
382
383    /// Matches the char at the `Position` against a filter function and returns `true` if a match
384    /// was made. If no match was made, returns `false` and `pos` will not be updated.
385    #[inline]
386    pub(crate) fn match_char_by<F>(&mut self, f: F) -> bool
387    where
388        F: FnOnce(char) -> bool,
389    {
390        if let Some(c) = self.input[self.pos..].chars().next() {
391            if f(c) {
392                self.pos += c.len_utf8();
393                true
394            } else {
395                false
396            }
397        } else {
398            false
399        }
400    }
401
402    /// Matches `string` from the `Position` and returns `true` if a match was made or `false`
403    /// otherwise. If no match was made, `pos` will not be updated.
404    #[inline]
405    pub(crate) fn match_string(&mut self, string: &str) -> bool {
406        let to = self.pos + string.len();
407
408        if Some(string.as_bytes()) == self.input.as_bytes().get(self.pos..to) {
409            self.pos = to;
410            true
411        } else {
412            false
413        }
414    }
415
416    /// Case-insensitively matches `string` from the `Position` and returns `true` if a match was
417    /// made or `false` otherwise. If no match was made, `pos` will not be updated.
418    #[inline]
419    pub(crate) fn match_insensitive(&mut self, string: &str) -> bool {
420        let matched = {
421            let slice = &self.input[self.pos..];
422            if let Some(slice) = slice.get(0..string.len()) {
423                slice.eq_ignore_ascii_case(string)
424            } else {
425                false
426            }
427        };
428
429        if matched {
430            self.pos += string.len();
431            true
432        } else {
433            false
434        }
435    }
436
437    /// Matches `char` `range` from the `Position` and returns `true` if a match was made or `false`
438    /// otherwise. If no match was made, `pos` will not be updated.
439    #[inline]
440    pub(crate) fn match_range(&mut self, range: Range<char>) -> bool {
441        if let Some(c) = self.input[self.pos..].chars().next() {
442            if range.start <= c && c <= range.end {
443                self.pos += c.len_utf8();
444                return true;
445            }
446        }
447
448        false
449    }
450}
451
452impl<'i> fmt::Debug for Position<'i> {
453    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
454        f.debug_struct("Position").field("pos", &self.pos).finish()
455    }
456}
457
458impl<'i> PartialEq for Position<'i> {
459    fn eq(&self, other: &Position<'i>) -> bool {
460        ptr::eq(self.input, other.input) && self.pos == other.pos
461    }
462}
463
464impl<'i> Eq for Position<'i> {}
465
466#[allow(clippy::non_canonical_partial_ord_impl)]
467impl<'i> PartialOrd for Position<'i> {
468    fn partial_cmp(&self, other: &Position<'i>) -> Option<Ordering> {
469        if ptr::eq(self.input, other.input) {
470            self.pos.partial_cmp(&other.pos)
471        } else {
472            None
473        }
474    }
475}
476
477impl<'i> Ord for Position<'i> {
478    fn cmp(&self, other: &Position<'i>) -> Ordering {
479        self.partial_cmp(other)
480            .expect("cannot compare positions from different strs")
481    }
482}
483
484impl<'i> Hash for Position<'i> {
485    fn hash<H: Hasher>(&self, state: &mut H) {
486        (self.input as *const str).hash(state);
487        self.pos.hash(state);
488    }
489}
490
491#[cfg(test)]
492mod tests {
493    use super::*;
494
495    #[test]
496    fn empty() {
497        let input = "";
498        assert!(Position::new(input, 0).unwrap().match_string(""));
499        assert!(!Position::new(input, 0).unwrap().match_string("a"));
500    }
501
502    #[test]
503    fn parts() {
504        let input = "asdasdf";
505
506        assert!(Position::new(input, 0).unwrap().match_string("asd"));
507        assert!(Position::new(input, 3).unwrap().match_string("asdf"));
508    }
509
510    #[test]
511    fn line_col() {
512        let input = "a\rb\nc\r\nd嗨";
513
514        assert_eq!(Position::new(input, 0).unwrap().line_col(), (1, 1));
515        assert_eq!(Position::new(input, 1).unwrap().line_col(), (1, 2));
516        assert_eq!(Position::new(input, 2).unwrap().line_col(), (1, 3));
517        assert_eq!(Position::new(input, 3).unwrap().line_col(), (1, 4));
518        assert_eq!(Position::new(input, 4).unwrap().line_col(), (2, 1));
519        assert_eq!(Position::new(input, 5).unwrap().line_col(), (2, 2));
520        assert_eq!(Position::new(input, 6).unwrap().line_col(), (2, 3));
521        assert_eq!(Position::new(input, 7).unwrap().line_col(), (3, 1));
522        assert_eq!(Position::new(input, 8).unwrap().line_col(), (3, 2));
523        assert_eq!(Position::new(input, 11).unwrap().line_col(), (3, 3));
524        let input = "abcd嗨";
525        assert_eq!(Position::new(input, 7).unwrap().line_col(), (1, 6));
526    }
527
528    #[test]
529    fn line_of() {
530        let input = "a\rb\nc\r\nd嗨";
531
532        assert_eq!(Position::new(input, 0).unwrap().line_of(), "a\rb\n");
533        assert_eq!(Position::new(input, 1).unwrap().line_of(), "a\rb\n");
534        assert_eq!(Position::new(input, 2).unwrap().line_of(), "a\rb\n");
535        assert_eq!(Position::new(input, 3).unwrap().line_of(), "a\rb\n");
536        assert_eq!(Position::new(input, 4).unwrap().line_of(), "c\r\n");
537        assert_eq!(Position::new(input, 5).unwrap().line_of(), "c\r\n");
538        assert_eq!(Position::new(input, 6).unwrap().line_of(), "c\r\n");
539        assert_eq!(Position::new(input, 7).unwrap().line_of(), "d嗨");
540        assert_eq!(Position::new(input, 8).unwrap().line_of(), "d嗨");
541        assert_eq!(Position::new(input, 11).unwrap().line_of(), "d嗨");
542    }
543
544    #[test]
545    fn line_of_empty() {
546        let input = "";
547
548        assert_eq!(Position::new(input, 0).unwrap().line_of(), "");
549    }
550
551    #[test]
552    fn line_of_new_line() {
553        let input = "\n";
554
555        assert_eq!(Position::new(input, 0).unwrap().line_of(), "\n");
556    }
557
558    #[test]
559    fn line_of_between_new_line() {
560        let input = "\n\n";
561
562        assert_eq!(Position::new(input, 1).unwrap().line_of(), "\n");
563    }
564
565    fn measure_skip(input: &str, pos: usize, n: usize) -> Option<usize> {
566        let mut p = Position::new(input, pos).unwrap();
567        if p.skip(n) {
568            Some(p.pos - pos)
569        } else {
570            None
571        }
572    }
573
574    #[test]
575    fn skip_empty() {
576        let input = "";
577
578        assert_eq!(measure_skip(input, 0, 0), Some(0));
579        assert_eq!(measure_skip(input, 0, 1), None);
580    }
581
582    #[test]
583    fn skip() {
584        let input = "d嗨";
585
586        assert_eq!(measure_skip(input, 0, 0), Some(0));
587        assert_eq!(measure_skip(input, 0, 1), Some(1));
588        assert_eq!(measure_skip(input, 1, 1), Some(3));
589    }
590
591    #[test]
592    fn skip_until() {
593        let input = "ab ac";
594        let pos = Position::from_start(input);
595
596        let mut test_pos = pos;
597        test_pos.skip_until(&["a", "b"]);
598        assert_eq!(test_pos.pos(), 0);
599
600        test_pos = pos;
601        test_pos.skip_until(&["b"]);
602        assert_eq!(test_pos.pos(), 1);
603
604        test_pos = pos;
605        test_pos.skip_until(&["ab"]);
606        assert_eq!(test_pos.pos(), 0);
607
608        test_pos = pos;
609        test_pos.skip_until(&["ac", "z"]);
610        assert_eq!(test_pos.pos(), 3);
611
612        test_pos = pos;
613        assert!(!test_pos.skip_until(&["z"]));
614        assert_eq!(test_pos.pos(), 5);
615    }
616
617    #[test]
618    fn match_range() {
619        let input = "b";
620
621        assert!(Position::new(input, 0).unwrap().match_range('a'..'c'));
622        assert!(Position::new(input, 0).unwrap().match_range('b'..'b'));
623        assert!(!Position::new(input, 0).unwrap().match_range('a'..'a'));
624        assert!(!Position::new(input, 0).unwrap().match_range('c'..'c'));
625        assert!(Position::new(input, 0).unwrap().match_range('a'..'嗨'));
626    }
627
628    #[test]
629    fn match_insensitive() {
630        let input = "AsdASdF";
631
632        assert!(Position::new(input, 0).unwrap().match_insensitive("asd"));
633        assert!(Position::new(input, 3).unwrap().match_insensitive("asdf"));
634    }
635
636    #[test]
637    fn cmp() {
638        let input = "a";
639        let start = Position::from_start(input);
640        let mut end = start;
641
642        assert!(end.skip(1));
643        let result = start.cmp(&end);
644
645        assert_eq!(result, Ordering::Less);
646    }
647
648    #[test]
649    #[should_panic]
650    fn cmp_panic() {
651        let input1 = "a";
652        let input2 = "b";
653        let pos1 = Position::from_start(input1);
654        let pos2 = Position::from_start(input2);
655
656        let _ = pos1.cmp(&pos2);
657    }
658
659    #[test]
660    #[cfg(feature = "std")]
661    fn hash() {
662        use std::collections::HashSet;
663
664        let input = "a";
665        let start = Position::from_start(input);
666        let mut positions = HashSet::new();
667
668        positions.insert(start);
669    }
670}