toml/
tokens.rs

1use std::borrow::Cow;
2use std::char;
3use std::str;
4use std::string;
5use std::string::String as StdString;
6
7use self::Token::*;
8
9/// A span, designating a range of bytes where a token is located.
10#[derive(Eq, PartialEq, Debug, Clone, Copy)]
11pub struct Span {
12    /// The start of the range.
13    pub start: usize,
14    /// The end of the range (exclusive).
15    pub end: usize,
16}
17
18impl From<Span> for (usize, usize) {
19    fn from(Span { start, end }: Span) -> (usize, usize) {
20        (start, end)
21    }
22}
23
24#[derive(Eq, PartialEq, Debug)]
25pub enum Token<'a> {
26    Whitespace(&'a str),
27    Newline,
28    Comment(&'a str),
29
30    Equals,
31    Period,
32    Comma,
33    Colon,
34    Plus,
35    LeftBrace,
36    RightBrace,
37    LeftBracket,
38    RightBracket,
39
40    Keylike(&'a str),
41    String {
42        src: &'a str,
43        val: Cow<'a, str>,
44        multiline: bool,
45    },
46}
47
48#[derive(Eq, PartialEq, Debug)]
49pub enum Error {
50    InvalidCharInString(usize, char),
51    InvalidEscape(usize, char),
52    InvalidHexEscape(usize, char),
53    InvalidEscapeValue(usize, u32),
54    NewlineInString(usize),
55    Unexpected(usize, char),
56    UnterminatedString(usize),
57    NewlineInTableKey(usize),
58    MultilineStringKey(usize),
59    EmptyTableKey(usize),
60    Wanted {
61        at: usize,
62        expected: &'static str,
63        found: &'static str,
64    },
65}
66
67#[derive(Clone)]
68pub struct Tokenizer<'a> {
69    input: &'a str,
70    chars: CrlfFold<'a>,
71}
72
73#[derive(Clone)]
74struct CrlfFold<'a> {
75    chars: str::CharIndices<'a>,
76}
77
78#[derive(Debug)]
79enum MaybeString {
80    NotEscaped(usize),
81    Owned(string::String),
82}
83
84impl<'a> Tokenizer<'a> {
85    pub fn new(input: &'a str) -> Tokenizer<'a> {
86        let mut t = Tokenizer {
87            input,
88            chars: CrlfFold {
89                chars: input.char_indices(),
90            },
91        };
92        // Eat utf-8 BOM
93        t.eatc('\u{feff}');
94        t
95    }
96
97    pub fn next(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
98        let (start, token) = match self.one() {
99            Some((start, '\n')) => (start, Newline),
100            Some((start, ' ')) => (start, self.whitespace_token(start)),
101            Some((start, '\t')) => (start, self.whitespace_token(start)),
102            Some((start, '#')) => (start, self.comment_token(start)),
103            Some((start, '=')) => (start, Equals),
104            Some((start, '.')) => (start, Period),
105            Some((start, ',')) => (start, Comma),
106            Some((start, ':')) => (start, Colon),
107            Some((start, '+')) => (start, Plus),
108            Some((start, '{')) => (start, LeftBrace),
109            Some((start, '}')) => (start, RightBrace),
110            Some((start, '[')) => (start, LeftBracket),
111            Some((start, ']')) => (start, RightBracket),
112            Some((start, '\'')) => {
113                return self
114                    .literal_string(start)
115                    .map(|t| Some((self.step_span(start), t)))
116            }
117            Some((start, '"')) => {
118                return self
119                    .basic_string(start)
120                    .map(|t| Some((self.step_span(start), t)))
121            }
122            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
123
124            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
125            None => return Ok(None),
126        };
127
128        let span = self.step_span(start);
129        Ok(Some((span, token)))
130    }
131
132    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
133        self.clone().next()
134    }
135
136    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
137        self.eat_spanned(expected).map(|s| s.is_some())
138    }
139
140    /// Eat a value, returning it's span if it was consumed.
141    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
142        let span = match self.peek()? {
143            Some((span, ref found)) if expected == *found => span,
144            Some(_) => return Ok(None),
145            None => return Ok(None),
146        };
147
148        drop(self.next());
149        Ok(Some(span))
150    }
151
152    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
153        // ignore span
154        let _ = self.expect_spanned(expected)?;
155        Ok(())
156    }
157
158    /// Expect the given token returning its span.
159    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
160        let current = self.current();
161        match self.next()? {
162            Some((span, found)) => {
163                if expected == found {
164                    Ok(span)
165                } else {
166                    Err(Error::Wanted {
167                        at: current,
168                        expected: expected.describe(),
169                        found: found.describe(),
170                    })
171                }
172            }
173            None => Err(Error::Wanted {
174                at: self.input.len(),
175                expected: expected.describe(),
176                found: "eof",
177            }),
178        }
179    }
180
181    pub fn table_key(&mut self) -> Result<(Span, Cow<'a, str>), Error> {
182        let current = self.current();
183        match self.next()? {
184            Some((span, Token::Keylike(k))) => Ok((span, k.into())),
185            Some((
186                span,
187                Token::String {
188                    src,
189                    val,
190                    multiline,
191                },
192            )) => {
193                let offset = self.substr_offset(src);
194                if multiline {
195                    return Err(Error::MultilineStringKey(offset));
196                }
197                if val == "" {
198                    return Err(Error::EmptyTableKey(offset));
199                }
200                match src.find('\n') {
201                    None => Ok((span, val)),
202                    Some(i) => Err(Error::NewlineInTableKey(offset + i)),
203                }
204            }
205            Some((_, other)) => Err(Error::Wanted {
206                at: current,
207                expected: "a table key",
208                found: other.describe(),
209            }),
210            None => Err(Error::Wanted {
211                at: self.input.len(),
212                expected: "a table key",
213                found: "eof",
214            }),
215        }
216    }
217
218    pub fn eat_whitespace(&mut self) -> Result<(), Error> {
219        while self.eatc(' ') || self.eatc('\t') {
220            // ...
221        }
222        Ok(())
223    }
224
225    pub fn eat_comment(&mut self) -> Result<bool, Error> {
226        if !self.eatc('#') {
227            return Ok(false);
228        }
229        drop(self.comment_token(0));
230        self.eat_newline_or_eof().map(|()| true)
231    }
232
233    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
234        let current = self.current();
235        match self.next()? {
236            None | Some((_, Token::Newline)) => Ok(()),
237            Some((_, other)) => Err(Error::Wanted {
238                at: current,
239                expected: "newline",
240                found: other.describe(),
241            }),
242        }
243    }
244
245    pub fn skip_to_newline(&mut self) {
246        loop {
247            match self.one() {
248                Some((_, '\n')) | None => break,
249                _ => {}
250            }
251        }
252    }
253
254    fn eatc(&mut self, ch: char) -> bool {
255        match self.chars.clone().next() {
256            Some((_, ch2)) if ch == ch2 => {
257                self.one();
258                true
259            }
260            _ => false,
261        }
262    }
263
264    pub fn current(&mut self) -> usize {
265        self.chars
266            .clone()
267            .next()
268            .map(|i| i.0)
269            .unwrap_or_else(|| self.input.len())
270    }
271
272    pub fn input(&self) -> &'a str {
273        self.input
274    }
275
276    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
277        while self.eatc(' ') || self.eatc('\t') {
278            // ...
279        }
280        Whitespace(&self.input[start..self.current()])
281    }
282
283    fn comment_token(&mut self, start: usize) -> Token<'a> {
284        while let Some((_, ch)) = self.chars.clone().next() {
285            if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
286                break;
287            }
288            self.one();
289        }
290        Comment(&self.input[start..self.current()])
291    }
292
293    fn read_string(
294        &mut self,
295        delim: char,
296        start: usize,
297        new_ch: &mut dyn FnMut(
298            &mut Tokenizer<'_>,
299            &mut MaybeString,
300            bool,
301            usize,
302            char,
303        ) -> Result<(), Error>,
304    ) -> Result<Token<'a>, Error> {
305        let mut multiline = false;
306        if self.eatc(delim) {
307            if self.eatc(delim) {
308                multiline = true;
309            } else {
310                return Ok(String {
311                    src: &self.input[start..start + 2],
312                    val: Cow::Borrowed(""),
313                    multiline: false,
314                });
315            }
316        }
317        let mut val = MaybeString::NotEscaped(self.current());
318        let mut n = 0;
319        'outer: loop {
320            n += 1;
321            match self.one() {
322                Some((i, '\n')) => {
323                    if multiline {
324                        if self.input.as_bytes()[i] == b'\r' {
325                            val.to_owned(&self.input[..i]);
326                        }
327                        if n == 1 {
328                            val = MaybeString::NotEscaped(self.current());
329                        } else {
330                            val.push('\n');
331                        }
332                        continue;
333                    } else {
334                        return Err(Error::NewlineInString(i));
335                    }
336                }
337                Some((i, ch)) if ch == delim => {
338                    if multiline {
339                        if !self.eatc(delim) {
340                            val.push(delim);
341                            continue 'outer;
342                        }
343                        if !self.eatc(delim) {
344                            val.push(delim);
345                            val.push(delim);
346                            continue 'outer;
347                        }
348                    }
349                    return Ok(String {
350                        src: &self.input[start..self.current()],
351                        val: val.into_cow(&self.input[..i]),
352                        multiline,
353                    });
354                }
355                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
356                None => return Err(Error::UnterminatedString(start)),
357            }
358        }
359    }
360
361    fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
362        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
363            if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') {
364                val.push(ch);
365                Ok(())
366            } else {
367                Err(Error::InvalidCharInString(i, ch))
368            }
369        })
370    }
371
372    fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
373        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
374            '\\' => {
375                val.to_owned(&me.input[..i]);
376                match me.chars.next() {
377                    Some((_, '"')) => val.push('"'),
378                    Some((_, '\\')) => val.push('\\'),
379                    Some((_, 'b')) => val.push('\u{8}'),
380                    Some((_, 'f')) => val.push('\u{c}'),
381                    Some((_, 'n')) => val.push('\n'),
382                    Some((_, 'r')) => val.push('\r'),
383                    Some((_, 't')) => val.push('\t'),
384                    Some((i, c @ 'u')) | Some((i, c @ 'U')) => {
385                        let len = if c == 'u' { 4 } else { 8 };
386                        val.push(me.hex(start, i, len)?);
387                    }
388                    Some((i, c @ ' ')) | Some((i, c @ '\t')) | Some((i, c @ '\n')) if multi => {
389                        if c != '\n' {
390                            while let Some((_, ch)) = me.chars.clone().next() {
391                                match ch {
392                                    ' ' | '\t' => {
393                                        me.chars.next();
394                                        continue;
395                                    }
396                                    '\n' => {
397                                        me.chars.next();
398                                        break;
399                                    }
400                                    _ => return Err(Error::InvalidEscape(i, c)),
401                                }
402                            }
403                        }
404                        while let Some((_, ch)) = me.chars.clone().next() {
405                            match ch {
406                                ' ' | '\t' | '\n' => {
407                                    me.chars.next();
408                                }
409                                _ => break,
410                            }
411                        }
412                    }
413                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
414                    None => return Err(Error::UnterminatedString(start)),
415                }
416                Ok(())
417            }
418            ch if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}' && ch != '\u{7f}') => {
419                val.push(ch);
420                Ok(())
421            }
422            _ => Err(Error::InvalidCharInString(i, ch)),
423        })
424    }
425
426    fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
427        let mut buf = StdString::with_capacity(len);
428        for _ in 0..len {
429            match self.one() {
430                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_digit(16) => buf.push(ch),
431                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
432                None => return Err(Error::UnterminatedString(start)),
433            }
434        }
435        let val = u32::from_str_radix(&buf, 16).unwrap();
436        match char::from_u32(val) {
437            Some(ch) => Ok(ch),
438            None => Err(Error::InvalidEscapeValue(i, val)),
439        }
440    }
441
442    fn keylike(&mut self, start: usize) -> Token<'a> {
443        while let Some((_, ch)) = self.peek_one() {
444            if !is_keylike(ch) {
445                break;
446            }
447            self.one();
448        }
449        Keylike(&self.input[start..self.current()])
450    }
451
452    pub fn substr_offset(&self, s: &'a str) -> usize {
453        assert!(s.len() <= self.input.len());
454        let a = self.input.as_ptr() as usize;
455        let b = s.as_ptr() as usize;
456        assert!(a <= b);
457        b - a
458    }
459
460    /// Calculate the span of a single character.
461    fn step_span(&mut self, start: usize) -> Span {
462        let end = self
463            .peek_one()
464            .map(|t| t.0)
465            .unwrap_or_else(|| self.input.len());
466        Span { start, end }
467    }
468
469    /// Peek one char without consuming it.
470    fn peek_one(&mut self) -> Option<(usize, char)> {
471        self.chars.clone().next()
472    }
473
474    /// Take one char.
475    pub fn one(&mut self) -> Option<(usize, char)> {
476        self.chars.next()
477    }
478}
479
480impl<'a> Iterator for CrlfFold<'a> {
481    type Item = (usize, char);
482
483    fn next(&mut self) -> Option<(usize, char)> {
484        self.chars.next().map(|(i, c)| {
485            if c == '\r' {
486                let mut attempt = self.chars.clone();
487                if let Some((_, '\n')) = attempt.next() {
488                    self.chars = attempt;
489                    return (i, '\n');
490                }
491            }
492            (i, c)
493        })
494    }
495}
496
497impl MaybeString {
498    fn push(&mut self, ch: char) {
499        match *self {
500            MaybeString::NotEscaped(..) => {}
501            MaybeString::Owned(ref mut s) => s.push(ch),
502        }
503    }
504
505    fn to_owned(&mut self, input: &str) {
506        match *self {
507            MaybeString::NotEscaped(start) => {
508                *self = MaybeString::Owned(input[start..].to_owned());
509            }
510            MaybeString::Owned(..) => {}
511        }
512    }
513
514    fn into_cow(self, input: &str) -> Cow<'_, str> {
515        match self {
516            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
517            MaybeString::Owned(s) => Cow::Owned(s),
518        }
519    }
520}
521
522fn is_keylike(ch: char) -> bool {
523    ('A' <= ch && ch <= 'Z')
524        || ('a' <= ch && ch <= 'z')
525        || ('0' <= ch && ch <= '9')
526        || ch == '-'
527        || ch == '_'
528}
529
530impl<'a> Token<'a> {
531    pub fn describe(&self) -> &'static str {
532        match *self {
533            Token::Keylike(_) => "an identifier",
534            Token::Equals => "an equals",
535            Token::Period => "a period",
536            Token::Comment(_) => "a comment",
537            Token::Newline => "a newline",
538            Token::Whitespace(_) => "whitespace",
539            Token::Comma => "a comma",
540            Token::RightBrace => "a right brace",
541            Token::LeftBrace => "a left brace",
542            Token::RightBracket => "a right bracket",
543            Token::LeftBracket => "a left bracket",
544            Token::String { multiline, .. } => {
545                if multiline {
546                    "a multiline string"
547                } else {
548                    "a string"
549                }
550            }
551            Token::Colon => "a colon",
552            Token::Plus => "a plus",
553        }
554    }
555}
556
557#[cfg(test)]
558mod tests {
559    use super::{Error, Token, Tokenizer};
560    use std::borrow::Cow;
561
562    fn err(input: &str, err: Error) {
563        let mut t = Tokenizer::new(input);
564        let token = t.next().unwrap_err();
565        assert_eq!(token, err);
566        assert!(t.next().unwrap().is_none());
567    }
568
569    #[test]
570    fn literal_strings() {
571        fn t(input: &str, val: &str, multiline: bool) {
572            let mut t = Tokenizer::new(input);
573            let (_, token) = t.next().unwrap().unwrap();
574            assert_eq!(
575                token,
576                Token::String {
577                    src: input,
578                    val: Cow::Borrowed(val),
579                    multiline: multiline,
580                }
581            );
582            assert!(t.next().unwrap().is_none());
583        }
584
585        t("''", "", false);
586        t("''''''", "", true);
587        t("'''\n'''", "", true);
588        t("'a'", "a", false);
589        t("'\"a'", "\"a", false);
590        t("''''a'''", "'a", true);
591        t("'''\n'a\n'''", "'a\n", true);
592        t("'''a\n'a\r\n'''", "a\n'a\n", true);
593    }
594
595    #[test]
596    fn basic_strings() {
597        fn t(input: &str, val: &str, multiline: bool) {
598            let mut t = Tokenizer::new(input);
599            let (_, token) = t.next().unwrap().unwrap();
600            assert_eq!(
601                token,
602                Token::String {
603                    src: input,
604                    val: Cow::Borrowed(val),
605                    multiline: multiline,
606                }
607            );
608            assert!(t.next().unwrap().is_none());
609        }
610
611        t(r#""""#, "", false);
612        t(r#""""""""#, "", true);
613        t(r#""a""#, "a", false);
614        t(r#""""a""""#, "a", true);
615        t(r#""\t""#, "\t", false);
616        t(r#""\u0000""#, "\0", false);
617        t(r#""\U00000000""#, "\0", false);
618        t(r#""\U000A0000""#, "\u{A0000}", false);
619        t(r#""\\t""#, "\\t", false);
620        t("\"\t\"", "\t", false);
621        t("\"\"\"\n\t\"\"\"", "\t", true);
622        t("\"\"\"\\\n\"\"\"", "", true);
623        t(
624            "\"\"\"\\\n     \t   \t  \\\r\n  \t \n  \t \r\n\"\"\"",
625            "",
626            true,
627        );
628        t(r#""\r""#, "\r", false);
629        t(r#""\n""#, "\n", false);
630        t(r#""\b""#, "\u{8}", false);
631        t(r#""a\fa""#, "a\u{c}a", false);
632        t(r#""\"a""#, "\"a", false);
633        t("\"\"\"\na\"\"\"", "a", true);
634        t("\"\"\"\n\"\"\"", "", true);
635        t(r#""""a\"""b""""#, "a\"\"\"b", true);
636        err(r#""\a"#, Error::InvalidEscape(2, 'a'));
637        err("\"\\\n", Error::InvalidEscape(2, '\n'));
638        err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
639        err("\"\\", Error::UnterminatedString(0));
640        err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
641        err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
642        err(r#""\U00"#, Error::UnterminatedString(0));
643        err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
644        err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffff_ffff));
645    }
646
647    #[test]
648    fn keylike() {
649        fn t(input: &str) {
650            let mut t = Tokenizer::new(input);
651            let (_, token) = t.next().unwrap().unwrap();
652            assert_eq!(token, Token::Keylike(input));
653            assert!(t.next().unwrap().is_none());
654        }
655        t("foo");
656        t("0bar");
657        t("bar0");
658        t("1234");
659        t("a-b");
660        t("a_B");
661        t("-_-");
662        t("___");
663    }
664
665    #[test]
666    fn all() {
667        fn t(input: &str, expected: &[((usize, usize), Token<'_>, &str)]) {
668            let mut tokens = Tokenizer::new(input);
669            let mut actual: Vec<((usize, usize), Token<'_>, &str)> = Vec::new();
670            while let Some((span, token)) = tokens.next().unwrap() {
671                actual.push((span.into(), token, &input[span.start..span.end]));
672            }
673            for (a, b) in actual.iter().zip(expected) {
674                assert_eq!(a, b);
675            }
676            assert_eq!(actual.len(), expected.len());
677        }
678
679        t(
680            " a ",
681            &[
682                ((0, 1), Token::Whitespace(" "), " "),
683                ((1, 2), Token::Keylike("a"), "a"),
684                ((2, 3), Token::Whitespace(" "), " "),
685            ],
686        );
687
688        t(
689            " a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ",
690            &[
691                ((0, 1), Token::Whitespace(" "), " "),
692                ((1, 2), Token::Keylike("a"), "a"),
693                ((2, 4), Token::Whitespace("\t "), "\t "),
694                ((4, 5), Token::LeftBracket, "["),
695                ((5, 6), Token::LeftBracket, "["),
696                ((6, 7), Token::RightBracket, "]"),
697                ((7, 8), Token::RightBracket, "]"),
698                ((8, 11), Token::Whitespace(" \t "), " \t "),
699                ((11, 12), Token::LeftBracket, "["),
700                ((12, 13), Token::RightBracket, "]"),
701                ((13, 14), Token::Whitespace(" "), " "),
702                ((14, 15), Token::LeftBrace, "{"),
703                ((15, 16), Token::RightBrace, "}"),
704                ((16, 17), Token::Whitespace(" "), " "),
705                ((17, 18), Token::Comma, ","),
706                ((18, 19), Token::Whitespace(" "), " "),
707                ((19, 20), Token::Period, "."),
708                ((20, 21), Token::Whitespace(" "), " "),
709                ((21, 22), Token::Equals, "="),
710                ((22, 23), Token::Newline, "\n"),
711                ((23, 29), Token::Comment("# foo "), "# foo "),
712                ((29, 31), Token::Newline, "\r\n"),
713                ((31, 36), Token::Comment("#foo "), "#foo "),
714                ((36, 37), Token::Newline, "\n"),
715                ((37, 38), Token::Whitespace(" "), " "),
716            ],
717        );
718    }
719
720    #[test]
721    fn bare_cr_bad() {
722        err("\r", Error::Unexpected(0, '\r'));
723        err("'\n", Error::NewlineInString(1));
724        err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
725        err("'", Error::UnterminatedString(0));
726        err("\u{0}", Error::Unexpected(0, '\u{0}'));
727    }
728
729    #[test]
730    fn bad_comment() {
731        let mut t = Tokenizer::new("#\u{0}");
732        t.next().unwrap().unwrap();
733        assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
734        assert!(t.next().unwrap().is_none());
735    }
736}