xml/reader/
lexer.rs

Help
1//! Contains simple lexer for XML documents.
2//!
3//! This module is for internal use. Use `xml::pull` module to do parsing.
4
5use std::fmt;
6use std::collections::VecDeque;
7use std::io::Read;
8use std::result;
9use std::borrow::Cow;
10
11use common::{Position, TextPosition, is_whitespace_char, is_name_char};
12use reader::Error;
13use util;
14
15/// `Token` represents a single lexeme of an XML document. These lexemes
16/// are used to perform actual parsing.
17#[derive(Copy, Clone, PartialEq, Eq, Debug)]
18pub enum Token {
19    /// `<?`
20    ProcessingInstructionStart,
21    /// `?>`
22    ProcessingInstructionEnd,
23    /// `<!DOCTYPE
24    DoctypeStart,
25    /// `<`
26    OpeningTagStart,
27    /// `</`
28    ClosingTagStart,
29    /// `>`
30    TagEnd,
31    /// `/>`
32    EmptyTagEnd,
33    /// `<!--`
34    CommentStart,
35    /// `-->`
36    CommentEnd,
37    /// A chunk of characters, used for errors recovery.
38    Chunk(&'static str),
39    /// Any non-special character except whitespace.
40    Character(char),
41    /// Whitespace character.
42    Whitespace(char),
43    /// `=`
44    EqualsSign,
45    /// `'`
46    SingleQuote,
47    /// `"`
48    DoubleQuote,
49    /// `<![CDATA[`
50    CDataStart,
51    /// `]]>`
52    CDataEnd,
53    /// `&`
54    ReferenceStart,
55    /// `;`
56    ReferenceEnd,
57}
58
59impl fmt::Display for Token {
60    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61        match *self {
62            Token::Chunk(s)                            => write!(f, "{}", s),
63            Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
64            other => write!(f, "{}", match other {
65                Token::OpeningTagStart            => "<",
66                Token::ProcessingInstructionStart => "<?",
67                Token::DoctypeStart               => "<!DOCTYPE",
68                Token::ClosingTagStart            => "</",
69                Token::CommentStart               => "<!--",
70                Token::CDataStart                 => "<![CDATA[",
71                Token::TagEnd                     => ">",
72                Token::EmptyTagEnd                => "/>",
73                Token::ProcessingInstructionEnd   => "?>",
74                Token::CommentEnd                 => "-->",
75                Token::CDataEnd                   => "]]>",
76                Token::ReferenceStart             => "&",
77                Token::ReferenceEnd               => ";",
78                Token::EqualsSign                 => "=",
79                Token::SingleQuote                => "'",
80                Token::DoubleQuote                => "\"",
81                _                          => unreachable!()
82            })
83        }
84    }
85}
86
87impl Token {
88    pub fn as_static_str(&self) -> Option<&'static str> {
89        match *self {
90            Token::OpeningTagStart            => Some("<"),
91            Token::ProcessingInstructionStart => Some("<?"),
92            Token::DoctypeStart               => Some("<!DOCTYPE"),
93            Token::ClosingTagStart            => Some("</"),
94            Token::CommentStart               => Some("<!--"),
95            Token::CDataStart                 => Some("<![CDATA["),
96            Token::TagEnd                     => Some(">"),
97            Token::EmptyTagEnd                => Some("/>"),
98            Token::ProcessingInstructionEnd   => Some("?>"),
99            Token::CommentEnd                 => Some("-->"),
100            Token::CDataEnd                   => Some("]]>"),
101            Token::ReferenceStart             => Some("&"),
102            Token::ReferenceEnd               => Some(";"),
103            Token::EqualsSign                 => Some("="),
104            Token::SingleQuote                => Some("'"),
105            Token::DoubleQuote                => Some("\""),
106            Token::Chunk(s)                   => Some(s),
107            _                                 => None
108        }
109    }
110
111    // using String.push_str(token.to_string()) is simply way too slow
112    pub fn push_to_string(&self, target: &mut String) {
113        match self.as_static_str() {
114            Some(s) => { target.push_str(s); }
115            None => {
116                match *self {
117                    Token::Character(c) | Token::Whitespace(c) => target.push(c),
118                    _ => unreachable!()
119                }
120            }
121        }
122    }
123
124    /// Returns `true` if this token contains data that can be interpreted
125    /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
126    #[inline]
127    pub fn contains_char_data(&self) -> bool {
128        match *self {
129            Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
130            Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote => true,
131            _ => false
132        }
133    }
134
135    /// Returns `true` if this token corresponds to a white space character.
136    #[inline]
137    pub fn is_whitespace(&self) -> bool {
138        match *self {
139            Token::Whitespace(_) => true,
140            _ => false
141        }
142    }
143}
144
145enum State {
146    /// Triggered on '<'
147    TagStarted,
148    /// Triggered on '<!'
149    CommentOrCDataOrDoctypeStarted,
150    /// Triggered on '<!-'
151    CommentStarted,
152    /// Triggered on '<!D' up to '<!DOCTYPE'
153    DoctypeStarted(DoctypeStartedSubstate),
154    /// Triggered on '<![' up to '<![CDATA'
155    CDataStarted(CDataStartedSubstate),
156    /// Triggered on '?'
157    ProcessingInstructionClosing,
158    /// Triggered on '/'
159    EmptyTagClosing,
160    /// Triggered on '-' up to '--'
161    CommentClosing(ClosingSubstate),
162    /// Triggered on ']' up to ']]'
163    CDataClosing(ClosingSubstate),
164    /// Default state
165    Normal
166}
167
168#[derive(Copy, Clone)]
169enum ClosingSubstate {
170    First, Second
171}
172
173#[derive(Copy, Clone)]
174enum DoctypeStartedSubstate {
175    D, DO, DOC, DOCT, DOCTY, DOCTYP
176}
177
178#[derive(Copy, Clone)]
179enum CDataStartedSubstate {
180    E, C, CD, CDA, CDAT, CDATA
181}
182
183/// `Result` represents lexing result. It is either a token or an error message.
184pub type Result = result::Result<Option<Token>, Error>;
185
186/// Helps to set up a dispatch table for lexing large unambigous tokens like
187/// `<![CDATA[` or `<!DOCTYPE `.
188macro_rules! dispatch_on_enum_state(
189    ($_self:ident, $s:expr, $c:expr, $is:expr,
190     $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
191     $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
192        match $s {
193            $(
194            $st => match $c {
195                $stc => $_self.move_to($is($next_st)),
196                _  => $_self.handle_error($chunk, $c)
197            },
198            )+
199            $end_st => match $c {
200                $end_c => $e,
201                _      => $_self.handle_error($end_chunk, $c)
202            }
203        }
204    )
205);
206
207/// `Lexer` is a lexer for XML documents, which implements pull API.
208///
209/// Main method is `next_token` which accepts an `std::io::Read` instance and
210/// tries to read the next lexeme from it.
211///
212/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
213/// When it is not set, errors will be reported as `Err` objects with a string message.
214/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
215/// to toggle the behavior.
216pub struct Lexer {
217    pos: TextPosition,
218    head_pos: TextPosition,
219    char_queue: VecDeque<char>,
220    st: State,
221    skip_errors: bool,
222    inside_comment: bool,
223    inside_token: bool,
224    eof_handled: bool
225}
226
227impl Position for Lexer {
228    #[inline]
229    /// Returns the position of the last token produced by the lexer
230    fn position(&self) -> TextPosition { self.pos }
231}
232
233impl Lexer {
234    /// Returns a new lexer with default state.
235    pub fn new() -> Lexer {
236        Lexer {
237            pos: TextPosition::new(),
238            head_pos: TextPosition::new(),
239            char_queue: VecDeque::with_capacity(4),  // TODO: check size
240            st: State::Normal,
241            skip_errors: false,
242            inside_comment: false,
243            inside_token: false,
244            eof_handled: false
245        }
246    }
247
248    /// Enables error handling so `next_token` will return `Some(Err(..))`
249    /// upon invalid lexeme.
250    #[inline]
251    pub fn enable_errors(&mut self) { self.skip_errors = false; }
252
253    /// Disables error handling so `next_token` will return `Some(Chunk(..))`
254    /// upon invalid lexeme with this lexeme content.
255    #[inline]
256    pub fn disable_errors(&mut self) { self.skip_errors = true; }
257
258    /// Enables special handling of some lexemes which should be done when we're parsing comment
259    /// internals.
260    #[inline]
261    pub fn inside_comment(&mut self) { self.inside_comment = true; }
262
263    /// Disables the effect of `inside_comment()` method.
264    #[inline]
265    pub fn outside_comment(&mut self) { self.inside_comment = false; }
266
267    /// Reset the eof handled flag of the lexer.
268    #[inline]
269    pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
270
271    /// Tries to read the next token from the buffer.
272    ///
273    /// It is possible to pass different instaces of `BufReader` each time
274    /// this method is called, but the resulting behavior is undefined in this case.
275    ///
276    /// Return value:
277    /// * `Err(reason) where reason: reader::Error` - when an error occurs;
278    /// * `Ok(None)` - upon end of stream is reached;
279    /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
280    pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
281        // Already reached end of buffer
282        if self.eof_handled {
283            return Ok(None);
284        }
285
286        if !self.inside_token {
287            self.pos = self.head_pos;
288            self.inside_token = true;
289        }
290
291        // Check if we have saved a char or two for ourselves
292        while let Some(c) = self.char_queue.pop_front() {
293            match try!(self.read_next_token(c)) {
294                Some(t) => {
295                    self.inside_token = false;
296                    return Ok(Some(t));
297                }
298                None => {}  // continue
299            }
300        }
301
302        loop {
303            // TODO: this should handle multiple encodings
304            let c = match try!(util::next_char_from(b)) {
305                Some(c) => c,   // got next char
306                None => break,  // nothing to read left
307            };
308
309            match try!(self.read_next_token(c)) {
310                Some(t) => {
311                    self.inside_token = false;
312                    return Ok(Some(t));
313                }
314                None => {
315                    // continue
316                }
317            }
318        }
319
320        // Handle end of stream
321        self.eof_handled = true;
322        self.pos = self.head_pos;
323        match self.st {
324            State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
325            State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
326            State::CommentClosing(ClosingSubstate::Second)  =>
327                Err(self.error("Unexpected end of stream")),
328            State::ProcessingInstructionClosing =>
329                Ok(Some(Token::Character('?'))),
330            State::EmptyTagClosing =>
331                Ok(Some(Token::Character('/'))),
332            State::CommentClosing(ClosingSubstate::First) =>
333                Ok(Some(Token::Character('-'))),
334            State::CDataClosing(ClosingSubstate::First) =>
335                Ok(Some(Token::Character(']'))),
336            State::CDataClosing(ClosingSubstate::Second) =>
337                Ok(Some(Token::Chunk("]]"))),
338            State::Normal =>
339                Ok(None)
340        }
341    }
342
343    #[inline]
344    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
345        (self, msg).into()
346    }
347
348    #[inline]
349    fn read_next_token(&mut self, c: char) -> Result {
350        let res = self.dispatch_char(c);
351        if self.char_queue.is_empty() {
352            if c == '\n' {
353                self.head_pos.new_line();
354            } else {
355                self.head_pos.advance(1);
356            }
357        }
358        res
359    }
360
361    fn dispatch_char(&mut self, c: char) -> Result {
362        match self.st {
363            State::Normal                         => self.normal(c),
364            State::TagStarted                     => self.tag_opened(c),
365            State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
366            State::CommentStarted                 => self.comment_started(c),
367            State::CDataStarted(s)                => self.cdata_started(c, s),
368            State::DoctypeStarted(s)              => self.doctype_started(c, s),
369            State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
370            State::EmptyTagClosing                => self.empty_element_closing(c),
371            State::CommentClosing(s)              => self.comment_closing(c, s),
372            State::CDataClosing(s)                => self.cdata_closing(c, s)
373        }
374    }
375
376    #[inline]
377    fn move_to(&mut self, st: State) -> Result {
378        self.st = st;
379        Ok(None)
380    }
381
382    #[inline]
383    fn move_to_with(&mut self, st: State, token: Token) -> Result {
384        self.st = st;
385        Ok(Some(token))
386    }
387
388    #[inline]
389    fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
390        self.char_queue.extend(cs.iter().cloned());
391        self.move_to_with(st, token)
392    }
393
394    fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
395        self.char_queue.push_back(c);
396        if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
397            self.move_to_with(State::Normal, Token::Chunk(chunk))
398        } else {
399            Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
400        }
401    }
402
403    /// Encountered a char
404    fn normal(&mut self, c: char) -> Result {
405        match c {
406            '<'                        => self.move_to(State::TagStarted),
407            '>'                        => Ok(Some(Token::TagEnd)),
408            '/'                        => self.move_to(State::EmptyTagClosing),
409            '='                        => Ok(Some(Token::EqualsSign)),
410            '"'                        => Ok(Some(Token::DoubleQuote)),
411            '\''                       => Ok(Some(Token::SingleQuote)),
412            '?'                        => self.move_to(State::ProcessingInstructionClosing),
413            '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
414            ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
415            '&'                        => Ok(Some(Token::ReferenceStart)),
416            ';'                        => Ok(Some(Token::ReferenceEnd)),
417            _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
418            _                          => Ok(Some(Token::Character(c)))
419        }
420    }
421
422    /// Encountered '<'
423    fn tag_opened(&mut self, c: char) -> Result {
424        match c {
425            '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
426            '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
427            '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
428            _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
429            _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
430            _                          => self.handle_error("<", c)
431        }
432    }
433
434    /// Encountered '<!'
435    fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
436        match c {
437            '-' => self.move_to(State::CommentStarted),
438            '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
439            'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
440            _   => self.handle_error("<!", c)
441        }
442    }
443
444    /// Encountered '<!-'
445    fn comment_started(&mut self, c: char) -> Result {
446        match c {
447            '-' => self.move_to_with(State::Normal, Token::CommentStart),
448            _   => self.handle_error("<!-", c)
449        }
450    }
451
452    /// Encountered '<!['
453    fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
454        use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
455        dispatch_on_enum_state!(self, s, c, State::CDataStarted,
456            E     ; 'C' ; C     ; "<![",
457            C     ; 'D' ; CD    ; "<![C",
458            CD    ; 'A' ; CDA   ; "<![CD",
459            CDA   ; 'T' ; CDAT  ; "<![CDA",
460            CDAT  ; 'A' ; CDATA ; "<![CDAT";
461            CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
462        )
463    }
464
465    /// Encountered '<!D'
466    fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
467        use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
468        dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
469            D      ; 'O' ; DO     ; "<!D",
470            DO     ; 'C' ; DOC    ; "<!DO",
471            DOC    ; 'T' ; DOCT   ; "<!DOC",
472            DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
473            DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
474            DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::Normal, Token::DoctypeStart)
475        )
476    }
477
478    /// Encountered '?'
479    fn processing_instruction_closing(&mut self, c: char) -> Result {
480        match c {
481            '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
482            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
483        }
484    }
485
486    /// Encountered '/'
487    fn empty_element_closing(&mut self, c: char) -> Result {
488        match c {
489            '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
490            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
491        }
492    }
493
494    /// Encountered '-'
495    fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
496        match s {
497            ClosingSubstate::First => match c {
498                '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
499                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
500            },
501            ClosingSubstate::Second => match c {
502                '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
503                // double dash not followed by a greater-than is a hard error inside comment
504                _ if self.inside_comment => self.handle_error("--", c),
505                // nothing else except comment closing starts with a double dash, and comment
506                // closing can never be after another dash, and also we're outside of a comment,
507                // therefore it is safe to push only the last read character to the list of unread
508                // characters and pass the double dash directly to the output
509                _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
510            }
511        }
512    }
513
514    /// Encountered ']'
515    fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
516        match s {
517            ClosingSubstate::First => match c {
518                ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
519                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
520            },
521            ClosingSubstate::Second => match c {
522                '>' => self.move_to_with(State::Normal, Token::CDataEnd),
523                _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
524            }
525        }
526    }
527}
528
529#[cfg(test)]
530mod tests {
531    use common::{Position};
532    use std::io::{BufReader, Cursor};
533
534    use super::{Lexer, Token};
535
536    macro_rules! assert_oks(
537        (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
538            $(
539                assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
540             )+
541        })
542    );
543
544    macro_rules! assert_err(
545        (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
546            let err = $lex.next_token(&mut $buf);
547            assert!(err.is_err());
548            let err = err.unwrap_err();
549            assert_eq!($r as u64, err.position().row);
550            assert_eq!($c as u64, err.position().column);
551            assert_eq!($s, err.msg());
552        })
553    );
554
555    macro_rules! assert_none(
556        (for $lex:ident and $buf:ident) => (
557            assert_eq!(Ok(None), $lex.next_token(&mut $buf));
558        )
559    );
560
561    fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
562        (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
563    }
564
565    #[test]
566    fn simple_lexer_test() {
567        let (mut lex, mut buf) = make_lex_and_buf(
568            r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
569        );
570
571        assert_oks!(for lex and buf ;
572            Token::OpeningTagStart
573            Token::Character('a')
574            Token::Whitespace(' ')
575            Token::Character('p')
576            Token::EqualsSign
577            Token::SingleQuote
578            Token::Character('q')
579            Token::SingleQuote
580            Token::TagEnd
581            Token::Whitespace(' ')
582            Token::Character('x')
583            Token::OpeningTagStart
584            Token::Character('b')
585            Token::Whitespace(' ')
586            Token::Character('z')
587            Token::EqualsSign
588            Token::DoubleQuote
589            Token::Character('y')
590            Token::DoubleQuote
591            Token::TagEnd
592            Token::Character('d')
593            Token::Whitespace('\t')
594            Token::ClosingTagStart
595            Token::Character('b')
596            Token::TagEnd
597            Token::ClosingTagStart
598            Token::Character('a')
599            Token::TagEnd
600            Token::OpeningTagStart
601            Token::Character('p')
602            Token::EmptyTagEnd
603            Token::Whitespace(' ')
604            Token::ProcessingInstructionStart
605            Token::Character('n')
606            Token::Character('m')
607            Token::Whitespace(' ')
608            Token::ProcessingInstructionEnd
609            Token::Whitespace(' ')
610            Token::CommentStart
611            Token::Whitespace(' ')
612            Token::Character('a')
613            Token::Whitespace(' ')
614            Token::Character('c')
615            Token::Whitespace(' ')
616            Token::CommentEnd
617            Token::Whitespace(' ')
618            Token::ReferenceStart
619            Token::Character('n')
620            Token::Character('b')
621            Token::Character('s')
622            Token::Character('p')
623            Token::ReferenceEnd
624        );
625        assert_none!(for lex and buf);
626    }
627
628    #[test]
629    fn special_chars_test() {
630        let (mut lex, mut buf) = make_lex_and_buf(
631            r#"?x!+ // -| ]z]]"#
632        );
633
634        assert_oks!(for lex and buf ;
635            Token::Character('?')
636            Token::Character('x')
637            Token::Character('!')
638            Token::Character('+')
639            Token::Whitespace(' ')
640            Token::Character('/')
641            Token::Character('/')
642            Token::Whitespace(' ')
643            Token::Character('-')
644            Token::Character('|')
645            Token::Whitespace(' ')
646            Token::Character(']')
647            Token::Character('z')
648            Token::Chunk("]]")
649        );
650        assert_none!(for lex and buf);
651    }
652
653    #[test]
654    fn cdata_test() {
655        let (mut lex, mut buf) = make_lex_and_buf(
656            r#"<a><![CDATA[x y ?]]> </a>"#
657        );
658
659        assert_oks!(for lex and buf ;
660            Token::OpeningTagStart
661            Token::Character('a')
662            Token::TagEnd
663            Token::CDataStart
664            Token::Character('x')
665            Token::Whitespace(' ')
666            Token::Character('y')
667            Token::Whitespace(' ')
668            Token::Character('?')
669            Token::CDataEnd
670            Token::Whitespace(' ')
671            Token::ClosingTagStart
672            Token::Character('a')
673            Token::TagEnd
674        );
675        assert_none!(for lex and buf);
676    }
677
678    #[test]
679    fn doctype_test() {
680        let (mut lex, mut buf) = make_lex_and_buf(
681            r#"<a><!DOCTYPE ab xx z> "#
682        );
683        assert_oks!(for lex and buf ;
684            Token::OpeningTagStart
685            Token::Character('a')
686            Token::TagEnd
687            Token::DoctypeStart
688            Token::Whitespace(' ')
689            Token::Character('a')
690            Token::Character('b')
691            Token::Whitespace(' ')
692            Token::Character('x')
693            Token::Character('x')
694            Token::Whitespace(' ')
695            Token::Character('z')
696            Token::TagEnd
697            Token::Whitespace(' ')
698        );
699        assert_none!(for lex and buf)
700    }
701
702    #[test]
703    fn end_of_stream_handling_ok() {
704        macro_rules! eof_check(
705            ($data:expr ; $token:expr) => ({
706                let (mut lex, mut buf) = make_lex_and_buf($data);
707                assert_oks!(for lex and buf ; $token);
708                assert_none!(for lex and buf);
709            })
710        );
711        eof_check!("?"  ; Token::Character('?'));
712        eof_check!("/"  ; Token::Character('/'));
713        eof_check!("-"  ; Token::Character('-'));
714        eof_check!("]"  ; Token::Character(']'));
715        eof_check!("]]" ; Token::Chunk("]]"));
716    }
717
718    #[test]
719    fn end_of_stream_handling_error() {
720        macro_rules! eof_check(
721            ($data:expr; $r:expr, $c:expr) => ({
722                let (mut lex, mut buf) = make_lex_and_buf($data);
723                assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
724                assert_none!(for lex and buf);
725            })
726        );
727        eof_check!("<"        ; 0, 1);
728        eof_check!("<!"       ; 0, 2);
729        eof_check!("<!-"      ; 0, 3);
730        eof_check!("<!["      ; 0, 3);
731        eof_check!("<![C"     ; 0, 4);
732        eof_check!("<![CD"    ; 0, 5);
733        eof_check!("<![CDA"   ; 0, 6);
734        eof_check!("<![CDAT"  ; 0, 7);
735        eof_check!("<![CDATA" ; 0, 8);
736        eof_check!("--"       ; 0, 2);
737    }
738
739    #[test]
740    fn error_in_comment_or_cdata_prefix() {
741        let (mut lex, mut buf) = make_lex_and_buf("<!x");
742        assert_err!(for lex and buf expect row 0 ; 0,
743            "Unexpected token '<!' before 'x'"
744        );
745
746        let (mut lex, mut buf) = make_lex_and_buf("<!x");
747        lex.disable_errors();
748        assert_oks!(for lex and buf ;
749            Token::Chunk("<!")
750            Token::Character('x')
751        );
752        assert_none!(for lex and buf);
753    }
754
755    #[test]
756    fn error_in_comment_started() {
757        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
758        assert_err!(for lex and buf expect row 0 ; 0,
759            "Unexpected token '<!-' before '\t'"
760        );
761
762        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
763        lex.disable_errors();
764        assert_oks!(for lex and buf ;
765            Token::Chunk("<!-")
766            Token::Whitespace('\t')
767        );
768        assert_none!(for lex and buf);
769    }
770
771    #[test]
772    fn error_in_comment_two_dashes_not_at_end() {
773        let (mut lex, mut buf) = make_lex_and_buf("--x");
774        lex.inside_comment();
775        assert_err!(for lex and buf expect row 0; 0,
776            "Unexpected token '--' before 'x'"
777        );
778
779        let (mut lex, mut buf) = make_lex_and_buf("--x");
780        assert_oks!(for lex and buf ;
781            Token::Chunk("--")
782            Token::Character('x')
783        );
784    }
785
786    macro_rules! check_case(
787        ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
788            let (mut lex, mut buf) = make_lex_and_buf($data);
789            assert_err!(for lex and buf expect row $r ; $c, $s);
790
791            let (mut lex, mut buf) = make_lex_and_buf($data);
792            lex.disable_errors();
793            assert_oks!(for lex and buf ;
794                Token::Chunk($chunk)
795                Token::Character($app)
796            );
797            assert_none!(for lex and buf);
798        })
799    );
800
801    #[test]
802    fn error_in_cdata_started() {
803        check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
804        check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
805        check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
806        check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
807        check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
808        check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
809    }
810
811    #[test]
812    fn error_in_doctype_started() {
813        check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
814        check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
815        check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
816        check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
817        check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
818        check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
819    }
820
821
822
823    #[test]
824    fn issue_98_cdata_ending_with_right_bracket() {
825        let (mut lex, mut buf) = make_lex_and_buf(
826            r#"<![CDATA[Foo [Bar]]]>"#
827        );
828
829        assert_oks!(for lex and buf ;
830            Token::CDataStart
831            Token::Character('F')
832            Token::Character('o')
833            Token::Character('o')
834            Token::Whitespace(' ')
835            Token::Character('[')
836            Token::Character('B')
837            Token::Character('a')
838            Token::Character('r')
839            Token::Character(']')
840            Token::CDataEnd
841        );
842        assert_none!(for lex and buf);
843    }
844}
xml/reader/lexer.rs

xml/reader/
lexer.rs