xml/reader/parser/
mod.rs

1//! Contains an implementation of pull-based XML parser.
2
3use std::mem;
4use std::borrow::Cow;
5use std::io::prelude::*;
6
7use common::{
8    self,
9    XmlVersion, Position, TextPosition,
10    is_name_start_char, is_name_char,
11};
12use name::OwnedName;
13use attribute::OwnedAttribute;
14use namespace::NamespaceStack;
15
16use reader::events::XmlEvent;
17use reader::config::ParserConfig;
18use reader::lexer::{Lexer, Token};
19
20macro_rules! gen_takes(
21    ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
22        $(
23        impl MarkupData {
24            #[inline]
25            fn $method(&mut self) -> $t {
26                mem::replace(&mut self.$field, $def)
27            }
28        }
29        )+
30    )
31);
32
33gen_takes!(
34    name         -> take_name, String, String::new();
35    ref_data     -> take_ref_data, String, String::new();
36
37    version      -> take_version, Option<common::XmlVersion>, None;
38    encoding     -> take_encoding, Option<String>, None;
39    standalone   -> take_standalone, Option<bool>, None;
40
41    element_name -> take_element_name, Option<OwnedName>, None;
42
43    attr_name    -> take_attr_name, Option<OwnedName>, None;
44    attributes   -> take_attributes, Vec<OwnedAttribute>, vec!()
45);
46
47macro_rules! self_error(
48    ($this:ident; $msg:expr) => ($this.error($msg));
49    ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+)))
50);
51
52mod outside_tag;
53mod inside_processing_instruction;
54mod inside_declaration;
55mod inside_doctype;
56mod inside_opening_tag;
57mod inside_closing_tag_name;
58mod inside_comment;
59mod inside_cdata;
60mod inside_reference;
61
62static DEFAULT_VERSION: XmlVersion      = XmlVersion::Version10;
63static DEFAULT_ENCODING: &'static str   = "UTF-8";
64static DEFAULT_STANDALONE: Option<bool> = None;
65
66type ElementStack = Vec<OwnedName>;
67pub type Result = super::Result<XmlEvent>;
68
69/// Pull-based XML parser.
70pub struct PullParser {
71    config: ParserConfig,
72    lexer: Lexer,
73    st: State,
74    buf: String,
75    nst: NamespaceStack,
76
77    data: MarkupData,
78    final_result: Option<Result>,
79    next_event: Option<Result>,
80    est: ElementStack,
81    pos: Vec<TextPosition>,
82
83    encountered_element: bool,
84    parsed_declaration: bool,
85    inside_whitespace: bool,
86    read_prefix_separator: bool,
87    pop_namespace: bool
88}
89
90impl PullParser {
91    /// Returns a new parser using the given config.
92    pub fn new(config: ParserConfig) -> PullParser {
93        PullParser {
94            config: config,
95            lexer: Lexer::new(),
96            st: State::OutsideTag,
97            buf: String::new(),
98            nst: NamespaceStack::default(),
99
100            data: MarkupData {
101                name: String::new(),
102                version: None,
103                encoding: None,
104                standalone: None,
105                ref_data: String::new(),
106                element_name: None,
107                quote: None,
108                attr_name: None,
109                attributes: Vec::new()
110            },
111            final_result: None,
112            next_event: None,
113            est: Vec::new(),
114            pos: vec![TextPosition::new()],
115
116            encountered_element: false,
117            parsed_declaration: false,
118            inside_whitespace: true,
119            read_prefix_separator: false,
120            pop_namespace: false
121        }
122    }
123
124    /// Checks if this parser ignores the end of stream errors.
125    pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
126}
127
128impl Position for PullParser {
129    /// Returns the position of the last event produced by the parser
130    #[inline]
131    fn position(&self) -> TextPosition {
132        self.pos[0]
133    }
134}
135
136#[derive(Clone, PartialEq)]
137pub enum State {
138    OutsideTag,
139    InsideOpeningTag(OpeningTagSubstate),
140    InsideClosingTag(ClosingTagSubstate),
141    InsideProcessingInstruction(ProcessingInstructionSubstate),
142    InsideComment,
143    InsideCData,
144    InsideDeclaration(DeclarationSubstate),
145    InsideDoctype,
146    InsideReference(Box<State>)
147}
148
149#[derive(Clone, PartialEq)]
150pub enum OpeningTagSubstate {
151    InsideName,
152
153    InsideTag,
154
155    InsideAttributeName,
156    AfterAttributeName,
157
158    InsideAttributeValue,
159}
160
161#[derive(Clone, PartialEq)]
162pub enum ClosingTagSubstate {
163    CTInsideName,
164    CTAfterName
165}
166
167#[derive(Clone, PartialEq)]
168pub enum ProcessingInstructionSubstate {
169    PIInsideName,
170    PIInsideData
171}
172
173#[derive(Clone, PartialEq)]
174pub enum DeclarationSubstate {
175    BeforeVersion,
176    InsideVersion,
177    AfterVersion,
178
179    InsideVersionValue,
180    AfterVersionValue,
181
182    InsideEncoding,
183    AfterEncoding,
184
185    InsideEncodingValue,
186
187    BeforeStandaloneDecl,
188    InsideStandaloneDecl,
189    AfterStandaloneDecl,
190
191    InsideStandaloneDeclValue,
192    AfterStandaloneDeclValue
193}
194
195#[derive(PartialEq)]
196enum QualifiedNameTarget {
197    AttributeNameTarget,
198    OpeningTagNameTarget,
199    ClosingTagNameTarget
200}
201
202#[derive(Copy, Clone, PartialEq, Eq)]
203enum QuoteToken {
204    SingleQuoteToken,
205    DoubleQuoteToken
206}
207
208impl QuoteToken {
209    fn from_token(t: &Token) -> QuoteToken {
210        match *t {
211            Token::SingleQuote => QuoteToken::SingleQuoteToken,
212            Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
213            _ => panic!("Unexpected token: {}", t)
214        }
215    }
216
217    fn as_token(self) -> Token {
218        match self {
219            QuoteToken::SingleQuoteToken => Token::SingleQuote,
220            QuoteToken::DoubleQuoteToken => Token::DoubleQuote
221        }
222    }
223}
224
225struct MarkupData {
226    name: String,     // used for processing instruction name
227    ref_data: String,  // used for reference content
228
229    version: Option<common::XmlVersion>,  // used for XML declaration version
230    encoding: Option<String>,  // used for XML declaration encoding
231    standalone: Option<bool>,  // used for XML declaration standalone parameter
232
233    element_name: Option<OwnedName>,  // used for element name
234
235    quote: Option<QuoteToken>,  // used to hold opening quote for attribute value
236    attr_name: Option<OwnedName>,  // used to hold attribute name
237    attributes: Vec<OwnedAttribute>   // used to hold all accumulated attributes
238}
239
240impl PullParser {
241    /// Returns next event read from the given buffer.
242    ///
243    /// This method should be always called with the same buffer. If you call it
244    /// providing different buffers each time, the result will be undefined.
245    pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
246        if let Some(ref ev) = self.final_result {
247            return ev.clone();
248        }
249
250        if let Some(ev) = self.next_event.take() {
251            return ev;
252        }
253
254        if self.pop_namespace {
255            self.pop_namespace = false;
256            self.nst.pop();
257        }
258
259        loop {
260            // While lexer gives us Ok(maybe_token) -- we loop.
261            // Upon having a complete XML-event -- we return from the whole function.
262            match self.lexer.next_token(r) {
263                Ok(maybe_token) =>
264                    match maybe_token {
265                        None => break,
266                        Some(token) =>
267                            match self.dispatch_token(token) {
268                                None => {} // continue
269                                Some(Ok(XmlEvent::EndDocument)) =>
270                                    return {
271                                        self.next_pos();
272                                        self.set_final_result(Ok(XmlEvent::EndDocument))
273                                    },
274                                Some(Ok(xml_event)) =>
275                                    return {
276                                        self.next_pos();
277                                        Ok(xml_event)
278                                    },
279                                Some(Err(xml_error)) =>
280                                    return {
281                                        self.next_pos();
282                                        self.set_final_result(Err(xml_error))
283                                    },
284                            }
285                    },
286                Err(lexer_error) =>
287                    return self.set_final_result(Err(lexer_error)),
288            }
289        }
290
291        // Handle end of stream
292        // Forward pos to the lexer head
293        self.next_pos();
294        let ev = if self.depth() == 0 {
295            if self.encountered_element && self.st == State::OutsideTag {  // all is ok
296                Ok(XmlEvent::EndDocument)
297            } else if !self.encountered_element {
298                self_error!(self; "Unexpected end of stream: no root element found")
299            } else {  // self.st != State::OutsideTag
300                self_error!(self; "Unexpected end of stream")  // TODO: add expected hint?
301            }
302        } else {
303            if self.config.ignore_end_of_stream {
304                self.final_result = None;
305                self.lexer.reset_eof_handled();
306                return self_error!(self; "Unexpected end of stream: still inside the root element");
307            } else {
308                self_error!(self; "Unexpected end of stream: still inside the root element")
309            }
310        };
311        self.set_final_result(ev)
312    }
313
314    // This function is to be called when a terminal event is reached.
315    // The function sets up the `self.final_result` into `Some(result)` and return `result`.
316    fn set_final_result(&mut self, result: Result) -> Result {
317        self.final_result = Some(result.clone());
318        result
319    }
320
321    #[inline]
322    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result {
323        Err((&self.lexer, msg).into())
324    }
325
326    #[inline]
327    fn next_pos(&mut self) {
328        if self.pos.len() > 1 {
329            self.pos.remove(0);
330        } else {
331            self.pos[0] = self.lexer.position();
332        }
333    }
334
335    #[inline]
336    fn push_pos(&mut self) {
337        self.pos.push(self.lexer.position());
338    }
339
340    fn dispatch_token(&mut self, t: Token) -> Option<Result> {
341        match self.st.clone() {
342            State::OutsideTag                     => self.outside_tag(t),
343            State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
344            State::InsideDeclaration(s)           => self.inside_declaration(t, s),
345            State::InsideDoctype                  => self.inside_doctype(t),
346            State::InsideOpeningTag(s)            => self.inside_opening_tag(t, s),
347            State::InsideClosingTag(s)            => self.inside_closing_tag_name(t, s),
348            State::InsideComment                  => self.inside_comment(t),
349            State::InsideCData                    => self.inside_cdata(t),
350            State::InsideReference(s)             => self.inside_reference(t, *s)
351        }
352    }
353
354    #[inline]
355    fn depth(&self) -> usize {
356        self.est.len()
357    }
358
359    #[inline]
360    fn buf_has_data(&self) -> bool {
361        self.buf.len() > 0
362    }
363
364    #[inline]
365    fn take_buf(&mut self) -> String {
366        mem::replace(&mut self.buf, String::new())
367    }
368
369    #[inline]
370    fn append_char_continue(&mut self, c: char) -> Option<Result> {
371        self.buf.push(c);
372        None
373    }
374
375    #[inline]
376    fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
377        self.st = st;
378        ev
379    }
380
381    #[inline]
382    fn into_state_continue(&mut self, st: State) -> Option<Result> {
383        self.into_state(st, None)
384    }
385
386    #[inline]
387    fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
388        self.into_state(st, Some(ev))
389    }
390
391    /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed,
392    /// an error is returned.
393    ///
394    /// # Parameters
395    /// * `t`       --- next token;
396    /// * `on_name` --- a callback which is executed when whitespace is encountered.
397    fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
398      where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
399        // We can get here for the first time only when self.data.name contains zero or one character,
400        // but first character cannot be a colon anyway
401        if self.buf.len() <= 1 {
402            self.read_prefix_separator = false;
403        }
404
405        let invoke_callback = |this: &mut PullParser, t| {
406            let name = this.take_buf();
407            match name.parse() {
408                Ok(name) => on_name(this, t, name),
409                Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name))
410            }
411        };
412
413        match t {
414            // There can be only one colon, and not as the first character
415            Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
416                self.buf.push(':');
417                self.read_prefix_separator = true;
418                None
419            }
420
421            Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) ||
422                                          self.buf_has_data() && is_name_char(c)) =>
423                self.append_char_continue(c),
424
425            Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
426
427            Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
428
429            Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
430                      target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
431
432            Token::Whitespace(_) => invoke_callback(self, t),
433
434            _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t))
435        }
436    }
437
438    /// Dispatches tokens in order to process attribute value.
439    ///
440    /// # Parameters
441    /// * `t`        --- next token;
442    /// * `on_value` --- a callback which is called when terminating quote is encountered.
443    fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
444      where F: Fn(&mut PullParser, String) -> Option<Result> {
445        match t {
446            Token::Whitespace(_) if self.data.quote.is_none() => None,  // skip leading whitespace
447
448            Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
449                None => {  // Entered attribute value
450                    self.data.quote = Some(QuoteToken::from_token(&t));
451                    None
452                }
453                Some(q) if q.as_token() == t => {
454                    self.data.quote = None;
455                    let value = self.take_buf();
456                    on_value(self, value)
457                }
458                _ => {
459                    t.push_to_string(&mut self.buf);
460                    None
461                }
462            },
463
464            Token::ReferenceStart => {
465                let st = Box::new(self.st.clone());
466                self.into_state_continue(State::InsideReference(st))
467            }
468
469            Token::OpeningTagStart =>
470                Some(self_error!(self; "Unexpected token inside attribute value: <")),
471
472            // Every character except " and ' and < is okay
473            _  => {
474                t.push_to_string(&mut self.buf);
475                None
476            }
477        }
478    }
479
480    fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
481        let mut name = self.data.take_element_name().unwrap();
482        let mut attributes = self.data.take_attributes();
483
484        // check whether the name prefix is bound and fix its namespace
485        match self.nst.get(name.borrow().prefix_repr()) {
486            Some("") => name.namespace = None,  // default namespace
487            Some(ns) => name.namespace = Some(ns.into()),
488            None => return Some(self_error!(self; "Element {} prefix is unbound", name))
489        }
490
491        // check and fix accumulated attributes prefixes
492        for attr in attributes.iter_mut() {
493            if let Some(ref pfx) = attr.name.prefix {
494                let new_ns = match self.nst.get(pfx) {
495                    Some("") => None,  // default namespace
496                    Some(ns) => Some(ns.into()),
497                    None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name))
498                };
499                attr.name.namespace = new_ns;
500            }
501        }
502
503        if emit_end_element {
504            self.pop_namespace = true;
505            self.next_event = Some(Ok(XmlEvent::EndElement {
506                name: name.clone()
507            }));
508        } else {
509            self.est.push(name.clone());
510        }
511        let namespace = self.nst.squash();
512        self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
513            name: name,
514            attributes: attributes,
515            namespace: namespace
516        }))
517    }
518
519    fn emit_end_element(&mut self) -> Option<Result> {
520        let mut name = self.data.take_element_name().unwrap();
521
522        // check whether the name prefix is bound and fix its namespace
523        match self.nst.get(name.borrow().prefix_repr()) {
524            Some("") => name.namespace = None,  // default namespace
525            Some(ns) => name.namespace = Some(ns.into()),
526            None => return Some(self_error!(self; "Element {} prefix is unbound", name))
527        }
528
529        let op_name = self.est.pop().unwrap();
530
531        if name == op_name {
532            self.pop_namespace = true;
533            self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name }))
534        } else {
535            Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name))
536        }
537    }
538
539}
540
541#[cfg(test)]
542mod tests {
543    use std::io::BufReader;
544
545    use common::{Position, TextPosition};
546    use name::OwnedName;
547    use attribute::OwnedAttribute;
548    use reader::parser::PullParser;
549    use reader::ParserConfig;
550    use reader::events::XmlEvent;
551
552    fn new_parser() -> PullParser {
553        PullParser::new(ParserConfig::new())
554    }
555
556    macro_rules! expect_event(
557        ($r:expr, $p:expr, $t:pat) => (
558            match $p.next(&mut $r) {
559                $t => {}
560                e => panic!("Unexpected event: {:?}", e)
561            }
562        );
563        ($r:expr, $p:expr, $t:pat => $c:expr ) => (
564            match $p.next(&mut $r) {
565                $t if $c => {}
566                e => panic!("Unexpected event: {:?}", e)
567            }
568        )
569    );
570
571    macro_rules! test_data(
572        ($d:expr) => ({
573            static DATA: &'static str = $d;
574            let r = BufReader::new(DATA.as_bytes());
575            let p = new_parser();
576            (r, p)
577        })
578    );
579
580    #[test]
581    fn issue_3_semicolon_in_attribute_value() {
582        let (mut r, mut p) = test_data!(r#"
583            <a attr="zzz;zzz" />
584        "#);
585
586        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
587        expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
588            *name == OwnedName::local("a") &&
589             attributes.len() == 1 &&
590             attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
591             namespace.is_essentially_empty()
592        );
593        expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
594        expect_event!(r, p, Ok(XmlEvent::EndDocument));
595    }
596
597    #[test]
598    fn issue_140_entity_reference_inside_tag() {
599        let (mut r, mut p) = test_data!(r#"
600            <bla>&#9835;</bla>
601        "#);
602
603        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
604        expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
605        expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
606        expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
607        expect_event!(r, p, Ok(XmlEvent::EndDocument));
608    }
609
610    #[test]
611    fn opening_tag_in_attribute_value() {
612        let (mut r, mut p) = test_data!(r#"
613            <a attr="zzz<zzz" />
614        "#);
615
616        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
617        expect_event!(r, p, Err(ref e) =>
618            e.msg() == "Unexpected token inside attribute value: <" &&
619            e.position() == TextPosition { row: 1, column: 24 }
620        );
621    }
622}