1use std::mem;
4use std::borrow::Cow;
5use std::io::prelude::*;
6
7use common::{
8 self,
9 XmlVersion, Position, TextPosition,
10 is_name_start_char, is_name_char,
11};
12use name::OwnedName;
13use attribute::OwnedAttribute;
14use namespace::NamespaceStack;
15
16use reader::events::XmlEvent;
17use reader::config::ParserConfig;
18use reader::lexer::{Lexer, Token};
19
20macro_rules! gen_takes(
21 ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
22 $(
23 impl MarkupData {
24 #[inline]
25 fn $method(&mut self) -> $t {
26 mem::replace(&mut self.$field, $def)
27 }
28 }
29 )+
30 )
31);
32
33gen_takes!(
34 name -> take_name, String, String::new();
35 ref_data -> take_ref_data, String, String::new();
36
37 version -> take_version, Option<common::XmlVersion>, None;
38 encoding -> take_encoding, Option<String>, None;
39 standalone -> take_standalone, Option<bool>, None;
40
41 element_name -> take_element_name, Option<OwnedName>, None;
42
43 attr_name -> take_attr_name, Option<OwnedName>, None;
44 attributes -> take_attributes, Vec<OwnedAttribute>, vec!()
45);
46
47macro_rules! self_error(
48 ($this:ident; $msg:expr) => ($this.error($msg));
49 ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+)))
50);
51
52mod outside_tag;
53mod inside_processing_instruction;
54mod inside_declaration;
55mod inside_doctype;
56mod inside_opening_tag;
57mod inside_closing_tag_name;
58mod inside_comment;
59mod inside_cdata;
60mod inside_reference;
61
62static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
63static DEFAULT_ENCODING: &'static str = "UTF-8";
64static DEFAULT_STANDALONE: Option<bool> = None;
65
66type ElementStack = Vec<OwnedName>;
67pub type Result = super::Result<XmlEvent>;
68
69pub struct PullParser {
71 config: ParserConfig,
72 lexer: Lexer,
73 st: State,
74 buf: String,
75 nst: NamespaceStack,
76
77 data: MarkupData,
78 final_result: Option<Result>,
79 next_event: Option<Result>,
80 est: ElementStack,
81 pos: Vec<TextPosition>,
82
83 encountered_element: bool,
84 parsed_declaration: bool,
85 inside_whitespace: bool,
86 read_prefix_separator: bool,
87 pop_namespace: bool
88}
89
90impl PullParser {
91 pub fn new(config: ParserConfig) -> PullParser {
93 PullParser {
94 config: config,
95 lexer: Lexer::new(),
96 st: State::OutsideTag,
97 buf: String::new(),
98 nst: NamespaceStack::default(),
99
100 data: MarkupData {
101 name: String::new(),
102 version: None,
103 encoding: None,
104 standalone: None,
105 ref_data: String::new(),
106 element_name: None,
107 quote: None,
108 attr_name: None,
109 attributes: Vec::new()
110 },
111 final_result: None,
112 next_event: None,
113 est: Vec::new(),
114 pos: vec![TextPosition::new()],
115
116 encountered_element: false,
117 parsed_declaration: false,
118 inside_whitespace: true,
119 read_prefix_separator: false,
120 pop_namespace: false
121 }
122 }
123
124 pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
126}
127
128impl Position for PullParser {
129 #[inline]
131 fn position(&self) -> TextPosition {
132 self.pos[0]
133 }
134}
135
136#[derive(Clone, PartialEq)]
137pub enum State {
138 OutsideTag,
139 InsideOpeningTag(OpeningTagSubstate),
140 InsideClosingTag(ClosingTagSubstate),
141 InsideProcessingInstruction(ProcessingInstructionSubstate),
142 InsideComment,
143 InsideCData,
144 InsideDeclaration(DeclarationSubstate),
145 InsideDoctype,
146 InsideReference(Box<State>)
147}
148
149#[derive(Clone, PartialEq)]
150pub enum OpeningTagSubstate {
151 InsideName,
152
153 InsideTag,
154
155 InsideAttributeName,
156 AfterAttributeName,
157
158 InsideAttributeValue,
159}
160
161#[derive(Clone, PartialEq)]
162pub enum ClosingTagSubstate {
163 CTInsideName,
164 CTAfterName
165}
166
167#[derive(Clone, PartialEq)]
168pub enum ProcessingInstructionSubstate {
169 PIInsideName,
170 PIInsideData
171}
172
173#[derive(Clone, PartialEq)]
174pub enum DeclarationSubstate {
175 BeforeVersion,
176 InsideVersion,
177 AfterVersion,
178
179 InsideVersionValue,
180 AfterVersionValue,
181
182 InsideEncoding,
183 AfterEncoding,
184
185 InsideEncodingValue,
186
187 BeforeStandaloneDecl,
188 InsideStandaloneDecl,
189 AfterStandaloneDecl,
190
191 InsideStandaloneDeclValue,
192 AfterStandaloneDeclValue
193}
194
195#[derive(PartialEq)]
196enum QualifiedNameTarget {
197 AttributeNameTarget,
198 OpeningTagNameTarget,
199 ClosingTagNameTarget
200}
201
202#[derive(Copy, Clone, PartialEq, Eq)]
203enum QuoteToken {
204 SingleQuoteToken,
205 DoubleQuoteToken
206}
207
208impl QuoteToken {
209 fn from_token(t: &Token) -> QuoteToken {
210 match *t {
211 Token::SingleQuote => QuoteToken::SingleQuoteToken,
212 Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
213 _ => panic!("Unexpected token: {}", t)
214 }
215 }
216
217 fn as_token(self) -> Token {
218 match self {
219 QuoteToken::SingleQuoteToken => Token::SingleQuote,
220 QuoteToken::DoubleQuoteToken => Token::DoubleQuote
221 }
222 }
223}
224
225struct MarkupData {
226 name: String, ref_data: String, version: Option<common::XmlVersion>, encoding: Option<String>, standalone: Option<bool>, element_name: Option<OwnedName>, quote: Option<QuoteToken>, attr_name: Option<OwnedName>, attributes: Vec<OwnedAttribute> }
239
240impl PullParser {
241 pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
246 if let Some(ref ev) = self.final_result {
247 return ev.clone();
248 }
249
250 if let Some(ev) = self.next_event.take() {
251 return ev;
252 }
253
254 if self.pop_namespace {
255 self.pop_namespace = false;
256 self.nst.pop();
257 }
258
259 loop {
260 match self.lexer.next_token(r) {
263 Ok(maybe_token) =>
264 match maybe_token {
265 None => break,
266 Some(token) =>
267 match self.dispatch_token(token) {
268 None => {} Some(Ok(XmlEvent::EndDocument)) =>
270 return {
271 self.next_pos();
272 self.set_final_result(Ok(XmlEvent::EndDocument))
273 },
274 Some(Ok(xml_event)) =>
275 return {
276 self.next_pos();
277 Ok(xml_event)
278 },
279 Some(Err(xml_error)) =>
280 return {
281 self.next_pos();
282 self.set_final_result(Err(xml_error))
283 },
284 }
285 },
286 Err(lexer_error) =>
287 return self.set_final_result(Err(lexer_error)),
288 }
289 }
290
291 self.next_pos();
294 let ev = if self.depth() == 0 {
295 if self.encountered_element && self.st == State::OutsideTag { Ok(XmlEvent::EndDocument)
297 } else if !self.encountered_element {
298 self_error!(self; "Unexpected end of stream: no root element found")
299 } else { self_error!(self; "Unexpected end of stream") }
302 } else {
303 if self.config.ignore_end_of_stream {
304 self.final_result = None;
305 self.lexer.reset_eof_handled();
306 return self_error!(self; "Unexpected end of stream: still inside the root element");
307 } else {
308 self_error!(self; "Unexpected end of stream: still inside the root element")
309 }
310 };
311 self.set_final_result(ev)
312 }
313
314 fn set_final_result(&mut self, result: Result) -> Result {
317 self.final_result = Some(result.clone());
318 result
319 }
320
321 #[inline]
322 fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result {
323 Err((&self.lexer, msg).into())
324 }
325
326 #[inline]
327 fn next_pos(&mut self) {
328 if self.pos.len() > 1 {
329 self.pos.remove(0);
330 } else {
331 self.pos[0] = self.lexer.position();
332 }
333 }
334
335 #[inline]
336 fn push_pos(&mut self) {
337 self.pos.push(self.lexer.position());
338 }
339
340 fn dispatch_token(&mut self, t: Token) -> Option<Result> {
341 match self.st.clone() {
342 State::OutsideTag => self.outside_tag(t),
343 State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
344 State::InsideDeclaration(s) => self.inside_declaration(t, s),
345 State::InsideDoctype => self.inside_doctype(t),
346 State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
347 State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
348 State::InsideComment => self.inside_comment(t),
349 State::InsideCData => self.inside_cdata(t),
350 State::InsideReference(s) => self.inside_reference(t, *s)
351 }
352 }
353
354 #[inline]
355 fn depth(&self) -> usize {
356 self.est.len()
357 }
358
359 #[inline]
360 fn buf_has_data(&self) -> bool {
361 self.buf.len() > 0
362 }
363
364 #[inline]
365 fn take_buf(&mut self) -> String {
366 mem::replace(&mut self.buf, String::new())
367 }
368
369 #[inline]
370 fn append_char_continue(&mut self, c: char) -> Option<Result> {
371 self.buf.push(c);
372 None
373 }
374
375 #[inline]
376 fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
377 self.st = st;
378 ev
379 }
380
381 #[inline]
382 fn into_state_continue(&mut self, st: State) -> Option<Result> {
383 self.into_state(st, None)
384 }
385
386 #[inline]
387 fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
388 self.into_state(st, Some(ev))
389 }
390
391 fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
398 where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
399 if self.buf.len() <= 1 {
402 self.read_prefix_separator = false;
403 }
404
405 let invoke_callback = |this: &mut PullParser, t| {
406 let name = this.take_buf();
407 match name.parse() {
408 Ok(name) => on_name(this, t, name),
409 Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name))
410 }
411 };
412
413 match t {
414 Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
416 self.buf.push(':');
417 self.read_prefix_separator = true;
418 None
419 }
420
421 Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) ||
422 self.buf_has_data() && is_name_char(c)) =>
423 self.append_char_continue(c),
424
425 Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
426
427 Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
428
429 Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
430 target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
431
432 Token::Whitespace(_) => invoke_callback(self, t),
433
434 _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t))
435 }
436 }
437
438 fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
444 where F: Fn(&mut PullParser, String) -> Option<Result> {
445 match t {
446 Token::Whitespace(_) if self.data.quote.is_none() => None, Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
449 None => { self.data.quote = Some(QuoteToken::from_token(&t));
451 None
452 }
453 Some(q) if q.as_token() == t => {
454 self.data.quote = None;
455 let value = self.take_buf();
456 on_value(self, value)
457 }
458 _ => {
459 t.push_to_string(&mut self.buf);
460 None
461 }
462 },
463
464 Token::ReferenceStart => {
465 let st = Box::new(self.st.clone());
466 self.into_state_continue(State::InsideReference(st))
467 }
468
469 Token::OpeningTagStart =>
470 Some(self_error!(self; "Unexpected token inside attribute value: <")),
471
472 _ => {
474 t.push_to_string(&mut self.buf);
475 None
476 }
477 }
478 }
479
480 fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
481 let mut name = self.data.take_element_name().unwrap();
482 let mut attributes = self.data.take_attributes();
483
484 match self.nst.get(name.borrow().prefix_repr()) {
486 Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
488 None => return Some(self_error!(self; "Element {} prefix is unbound", name))
489 }
490
491 for attr in attributes.iter_mut() {
493 if let Some(ref pfx) = attr.name.prefix {
494 let new_ns = match self.nst.get(pfx) {
495 Some("") => None, Some(ns) => Some(ns.into()),
497 None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name))
498 };
499 attr.name.namespace = new_ns;
500 }
501 }
502
503 if emit_end_element {
504 self.pop_namespace = true;
505 self.next_event = Some(Ok(XmlEvent::EndElement {
506 name: name.clone()
507 }));
508 } else {
509 self.est.push(name.clone());
510 }
511 let namespace = self.nst.squash();
512 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
513 name: name,
514 attributes: attributes,
515 namespace: namespace
516 }))
517 }
518
519 fn emit_end_element(&mut self) -> Option<Result> {
520 let mut name = self.data.take_element_name().unwrap();
521
522 match self.nst.get(name.borrow().prefix_repr()) {
524 Some("") => name.namespace = None, Some(ns) => name.namespace = Some(ns.into()),
526 None => return Some(self_error!(self; "Element {} prefix is unbound", name))
527 }
528
529 let op_name = self.est.pop().unwrap();
530
531 if name == op_name {
532 self.pop_namespace = true;
533 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name }))
534 } else {
535 Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name))
536 }
537 }
538
539}
540
541#[cfg(test)]
542mod tests {
543 use std::io::BufReader;
544
545 use common::{Position, TextPosition};
546 use name::OwnedName;
547 use attribute::OwnedAttribute;
548 use reader::parser::PullParser;
549 use reader::ParserConfig;
550 use reader::events::XmlEvent;
551
552 fn new_parser() -> PullParser {
553 PullParser::new(ParserConfig::new())
554 }
555
556 macro_rules! expect_event(
557 ($r:expr, $p:expr, $t:pat) => (
558 match $p.next(&mut $r) {
559 $t => {}
560 e => panic!("Unexpected event: {:?}", e)
561 }
562 );
563 ($r:expr, $p:expr, $t:pat => $c:expr ) => (
564 match $p.next(&mut $r) {
565 $t if $c => {}
566 e => panic!("Unexpected event: {:?}", e)
567 }
568 )
569 );
570
571 macro_rules! test_data(
572 ($d:expr) => ({
573 static DATA: &'static str = $d;
574 let r = BufReader::new(DATA.as_bytes());
575 let p = new_parser();
576 (r, p)
577 })
578 );
579
580 #[test]
581 fn issue_3_semicolon_in_attribute_value() {
582 let (mut r, mut p) = test_data!(r#"
583 <a attr="zzz;zzz" />
584 "#);
585
586 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
587 expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
588 *name == OwnedName::local("a") &&
589 attributes.len() == 1 &&
590 attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
591 namespace.is_essentially_empty()
592 );
593 expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
594 expect_event!(r, p, Ok(XmlEvent::EndDocument));
595 }
596
597 #[test]
598 fn issue_140_entity_reference_inside_tag() {
599 let (mut r, mut p) = test_data!(r#"
600 <bla>♫</bla>
601 "#);
602
603 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
604 expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
605 expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
606 expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
607 expect_event!(r, p, Ok(XmlEvent::EndDocument));
608 }
609
610 #[test]
611 fn opening_tag_in_attribute_value() {
612 let (mut r, mut p) = test_data!(r#"
613 <a attr="zzz<zzz" />
614 "#);
615
616 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
617 expect_event!(r, p, Err(ref e) =>
618 e.msg() == "Unexpected token inside attribute value: <" &&
619 e.position() == TextPosition { row: 1, column: 24 }
620 );
621 }
622}