Import xml-rs 0.8.4

author Woohyun Jung <wh0705.jung@samsung.com>

Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)

committer Woohyun Jung <wh0705.jung@samsung.com>

Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)
author Woohyun Jung <wh0705.jung@samsung.com>
Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)
committer Woohyun Jung <wh0705.jung@samsung.com>
Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json

new file mode 100644 (file)

index 0000000..6e0c55d
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,5 @@
+{
+  "git": {
+    "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd"
+  }
+}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml

new file mode 100644 (file)

index 0000000..daca69f
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,31 @@
+name: CI
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        rust: [stable, beta, nightly]
+
+    steps:
+    - uses: actions/checkout@v2
+      
+    - uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: ${{ matrix.rust }}
+        override: true
+
+    - uses: actions-rs/cargo@v1
+      with:
+        command: build
+    
+    - uses: actions-rs/cargo@v1
+      with:
+        command: test
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..60b0232
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*.swp
+*.swo
+/doc
+*~
+/target/
+/Cargo.lock
+.idea/
+*.iml
+\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml

new file mode 100644 (file)

index 0000000..e704337
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,36 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+name = "xml-rs"
+version = "0.8.4"
+authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
+description = "An XML library in pure Rust"
+documentation = "http://docs.rs/xml-rs/"
+readme = "Readme.md"
+keywords = ["xml", "parsing", "parser"]
+categories = ["parsing"]
+license = "MIT"
+repository = "https://github.com/netvl/xml-rs"
+
+[lib]
+name = "xml"
+path = "src/lib.rs"
+
+[[bin]]
+name = "xml-analyze"
+path = "src/analyze.rs"
+[dev-dependencies.doc-comment]
+version = "0.3"
+
+[dev-dependencies.lazy_static]
+version = "1.2.0"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig

new file mode 100644 (file)

index 0000000..c8df8e6
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,23 @@
+[package]
+name = "xml-rs"
+version = "0.8.4"
+authors = ["Vladimir Matveev <vmatveev@citrine.cc>"]
+license = "MIT"
+description = "An XML library in pure Rust"
+repository = "https://github.com/netvl/xml-rs"
+documentation = "http://docs.rs/xml-rs/"
+readme = "Readme.md"
+keywords = ["xml", "parsing", "parser"]
+categories = ["parsing"]
+
+[lib]
+name = "xml"
+path = "src/lib.rs"
+
+[[bin]]
+name = "xml-analyze"
+path = "src/analyze.rs"
+
+[dev-dependencies]
+doc-comment = "0.3"
+lazy_static = "1.2.0"
diff --git a/Changelog.md b/Changelog.md

new file mode 100644 (file)

index 0000000..3cca8b8
--- /dev/null
+++ b/Changelog.md
@@ -0,0 +1,126 @@
+## Version 0.8.4
+
+* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters.
+* Fixed writer output operations to use `write_all` to ensure that the data
+  is written fully.
+* The document declaration is now written before any characters automatically.
+
+## Version 0.8.3
+
+* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser
+  skip emitting whitespace events outside of the root element when set to `true`.
+  This helps with certain tasks like canonicalization.
+
+## Version 0.8.2
+
+* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore
+  invalid Unicode code points and replace them with a Unicode "replacement character"
+  during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs.
+* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing
+  elements when they are emitted: `<a />` (`true`) vs `<a/>` (`false`).
+
+## Version 0.8.1
+
+* Fixed various issues with tests introduced by updates in Rust.
+* Adjusted the lexer to ignore contents of the `<!DOCTYPE>` tag.
+* Removed unnecessary unsafety in tests.
+* Added tests for doc comments in the readme file.
+* Switched to GitHub Actions from Travis CI.
+
+## Version 0.8.0
+
+* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump.
+
+## Version 0.7.1
+
+* Removed dependency on bitflags.
+* Added the `XmlWriter::inner_mut()` method.
+* Fixed some rustdoc warnings.
+
+## Version 0.7.0
+
+* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc.
+
+## Version 0.6.2
+
+* Bumped `bitflags` to 1.0.
+
+## Version 0.6.1
+
+* Fixed the writer to escape some special characters when writing attribute values.
+
+## Version 0.6.0
+
+* Changed the target type of extra entities from `char` to `String`. This is an incompatible
+  change.
+
+## Version 0.5.0
+
+* Added support for ignoring EOF errors in order to read documents from streams incrementally.
+* Bumped `bitflags` to 0.9.
+
+## Version 0.4.1
+
+* Added missing `Debug` implementation to `xml::writer::XmlEvent`.
+
+## Version 0.4.0
+
+* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility.
+
+## Version 0.3.8
+
+* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors.
+
+## Version 0.3.7
+
+* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140).
+* Added support for configuring custom entities in the parser configuration.
+
+## Version 0.3.6
+
+* Added an `Error` implementation for `EmitterError`.
+* Fixed escaping of strings with multi-byte code points.
+
+## Version 0.3.5
+
+* Added `Debug` implementation for `XmlVersion`.
+* Fixed some failing tests.
+
+## Version 0.3.3
+
+* Updated `bitflags` to 0.7.
+
+## Version 0.3.2
+
+* Added `From<io::Error>` for `xml::reader::Error`, which improves usability of working with parsing errors.
+
+## Version 0.3.1
+
+* Bumped `bitflags` dependency to 0.4, some internal warning fixes.
+
+## Version 0.3.0
+
+* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer.
+
+## Version 0.2.4
+
+* Fixed #112 - incorrect handling of namespace redefinitions when writing a document.
+
+## Version 0.2.3
+
+* Added `into_inner()` methods to `EventReader` and `EventWriter`.
+
+## Version 0.2.2
+
+* Using `join` instead of the deprecated `connect`.
+* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness.
+* Fixed incorrect handling of unqualified attribute names (#107).
+* Added this changelog.
+
+## Version 0.2.1
+
+* Fixed #105 - incorrect handling of double dashes.
+
+## Version 0.2.0
+
+* Major update, includes proper document writing support and significant architecture changes.
diff --git a/LICENSE b/LICENSE

new file mode 100644 (file)

index 0000000..6caa1d3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Vladimir Matveev
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Readme.md b/Readme.md

new file mode 100644 (file)

index 0000000..5ab88f8
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,236 @@
+xml-rs, an XML library for Rust
+===============================
+
+[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI)
+[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs)
+[![docs][docs-img]](https://docs.rs/xml-rs/)
+
+[Documentation](https://docs.rs/xml-rs/)
+
+  [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square
+  [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square
+  [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square
+
+xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language.
+It is heavily inspired by Java [Streaming API for XML (StAX)][stax].
+
+  [stax]: https://en.wikipedia.org/wiki/StAX
+
+This library currently contains pull parser much like [StAX event reader][stax-reader].
+It provides iterator API, so you can leverage Rust's existing iterators library features.
+
+  [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html
+
+It also provides a streaming document writer much like [StAX event writer][stax-writer].
+This writer consumes its own set of events, but reader events can be converted to
+writer events easily, and so it is possible to write XML transformation chains in a pretty
+clean manner.
+
+  [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html
+
+This parser is mostly full-featured, however, there are limitations:
+* no other encodings but UTF-8 are supported yet, because no stream-based encoding library
+  is available now; when (or if) one will be available, I'll try to make use of it;
+* DTD validation is not supported, `<!DOCTYPE>` declarations are completely ignored; thus no
+  support for custom entities too; internal DTD declarations are likely to cause parsing errors;
+* attribute value normalization is not performed, and end-of-line characters are not normalized too.
+
+Other than that the parser tries to be mostly XML-1.0-compliant.
+
+Writer is also mostly full-featured with the following limitations:
+* no support for encodings other than UTF-8, for the same reason as above;
+* no support for emitting `<!DOCTYPE>` declarations;
+* more validations of input are needed, for example, checking that namespace prefixes are bounded
+  or comments are well-formed.
+
+What is planned (highest priority first, approximately):
+
+0. missing features required by XML standard (e.g. aforementioned normalization and
+   proper DTD parsing);
+1. miscellaneous features of the writer;
+2. parsing into a DOM tree and its serialization back to XML text;
+3. SAX-like callback-based parser (fairly easy to implement over pull parser);
+4. DTD validation;
+5. (let's dream a bit) XML Schema validation.
+
+Building and using
+------------------
+
+xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest:
+
+```toml
+[dependencies]
+xml-rs = "0.8"
+```
+
+The package exposes a single crate called `xml`:
+
+```rust
+extern crate xml;
+```
+
+Reading XML documents
+---------------------
+
+`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding
+library is available, it is likely that xml-rs will be switched to use whatever character stream structure
+this library would provide, but currently it is a `Read`.
+
+Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator
+over events:
+
+```rust,no_run
+extern crate xml;
+
+use std::fs::File;
+use std::io::BufReader;
+
+use xml::reader::{EventReader, XmlEvent};
+
+fn indent(size: usize) -> String {
+    const INDENT: &'static str = "    ";
+    (0..size).map(|_| INDENT)
+             .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s)
+}
+
+fn main() {
+    let file = File::open("file.xml").unwrap();
+    let file = BufReader::new(file);
+
+    let parser = EventReader::new(file);
+    let mut depth = 0;
+    for e in parser {
+        match e {
+            Ok(XmlEvent::StartElement { name, .. }) => {
+                println!("{}+{}", indent(depth), name);
+                depth += 1;
+            }
+            Ok(XmlEvent::EndElement { name }) => {
+                depth -= 1;
+                println!("{}-{}", indent(depth), name);
+            }
+            Err(e) => {
+                println!("Error: {}", e);
+                break;
+            }
+            _ => {}
+        }
+    }
+}
+```
+
+`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly.
+Document parsing can end normally or with an error. Regardless of exact cause, the parsing
+process will be stopped, and iterator will terminate normally.
+
+You can also have finer control over when to pull the next event from the parser using its own
+`next()` method:
+
+```rust,ignore
+match parser.next() {
+    ...
+}
+```
+
+Upon the end of the document or an error the parser will remember that last event and will always
+return it in the result of `next()` call afterwards. If iterator is used, then it will yield
+error or end-of-document event once and will produce `None` afterwards.
+
+It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure.
+See its documentation for more information and examples.
+
+You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a
+small program (BTW, it is built with `cargo build` and can be run after that) which shows various
+statistics about specified XML document. It can also be used to check for well-formedness of
+XML documents - if a document is not well-formed, this program will exit with an error.
+
+Writing XML documents
+---------------------
+
+xml-rs also provides a streaming writer much like StAX event writer. With it you can write an
+XML document to any `Write` implementor.
+
+```rust,no_run
+extern crate xml;
+
+use std::fs::File;
+use std::io::{self, Write};
+
+use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result};
+
+fn handle_event<W: Write>(w: &mut EventWriter<W>, line: String) -> Result<()> {
+    let line = line.trim();
+    let event: XmlEvent = if line.starts_with("+") && line.len() > 1 {
+        XmlEvent::start_element(&line[1..]).into()
+    } else if line.starts_with("-") {
+        XmlEvent::end_element().into()
+    } else {
+        XmlEvent::characters(&line).into()
+    };
+    w.write(event)
+}
+
+fn main() {
+    let mut file = File::create("output.xml").unwrap();
+
+    let mut input = io::stdin();
+    let mut output = io::stdout();
+    let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file);
+    loop {
+        print!("> "); output.flush().unwrap();
+        let mut line = String::new();
+        match input.read_line(&mut line) {
+            Ok(0) => break,
+            Ok(_) => match handle_event(&mut writer, line) {
+                Ok(_) => {}
+                Err(e) => panic!("Write error: {}", e)
+            },
+            Err(e) => panic!("Input error: {}", e)
+        }
+    }
+}
+```
+
+The code example above also demonstrates how to create a writer out of its configuration.
+Similar thing also works with `EventReader`.
+
+The library provides an XML event building DSL which helps to construct complex events,
+e.g. ones having namespace definitions. Some examples:
+
+```rust,ignore
+// <a:hello a:param="value" xmlns:a="urn:some:document">
+XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document")
+
+// <hello b:config="name" xmlns="urn:default:uri">
+XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri")
+
+// <![CDATA[some unescaped text]]>
+XmlEvent::cdata("some unescaped text")
+```
+
+Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL.
+There are more examples in `xml::writer::XmlEvent` documentation.
+
+The writer has multiple configuration options; see `EmitterConfig` documentation for more
+information.
+
+Other things
+------------
+
+No performance tests or measurements are done. The implementation is rather naive, and no specific
+optimizations are made. Hopefully the library is sufficiently fast to process documents of common size.
+I intend to add benchmarks in future, but not until more important features are added.
+
+Known issues
+------------
+
+All known issues are present on GitHub issue tracker: <http://github.com/netvl/xml-rs/issues>.
+Feel free to post any found problems there.
+
+License
+-------
+
+This library is licensed under MIT license.
+
+---
+Copyright (C) Vladimir Matveev, 2014-2020
diff --git a/design.md b/design.md

new file mode 100644 (file)

index 0000000..da67c7b
--- /dev/null
+++ b/design.md
@@ -0,0 +1,37 @@
+# Reader
+
+Basic features:
+ * [x] Parsing XML 1.0 documents and returning a stream of events
+   - [ ] Support reading embedded DTD schemas
+   - [ ] Support for embedded entities
+ * [x] Support for namespaces and emitting namespace information in events
+ * [ ] \[maybe\] push-based wrapper
+ * Missing XML features
+   - [ ] Support for different encodings
+   - [ ] Attribute values normalization
+   - [ ] EOL characters normalization
+
+Advanced features:
+ * [ ] DTD schema validation
+ * [ ] XSD schema validation
+
+# Writer
+
+Basic features:
+  * [x] Writing basic XML 1.0 documents in UTF-8
+  * [x] Writing XML 1.0 documents with namespace support
+  * [x] Support for writing elements with empty body as empty elements
+  * [x] Pretty-printed and compact output
+  * [ ] Writing XML document with embedded DTDs and DTD references
+  * Misc features:
+    - [ ] Support for different encodings
+    - [x] Support for writing CDATA as characters
+    - [ ] Checking events for invalid characters (e.g. `--` in comments)
+    - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI
+    - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace
+    - [ ] Support checking namespace prefix for events with both prefix and namespace URI
+
+# Other
+
+DOM-based API:
+ * [ ] Basic support for DOM-based API
diff --git a/src/analyze.rs b/src/analyze.rs

new file mode 100644 (file)

index 0000000..d369d2f
--- /dev/null
+++ b/src/analyze.rs
@@ -0,0 +1,99 @@
+#![forbid(unsafe_code)]
+
+extern crate xml;
+
+use std::cmp;
+use std::env;
+use std::io::{self, Read, Write, BufReader};
+use std::fs::File;
+use std::collections::HashSet;
+
+use xml::ParserConfig;
+use xml::reader::XmlEvent;
+
+macro_rules! abort {
+    ($code:expr) => {::std::process::exit($code)};
+    ($code:expr, $($args:tt)+) => {{
+        writeln!(&mut ::std::io::stderr(), $($args)+).unwrap();
+        ::std::process::exit($code);
+    }}
+}
+
+fn main() {
+    let mut file;
+    let mut stdin;
+    let source: &mut Read = match env::args().nth(1) {
+        Some(file_name) => {
+            file = File::open(file_name)
+                .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e));
+            &mut file
+        }
+        None => {
+            stdin = io::stdin();
+            &mut stdin
+        }
+    };
+
+    let reader = ParserConfig::new()
+        .whitespace_to_characters(true)
+        .ignore_comments(false)
+        .create_reader(BufReader::new(source));
+
+    let mut processing_instructions = 0;
+    let mut elements = 0;
+    let mut character_blocks = 0;
+    let mut cdata_blocks = 0;
+    let mut characters = 0;
+    let mut comment_blocks = 0;
+    let mut comment_characters = 0;
+    let mut namespaces = HashSet::new();
+    let mut depth = 0;
+    let mut max_depth = 0;
+
+    for e in reader {
+        match e {
+            Ok(e) => match e {
+                XmlEvent::StartDocument { version, encoding, standalone } =>
+                    println!(
+                        "XML document version {}, encoded in {}, {}standalone",
+                        version, encoding, if standalone.unwrap_or(false) { "" } else { "not " }
+                    ),
+                XmlEvent::EndDocument => println!("Document finished"),
+                XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1,
+                XmlEvent::Whitespace(_) => {}  // can't happen due to configuration
+                XmlEvent::Characters(s) => {
+                    character_blocks += 1;
+                    characters += s.len();
+                }
+                XmlEvent::CData(s) => {
+                    cdata_blocks += 1;
+                    characters += s.len();
+                }
+                XmlEvent::Comment(s) => {
+                    comment_blocks += 1;
+                    comment_characters += s.len();
+                }
+                XmlEvent::StartElement { namespace, .. } => {
+                    depth += 1;
+                    max_depth = cmp::max(max_depth, depth);
+                    elements += 1;
+                    namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri));
+                }
+                XmlEvent::EndElement { .. } => {
+                    depth -= 1;
+                }
+            },
+            Err(e) => abort!(1, "Error parsing XML document: {}", e)
+        }
+    }
+    namespaces.remove(xml::namespace::NS_EMPTY_URI);
+    namespaces.remove(xml::namespace::NS_XMLNS_URI);
+    namespaces.remove(xml::namespace::NS_XML_URI);
+
+    println!("Elements: {}, maximum depth: {}", elements, max_depth);
+    println!("Namespaces (excluding built-in): {}", namespaces.len());
+    println!("Characters: {}, characters blocks: {}, CDATA blocks: {}",
+             characters, character_blocks, cdata_blocks);
+    println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters);
+    println!("Processing instructions (excluding built-in): {}", processing_instructions);
+}
diff --git a/src/attribute.rs b/src/attribute.rs

new file mode 100644 (file)

index 0000000..8728f49
--- /dev/null
+++ b/src/attribute.rs
@@ -0,0 +1,99 @@
+//! Contains XML attributes manipulation types and functions.
+//!
+
+use std::fmt;
+
+use name::{Name, OwnedName};
+use escape::escape_str_attribute;
+
+/// A borrowed version of an XML attribute.
+///
+/// Consists of a borrowed qualified name and a borrowed string value.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
+pub struct Attribute<'a> {
+    /// Attribute name.
+    pub name: Name<'a>,
+
+    /// Attribute value.
+    pub value: &'a str
+}
+
+impl<'a> fmt::Display for Attribute<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value))
+    }
+}
+
+impl<'a> Attribute<'a> {
+    /// Creates an owned attribute out of this borrowed one.
+    #[inline]
+    pub fn to_owned(&self) -> OwnedAttribute {
+        OwnedAttribute {
+            name: self.name.into(),
+            value: self.value.into(),
+        }
+    }
+
+    /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value.
+    #[inline]
+    pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> {
+        Attribute { name, value, }
+    }
+}
+
+/// An owned version of an XML attribute.
+///
+/// Consists of an owned qualified name and an owned string value.
+#[derive(Clone, Eq, PartialEq, Hash, Debug)]
+pub struct OwnedAttribute {
+    /// Attribute name.
+    pub name: OwnedName,
+
+    /// Attribute value.
+    pub value: String
+}
+
+impl OwnedAttribute {
+    /// Returns a borrowed `Attribute` out of this owned one.
+    pub fn borrow(&self) -> Attribute {
+        Attribute {
+            name: self.name.borrow(),
+            value: &*self.value,
+        }
+    }
+
+    /// Creates a new owned attribute using the provided owned name and an owned string value.
+    #[inline]
+    pub fn new<S: Into<String>>(name: OwnedName, value: S) -> OwnedAttribute {
+        OwnedAttribute {
+            name,
+            value: value.into(),
+        }
+    }
+}
+
+impl fmt::Display for OwnedAttribute {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{Attribute};
+
+    use name::Name;
+
+    #[test]
+    fn attribute_display() {
+        let attr = Attribute::new(
+            Name::qualified("attribute", "urn:namespace", Some("n")),
+            "its value with > & \" ' < weird symbols"
+        );
+
+        assert_eq!(
+            &*attr.to_string(),
+            "{urn:namespace}n:attribute=\"its value with &gt; &amp; &quot; &apos; &lt; weird symbols\""
+        )
+    }
+}
diff --git a/src/common.rs b/src/common.rs

new file mode 100644 (file)

index 0000000..029e851
--- /dev/null
+++ b/src/common.rs
@@ -0,0 +1,142 @@
+//! Contains common types and functions used throughout the library.
+
+use std::fmt;
+
+/// Represents a position inside some textual document.
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub struct TextPosition {
+    /// Row, counting from 0
+    pub row: u64,
+    /// Column, counting from 0
+    pub column: u64,
+}
+
+impl TextPosition {
+    /// Creates a new position initialized to the beginning of the document
+    #[inline]
+    pub fn new() -> TextPosition {
+        TextPosition { row: 0, column: 0 }
+    }
+
+    /// Advances the position in a line
+    #[inline]
+    pub fn advance(&mut self, count: u8) {
+        self.column += count as u64;
+    }
+
+    /// Advances the position in a line to the next tab position
+    #[inline]
+    pub fn advance_to_tab(&mut self, width: u8) {
+        let width = width as u64;
+        self.column += width - self.column % width
+    }
+
+    /// Advances the position to the beginning of the next line
+    #[inline]
+    pub fn new_line(&mut self) {
+        self.column = 0;
+        self.row += 1;
+    }
+}
+
+impl fmt::Debug for TextPosition {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}:{}", self.row + 1, self.column + 1)
+    }
+}
+
+impl fmt::Display for TextPosition {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}:{}", self.row + 1, self.column + 1)
+    }
+}
+
+/// Get the position in the document corresponding to the object
+///
+/// This trait is implemented by parsers, lexers and errors.
+pub trait Position {
+    /// Returns the current position or a position corresponding to the object.
+    fn position(&self) -> TextPosition;
+}
+
+impl Position for TextPosition {
+    #[inline]
+    fn position(&self) -> TextPosition {
+        *self
+    }
+}
+
+/// XML version enumeration.
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum XmlVersion {
+    /// XML version 1.0.
+    Version10,
+
+    /// XML version 1.1.
+    Version11
+}
+
+impl fmt::Display for XmlVersion {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            XmlVersion::Version10 => write!(f, "1.0"),
+            XmlVersion::Version11 => write!(f, "1.1")
+        }
+    }
+}
+
+impl fmt::Debug for XmlVersion {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(self, f)
+    }
+}
+
+/// Checks whether the given character is a white space character (`S`)
+/// as is defined by XML 1.1 specification, [section 2.3][1].
+///
+/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+pub fn is_whitespace_char(c: char) -> bool {
+    match c {
+        '\x20' | '\x09' | '\x0d' | '\x0a' => true,
+        _ => false
+    }
+}
+
+/// Checks whether the given string is compound only by white space
+/// characters (`S`) using the previous is_whitespace_char to check
+/// all characters of this string
+pub fn is_whitespace_str(s: &str) -> bool {
+    s.chars().all(is_whitespace_char)
+}
+
+/// Checks whether the given character is a name start character (`NameStartChar`)
+/// as is defined by XML 1.1 specification, [section 2.3][1].
+///
+/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+pub fn is_name_start_char(c: char) -> bool {
+    match c {
+        ':' | 'A'...'Z' | '_' | 'a'...'z' |
+        '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' |
+        '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' |
+        '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' |
+        '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' |
+        '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' |
+        '\u{10000}'...'\u{EFFFF}' => true,
+        _ => false
+    }
+}
+
+/// Checks whether the given character is a name character (`NameChar`)
+/// as is defined by XML 1.1 specification, [section 2.3][1].
+///
+/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
+pub fn is_name_char(c: char) -> bool {
+    match c {
+        _ if is_name_start_char(c) => true,
+        '-' | '.' | '0'...'9' | '\u{B7}' |
+        '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true,
+        _ => false
+    }
+}
diff --git a/src/escape.rs b/src/escape.rs

new file mode 100644 (file)

index 0000000..18298b9
--- /dev/null
+++ b/src/escape.rs
@@ -0,0 +1,126 @@
+//! Contains functions for performing XML special characters escaping.
+
+use std::borrow::Cow;
+
+enum Value {
+    Char(char),
+    Str(&'static str)
+}
+
+impl Value {
+    fn dispatch_for_attribute(c: char) -> Value {
+        match c {
+            '<'  => Value::Str("&lt;"),
+            '>'  => Value::Str("&gt;"),
+            '"'  => Value::Str("&quot;"),
+            '\'' => Value::Str("&apos;"),
+            '&'  => Value::Str("&amp;"),
+            '\n' => Value::Str("&#xA;"),
+            '\r' => Value::Str("&#xD;"),
+            _    => Value::Char(c)
+        }
+    }
+
+    fn dispatch_for_pcdata(c: char) -> Value {
+        match c {
+            '<'  => Value::Str("&lt;"),
+            '&'  => Value::Str("&amp;"),
+            _    => Value::Char(c)
+        }
+    }
+}
+
+enum Process<'a> {
+    Borrowed(&'a str),
+    Owned(String)
+}
+
+impl<'a> Process<'a> {
+    fn process(&mut self, (i, next): (usize, Value)) {
+        match next {
+            Value::Str(s) => match *self {
+                Process::Owned(ref mut o) => o.push_str(s),
+                Process::Borrowed(b) => {
+                    let mut r = String::with_capacity(b.len() + s.len());
+                    r.push_str(&b[..i]);
+                    r.push_str(s);
+                    *self = Process::Owned(r);
+                }
+            },
+            Value::Char(c) => match *self {
+                Process::Borrowed(_) => {}
+                Process::Owned(ref mut o) => o.push(c)
+            }
+        }
+    }
+
+    fn into_result(self) -> Cow<'a, str> {
+        match self {
+            Process::Borrowed(b) => Cow::Borrowed(b),
+            Process::Owned(o) => Cow::Owned(o)
+        }
+    }
+}
+
+impl<'a> Extend<(usize, Value)> for Process<'a> {
+    fn extend<I: IntoIterator<Item=(usize, Value)>>(&mut self, it: I) {
+        for v in it.into_iter() {
+            self.process(v);
+        }
+    }
+}
+
+fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow<str> {
+    let mut p = Process::Borrowed(s);
+    p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c))));
+    p.into_result()
+}
+
+/// Performs escaping of common XML characters inside an attribute value.
+///
+/// This function replaces several important markup characters with their
+/// entity equivalents:
+///
+/// * `<` → `&lt;`
+/// * `>` → `&gt;`
+/// * `"` → `&quot;`
+/// * `'` → `&apos;`
+/// * `&` → `&amp;`
+///
+/// The resulting string is safe to use inside XML attribute values or in PCDATA sections.
+///
+/// Does not perform allocations if the given string does not contain escapable characters.
+#[inline]
+pub fn escape_str_attribute(s: &str) -> Cow<str> {
+    escape_str(s, Value::dispatch_for_attribute)
+}
+
+/// Performs escaping of common XML characters inside PCDATA.
+///
+/// This function replaces several important markup characters with their
+/// entity equivalents:
+///
+/// * `<` → `&lt;`
+/// * `&` → `&amp;`
+///
+/// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values.
+///
+/// Does not perform allocations if the given string does not contain escapable characters.
+#[inline]
+pub fn escape_str_pcdata(s: &str) -> Cow<str> {
+    escape_str(s, Value::dispatch_for_pcdata)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{escape_str_pcdata, escape_str_attribute};
+
+    // TODO: add more tests
+
+    #[test]
+    fn test_escape_multibyte_code_points() {
+        assert_eq!(escape_str_attribute("☃<"), "☃&lt;");
+        assert_eq!(escape_str_pcdata("☃<"), "☃&lt;");
+    }
+}
+
diff --git a/src/lib.rs b/src/lib.rs

new file mode 100644 (file)

index 0000000..fb672ef
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,29 @@
+//#![warn(missing_doc)]
+#![allow(dead_code)]
+#![allow(unused_variables)]
+#![forbid(non_camel_case_types)]
+#![forbid(unsafe_code)]
+
+//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser.
+
+#[cfg(doctest)]
+#[macro_use]
+extern crate doc_comment;
+
+#[cfg(doctest)]
+doctest!("../Readme.md");
+
+pub use reader::EventReader;
+pub use reader::ParserConfig;
+pub use writer::EventWriter;
+pub use writer::EmitterConfig;
+
+pub mod macros;
+pub mod name;
+pub mod attribute;
+pub mod common;
+pub mod escape;
+pub mod namespace;
+pub mod reader;
+pub mod writer;
+mod util;
diff --git a/src/macros.rs b/src/macros.rs

new file mode 100644 (file)

index 0000000..1cce3d6
--- /dev/null
+++ b/src/macros.rs
@@ -0,0 +1,30 @@
+#![macro_use]
+
+//! Contains several macros used in this crate.
+
+macro_rules! gen_setter {
+    ($target:ty, $field:ident : into $t:ty) => {
+        impl $target {
+            /// Sets the field to the provided value and returns updated config object.
+            pub fn $field<T: Into<$t>>(mut self, value: T) -> $target {
+                self.$field = value.into();
+                self
+            }
+        }
+    };
+    ($target:ty, $field:ident : val $t:ty) => {
+        impl $target {
+            /// Sets the field to the provided value and returns updated config object.
+            pub fn $field(mut self, value: $t) -> $target {
+                self.$field = value;
+                self
+            }
+        }
+    }
+}
+
+macro_rules! gen_setters {
+    ($target:ty, $($field:ident : $k:tt $tpe:ty),+) => ($(
+        gen_setter! { $target, $field : $k $tpe }
+    )+)
+}
diff --git a/src/name.rs b/src/name.rs

new file mode 100644 (file)

index 0000000..a20eae2
--- /dev/null
+++ b/src/name.rs
@@ -0,0 +1,301 @@
+//! Contains XML qualified names manipulation types and functions.
+//!
+
+use std::fmt;
+use std::str::FromStr;
+
+use namespace::NS_NO_PREFIX;
+
+/// Represents a qualified XML name.
+///
+/// A qualified name always consists at least of a local name. It can optionally contain
+/// a prefix; when reading an XML document, if it contains a prefix, it must also contain a
+/// namespace URI, but this is not enforced statically; see below. The name can contain a
+/// namespace without a prefix; in that case a default, empty prefix is assumed.
+///
+/// When writing XML documents, it is possible to omit the namespace URI, leaving only
+/// the prefix. In this case the writer will check that the specifed prefix is bound to some
+/// URI in the current namespace context. If both prefix and namespace URI are specified,
+/// it is checked that the current namespace context contains this exact correspondence
+/// between prefix and namespace URI.
+///
+/// # Prefixes and URIs
+///
+/// A qualified name with a prefix must always contain a proper namespace URI --- names with
+/// a prefix but without a namespace associated with that prefix are meaningless. However,
+/// it is impossible to obtain proper namespace URI by a prefix without a context, and such
+/// context is only available when parsing a document (or it can be constructed manually
+/// when writing a document). Tying a name to a context statically seems impractical. This
+/// may change in future, though.
+///
+/// # Conversions
+///
+/// `Name` implements some `From` instances for conversion from strings and tuples. For example:
+///
+/// ```rust
+/// # use xml::name::Name;
+/// let n1: Name = "p:some-name".into();
+/// let n2: Name = ("p", "some-name").into();
+///
+/// assert_eq!(n1, n2);
+/// assert_eq!(n1.local_name, "some-name");
+/// assert_eq!(n1.prefix, Some("p"));
+/// assert!(n1.namespace.is_none());
+/// ```
+///
+/// This is added to support easy specification of XML elements when writing XML documents.
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub struct Name<'a> {
+    /// A local name, e.g. `string` in `xsi:string`.
+    pub local_name: &'a str,
+
+    /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
+    pub namespace: Option<&'a str>,
+
+    /// A name prefix, e.g. `xsi` in `xsi:string`.
+    pub prefix: Option<&'a str>
+}
+
+impl<'a> From<&'a str> for Name<'a> {
+    fn from(s: &'a str) -> Name<'a> {
+        let mut parts = s.splitn(2, ":").fuse();
+        match (parts.next(), parts.next()) {
+            (Some(name), None) => Name::local(name),
+            (Some(prefix), Some(name)) => Name::prefixed(name, prefix),
+            _ => unreachable!()
+        }
+    }
+}
+
+impl<'a> From<(&'a str, &'a str)> for Name<'a> {
+    fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> {
+        Name::prefixed(name, prefix)
+    }
+}
+
+impl<'a> fmt::Display for Name<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if let Some(namespace) = self.namespace {
+            write!(f, "{{{}}}", namespace)?;
+        }
+
+        if let Some(prefix) = self.prefix {
+            write!(f, "{}:", prefix)?;
+        }
+
+        write!(f, "{}", self.local_name)
+    }
+}
+
+impl<'a> Name<'a> {
+    /// Returns an owned variant of the qualified name.
+    pub fn to_owned(&self) -> OwnedName {
+        OwnedName {
+            local_name: self.local_name.into(),
+            namespace: self.namespace.map(|s| s.into()),
+            prefix: self.prefix.map(|s| s.into())
+        }
+    }
+
+    /// Returns a new `Name` instance representing plain local name.
+    #[inline]
+    pub fn local(local_name: &str) -> Name {
+        Name {
+            local_name,
+            prefix: None,
+            namespace: None
+        }
+    }
+
+    /// Returns a new `Name` instance with the given local name and prefix.
+    #[inline]
+    pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> {
+        Name {
+            local_name,
+            namespace: None,
+            prefix: Some(prefix)
+        }
+    }
+
+    /// Returns a new `Name` instance representing a qualified name with or without a prefix and
+    /// with a namespace URI.
+    #[inline]
+    pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> {
+        Name {
+            local_name,
+            namespace: Some(namespace),
+            prefix,
+        }
+    }
+
+    /// Returns a correct XML representation of this local name and prefix.
+    ///
+    /// This method is different from the autoimplemented `to_string()` because it does not
+    /// include namespace URI in the result.
+    pub fn to_repr(&self) -> String {
+        self.repr_display().to_string()
+    }
+
+    /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this
+    /// local name and prefix.
+    ///
+    /// This method is needed for efficiency purposes in order not to create unnecessary
+    /// allocations.
+    #[inline]
+    pub fn repr_display(&self) -> ReprDisplay {
+        ReprDisplay(self)
+    }
+
+    /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant.
+    #[inline]
+    pub fn prefix_repr(&self) -> &str {
+        self.prefix.unwrap_or(NS_NO_PREFIX)
+    }
+}
+
+/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is
+/// displayed in an XML document.
+pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>);
+
+impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.0.prefix {
+            Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name),
+            None => write!(f, "{}", self.0.local_name)
+        }
+    }
+}
+
+/// An owned variant of `Name`.
+///
+/// Everything about `Name` applies to this structure as well.
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct OwnedName {
+    /// A local name, e.g. `string` in `xsi:string`.
+    pub local_name: String,
+
+    /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
+    pub namespace: Option<String>,
+
+    /// A name prefix, e.g. `xsi` in `xsi:string`.
+    pub prefix: Option<String>,
+}
+
+impl fmt::Display for OwnedName {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(&self.borrow(), f)
+    }
+}
+
+impl OwnedName {
+    /// Constructs a borrowed `Name` based on this owned name.
+    pub fn borrow(&self) -> Name {
+        Name {
+            local_name: &*self.local_name,
+            namespace: self.namespace.as_ref().map(|s| &**s),
+            prefix: self.prefix.as_ref().map(|s| &**s),
+        }
+    }
+
+    /// Returns a new `OwnedName` instance representing a plain local name.
+    #[inline]
+    pub fn local<S>(local_name: S) -> OwnedName where S: Into<String> {
+        OwnedName {
+            local_name: local_name.into(),
+            namespace: None,
+            prefix: None,
+        }
+    }
+
+    /// Returns a new `OwnedName` instance representing a qualified name with or without
+    /// a prefix and with a namespace URI.
+    #[inline]
+    pub fn qualified<S1, S2, S3>(local_name: S1, namespace: S2, prefix: Option<S3>) -> OwnedName
+        where S1: Into<String>, S2: Into<String>, S3: Into<String>
+    {
+        OwnedName {
+            local_name: local_name.into(),
+            namespace: Some(namespace.into()),
+            prefix: prefix.map(|v| v.into())
+        }
+    }
+
+    /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix`
+    /// but avoids extra work.
+    #[inline]
+    pub fn prefix_ref(&self) -> Option<&str> {
+        self.prefix.as_ref().map(|s| &**s)
+    }
+
+    /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace`
+    /// but avoids extra work.
+    #[inline]
+    pub fn namespace_ref(&self) -> Option<&str> {
+        self.namespace.as_ref().map(|s| &**s)
+    }
+}
+
+impl<'a> From<Name<'a>> for OwnedName {
+    #[inline]
+    fn from(n: Name<'a>) -> OwnedName {
+        n.to_owned()
+    }
+}
+
+impl FromStr for OwnedName {
+    type Err = ();
+
+    /// Parses the given string slice into a qualified name.
+    ///
+    /// This function, when finishes sucessfully, always return a qualified
+    /// name without a namespace (`name.namespace == None`). It should be filled later
+    /// using proper `NamespaceStack`.
+    ///
+    /// It is supposed that all characters in the argument string are correct
+    /// as defined by the XML specification. No additional checks except a check
+    /// for emptiness are done.
+    fn from_str(s: &str) -> Result<OwnedName, ()> {
+        let mut it = s.split(':');
+
+        let r = match (it.next(), it.next(), it.next()) {
+            (Some(prefix), Some(local_name), None) if !prefix.is_empty() &&
+                                                      !local_name.is_empty() =>
+                Some((local_name.into(), Some(prefix.into()))),
+            (Some(local_name), None, None) if !local_name.is_empty() =>
+                Some((local_name.into(), None)),
+            (_, _, _) => None
+        };
+        r.map(|(local_name, prefix)| OwnedName {
+            local_name,
+            namespace: None,
+            prefix
+        }).ok_or(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::OwnedName;
+
+    #[test]
+    fn test_owned_name_from_str() {
+        assert_eq!("prefix:name".parse(), Ok(OwnedName {
+            local_name: "name".into(),
+            namespace: None,
+            prefix: Some("prefix".into())
+        }));
+
+        assert_eq!("name".parse(), Ok(OwnedName {
+            local_name: "name".into(),
+            namespace: None,
+            prefix: None
+        }));
+
+        assert_eq!("".parse(), Err::<OwnedName, ()>(()));
+        assert_eq!(":".parse(), Err::<OwnedName, ()>(()));
+        assert_eq!(":a".parse(), Err::<OwnedName, ()>(()));
+        assert_eq!("a:".parse(), Err::<OwnedName, ()>(()));
+        assert_eq!("a:b:c".parse(), Err::<OwnedName, ()>(()));
+    }
+}
diff --git a/src/namespace.rs b/src/namespace.rs

new file mode 100644 (file)

index 0000000..1ab4a5c
--- /dev/null
+++ b/src/namespace.rs
@@ -0,0 +1,485 @@
+//! Contains namespace manipulation types and functions.
+
+use std::iter::{Map, Rev};
+use std::collections::btree_map::{BTreeMap, Entry};
+use std::collections::btree_map::Iter as Entries;
+use std::collections::HashSet;
+use std::slice::Iter;
+
+/// Designates prefix for namespace definitions.
+///
+/// See [Namespaces in XML][namespace] spec for more information.
+///
+///   [namespace]: http://www.w3.org/TR/xml-names/#ns-decl
+pub const NS_XMLNS_PREFIX: &'static str = "xmlns";
+
+/// Designates the standard URI for `xmlns` prefix.
+///
+/// See [A Namespace Name for xmlns Attributes][1] for more information.
+///
+///   [namespace]: http://www.w3.org/2000/xmlns/
+pub const NS_XMLNS_URI: &'static str    = "http://www.w3.org/2000/xmlns/";
+
+/// Designates prefix for a namespace containing several special predefined attributes.
+///
+/// See [2.10 White Space handling][1],  [2.1 Language Identification][2],
+/// [XML Base specification][3] and [xml:id specification][4] for more information.
+///
+///   [1]: http://www.w3.org/TR/REC-xml/#sec-white-space
+///   [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag
+///   [3]: http://www.w3.org/TR/xmlbase/
+///   [4]: http://www.w3.org/TR/xml-id/
+pub const NS_XML_PREFIX: &'static str   = "xml";
+
+/// Designates the standard URI for `xml` prefix.
+///
+/// See `NS_XML_PREFIX` documentation for more information.
+pub const NS_XML_URI: &'static str      = "http://www.w3.org/XML/1998/namespace";
+
+/// Designates the absence of prefix in a qualified name.
+///
+/// This constant should be used to define or query default namespace which should be used
+/// for element or attribute names without prefix. For example, if a namespace mapping
+/// at a particular point in the document contains correspondence like
+///
+/// ```none
+///   NS_NO_PREFIX  -->  urn:some:namespace
+/// ```
+///
+/// then all names declared without an explicit prefix `urn:some:namespace` is assumed as
+/// a namespace URI.
+///
+/// By default empty prefix corresponds to absence of namespace, but this can change either
+/// when writing an XML document (manually) or when reading an XML document (based on namespace
+/// declarations).
+pub const NS_NO_PREFIX: &'static str    = "";
+
+/// Designates an empty namespace URI, which is equivalent to absence of namespace.
+///
+/// This constant should not usually be used directly; it is used to designate that
+/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with
+/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping
+/// in a namespace back to its default value.
+pub const NS_EMPTY_URI: &'static str    = "";
+
+/// Namespace is a map from prefixes to namespace URIs.
+///
+/// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant.
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct Namespace(pub BTreeMap<String, String>);
+
+impl Namespace {
+    /// Returns an empty namespace.
+    #[inline]
+    pub fn empty() -> Namespace { Namespace(BTreeMap::new()) }
+
+    /// Checks whether this namespace is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Checks whether this namespace is essentially empty, that is, it does not contain
+    /// anything but default mappings.
+    pub fn is_essentially_empty(&self) -> bool {
+        // a shortcut for a namespace which is definitely not empty
+        if self.0.len() > 3 { return false; }
+
+        self.0.iter().all(|(k, v)| match (&**k, &**v) {
+            (NS_NO_PREFIX,    NS_EMPTY_URI) => true,
+            (NS_XMLNS_PREFIX, NS_XMLNS_URI) => true,
+            (NS_XML_PREFIX,   NS_XML_URI)   => true,
+            _ => false
+        })
+    }
+
+    /// Checks whether this namespace mapping contains the given prefix.
+    ///
+    /// # Parameters
+    /// * `prefix`  --- namespace prefix.
+    ///
+    /// # Return value
+    /// `true` if this namespace contains the given prefix, `false` otherwise.
+    #[inline]
+    pub fn contains<P: ?Sized+AsRef<str>>(&self, prefix: &P) -> bool {
+        self.0.contains_key(prefix.as_ref())
+    }
+
+    /// Puts a mapping into this namespace.
+    ///
+    /// This method does not override any already existing mappings.
+    ///
+    /// Returns a boolean flag indicating whether the map already contained
+    /// the given prefix.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix;
+    /// * `uri`    --- namespace URI.
+    ///
+    /// # Return value
+    /// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
+    /// was already present in the namespace.
+    pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool
+        where P: Into<String>, U: Into<String>
+    {
+        match self.0.entry(prefix.into()) {
+            Entry::Occupied(_) => false,
+            Entry::Vacant(ve) => {
+                ve.insert(uri.into());
+                true
+            }
+        }
+    }
+
+    /// Puts a mapping into this namespace forcefully.
+    ///
+    /// This method, unlike `put()`, does replace an already existing mapping.
+    ///
+    /// Returns previous URI which was assigned to the given prefix, if it is present.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix;
+    /// * `uri`    --- namespace URI.
+    ///
+    /// # Return value
+    /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or
+    /// `None` if such prefix was not present in the namespace before.
+    pub fn force_put<P, U>(&mut self, prefix: P, uri: U) -> Option<String>
+        where P: Into<String>, U: Into<String>
+    {
+        self.0.insert(prefix.into(), uri.into())
+    }
+
+    /// Queries the namespace for the given prefix.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix.
+    ///
+    /// # Return value
+    /// Namespace URI corresponding to the given prefix, if it is present.
+    pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
+        self.0.get(prefix.as_ref()).map(|s| &**s)
+    }
+}
+
+/// An alias for iterator type for namespace mappings contained in a namespace.
+pub type NamespaceMappings<'a> = Map<
+    Entries<'a, String, String>,
+    for<'b> fn((&'b String, &'b String)) -> UriMapping<'b>
+>;
+
+impl<'a> IntoIterator for &'a Namespace {
+    type Item = UriMapping<'a>;
+    type IntoIter = NamespaceMappings<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> {
+            (&*prefix, &*uri)
+        }
+        self.0.iter().map(mapper)
+    }
+}
+
+/// Namespace stack is a sequence of namespaces.
+///
+/// Namespace stack is used to represent cumulative namespace consisting of
+/// combined namespaces from nested elements.
+#[derive(Clone, Eq, PartialEq, Debug)]
+pub struct NamespaceStack(pub Vec<Namespace>);
+
+impl NamespaceStack {
+    /// Returns an empty namespace stack.
+    #[inline]
+    pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) }
+
+    /// Returns a namespace stack with default items in it.
+    ///
+    /// Default items are the following:
+    ///
+    /// * `xml` → `http://www.w3.org/XML/1998/namespace`;
+    /// * `xmlns` → `http://www.w3.org/2000/xmlns/`.
+    #[inline]
+    pub fn default() -> NamespaceStack {
+        let mut nst = NamespaceStack::empty();
+        nst.push_empty();
+        // xml namespace
+        nst.put(NS_XML_PREFIX, NS_XML_URI);
+        // xmlns namespace
+        nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI);
+        // empty namespace
+        nst.put(NS_NO_PREFIX, NS_EMPTY_URI);
+        nst
+    }
+
+    /// Adds an empty namespace to the top of this stack.
+    #[inline]
+    pub fn push_empty(&mut self) -> &mut NamespaceStack {
+        self.0.push(Namespace::empty());
+        self
+    }
+
+    /// Removes the topmost namespace in this stack.
+    ///
+    /// Panics if the stack is empty.
+    #[inline]
+    pub fn pop(&mut self) -> Namespace {
+        self.0.pop().unwrap()
+    }
+
+    /// Removes the topmost namespace in this stack.
+    ///
+    /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise.
+    #[inline]
+    pub fn try_pop(&mut self) -> Option<Namespace> {
+        self.0.pop()
+    }
+
+    /// Borrows the topmost namespace mutably, leaving the stack intact.
+    ///
+    /// Panics if the stack is empty.
+    #[inline]
+    pub fn peek_mut(&mut self) -> &mut Namespace {
+        self.0.last_mut().unwrap()
+    }
+
+    /// Borrows the topmost namespace immutably, leaving the stack intact.
+    ///
+    /// Panics if the stack is empty.
+    #[inline]
+    pub fn peek(&self) -> &Namespace {
+        self.0.last().unwrap()
+    }
+
+    /// Puts a mapping into the topmost namespace if this stack does not already contain one.
+    ///
+    /// Returns a boolean flag indicating whether the insertion has completed successfully.
+    /// Note that both key and value are matched and the mapping is inserted if either
+    /// namespace prefix is not already mapped, or if it is mapped, but to a different URI.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix;
+    /// * `uri`    --- namespace URI.
+    ///
+    /// # Return value
+    /// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
+    /// was already present in the namespace stack.
+    pub fn put_checked<P, U>(&mut self, prefix: P, uri: U) -> bool
+        where P: Into<String> + AsRef<str>,
+              U: Into<String> + AsRef<str>
+    {
+        if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) {
+            false
+        } else {
+            self.put(prefix, uri);
+            true
+        }
+    }
+
+    /// Puts a mapping into the topmost namespace in this stack.
+    ///
+    /// This method does not override a mapping in the topmost namespace if it is
+    /// already present, however, it does not depend on other namespaces in the stack,
+    /// so it is possible to put a mapping which is present in lower namespaces.
+    ///
+    /// Returns a boolean flag indicating whether the insertion has completed successfully.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix;
+    /// * `uri`    --- namespace URI.
+    ///
+    /// # Return value
+    /// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
+    /// was already present in the namespace.
+    #[inline]
+    pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool
+        where P: Into<String>, U: Into<String>
+    {
+        self.0.last_mut().unwrap().put(prefix, uri)
+    }
+
+    /// Performs a search for the given prefix in the whole stack.
+    ///
+    /// This method walks the stack from top to bottom, querying each namespace
+    /// in order for the given prefix. If none of the namespaces contains the prefix,
+    /// `None` is returned.
+    ///
+    /// # Parameters
+    /// * `prefix` --- namespace prefix.
+    #[inline]
+    pub fn get<'a, P: ?Sized+AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
+        let prefix = prefix.as_ref();
+        for ns in self.0.iter().rev() {
+            match ns.get(prefix) {
+                None => {},
+                r => return r,
+            }
+        }
+        None
+    }
+
+    /// Combines this stack of namespaces into a single namespace.
+    ///
+    /// Namespaces are combined in left-to-right order, that is, rightmost namespace
+    /// elements take priority over leftmost ones.
+    pub fn squash(&self) -> Namespace {
+        let mut result = BTreeMap::new();
+        for ns in self.0.iter() {
+            result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone())));
+        }
+        Namespace(result)
+    }
+
+    /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`.
+    ///
+    /// See `CheckedTarget` for more information.
+    #[inline]
+    pub fn checked_target(&mut self) -> CheckedTarget {
+        CheckedTarget(self)
+    }
+
+    /// Returns an iterator over all mappings in this namespace stack.
+    #[inline]
+    pub fn iter(&self) -> NamespaceStackMappings {
+        self.into_iter()
+    }
+}
+
+/// An iterator over mappings from prefixes to URIs in a namespace stack.
+///
+/// # Example
+/// ```
+/// # use xml::namespace::NamespaceStack;
+/// let mut nst = NamespaceStack::empty();
+/// nst.push_empty();
+/// nst.put("a", "urn:A");
+/// nst.put("b", "urn:B");
+/// nst.push_empty();
+/// nst.put("c", "urn:C");
+///
+/// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::<Vec<_>>());
+/// ```
+pub struct NamespaceStackMappings<'a> {
+    namespaces: Rev<Iter<'a, Namespace>>,
+    current_namespace: Option<NamespaceMappings<'a>>,
+    used_keys: HashSet<&'a str>
+}
+
+impl<'a> NamespaceStackMappings<'a> {
+    fn go_to_next_namespace(&mut self) -> bool {
+        self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter());
+        self.current_namespace.is_some()
+    }
+}
+
+impl<'a> Iterator for NamespaceStackMappings<'a> {
+    type Item = UriMapping<'a>;
+
+    fn next(&mut self) -> Option<UriMapping<'a>> {
+        // If there is no current namespace and no next namespace, we're finished
+        if self.current_namespace.is_none() && !self.go_to_next_namespace() {
+            return None;
+        }
+        let next_item = self.current_namespace.as_mut().unwrap().next();
+
+        match next_item {
+            // There is an element in the current namespace
+            Some((k, v)) => if self.used_keys.contains(&k) {
+                // If the current key is used, go to the next one
+                self.next()
+            } else {
+                // Otherwise insert the current key to the set of used keys and
+                // return the mapping
+                self.used_keys.insert(k);
+                Some((k, v))
+            },
+            // Current namespace is exhausted
+            None => if self.go_to_next_namespace() {
+                // If there is next namespace, continue from it
+                self.next()
+            } else {
+                // No next namespace, exiting
+                None
+            }
+        }
+    }
+}
+
+impl<'a> IntoIterator for &'a NamespaceStack {
+    type Item = UriMapping<'a>;
+    type IntoIter = NamespaceStackMappings<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        NamespaceStackMappings {
+            namespaces: self.0.iter().rev(),
+            current_namespace: None,
+            used_keys: HashSet::new()
+        }
+    }
+}
+
+/// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators.
+pub type UriMapping<'a> = (&'a str, &'a str);
+
+impl<'a> Extend<UriMapping<'a>> for Namespace {
+    fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> {
+        for (prefix, uri) in iterable {
+            self.put(prefix, uri);
+        }
+    }
+}
+
+impl<'a> Extend<UriMapping<'a>> for NamespaceStack {
+    fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> {
+        for (prefix, uri) in iterable {
+            self.put(prefix, uri);
+        }
+    }
+}
+
+/// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`.
+///
+/// # Example
+///
+/// ```
+/// # use xml::namespace::NamespaceStack;
+///
+/// let mut nst = NamespaceStack::empty();
+/// nst.push_empty();
+/// nst.put("a", "urn:A");
+/// nst.put("b", "urn:B");
+/// nst.push_empty();
+/// nst.put("c", "urn:C");
+///
+/// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]);
+/// assert_eq!(
+///     vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")],
+///     nst.iter().collect::<Vec<_>>()
+/// );
+/// ```
+///
+/// Compare:
+///
+/// ```
+/// # use xml::namespace::NamespaceStack;
+/// # let mut nst = NamespaceStack::empty();
+/// # nst.push_empty();
+/// # nst.put("a", "urn:A");
+/// # nst.put("b", "urn:B");
+/// # nst.push_empty();
+/// # nst.put("c", "urn:C");
+///
+/// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]);
+/// assert_eq!(
+///     vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")],
+///     nst.iter().collect::<Vec<_>>()
+/// );
+/// ```
+pub struct CheckedTarget<'a>(&'a mut NamespaceStack);
+
+impl<'a, 'b> Extend<UriMapping<'b>> for CheckedTarget<'a> {
+    fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'b>> {
+        for (prefix, uri) in iterable {
+            self.0.put_checked(prefix, uri);
+        }
+    }
+}
diff --git a/src/reader/config.rs b/src/reader/config.rs

new file mode 100644 (file)

index 0000000..0abb165
--- /dev/null
+++ b/src/reader/config.rs
@@ -0,0 +1,181 @@
+//! Contains parser configuration structure.
+use std::io::Read;
+use std::collections::HashMap;
+
+use reader::EventReader;
+
+/// Parser configuration structure.
+///
+/// This structure contains various configuration options which affect
+/// behavior of the parser.
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct ParserConfig {
+    /// Whether or not should whitespace in textual events be removed. Default is false.
+    ///
+    /// When true, all standalone whitespace will be removed (this means no
+    /// `Whitespace` events will be emitted), and leading and trailing whitespace
+    /// from `Character` events will be deleted. If after trimming `Characters`
+    /// event will be empty, it will also be omitted from output stream. This is
+    /// possible, however, only if `whitespace_to_characters` or
+    /// `cdata_to_characters` options are set.
+    ///
+    /// This option does not affect CDATA events, unless `cdata_to_characters`
+    /// option is also set. In that case CDATA content will also be trimmed.
+    pub trim_whitespace: bool,
+
+    /// Whether or not should whitespace be converted to characters.
+    /// Default is false.
+    ///
+    /// If true, instead of `Whitespace` events `Characters` events with the
+    /// same content will be emitted. If `trim_whitespace` is also true, these
+    /// events will be trimmed to nothing and, consequently, not emitted.
+    pub whitespace_to_characters: bool,
+
+    /// Whether or not should CDATA be converted to characters.
+    /// Default is false.
+    ///
+    /// If true, instead of `CData` events `Characters` events with the same
+    /// content will be emitted. If `trim_whitespace` is also true, these events
+    /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
+    /// this event will be omitted from the stream.
+    pub cdata_to_characters: bool,
+
+    /// Whether or not should comments be omitted. Default is true.
+    ///
+    /// If true, `Comment` events will not be emitted at all.
+    pub ignore_comments: bool,
+
+    /// Whether or not should sequential `Characters` events be merged.
+    /// Default is true.
+    ///
+    /// If true, multiple sequential `Characters` events will be merged into
+    /// a single event, that is, their data will be concatenated.
+    ///
+    /// Multiple sequential `Characters` events are only possible if either
+    /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
+    /// events will always be separated by other events.
+    pub coalesce_characters: bool,
+
+    /// A map of extra entities recognized by the parser. Default is an empty map.
+    ///
+    /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
+    /// however, it is convenient to make the parser recognize additional entities which
+    /// are also not available through the DTD definitions (especially given that at the moment
+    /// DTD parsing is not supported).
+    pub extra_entities: HashMap<String, String>,
+
+    /// Whether or not the parser should ignore the end of stream. Default is false.
+    ///
+    /// By default the parser will either error out when it encounters a premature end of
+    /// stream or complete normally if the end of stream was expected. If you want to continue
+    /// reading from a stream whose input is supplied progressively, you can set this option to true.
+    /// In this case the parser will allow you to invoke the next() method even if a supposed end
+    /// of stream has happened.
+    ///
+    /// Note that support for this functionality is incomplete; for example, the parser will fail if
+    /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
+    pub ignore_end_of_stream: bool,
+
+    /// Whether or not non-unicode entity references get replaced with the replacement character
+    ///
+    /// When true, any decimal or hexadecimal character reference that cannot be converted from a
+    /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
+    /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
+    pub replace_unknown_entity_references: bool,
+
+    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
+    ///
+    /// By default any whitespace that is not enclosed within at least one level of elements will be
+    /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
+    pub ignore_root_level_whitespace: bool,
+}
+
+impl ParserConfig {
+    /// Returns a new config with default values.
+    ///
+    /// You can tweak default values using builder-like pattern:
+    ///
+    /// ```rust
+    /// use xml::reader::ParserConfig;
+    ///
+    /// let config = ParserConfig::new()
+    ///     .trim_whitespace(true)
+    ///     .ignore_comments(true)
+    ///     .coalesce_characters(false);
+    /// ```
+    pub fn new() -> ParserConfig {
+        ParserConfig {
+            trim_whitespace: false,
+            whitespace_to_characters: false,
+            cdata_to_characters: false,
+            ignore_comments: true,
+            coalesce_characters: true,
+            extra_entities: HashMap::new(),
+            ignore_end_of_stream: false,
+            replace_unknown_entity_references: false,
+            ignore_root_level_whitespace: true,
+        }
+    }
+
+    /// Creates an XML reader with this configuration.
+    ///
+    /// This is a convenience method for configuring and creating a reader at the same time:
+    ///
+    /// ```rust
+    /// use xml::reader::ParserConfig;
+    ///
+    /// let mut source: &[u8] = b"...";
+    ///
+    /// let reader = ParserConfig::new()
+    ///     .trim_whitespace(true)
+    ///     .ignore_comments(true)
+    ///     .coalesce_characters(false)
+    ///     .create_reader(&mut source);
+    /// ```
+    ///
+    /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
+    /// this configuration object.
+    #[inline]
+    pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
+        EventReader::new_with_config(source, self)
+    }
+
+    /// Adds a new entity mapping and returns an updated config object.
+    ///
+    /// This is a convenience method for adding external entities mappings to the XML parser.
+    /// An example:
+    ///
+    /// ```rust
+    /// use xml::reader::ParserConfig;
+    ///
+    /// let mut source: &[u8] = b"...";
+    ///
+    /// let reader = ParserConfig::new()
+    ///     .add_entity("nbsp", " ")
+    ///     .add_entity("copy", "©")
+    ///     .add_entity("reg", "®")
+    ///     .create_reader(&mut source);
+    /// ```
+    pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
+        self.extra_entities.insert(entity.into(), value.into());
+        self
+    }
+}
+
+impl Default for ParserConfig {
+    #[inline]
+    fn default() -> ParserConfig {
+        ParserConfig::new()
+    }
+}
+
+gen_setters! { ParserConfig,
+    trim_whitespace: val bool,
+    whitespace_to_characters: val bool,
+    cdata_to_characters: val bool,
+    ignore_comments: val bool,
+    coalesce_characters: val bool,
+    ignore_end_of_stream: val bool,
+    replace_unknown_entity_references: val bool,
+    ignore_root_level_whitespace: val bool
+}
diff --git a/src/reader/error.rs b/src/reader/error.rs

new file mode 100644 (file)

index 0000000..92378e6
--- /dev/null
+++ b/src/reader/error.rs
@@ -0,0 +1,121 @@
+
+use std::io;
+use std::borrow::Cow;
+use std::fmt;
+use std::error;
+use std::str;
+
+use util;
+use common::{Position, TextPosition};
+
+#[derive(Debug)]
+pub enum ErrorKind {
+    Syntax(Cow<'static, str>),
+    Io(io::Error),
+    Utf8(str::Utf8Error),
+    UnexpectedEof,
+}
+
+/// An XML parsing error.
+///
+/// Consists of a 2D position in a document and a textual message describing the error.
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct Error {
+    pos: TextPosition,
+    kind: ErrorKind,
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{} {}", self.pos, self.msg())
+    }
+}
+
+impl Position for Error {
+    #[inline]
+    fn position(&self) -> TextPosition { self.pos }
+}
+
+impl Error {
+    /// Returns a reference to a message which is contained inside this error.
+    #[inline]
+    pub fn msg(&self) -> &str {
+        use self::ErrorKind::*;
+        match self.kind {
+            UnexpectedEof => &"Unexpected EOF",
+            Utf8(ref reason) => error_description(reason),
+            Io(ref io_error) => error_description(io_error),
+            Syntax(ref msg) => msg.as_ref(),
+        }
+    }
+
+    pub fn kind(&self) -> &ErrorKind { &self.kind }
+}
+
+impl error::Error for Error {
+    #[inline]
+    fn description(&self) -> &str { self.msg() }
+}
+
+impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> {
+    fn from(orig: (&'a P, M)) -> Self {
+        Error{
+            pos: orig.0.position(),
+            kind: ErrorKind::Syntax(orig.1.into())
+        }
+    }
+}
+
+impl From<util::CharReadError> for Error {
+    fn from(e: util::CharReadError) -> Self {
+        use util::CharReadError::*;
+        Error{
+            pos: TextPosition::new(),
+            kind: match e {
+                UnexpectedEof => ErrorKind::UnexpectedEof,
+                Utf8(reason) => ErrorKind::Utf8(reason),
+                Io(io_error) => ErrorKind::Io(io_error),
+            }
+        }
+    }
+}
+
+impl From<io::Error> for Error {
+    fn from(e: io::Error) -> Self {
+        Error {
+            pos: TextPosition::new(),
+            kind: ErrorKind::Io(e),
+        }
+    }
+}
+
+impl Clone for ErrorKind {
+    fn clone(&self) -> Self {
+        use self::ErrorKind::*;
+        match *self {
+            UnexpectedEof => UnexpectedEof,
+            Utf8(ref reason) => Utf8(reason.clone()),
+            Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))),
+            Syntax(ref msg) => Syntax(msg.clone()),
+        }
+    }
+}
+impl PartialEq for ErrorKind {
+    fn eq(&self, other: &ErrorKind) -> bool {
+        use self::ErrorKind::*;
+        match (self, other) {
+            (&UnexpectedEof, &UnexpectedEof) => true,
+            (&Utf8(ref left), &Utf8(ref right)) => left == right,
+            (&Io(ref left), &Io(ref right)) =>
+                left.kind() == right.kind() &&
+                error_description(left) == error_description(right),
+            (&Syntax(ref left), &Syntax(ref right)) =>
+                left == right,
+
+            (_, _) => false,
+        }
+    }
+}
+impl Eq for ErrorKind {}
+
+fn error_description(e: &error::Error) -> &str { e.description() }
diff --git a/src/reader/events.rs b/src/reader/events.rs

new file mode 100644 (file)

index 0000000..46d7621
--- /dev/null
+++ b/src/reader/events.rs
@@ -0,0 +1,219 @@
+//! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
+
+use std::fmt;
+use std::borrow::Cow;
+
+use name::OwnedName;
+use attribute::OwnedAttribute;
+use common::XmlVersion;
+use namespace::Namespace;
+
+/// An element of an XML input stream.
+///
+/// Items of this enum are emitted by `reader::EventReader`. They correspond to different
+/// elements of an XML document.
+#[derive(PartialEq, Clone)]
+pub enum XmlEvent {
+    /// Corresponds to XML document declaration.
+    ///
+    /// This event is always emitted before any other event. It is emitted
+    /// even if the actual declaration is not present in the document.
+    StartDocument {
+        /// XML version.
+        ///
+        /// If XML declaration is not present, defaults to `Version10`.
+        version: XmlVersion,
+
+        /// XML document encoding.
+        ///
+        /// If XML declaration is not present or does not contain `encoding` attribute,
+        /// defaults to `"UTF-8"`. This field is currently used for no other purpose than
+        /// informational.
+        encoding: String,
+
+        /// XML standalone declaration.
+        ///
+        /// If XML document is not present or does not contain `standalone` attribute,
+        /// defaults to `None`. This field is currently used for no other purpose than
+        /// informational.
+        standalone: Option<bool>
+    },
+
+    /// Denotes to the end of the document stream.
+    ///
+    /// This event is always emitted after any other event (except `Error`). After it
+    /// is emitted for the first time, it will always be emitted on next event pull attempts.
+    EndDocument,
+
+    /// Denotes an XML processing instruction.
+    ///
+    /// This event contains a processing instruction target (`name`) and opaque `data`. It
+    /// is up to the application to process them.
+    ProcessingInstruction {
+        /// Processing instruction target.
+        name: String,
+
+        /// Processing instruction content.
+        data: Option<String>
+    },
+
+    /// Denotes a beginning of an XML element.
+    ///
+    /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the
+    /// latter case `EndElement` event immediately follows.
+    StartElement {
+        /// Qualified name of the element.
+        name: OwnedName,
+
+        /// A list of attributes associated with the element.
+        ///
+        /// Currently attributes are not checked for duplicates (TODO)
+        attributes: Vec<OwnedAttribute>,
+
+        /// Contents of the namespace mapping at this point of the document.
+        namespace: Namespace,
+    },
+
+    /// Denotes an end of an XML element.
+    ///
+    /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the
+    /// latter case it is emitted immediately after corresponding `StartElement` event.
+    EndElement {
+        /// Qualified name of the element.
+        name: OwnedName
+    },
+
+    /// Denotes CDATA content.
+    ///
+    /// This event contains unparsed data. No unescaping will be performed.
+    ///
+    /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See
+    /// `pull::ParserConfiguration` structure for more information.
+    CData(String),
+
+    /// Denotes a comment.
+    ///
+    /// It is possible to configure a parser to ignore comments, so this event will never be emitted.
+    /// See `pull::ParserConfiguration` structure for more information.
+    Comment(String),
+
+    /// Denotes character data outside of tags.
+    ///
+    /// Contents of this event will always be unescaped, so no entities like `&lt;` or `&amp;` or `&#123;`
+    /// will appear in it.
+    ///
+    /// It is possible to configure a parser to trim leading and trailing whitespace for this event.
+    /// See `pull::ParserConfiguration` structure for more information.
+    Characters(String),
+
+    /// Denotes a chunk of whitespace outside of tags.
+    ///
+    /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`.
+    /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace
+    /// trimming, it will eliminate standalone whitespace from the event stream completely.
+    Whitespace(String)
+}
+
+impl fmt::Debug for XmlEvent {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            XmlEvent::StartDocument { ref version, ref encoding, ref standalone } =>
+                write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone),
+            XmlEvent::EndDocument =>
+                write!(f, "EndDocument"),
+            XmlEvent::ProcessingInstruction { ref name, ref data } =>
+                write!(f, "ProcessingInstruction({}{})", *name, match *data {
+                    Some(ref data) => format!(", {}", data),
+                    None       => String::new()
+                }),
+            XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } =>
+                write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() {
+                    String::new()
+                } else {
+                    let attributes: Vec<String> = attributes.iter().map(
+                        |a| format!("{} -> {}", a.name, a.value)
+                    ).collect();
+                    format!(", [{}]", attributes.join(", "))
+                }),
+            XmlEvent::EndElement { ref name } =>
+                write!(f, "EndElement({})", name),
+            XmlEvent::Comment(ref data) =>
+                write!(f, "Comment({})", data),
+            XmlEvent::CData(ref data) =>
+                write!(f, "CData({})", data),
+            XmlEvent::Characters(ref data) =>
+                write!(f, "Characters({})", data),
+            XmlEvent::Whitespace(ref data) =>
+                write!(f, "Whitespace({})", data)
+        }
+    }
+}
+
+impl XmlEvent {
+    /// Obtains a writer event from this reader event.
+    ///
+    /// This method is useful for streaming processing of XML documents where the output
+    /// is also an XML document. With this method it is possible to process some events
+    /// while passing other events through to the writer unchanged:
+    ///
+    /// ```rust
+    /// use std::str;
+    ///
+    /// use xml::{EventReader, EventWriter};
+    /// use xml::reader::XmlEvent as ReaderEvent;
+    /// use xml::writer::XmlEvent as WriterEvent;
+    ///
+    /// let mut input: &[u8] = b"<hello>world</hello>";
+    /// let mut output: Vec<u8> = Vec::new();
+    ///
+    /// {
+    ///     let mut reader = EventReader::new(&mut input);
+    ///     let mut writer = EventWriter::new(&mut output);
+    ///
+    ///     for e in reader {
+    ///         match e.unwrap() {
+    ///             ReaderEvent::Characters(s) =>
+    ///                 writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(),
+    ///             e => if let Some(e) = e.as_writer_event() {
+    ///                 writer.write(e).unwrap()
+    ///             }
+    ///         }
+    ///     }
+    /// }
+    ///
+    /// assert_eq!(
+    ///     str::from_utf8(&output).unwrap(),
+    ///     r#"<?xml version="1.0" encoding="UTF-8"?><hello>WORLD</hello>"#
+    /// );
+    /// ```
+    ///
+    /// Note that this API may change or get additions in future to improve its ergonomics.
+    pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> {
+        match *self {
+            XmlEvent::StartDocument { version, ref encoding, standalone } =>
+                Some(::writer::events::XmlEvent::StartDocument {
+                    version: version,
+                    encoding: Some(encoding),
+                    standalone: standalone
+                }),
+            XmlEvent::ProcessingInstruction { ref name, ref data } =>
+                Some(::writer::events::XmlEvent::ProcessingInstruction {
+                    name: name,
+                    data: data.as_ref().map(|s| &s[..])
+                }),
+            XmlEvent::StartElement { ref name, ref attributes, ref namespace } =>
+                Some(::writer::events::XmlEvent::StartElement {
+                    name: name.borrow(),
+                    attributes: attributes.iter().map(|a| a.borrow()).collect(),
+                    namespace: Cow::Borrowed(namespace)
+                }),
+            XmlEvent::EndElement { ref name } =>
+                Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
+            XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)),
+            XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)),
+            XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)),
+            XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)),
+            _ => None
+        }
+    }
+}
diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs

new file mode 100644 (file)

index 0000000..c466db9
--- /dev/null
+++ b/src/reader/lexer.rs
@@ -0,0 +1,867 @@
+//! Contains simple lexer for XML documents.
+//!
+//! This module is for internal use. Use `xml::pull` module to do parsing.
+
+use std::fmt;
+use std::collections::VecDeque;
+use std::io::Read;
+use std::result;
+use std::borrow::Cow;
+
+use common::{Position, TextPosition, is_whitespace_char, is_name_char};
+use reader::Error;
+use util;
+
+/// `Token` represents a single lexeme of an XML document. These lexemes
+/// are used to perform actual parsing.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum Token {
+    /// `<?`
+    ProcessingInstructionStart,
+    /// `?>`
+    ProcessingInstructionEnd,
+    /// `<!DOCTYPE
+    DoctypeStart,
+    /// `<`
+    OpeningTagStart,
+    /// `</`
+    ClosingTagStart,
+    /// `>`
+    TagEnd,
+    /// `/>`
+    EmptyTagEnd,
+    /// `<!--`
+    CommentStart,
+    /// `-->`
+    CommentEnd,
+    /// A chunk of characters, used for errors recovery.
+    Chunk(&'static str),
+    /// Any non-special character except whitespace.
+    Character(char),
+    /// Whitespace character.
+    Whitespace(char),
+    /// `=`
+    EqualsSign,
+    /// `'`
+    SingleQuote,
+    /// `"`
+    DoubleQuote,
+    /// `<![CDATA[`
+    CDataStart,
+    /// `]]>`
+    CDataEnd,
+    /// `&`
+    ReferenceStart,
+    /// `;`
+    ReferenceEnd,
+}
+
+impl fmt::Display for Token {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Token::Chunk(s)                            => write!(f, "{}", s),
+            Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
+            other => write!(f, "{}", match other {
+                Token::OpeningTagStart            => "<",
+                Token::ProcessingInstructionStart => "<?",
+                Token::DoctypeStart               => "<!DOCTYPE",
+                Token::ClosingTagStart            => "</",
+                Token::CommentStart               => "<!--",
+                Token::CDataStart                 => "<![CDATA[",
+                Token::TagEnd                     => ">",
+                Token::EmptyTagEnd                => "/>",
+                Token::ProcessingInstructionEnd   => "?>",
+                Token::CommentEnd                 => "-->",
+                Token::CDataEnd                   => "]]>",
+                Token::ReferenceStart             => "&",
+                Token::ReferenceEnd               => ";",
+                Token::EqualsSign                 => "=",
+                Token::SingleQuote                => "'",
+                Token::DoubleQuote                => "\"",
+                _                          => unreachable!()
+            })
+        }
+    }
+}
+
+impl Token {
+    pub fn as_static_str(&self) -> Option<&'static str> {
+        match *self {
+            Token::OpeningTagStart            => Some("<"),
+            Token::ProcessingInstructionStart => Some("<?"),
+            Token::DoctypeStart               => Some("<!DOCTYPE"),
+            Token::ClosingTagStart            => Some("</"),
+            Token::CommentStart               => Some("<!--"),
+            Token::CDataStart                 => Some("<![CDATA["),
+            Token::TagEnd                     => Some(">"),
+            Token::EmptyTagEnd                => Some("/>"),
+            Token::ProcessingInstructionEnd   => Some("?>"),
+            Token::CommentEnd                 => Some("-->"),
+            Token::CDataEnd                   => Some("]]>"),
+            Token::ReferenceStart             => Some("&"),
+            Token::ReferenceEnd               => Some(";"),
+            Token::EqualsSign                 => Some("="),
+            Token::SingleQuote                => Some("'"),
+            Token::DoubleQuote                => Some("\""),
+            Token::Chunk(s)                   => Some(s),
+            _                                 => None
+        }
+    }
+
+    // using String.push_str(token.to_string()) is simply way too slow
+    pub fn push_to_string(&self, target: &mut String) {
+        match self.as_static_str() {
+            Some(s) => { target.push_str(s); }
+            None => {
+                match *self {
+                    Token::Character(c) | Token::Whitespace(c) => target.push(c),
+                    _ => unreachable!()
+                }
+            }
+        }
+    }
+
+    /// Returns `true` if this token contains data that can be interpreted
+    /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
+    #[inline]
+    pub fn contains_char_data(&self) -> bool {
+        match *self {
+            Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
+            Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | 
+            Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true,
+            _ => false
+        }
+    }
+
+    /// Returns `true` if this token corresponds to a white space character.
+    #[inline]
+    pub fn is_whitespace(&self) -> bool {
+        match *self {
+            Token::Whitespace(_) => true,
+            _ => false
+        }
+    }
+}
+
+enum State {
+    /// Triggered on '<'
+    TagStarted,
+    /// Triggered on '<!'
+    CommentOrCDataOrDoctypeStarted,
+    /// Triggered on '<!-'
+    CommentStarted,
+    /// Triggered on '<!D' up to '<!DOCTYPE'
+    DoctypeStarted(DoctypeStartedSubstate),
+    /// Triggered after DoctypeStarted to handle sub elements
+    DoctypeFinishing(u8),
+    /// Triggered on '<![' up to '<![CDATA'
+    CDataStarted(CDataStartedSubstate),
+    /// Triggered on '?'
+    ProcessingInstructionClosing,
+    /// Triggered on '/'
+    EmptyTagClosing,
+    /// Triggered on '-' up to '--'
+    CommentClosing(ClosingSubstate),
+    /// Triggered on ']' up to ']]'
+    CDataClosing(ClosingSubstate),
+    /// Default state
+    Normal
+}
+
+#[derive(Copy, Clone)]
+enum ClosingSubstate {
+    First, Second
+}
+
+#[derive(Copy, Clone)]
+enum DoctypeStartedSubstate {
+    D, DO, DOC, DOCT, DOCTY, DOCTYP
+}
+
+#[derive(Copy, Clone)]
+enum CDataStartedSubstate {
+    E, C, CD, CDA, CDAT, CDATA
+}
+
+/// `Result` represents lexing result. It is either a token or an error message.
+pub type Result = result::Result<Option<Token>, Error>;
+
+/// Helps to set up a dispatch table for lexing large unambigous tokens like
+/// `<![CDATA[` or `<!DOCTYPE `.
+macro_rules! dispatch_on_enum_state(
+    ($_self:ident, $s:expr, $c:expr, $is:expr,
+     $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
+     $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
+        match $s {
+            $(
+            $st => match $c {
+                $stc => $_self.move_to($is($next_st)),
+                _  => $_self.handle_error($chunk, $c)
+            },
+            )+
+            $end_st => match $c {
+                $end_c => $e,
+                _      => $_self.handle_error($end_chunk, $c)
+            }
+        }
+    )
+);
+
+/// `Lexer` is a lexer for XML documents, which implements pull API.
+///
+/// Main method is `next_token` which accepts an `std::io::Read` instance and
+/// tries to read the next lexeme from it.
+///
+/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
+/// When it is not set, errors will be reported as `Err` objects with a string message.
+/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
+/// to toggle the behavior.
+pub struct Lexer {
+    pos: TextPosition,
+    head_pos: TextPosition,
+    char_queue: VecDeque<char>,
+    st: State,
+    skip_errors: bool,
+    inside_comment: bool,
+    inside_token: bool,
+    eof_handled: bool
+}
+
+impl Position for Lexer {
+    #[inline]
+    /// Returns the position of the last token produced by the lexer
+    fn position(&self) -> TextPosition { self.pos }
+}
+
+impl Lexer {
+    /// Returns a new lexer with default state.
+    pub fn new() -> Lexer {
+        Lexer {
+            pos: TextPosition::new(),
+            head_pos: TextPosition::new(),
+            char_queue: VecDeque::with_capacity(4),  // TODO: check size
+            st: State::Normal,
+            skip_errors: false,
+            inside_comment: false,
+            inside_token: false,
+            eof_handled: false
+        }
+    }
+
+    /// Enables error handling so `next_token` will return `Some(Err(..))`
+    /// upon invalid lexeme.
+    #[inline]
+    pub fn enable_errors(&mut self) { self.skip_errors = false; }
+
+    /// Disables error handling so `next_token` will return `Some(Chunk(..))`
+    /// upon invalid lexeme with this lexeme content.
+    #[inline]
+    pub fn disable_errors(&mut self) { self.skip_errors = true; }
+
+    /// Enables special handling of some lexemes which should be done when we're parsing comment
+    /// internals.
+    #[inline]
+    pub fn inside_comment(&mut self) { self.inside_comment = true; }
+
+    /// Disables the effect of `inside_comment()` method.
+    #[inline]
+    pub fn outside_comment(&mut self) { self.inside_comment = false; }
+
+    /// Reset the eof handled flag of the lexer.
+    #[inline]
+    pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
+
+    /// Tries to read the next token from the buffer.
+    ///
+    /// It is possible to pass different instaces of `BufReader` each time
+    /// this method is called, but the resulting behavior is undefined in this case.
+    ///
+    /// Return value:
+    /// * `Err(reason) where reason: reader::Error` - when an error occurs;
+    /// * `Ok(None)` - upon end of stream is reached;
+    /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
+    pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
+        // Already reached end of buffer
+        if self.eof_handled {
+            return Ok(None);
+        }
+
+        if !self.inside_token {
+            self.pos = self.head_pos;
+            self.inside_token = true;
+        }
+
+        // Check if we have saved a char or two for ourselves
+        while let Some(c) = self.char_queue.pop_front() {
+            match try!(self.read_next_token(c)) {
+                Some(t) => {
+                    self.inside_token = false;
+                    return Ok(Some(t));
+                }
+                None => {}  // continue
+            }
+        }
+
+        loop {
+            // TODO: this should handle multiple encodings
+            let c = match try!(util::next_char_from(b)) {
+                Some(c) => c,   // got next char
+                None => break,  // nothing to read left
+            };
+
+            match try!(self.read_next_token(c)) {
+                Some(t) => {
+                    self.inside_token = false;
+                    return Ok(Some(t));
+                }
+                None => {
+                    // continue
+                }
+            }
+        }
+
+        // Handle end of stream
+        self.eof_handled = true;
+        self.pos = self.head_pos;
+        match self.st {
+            State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
+            State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
+            State::CommentClosing(ClosingSubstate::Second) |
+            State::DoctypeFinishing(_) =>
+                Err(self.error("Unexpected end of stream")),
+            State::ProcessingInstructionClosing =>
+                Ok(Some(Token::Character('?'))),
+            State::EmptyTagClosing =>
+                Ok(Some(Token::Character('/'))),
+            State::CommentClosing(ClosingSubstate::First) =>
+                Ok(Some(Token::Character('-'))),
+            State::CDataClosing(ClosingSubstate::First) =>
+                Ok(Some(Token::Character(']'))),
+            State::CDataClosing(ClosingSubstate::Second) =>
+                Ok(Some(Token::Chunk("]]"))),
+            State::Normal =>
+                Ok(None)
+        }
+    }
+
+    #[inline]
+    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
+        (self, msg).into()
+    }
+
+    #[inline]
+    fn read_next_token(&mut self, c: char) -> Result {
+        let res = self.dispatch_char(c);
+        if self.char_queue.is_empty() {
+            if c == '\n' {
+                self.head_pos.new_line();
+            } else {
+                self.head_pos.advance(1);
+            }
+        }
+        res
+    }
+
+    fn dispatch_char(&mut self, c: char) -> Result {
+        match self.st {
+            State::Normal                         => self.normal(c),
+            State::TagStarted                     => self.tag_opened(c),
+            State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
+            State::CommentStarted                 => self.comment_started(c),
+            State::CDataStarted(s)                => self.cdata_started(c, s),
+            State::DoctypeStarted(s)              => self.doctype_started(c, s),
+            State::DoctypeFinishing(d)            => self.doctype_finishing(c, d),
+            State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
+            State::EmptyTagClosing                => self.empty_element_closing(c),
+            State::CommentClosing(s)              => self.comment_closing(c, s),
+            State::CDataClosing(s)                => self.cdata_closing(c, s)
+        }
+    }
+
+    #[inline]
+    fn move_to(&mut self, st: State) -> Result {
+        self.st = st;
+        Ok(None)
+    }
+
+    #[inline]
+    fn move_to_with(&mut self, st: State, token: Token) -> Result {
+        self.st = st;
+        Ok(Some(token))
+    }
+
+    #[inline]
+    fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
+        self.char_queue.extend(cs.iter().cloned());
+        self.move_to_with(st, token)
+    }
+
+    fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
+        self.char_queue.push_back(c);
+        if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
+            self.move_to_with(State::Normal, Token::Chunk(chunk))
+        } else {
+            Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
+        }
+    }
+
+    /// Encountered a char
+    fn normal(&mut self, c: char) -> Result {
+        match c {
+            '<'                        => self.move_to(State::TagStarted),
+            '>'                        => Ok(Some(Token::TagEnd)),
+            '/'                        => self.move_to(State::EmptyTagClosing),
+            '='                        => Ok(Some(Token::EqualsSign)),
+            '"'                        => Ok(Some(Token::DoubleQuote)),
+            '\''                       => Ok(Some(Token::SingleQuote)),
+            '?'                        => self.move_to(State::ProcessingInstructionClosing),
+            '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
+            ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
+            '&'                        => Ok(Some(Token::ReferenceStart)),
+            ';'                        => Ok(Some(Token::ReferenceEnd)),
+            _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
+            _                          => Ok(Some(Token::Character(c)))
+        }
+    }
+
+    /// Encountered '<'
+    fn tag_opened(&mut self, c: char) -> Result {
+        match c {
+            '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
+            '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
+            '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
+            _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
+            _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
+            _                          => self.handle_error("<", c)
+        }
+    }
+
+    /// Encountered '<!'
+    fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
+        match c {
+            '-' => self.move_to(State::CommentStarted),
+            '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
+            'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
+            _   => self.handle_error("<!", c)
+        }
+    }
+
+    /// Encountered '<!-'
+    fn comment_started(&mut self, c: char) -> Result {
+        match c {
+            '-' => self.move_to_with(State::Normal, Token::CommentStart),
+            _   => self.handle_error("<!-", c)
+        }
+    }
+
+    /// Encountered '<!['
+    fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
+        use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
+        dispatch_on_enum_state!(self, s, c, State::CDataStarted,
+            E     ; 'C' ; C     ; "<![",
+            C     ; 'D' ; CD    ; "<![C",
+            CD    ; 'A' ; CDA   ; "<![CD",
+            CDA   ; 'T' ; CDAT  ; "<![CDA",
+            CDAT  ; 'A' ; CDATA ; "<![CDAT";
+            CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
+        )
+    }
+
+    /// Encountered '<!D'
+    fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
+        use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
+        dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
+            D      ; 'O' ; DO     ; "<!D",
+            DO     ; 'C' ; DOC    ; "<!DO",
+            DOC    ; 'T' ; DOCT   ; "<!DOC",
+            DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
+            DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
+            DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
+        )
+    }
+
+    /// State used while awaiting the closing bracket for the <!DOCTYPE tag
+    fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
+        match c {
+            '<' => self.move_to(State::DoctypeFinishing(d + 1)),
+            '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
+            '>' => self.move_to(State::DoctypeFinishing(d - 1)),
+            _ => Ok(None),
+        }
+    }
+
+    /// Encountered '?'
+    fn processing_instruction_closing(&mut self, c: char) -> Result {
+        match c {
+            '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
+            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
+        }
+    }
+
+    /// Encountered '/'
+    fn empty_element_closing(&mut self, c: char) -> Result {
+        match c {
+            '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
+            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
+        }
+    }
+
+    /// Encountered '-'
+    fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
+        match s {
+            ClosingSubstate::First => match c {
+                '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
+                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
+            },
+            ClosingSubstate::Second => match c {
+                '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
+                // double dash not followed by a greater-than is a hard error inside comment
+                _ if self.inside_comment => self.handle_error("--", c),
+                // nothing else except comment closing starts with a double dash, and comment
+                // closing can never be after another dash, and also we're outside of a comment,
+                // therefore it is safe to push only the last read character to the list of unread
+                // characters and pass the double dash directly to the output
+                _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
+            }
+        }
+    }
+
+    /// Encountered ']'
+    fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
+        match s {
+            ClosingSubstate::First => match c {
+                ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
+                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
+            },
+            ClosingSubstate::Second => match c {
+                '>' => self.move_to_with(State::Normal, Token::CDataEnd),
+                _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common::{Position};
+    use std::io::{BufReader, Cursor};
+
+    use super::{Lexer, Token};
+
+    macro_rules! assert_oks(
+        (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
+            $(
+                assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
+             )+
+        })
+    );
+
+    macro_rules! assert_err(
+        (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
+            let err = $lex.next_token(&mut $buf);
+            assert!(err.is_err());
+            let err = err.unwrap_err();
+            assert_eq!($r as u64, err.position().row);
+            assert_eq!($c as u64, err.position().column);
+            assert_eq!($s, err.msg());
+        })
+    );
+
+    macro_rules! assert_none(
+        (for $lex:ident and $buf:ident) => (
+            assert_eq!(Ok(None), $lex.next_token(&mut $buf));
+        )
+    );
+
+    fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
+        (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
+    }
+
+    #[test]
+    fn simple_lexer_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a p='q'> x<b z="y">d   </b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::Whitespace(' ')
+            Token::Character('p')
+            Token::EqualsSign
+            Token::SingleQuote
+            Token::Character('q')
+            Token::SingleQuote
+            Token::TagEnd
+            Token::Whitespace(' ')
+            Token::Character('x')
+            Token::OpeningTagStart
+            Token::Character('b')
+            Token::Whitespace(' ')
+            Token::Character('z')
+            Token::EqualsSign
+            Token::DoubleQuote
+            Token::Character('y')
+            Token::DoubleQuote
+            Token::TagEnd
+            Token::Character('d')
+            Token::Whitespace('\t')
+            Token::ClosingTagStart
+            Token::Character('b')
+            Token::TagEnd
+            Token::ClosingTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::OpeningTagStart
+            Token::Character('p')
+            Token::EmptyTagEnd
+            Token::Whitespace(' ')
+            Token::ProcessingInstructionStart
+            Token::Character('n')
+            Token::Character('m')
+            Token::Whitespace(' ')
+            Token::ProcessingInstructionEnd
+            Token::Whitespace(' ')
+            Token::CommentStart
+            Token::Whitespace(' ')
+            Token::Character('a')
+            Token::Whitespace(' ')
+            Token::Character('c')
+            Token::Whitespace(' ')
+            Token::CommentEnd
+            Token::Whitespace(' ')
+            Token::ReferenceStart
+            Token::Character('n')
+            Token::Character('b')
+            Token::Character('s')
+            Token::Character('p')
+            Token::ReferenceEnd
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn special_chars_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"?x!+ // -| ]z]]"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::Character('?')
+            Token::Character('x')
+            Token::Character('!')
+            Token::Character('+')
+            Token::Whitespace(' ')
+            Token::Character('/')
+            Token::Character('/')
+            Token::Whitespace(' ')
+            Token::Character('-')
+            Token::Character('|')
+            Token::Whitespace(' ')
+            Token::Character(']')
+            Token::Character('z')
+            Token::Chunk("]]")
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn cdata_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><![CDATA[x y ?]]> </a>"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::CDataStart
+            Token::Character('x')
+            Token::Whitespace(' ')
+            Token::Character('y')
+            Token::Whitespace(' ')
+            Token::Character('?')
+            Token::CDataEnd
+            Token::Whitespace(' ')
+            Token::ClosingTagStart
+            Token::Character('a')
+            Token::TagEnd
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn doctype_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><!DOCTYPE ab xx z> "#
+        );
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::DoctypeStart
+            Token::TagEnd
+            Token::Whitespace(' ')
+        );
+        assert_none!(for lex and buf)
+    }
+
+    #[test]
+    fn doctype_with_internal_subset_test() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
+        );
+        assert_oks!(for lex and buf ;
+            Token::OpeningTagStart
+            Token::Character('a')
+            Token::TagEnd
+            Token::DoctypeStart
+            Token::TagEnd
+            Token::Whitespace(' ')
+        );
+        assert_none!(for lex and buf)
+    }
+
+    #[test]
+    fn end_of_stream_handling_ok() {
+        macro_rules! eof_check(
+            ($data:expr ; $token:expr) => ({
+                let (mut lex, mut buf) = make_lex_and_buf($data);
+                assert_oks!(for lex and buf ; $token);
+                assert_none!(for lex and buf);
+            })
+        );
+        eof_check!("?"  ; Token::Character('?'));
+        eof_check!("/"  ; Token::Character('/'));
+        eof_check!("-"  ; Token::Character('-'));
+        eof_check!("]"  ; Token::Character(']'));
+        eof_check!("]]" ; Token::Chunk("]]"));
+    }
+
+    #[test]
+    fn end_of_stream_handling_error() {
+        macro_rules! eof_check(
+            ($data:expr; $r:expr, $c:expr) => ({
+                let (mut lex, mut buf) = make_lex_and_buf($data);
+                assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
+                assert_none!(for lex and buf);
+            })
+        );
+        eof_check!("<"        ; 0, 1);
+        eof_check!("<!"       ; 0, 2);
+        eof_check!("<!-"      ; 0, 3);
+        eof_check!("<!["      ; 0, 3);
+        eof_check!("<![C"     ; 0, 4);
+        eof_check!("<![CD"    ; 0, 5);
+        eof_check!("<![CDA"   ; 0, 6);
+        eof_check!("<![CDAT"  ; 0, 7);
+        eof_check!("<![CDATA" ; 0, 8);
+        eof_check!("--"       ; 0, 2);
+    }
+
+    #[test]
+    fn error_in_comment_or_cdata_prefix() {
+        let (mut lex, mut buf) = make_lex_and_buf("<!x");
+        assert_err!(for lex and buf expect row 0 ; 0,
+            "Unexpected token '<!' before 'x'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("<!x");
+        lex.disable_errors();
+        assert_oks!(for lex and buf ;
+            Token::Chunk("<!")
+            Token::Character('x')
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn error_in_comment_started() {
+        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
+        assert_err!(for lex and buf expect row 0 ; 0,
+            "Unexpected token '<!-' before '\t'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
+        lex.disable_errors();
+        assert_oks!(for lex and buf ;
+            Token::Chunk("<!-")
+            Token::Whitespace('\t')
+        );
+        assert_none!(for lex and buf);
+    }
+
+    #[test]
+    fn error_in_comment_two_dashes_not_at_end() {
+        let (mut lex, mut buf) = make_lex_and_buf("--x");
+        lex.inside_comment();
+        assert_err!(for lex and buf expect row 0; 0,
+            "Unexpected token '--' before 'x'"
+        );
+
+        let (mut lex, mut buf) = make_lex_and_buf("--x");
+        assert_oks!(for lex and buf ;
+            Token::Chunk("--")
+            Token::Character('x')
+        );
+    }
+
+    macro_rules! check_case(
+        ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
+            let (mut lex, mut buf) = make_lex_and_buf($data);
+            assert_err!(for lex and buf expect row $r ; $c, $s);
+
+            let (mut lex, mut buf) = make_lex_and_buf($data);
+            lex.disable_errors();
+            assert_oks!(for lex and buf ;
+                Token::Chunk($chunk)
+                Token::Character($app)
+            );
+            assert_none!(for lex and buf);
+        })
+    );
+
+    #[test]
+    fn error_in_cdata_started() {
+        check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
+        check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
+        check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
+        check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
+        check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
+        check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
+    }
+
+    #[test]
+    fn error_in_doctype_started() {
+        check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
+        check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
+        check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
+        check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
+        check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
+        check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
+    }
+
+
+
+    #[test]
+    fn issue_98_cdata_ending_with_right_bracket() {
+        let (mut lex, mut buf) = make_lex_and_buf(
+            r#"<![CDATA[Foo [Bar]]]>"#
+        );
+
+        assert_oks!(for lex and buf ;
+            Token::CDataStart
+            Token::Character('F')
+            Token::Character('o')
+            Token::Character('o')
+            Token::Whitespace(' ')
+            Token::Character('[')
+            Token::Character('B')
+            Token::Character('a')
+            Token::Character('r')
+            Token::Character(']')
+            Token::CDataEnd
+        );
+        assert_none!(for lex and buf);
+    }
+}
diff --git a/src/reader/mod.rs b/src/reader/mod.rs

new file mode 100644 (file)

index 0000000..90f5b52
--- /dev/null
+++ b/src/reader/mod.rs
@@ -0,0 +1,129 @@
+//! Contains high-level interface for a pull-based XML parser.
+//!
+//! The most important type in this module is `EventReader`, which provides an iterator
+//! view for events in XML document.
+
+use std::io::{Read};
+use std::result;
+
+use common::{Position, TextPosition};
+
+pub use self::config::ParserConfig;
+pub use self::events::XmlEvent;
+
+use self::parser::PullParser;
+
+mod lexer;
+mod parser;
+mod config;
+mod events;
+
+mod error;
+pub use self::error::{Error, ErrorKind};
+
+/// A result type yielded by `XmlReader`.
+pub type Result<T> = result::Result<T, Error>;
+
+/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing.
+pub struct EventReader<R: Read> {
+    source: R,
+    parser: PullParser
+}
+
+impl<R: Read> EventReader<R> {
+    /// Creates a new reader, consuming the given stream.
+    #[inline]
+    pub fn new(source: R) -> EventReader<R> {
+        EventReader::new_with_config(source, ParserConfig::new())
+    }
+
+    /// Creates a new reader with the provded configuration, consuming the given stream.
+    #[inline]
+    pub fn new_with_config(source: R, config: ParserConfig) -> EventReader<R> {
+        EventReader { source: source, parser: PullParser::new(config) }
+    }
+
+    /// Pulls and returns next XML event from the stream.
+    ///
+    /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then
+    /// further calls to this method will return this event again.
+    #[inline]
+    pub fn next(&mut self) -> Result<XmlEvent> {
+        self.parser.next(&mut self.source)
+    }
+
+    pub fn source(&self) -> &R { &self.source }
+    pub fn source_mut(&mut self) -> &mut R { &mut self.source }
+
+    /// Unwraps this `EventReader`, returning the underlying reader.
+    ///
+    /// Note that this operation is destructive; unwrapping the reader and wrapping it
+    /// again with `EventReader::new()` will create a fresh reader which will attempt
+    /// to parse an XML document from the beginning.
+    pub fn into_inner(self) -> R {
+        self.source
+    }
+}
+
+impl<B: Read> Position for EventReader<B> {
+    /// Returns the position of the last event produced by the reader.
+    #[inline]
+    fn position(&self) -> TextPosition {
+        self.parser.position()
+    }
+}
+
+impl<R: Read> IntoIterator for EventReader<R> {
+    type Item = Result<XmlEvent>;
+    type IntoIter = Events<R>;
+
+    fn into_iter(self) -> Events<R> {
+        Events { reader: self, finished: false }
+    }
+}
+
+/// An iterator over XML events created from some type implementing `Read`.
+///
+/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then
+/// it will be returned by the iterator once, and then it will stop producing events.
+pub struct Events<R: Read> {
+    reader: EventReader<R>,
+    finished: bool
+}
+
+impl<R: Read> Events<R> {
+    /// Unwraps the iterator, returning the internal `EventReader`.
+    #[inline]
+    pub fn into_inner(self) -> EventReader<R> {
+        self.reader
+    }
+
+    pub fn source(&self) -> &R { &self.reader.source }
+    pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source }
+
+}
+
+impl<R: Read> Iterator for Events<R> {
+    type Item = Result<XmlEvent>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Result<XmlEvent>> {
+        if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None }
+        else {
+            let ev = self.reader.next();
+            match ev {
+                Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true,
+                _ => {}
+            }
+            Some(ev)
+        }
+    }
+}
+
+impl<'r> EventReader<&'r [u8]> {
+    /// A convenience method to create an `XmlReader` from a string slice.
+    #[inline]
+    pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> {
+        EventReader::new(source.as_bytes())
+    }
+}
diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs

new file mode 100644 (file)

index 0000000..3269fb4
--- /dev/null
+++ b/src/reader/parser/inside_cdata.rs
@@ -0,0 +1,32 @@
+use reader::events::XmlEvent;
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State};
+
+impl PullParser {
+    pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
+        match t {
+            Token::CDataEnd => {
+                self.lexer.enable_errors();
+                let event = if self.config.cdata_to_characters {
+                    None
+                } else {
+                    let data = self.take_buf();
+                    Some(Ok(XmlEvent::CData(data)))
+                };
+                self.into_state(State::OutsideTag, event)
+            }
+
+            Token::Whitespace(_) => {
+                t.push_to_string(&mut self.buf);
+                None
+            }
+
+            _ => {
+                self.inside_whitespace = false;
+                t.push_to_string(&mut self.buf);
+                None
+            }
+        }
+    }
+}
diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs

new file mode 100644 (file)

index 0000000..1d8074a
--- /dev/null
+++ b/src/reader/parser/inside_closing_tag_name.rs
@@ -0,0 +1,34 @@
+use namespace;
+
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate};
+
+impl PullParser {
+    pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> {
+        match s {
+            ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| {
+                match name.prefix_ref() {
+                    Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
+                                    prefix == namespace::NS_XMLNS_PREFIX =>
+                        // TODO: {:?} is bad, need something better
+                        Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)),
+                    _ => {
+                        this.data.element_name = Some(name.clone());
+                        match token {
+                            Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)),
+                            Token::TagEnd => this.emit_end_element(),
+                            _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token))
+                        }
+                    }
+                }
+            }),
+            ClosingTagSubstate::CTAfterName => match t {
+                Token::Whitespace(_) => None,  //  Skip whitespace
+                Token::TagEnd => self.emit_end_element(),
+                _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t))
+            }
+        }
+    }
+
+}
diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs

new file mode 100644 (file)

index 0000000..fc98320
--- /dev/null
+++ b/src/reader/parser/inside_comment.rs
@@ -0,0 +1,32 @@
+use reader::events::XmlEvent;
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State};
+
+impl PullParser {
+    pub fn inside_comment(&mut self, t: Token) -> Option<Result> {
+        match t {
+            // Double dash is illegal inside a comment
+            Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")),
+
+            Token::CommentEnd if self.config.ignore_comments => {
+                self.lexer.outside_comment();
+                self.into_state_continue(State::OutsideTag)
+            }
+
+            Token::CommentEnd => {
+                self.lexer.outside_comment();
+                let data = self.take_buf();
+                self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data)))
+            }
+
+            _ if self.config.ignore_comments => None,  // Do not modify buffer if ignoring the comment
+
+            _ => {
+                t.push_to_string(&mut self.buf);
+                None
+            }
+        }
+    }
+
+}
diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs

new file mode 100644 (file)

index 0000000..af39d10
--- /dev/null
+++ b/src/reader/parser/inside_declaration.rs
@@ -0,0 +1,151 @@
+
+use common::XmlVersion;
+
+use reader::events::XmlEvent;
+use reader::lexer::Token;
+
+use super::{
+    Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget,
+    DEFAULT_VERSION, DEFAULT_ENCODING
+};
+
+impl PullParser {
+    // TODO: remove redundancy via macros or extra methods
+    pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> {
+        macro_rules! unexpected_token(
+            ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t))));
+            ($t:expr) => (unexpected_token!(self; $t));
+        );
+
+        #[inline]
+        fn emit_start_document(this: &mut PullParser) -> Option<Result> {
+            this.parsed_declaration = true;
+            let version = this.data.take_version();
+            let encoding = this.data.take_encoding();
+            let standalone = this.data.take_standalone();
+            this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
+                version: version.unwrap_or(DEFAULT_VERSION),
+                encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()),
+                standalone: standalone
+            }))
+        }
+
+        match s {
+            DeclarationSubstate::BeforeVersion => match t {
+                Token::Whitespace(_) => None,  // continue
+                Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+                match &name.local_name[..] {
+                    "ersion" if name.namespace.is_none() =>
+                        this.into_state_continue(State::InsideDeclaration(
+                            if token == Token::EqualsSign {
+                                DeclarationSubstate::InsideVersionValue
+                            } else {
+                                DeclarationSubstate::AfterVersion
+                            }
+                        )),
+                    _ => unexpected_token!(this; name)
+                }
+            }),
+
+            DeclarationSubstate::AfterVersion => match t {
+                Token::Whitespace(_) => None,
+                Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| {
+                this.data.version = match &value[..] {
+                    "1.0" => Some(XmlVersion::Version10),
+                    "1.1" => Some(XmlVersion::Version11),
+                    _     => None
+                };
+                if this.data.version.is_some() {
+                    this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue))
+                } else {
+                    Some(self_error!(this; "Unexpected XML version value: {}", value))
+                }
+            }),
+
+            DeclarationSubstate::AfterVersionValue => match t {
+                Token::Whitespace(_) => None,  // skip whitespace
+                Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)),
+                Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
+                Token::ProcessingInstructionEnd => emit_start_document(self),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+                match &name.local_name[..] {
+                    "ncoding" if name.namespace.is_none() =>
+                        this.into_state_continue(State::InsideDeclaration(
+                            if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding }
+                        )),
+                    _ => unexpected_token!(this; name)
+                }
+            }),
+
+            DeclarationSubstate::AfterEncoding => match t {
+                Token::Whitespace(_) => None,
+                Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| {
+                this.data.encoding = Some(value);
+                this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl))
+            }),
+
+            DeclarationSubstate::BeforeStandaloneDecl => match t {
+                Token::Whitespace(_) => None,  // skip whitespace
+                Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
+                Token::ProcessingInstructionEnd => emit_start_document(self),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+                match &name.local_name[..] {
+                    "tandalone" if name.namespace.is_none() =>
+                        this.into_state_continue(State::InsideDeclaration(
+                            if token == Token::EqualsSign {
+                                DeclarationSubstate::InsideStandaloneDeclValue
+                            } else {
+                                DeclarationSubstate::AfterStandaloneDecl
+                            }
+                        )),
+                    _ => unexpected_token!(this; name)
+                }
+            }),
+
+            DeclarationSubstate::AfterStandaloneDecl => match t {
+                Token::Whitespace(_) => None,
+                Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)),
+                _ => unexpected_token!(t)
+            },
+
+            DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| {
+                let standalone = match &value[..] {
+                    "yes" => Some(true),
+                    "no"  => Some(false),
+                    _     => None
+                };
+                if standalone.is_some() {
+                    this.data.standalone = standalone;
+                    this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue))
+                } else {
+                    Some(self_error!(this; "Invalid standalone declaration value: {}", value))
+                }
+            }),
+
+            DeclarationSubstate::AfterStandaloneDeclValue => match t {
+                Token::Whitespace(_) => None,  // skip whitespace
+                Token::ProcessingInstructionEnd => emit_start_document(self),
+                _ => unexpected_token!(t)
+            }
+        }
+    }
+
+}
diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs

new file mode 100644 (file)

index 0000000..8dcf367
--- /dev/null
+++ b/src/reader/parser/inside_doctype.rs
@@ -0,0 +1,16 @@
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State};
+
+impl PullParser {
+    pub fn inside_doctype(&mut self, t: Token) -> Option<Result> {
+        match t {
+            Token::TagEnd => {
+                self.lexer.enable_errors();
+                self.into_state_continue(State::OutsideTag)
+            }
+
+            _ => None
+        }
+    }
+}
diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs

new file mode 100644 (file)

index 0000000..533874f
--- /dev/null
+++ b/src/reader/parser/inside_opening_tag.rs
@@ -0,0 +1,108 @@
+use common::is_name_start_char;
+use attribute::OwnedAttribute;
+use namespace;
+
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget};
+
+impl PullParser {
+    pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
+        macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t))));
+        match s {
+            OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
+                match name.prefix_ref() {
+                    Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
+                                    prefix == namespace::NS_XMLNS_PREFIX =>
+                        Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)),
+                    _ => {
+                        this.data.element_name = Some(name.clone());
+                        match token {
+                            Token::TagEnd => this.emit_start_element(false),
+                            Token::EmptyTagEnd => this.emit_start_element(true),
+                            Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
+                            _ => unreachable!()
+                        }
+                    }
+                }
+            }),
+
+            OpeningTagSubstate::InsideTag => match t {
+                Token::Whitespace(_) => None,  // skip whitespace
+                Token::Character(c) if is_name_start_char(c) => {
+                    self.buf.push(c);
+                    self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
+                }
+                Token::TagEnd => self.emit_start_element(false),
+                Token::EmptyTagEnd => self.emit_start_element(true),
+                _ => unexpected_token!(t)
+            },
+
+            OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
+                this.data.attr_name = Some(name);
+                match token {
+                    Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
+                    Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
+                    _ => unreachable!()
+                }
+            }),
+
+            OpeningTagSubstate::AfterAttributeName => match t {
+                Token::Whitespace(_) => None,
+                Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
+                _ => unexpected_token!(t)
+            },
+
+            OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
+                let name = this.data.take_attr_name().unwrap();  // unwrap() will always succeed here
+
+                // check that no attribute with such name is already present
+                // if there is one, XML is not well-formed
+                if this.data.attributes.iter().find(|a| a.name == name).is_some() {  // TODO: looks bad
+                    // TODO: ideally this error should point to the beginning of the attribute,
+                    // TODO: not the end of its value
+                    Some(self_error!(this; "Attribute '{}' is redefined", name))
+                } else {
+                    match name.prefix_ref() {
+                        // declaring a new prefix; it is sufficient to check prefix only
+                        // because "xmlns" prefix is reserved
+                        Some(namespace::NS_XMLNS_PREFIX) => {
+                            let ln = &name.local_name[..];
+                            if ln == namespace::NS_XMLNS_PREFIX {
+                                Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX))
+                            } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI {
+                                Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX))
+                            } else if value.is_empty() {
+                                Some(self_error!(this; "Cannot undefine prefix '{}'", ln))
+                            } else {
+                                this.nst.put(name.local_name.clone(), value);
+                                this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+                            }
+                        }
+
+                        // declaring default namespace
+                        None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX =>
+                            match &value[..] {
+                                namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX =>
+                                    Some(self_error!(this; "Namespace '{}' cannot be default", value)),
+                                _ => {
+                                    this.nst.put(namespace::NS_NO_PREFIX, value.clone());
+                                    this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+                                }
+                            },
+
+                        // regular attribute
+                        _ => {
+                            this.data.attributes.push(OwnedAttribute {
+                                name: name.clone(),
+                                value: value
+                            });
+                            this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
+                        }
+                    }
+                }
+            })
+        }
+    }
+
+}
diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs

new file mode 100644 (file)

index 0000000..8ddf6b8
--- /dev/null
+++ b/src/reader/parser/inside_processing_instruction.rs
@@ -0,0 +1,96 @@
+use common::{
+    is_name_start_char, is_name_char,
+};
+
+use reader::events::XmlEvent;
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate};
+
+impl PullParser {
+    pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> {
+        match s {
+            ProcessingInstructionSubstate::PIInsideName => match t {
+                Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) ||
+                                 self.buf_has_data() && is_name_char(c) => self.append_char_continue(c),
+
+                Token::ProcessingInstructionEnd => {
+                    // self.buf contains PI name
+                    let name = self.take_buf();
+
+                    // Don't need to check for declaration because it has mandatory attributes
+                    // but there is none
+                    match &name[..] {
+                        // Name is empty, it is an error
+                        "" => Some(self_error!(self; "Encountered processing instruction without name")),
+
+                        // Found <?xml-like PI not at the beginning of a document,
+                        // it is an error - see section 2.6 of XML 1.1 spec
+                        "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML" =>
+                            Some(self_error!(self; "Invalid processing instruction: <?{}", name)),
+
+                        // All is ok, emitting event
+                        _ => {
+                            self.into_state_emit(
+                                State::OutsideTag,
+                                Ok(XmlEvent::ProcessingInstruction {
+                                    name: name,
+                                    data: None
+                                })
+                            )
+                        }
+                    }
+                }
+
+                Token::Whitespace(_) => {
+                    // self.buf contains PI name
+                    let name = self.take_buf();
+
+                    match &name[..] {
+                        // We have not ever encountered an element and have not parsed XML declaration
+                        "xml" if !self.encountered_element && !self.parsed_declaration =>
+                            self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)),
+
+                        // Found <?xml-like PI after the beginning of a document,
+                        // it is an error - see section 2.6 of XML 1.1 spec
+                        "xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML"
+                            if self.encountered_element || self.parsed_declaration =>
+                            Some(self_error!(self; "Invalid processing instruction: <?{}", name)),
+
+                        // All is ok, starting parsing PI data
+                        _ => {
+                            self.lexer.disable_errors();  // data is arbitrary, so disable errors
+                            self.data.name = name;
+                            self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData))
+                        }
+
+                    }
+                }
+
+                _ => Some(self_error!(self; "Unexpected token: <?{}{}", self.buf, t))
+            },
+
+            ProcessingInstructionSubstate::PIInsideData => match t {
+                Token::ProcessingInstructionEnd => {
+                    self.lexer.enable_errors();
+                    let name = self.data.take_name();
+                    let data = self.take_buf();
+                    self.into_state_emit(
+                        State::OutsideTag,
+                        Ok(XmlEvent::ProcessingInstruction {
+                            name: name,
+                            data: Some(data)
+                        })
+                    )
+                },
+
+                // Any other token should be treated as plain characters
+                _ => {
+                    t.push_to_string(&mut self.buf);
+                    None
+                }
+            },
+        }
+    }
+
+}
diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs

new file mode 100644 (file)

index 0000000..60026d5
--- /dev/null
+++ b/src/reader/parser/inside_reference.rs
@@ -0,0 +1,89 @@
+use std::char;
+
+use common::{is_name_start_char, is_name_char, is_whitespace_str};
+
+use reader::lexer::Token;
+
+use super::{Result, PullParser, State};
+
+impl PullParser {
+    pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option<Result> {
+        match t {
+            Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) ||
+                             self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => {
+                self.data.ref_data.push(c);
+                None
+            }
+
+            Token::ReferenceEnd => {
+                // TODO: check for unicode correctness
+                let name = self.data.take_ref_data();
+                let name_len = name.len();  // compute once
+                let c = match &name[..] {
+                    "lt"   => Ok('<'.to_string()),
+                    "gt"   => Ok('>'.to_string()),
+                    "amp"  => Ok('&'.to_string()),
+                    "apos" => Ok('\''.to_string()),
+                    "quot" => Ok('"'.to_string()),
+                    ""     => Err(self_error!(self; "Encountered empty entity")),
+                    _ if name_len > 2 && name.starts_with("#x") => {
+                        let num_str = &name[2..name_len];
+                        if num_str == "0" {
+                            Err(self_error!(self; "Null character entity is not allowed"))
+                        } else {
+                            if self.config.replace_unknown_entity_references {
+                                match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) {
+                                    Some(c) => Ok(c.to_string()),
+                                    None    => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name))
+                                }
+                            } else {
+                                match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) {
+                                    Some(c) => Ok(c.to_string()),
+                                    None    => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name))
+                                }
+                            }
+                        }
+                    }
+                    _ if name_len > 1 && name.starts_with('#') => {
+                        let num_str = &name[1..name_len];
+                        if num_str == "0" {
+                            Err(self_error!(self; "Null character entity is not allowed"))
+                        } else {
+                            if self.config.replace_unknown_entity_references {
+                                match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) {
+                                    Some(c) => Ok(c.to_string()),
+                                    None    => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name))
+                                }
+                            }
+                            else {
+                                match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) {
+                                    Some(c) => Ok(c.to_string()),
+                                    None    => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name))
+                                }
+                            }
+                        }
+                    },
+                    _ => {
+                        if let Some(v) = self.config.extra_entities.get(&name) {
+                            Ok(v.clone())
+                        } else {
+                            Err(self_error!(self; "Unexpected entity: {}", name))
+                        }
+                    }
+                };
+                match c {
+                    Ok(c) => {
+                        self.buf.push_str(&c);
+                        if prev_st == State::OutsideTag && !is_whitespace_str(&c) {
+                            self.inside_whitespace = false;
+                        }
+                        self.into_state_continue(prev_st)
+                    }
+                    Err(e) => Some(e)
+                }
+            }
+
+            _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t))
+        }
+    }
+}
diff --git a/src/reader/parser/mod.rs b/src/reader/parser/mod.rs

new file mode 100644 (file)

index 0000000..58ca3a6
--- /dev/null
+++ b/src/reader/parser/mod.rs
@@ -0,0 +1,622 @@
+//! Contains an implementation of pull-based XML parser.
+
+use std::mem;
+use std::borrow::Cow;
+use std::io::prelude::*;
+
+use common::{
+    self,
+    XmlVersion, Position, TextPosition,
+    is_name_start_char, is_name_char,
+};
+use name::OwnedName;
+use attribute::OwnedAttribute;
+use namespace::NamespaceStack;
+
+use reader::events::XmlEvent;
+use reader::config::ParserConfig;
+use reader::lexer::{Lexer, Token};
+
+macro_rules! gen_takes(
+    ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
+        $(
+        impl MarkupData {
+            #[inline]
+            fn $method(&mut self) -> $t {
+                mem::replace(&mut self.$field, $def)
+            }
+        }
+        )+
+    )
+);
+
+gen_takes!(
+    name         -> take_name, String, String::new();
+    ref_data     -> take_ref_data, String, String::new();
+
+    version      -> take_version, Option<common::XmlVersion>, None;
+    encoding     -> take_encoding, Option<String>, None;
+    standalone   -> take_standalone, Option<bool>, None;
+
+    element_name -> take_element_name, Option<OwnedName>, None;
+
+    attr_name    -> take_attr_name, Option<OwnedName>, None;
+    attributes   -> take_attributes, Vec<OwnedAttribute>, vec!()
+);
+
+macro_rules! self_error(
+    ($this:ident; $msg:expr) => ($this.error($msg));
+    ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+)))
+);
+
+mod outside_tag;
+mod inside_processing_instruction;
+mod inside_declaration;
+mod inside_doctype;
+mod inside_opening_tag;
+mod inside_closing_tag_name;
+mod inside_comment;
+mod inside_cdata;
+mod inside_reference;
+
+static DEFAULT_VERSION: XmlVersion      = XmlVersion::Version10;
+static DEFAULT_ENCODING: &'static str   = "UTF-8";
+static DEFAULT_STANDALONE: Option<bool> = None;
+
+type ElementStack = Vec<OwnedName>;
+pub type Result = super::Result<XmlEvent>;
+
+/// Pull-based XML parser.
+pub struct PullParser {
+    config: ParserConfig,
+    lexer: Lexer,
+    st: State,
+    buf: String,
+    nst: NamespaceStack,
+
+    data: MarkupData,
+    final_result: Option<Result>,
+    next_event: Option<Result>,
+    est: ElementStack,
+    pos: Vec<TextPosition>,
+
+    encountered_element: bool,
+    parsed_declaration: bool,
+    inside_whitespace: bool,
+    read_prefix_separator: bool,
+    pop_namespace: bool
+}
+
+impl PullParser {
+    /// Returns a new parser using the given config.
+    pub fn new(config: ParserConfig) -> PullParser {
+        PullParser {
+            config: config,
+            lexer: Lexer::new(),
+            st: State::OutsideTag,
+            buf: String::new(),
+            nst: NamespaceStack::default(),
+
+            data: MarkupData {
+                name: String::new(),
+                version: None,
+                encoding: None,
+                standalone: None,
+                ref_data: String::new(),
+                element_name: None,
+                quote: None,
+                attr_name: None,
+                attributes: Vec::new()
+            },
+            final_result: None,
+            next_event: None,
+            est: Vec::new(),
+            pos: vec![TextPosition::new()],
+
+            encountered_element: false,
+            parsed_declaration: false,
+            inside_whitespace: true,
+            read_prefix_separator: false,
+            pop_namespace: false
+        }
+    }
+
+    /// Checks if this parser ignores the end of stream errors.
+    pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
+}
+
+impl Position for PullParser {
+    /// Returns the position of the last event produced by the parser
+    #[inline]
+    fn position(&self) -> TextPosition {
+        self.pos[0]
+    }
+}
+
+#[derive(Clone, PartialEq)]
+pub enum State {
+    OutsideTag,
+    InsideOpeningTag(OpeningTagSubstate),
+    InsideClosingTag(ClosingTagSubstate),
+    InsideProcessingInstruction(ProcessingInstructionSubstate),
+    InsideComment,
+    InsideCData,
+    InsideDeclaration(DeclarationSubstate),
+    InsideDoctype,
+    InsideReference(Box<State>)
+}
+
+#[derive(Clone, PartialEq)]
+pub enum OpeningTagSubstate {
+    InsideName,
+
+    InsideTag,
+
+    InsideAttributeName,
+    AfterAttributeName,
+
+    InsideAttributeValue,
+}
+
+#[derive(Clone, PartialEq)]
+pub enum ClosingTagSubstate {
+    CTInsideName,
+    CTAfterName
+}
+
+#[derive(Clone, PartialEq)]
+pub enum ProcessingInstructionSubstate {
+    PIInsideName,
+    PIInsideData
+}
+
+#[derive(Clone, PartialEq)]
+pub enum DeclarationSubstate {
+    BeforeVersion,
+    InsideVersion,
+    AfterVersion,
+
+    InsideVersionValue,
+    AfterVersionValue,
+
+    InsideEncoding,
+    AfterEncoding,
+
+    InsideEncodingValue,
+
+    BeforeStandaloneDecl,
+    InsideStandaloneDecl,
+    AfterStandaloneDecl,
+
+    InsideStandaloneDeclValue,
+    AfterStandaloneDeclValue
+}
+
+#[derive(PartialEq)]
+enum QualifiedNameTarget {
+    AttributeNameTarget,
+    OpeningTagNameTarget,
+    ClosingTagNameTarget
+}
+
+#[derive(Copy, Clone, PartialEq, Eq)]
+enum QuoteToken {
+    SingleQuoteToken,
+    DoubleQuoteToken
+}
+
+impl QuoteToken {
+    fn from_token(t: &Token) -> QuoteToken {
+        match *t {
+            Token::SingleQuote => QuoteToken::SingleQuoteToken,
+            Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
+            _ => panic!("Unexpected token: {}", t)
+        }
+    }
+
+    fn as_token(self) -> Token {
+        match self {
+            QuoteToken::SingleQuoteToken => Token::SingleQuote,
+            QuoteToken::DoubleQuoteToken => Token::DoubleQuote
+        }
+    }
+}
+
+struct MarkupData {
+    name: String,     // used for processing instruction name
+    ref_data: String,  // used for reference content
+
+    version: Option<common::XmlVersion>,  // used for XML declaration version
+    encoding: Option<String>,  // used for XML declaration encoding
+    standalone: Option<bool>,  // used for XML declaration standalone parameter
+
+    element_name: Option<OwnedName>,  // used for element name
+
+    quote: Option<QuoteToken>,  // used to hold opening quote for attribute value
+    attr_name: Option<OwnedName>,  // used to hold attribute name
+    attributes: Vec<OwnedAttribute>   // used to hold all accumulated attributes
+}
+
+impl PullParser {
+    /// Returns next event read from the given buffer.
+    ///
+    /// This method should be always called with the same buffer. If you call it
+    /// providing different buffers each time, the result will be undefined.
+    pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
+        if let Some(ref ev) = self.final_result {
+            return ev.clone();
+        }
+
+        if let Some(ev) = self.next_event.take() {
+            return ev;
+        }
+
+        if self.pop_namespace {
+            self.pop_namespace = false;
+            self.nst.pop();
+        }
+
+        loop {
+            // While lexer gives us Ok(maybe_token) -- we loop.
+            // Upon having a complete XML-event -- we return from the whole function.
+            match self.lexer.next_token(r) {
+                Ok(maybe_token) =>
+                    match maybe_token {
+                        None => break,
+                        Some(token) =>
+                            match self.dispatch_token(token) {
+                                None => {} // continue
+                                Some(Ok(XmlEvent::EndDocument)) =>
+                                    return {
+                                        self.next_pos();
+                                        self.set_final_result(Ok(XmlEvent::EndDocument))
+                                    },
+                                Some(Ok(xml_event)) =>
+                                    return {
+                                        self.next_pos();
+                                        Ok(xml_event)
+                                    },
+                                Some(Err(xml_error)) =>
+                                    return {
+                                        self.next_pos();
+                                        self.set_final_result(Err(xml_error))
+                                    },
+                            }
+                    },
+                Err(lexer_error) =>
+                    return self.set_final_result(Err(lexer_error)),
+            }
+        }
+
+        // Handle end of stream
+        // Forward pos to the lexer head
+        self.next_pos();
+        let ev = if self.depth() == 0 {
+            if self.encountered_element && self.st == State::OutsideTag {  // all is ok
+                Ok(XmlEvent::EndDocument)
+            } else if !self.encountered_element {
+                self_error!(self; "Unexpected end of stream: no root element found")
+            } else {  // self.st != State::OutsideTag
+                self_error!(self; "Unexpected end of stream")  // TODO: add expected hint?
+            }
+        } else {
+            if self.config.ignore_end_of_stream {
+                self.final_result = None;
+                self.lexer.reset_eof_handled();
+                return self_error!(self; "Unexpected end of stream: still inside the root element");
+            } else {
+                self_error!(self; "Unexpected end of stream: still inside the root element")
+            }
+        };
+        self.set_final_result(ev)
+    }
+
+    // This function is to be called when a terminal event is reached.
+    // The function sets up the `self.final_result` into `Some(result)` and return `result`.
+    fn set_final_result(&mut self, result: Result) -> Result {
+        self.final_result = Some(result.clone());
+        result
+    }
+
+    #[inline]
+    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result {
+        Err((&self.lexer, msg).into())
+    }
+
+    #[inline]
+    fn next_pos(&mut self) {
+        if self.pos.len() > 1 {
+            self.pos.remove(0);
+        } else {
+            self.pos[0] = self.lexer.position();
+        }
+    }
+
+    #[inline]
+    fn push_pos(&mut self) {
+        self.pos.push(self.lexer.position());
+    }
+
+    fn dispatch_token(&mut self, t: Token) -> Option<Result> {
+        match self.st.clone() {
+            State::OutsideTag                     => self.outside_tag(t),
+            State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
+            State::InsideDeclaration(s)           => self.inside_declaration(t, s),
+            State::InsideDoctype                  => self.inside_doctype(t),
+            State::InsideOpeningTag(s)            => self.inside_opening_tag(t, s),
+            State::InsideClosingTag(s)            => self.inside_closing_tag_name(t, s),
+            State::InsideComment                  => self.inside_comment(t),
+            State::InsideCData                    => self.inside_cdata(t),
+            State::InsideReference(s)             => self.inside_reference(t, *s)
+        }
+    }
+
+    #[inline]
+    fn depth(&self) -> usize {
+        self.est.len()
+    }
+
+    #[inline]
+    fn buf_has_data(&self) -> bool {
+        self.buf.len() > 0
+    }
+
+    #[inline]
+    fn take_buf(&mut self) -> String {
+        mem::replace(&mut self.buf, String::new())
+    }
+
+    #[inline]
+    fn append_char_continue(&mut self, c: char) -> Option<Result> {
+        self.buf.push(c);
+        None
+    }
+
+    #[inline]
+    fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
+        self.st = st;
+        ev
+    }
+
+    #[inline]
+    fn into_state_continue(&mut self, st: State) -> Option<Result> {
+        self.into_state(st, None)
+    }
+
+    #[inline]
+    fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
+        self.into_state(st, Some(ev))
+    }
+
+    /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed,
+    /// an error is returned.
+    ///
+    /// # Parameters
+    /// * `t`       --- next token;
+    /// * `on_name` --- a callback which is executed when whitespace is encountered.
+    fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
+      where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
+        // We can get here for the first time only when self.data.name contains zero or one character,
+        // but first character cannot be a colon anyway
+        if self.buf.len() <= 1 {
+            self.read_prefix_separator = false;
+        }
+
+        let invoke_callback = |this: &mut PullParser, t| {
+            let name = this.take_buf();
+            match name.parse() {
+                Ok(name) => on_name(this, t, name),
+                Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name))
+            }
+        };
+
+        match t {
+            // There can be only one colon, and not as the first character
+            Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
+                self.buf.push(':');
+                self.read_prefix_separator = true;
+                None
+            }
+
+            Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) ||
+                                          self.buf_has_data() && is_name_char(c)) =>
+                self.append_char_continue(c),
+
+            Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
+
+            Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
+
+            Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
+                      target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
+
+            Token::Whitespace(_) => invoke_callback(self, t),
+
+            _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t))
+        }
+    }
+
+    /// Dispatches tokens in order to process attribute value.
+    ///
+    /// # Parameters
+    /// * `t`        --- next token;
+    /// * `on_value` --- a callback which is called when terminating quote is encountered.
+    fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
+      where F: Fn(&mut PullParser, String) -> Option<Result> {
+        match t {
+            Token::Whitespace(_) if self.data.quote.is_none() => None,  // skip leading whitespace
+
+            Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
+                None => {  // Entered attribute value
+                    self.data.quote = Some(QuoteToken::from_token(&t));
+                    None
+                }
+                Some(q) if q.as_token() == t => {
+                    self.data.quote = None;
+                    let value = self.take_buf();
+                    on_value(self, value)
+                }
+                _ => {
+                    t.push_to_string(&mut self.buf);
+                    None
+                }
+            },
+
+            Token::ReferenceStart => {
+                let st = Box::new(self.st.clone());
+                self.into_state_continue(State::InsideReference(st))
+            }
+
+            Token::OpeningTagStart =>
+                Some(self_error!(self; "Unexpected token inside attribute value: <")),
+
+            // Every character except " and ' and < is okay
+            _  => {
+                t.push_to_string(&mut self.buf);
+                None
+            }
+        }
+    }
+
+    fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
+        let mut name = self.data.take_element_name().unwrap();
+        let mut attributes = self.data.take_attributes();
+
+        // check whether the name prefix is bound and fix its namespace
+        match self.nst.get(name.borrow().prefix_repr()) {
+            Some("") => name.namespace = None,  // default namespace
+            Some(ns) => name.namespace = Some(ns.into()),
+            None => return Some(self_error!(self; "Element {} prefix is unbound", name))
+        }
+
+        // check and fix accumulated attributes prefixes
+        for attr in attributes.iter_mut() {
+            if let Some(ref pfx) = attr.name.prefix {
+                let new_ns = match self.nst.get(pfx) {
+                    Some("") => None,  // default namespace
+                    Some(ns) => Some(ns.into()),
+                    None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name))
+                };
+                attr.name.namespace = new_ns;
+            }
+        }
+
+        if emit_end_element {
+            self.pop_namespace = true;
+            self.next_event = Some(Ok(XmlEvent::EndElement {
+                name: name.clone()
+            }));
+        } else {
+            self.est.push(name.clone());
+        }
+        let namespace = self.nst.squash();
+        self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
+            name: name,
+            attributes: attributes,
+            namespace: namespace
+        }))
+    }
+
+    fn emit_end_element(&mut self) -> Option<Result> {
+        let mut name = self.data.take_element_name().unwrap();
+
+        // check whether the name prefix is bound and fix its namespace
+        match self.nst.get(name.borrow().prefix_repr()) {
+            Some("") => name.namespace = None,  // default namespace
+            Some(ns) => name.namespace = Some(ns.into()),
+            None => return Some(self_error!(self; "Element {} prefix is unbound", name))
+        }
+
+        let op_name = self.est.pop().unwrap();
+
+        if name == op_name {
+            self.pop_namespace = true;
+            self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name }))
+        } else {
+            Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name))
+        }
+    }
+
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::BufReader;
+
+    use common::{Position, TextPosition};
+    use name::OwnedName;
+    use attribute::OwnedAttribute;
+    use reader::parser::PullParser;
+    use reader::ParserConfig;
+    use reader::events::XmlEvent;
+
+    fn new_parser() -> PullParser {
+        PullParser::new(ParserConfig::new())
+    }
+
+    macro_rules! expect_event(
+        ($r:expr, $p:expr, $t:pat) => (
+            match $p.next(&mut $r) {
+                $t => {}
+                e => panic!("Unexpected event: {:?}", e)
+            }
+        );
+        ($r:expr, $p:expr, $t:pat => $c:expr ) => (
+            match $p.next(&mut $r) {
+                $t if $c => {}
+                e => panic!("Unexpected event: {:?}", e)
+            }
+        )
+    );
+
+    macro_rules! test_data(
+        ($d:expr) => ({
+            static DATA: &'static str = $d;
+            let r = BufReader::new(DATA.as_bytes());
+            let p = new_parser();
+            (r, p)
+        })
+    );
+
+    #[test]
+    fn issue_3_semicolon_in_attribute_value() {
+        let (mut r, mut p) = test_data!(r#"
+            <a attr="zzz;zzz" />
+        "#);
+
+        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+        expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
+            *name == OwnedName::local("a") &&
+             attributes.len() == 1 &&
+             attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
+             namespace.is_essentially_empty()
+        );
+        expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
+        expect_event!(r, p, Ok(XmlEvent::EndDocument));
+    }
+
+    #[test]
+    fn issue_140_entity_reference_inside_tag() {
+        let (mut r, mut p) = test_data!(r#"
+            <bla>&#9835;</bla>
+        "#);
+
+        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+        expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
+        expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
+        expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
+        expect_event!(r, p, Ok(XmlEvent::EndDocument));
+    }
+
+    #[test]
+    fn opening_tag_in_attribute_value() {
+        let (mut r, mut p) = test_data!(r#"
+            <a attr="zzz<zzz" />
+        "#);
+
+        expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
+        expect_event!(r, p, Err(ref e) =>
+            e.msg() == "Unexpected token inside attribute value: <" &&
+            e.position() == TextPosition { row: 1, column: 24 }
+        );
+    }
+}
diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs

new file mode 100644 (file)

index 0000000..d3f7598
--- /dev/null
+++ b/src/reader/parser/outside_tag.rs
@@ -0,0 +1,130 @@
+use common::is_whitespace_char;
+
+use reader::events::XmlEvent;
+use reader::lexer::Token;
+
+use super::{
+    Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate,
+    ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE
+};
+
+impl PullParser {
+    pub fn outside_tag(&mut self, t: Token) -> Option<Result> {
+        match t {
+            Token::ReferenceStart =>
+                self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))),
+
+            Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None,  // skip whitespace outside of the root element
+
+            Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None,
+
+            Token::Whitespace(c) => {
+                if !self.buf_has_data() {
+                    self.push_pos();
+                }
+                self.append_char_continue(c)
+            }
+
+            _ if t.contains_char_data() && self.depth() == 0 =>
+                Some(self_error!(self; "Unexpected characters outside the root element: {}", t)),
+
+            _ if t.contains_char_data() => {  // Non-whitespace char data
+                if !self.buf_has_data() {
+                    self.push_pos();
+                }
+                self.inside_whitespace = false;
+                t.push_to_string(&mut self.buf);
+                None
+            }
+
+            Token::ReferenceEnd => { // Semi-colon in a text outside an entity
+                self.inside_whitespace = false;
+                Token::ReferenceEnd.push_to_string(&mut self.buf);
+                None
+            }
+
+            Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => {
+                // We need to switch the lexer into a comment mode inside comments
+                self.lexer.inside_comment();
+                self.into_state_continue(State::InsideComment)
+            }
+
+            Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => {
+                if !self.buf_has_data() {
+                    self.push_pos();
+                }
+                // We need to disable lexing errors inside CDATA
+                self.lexer.disable_errors();
+                self.into_state_continue(State::InsideCData)
+            }
+
+            _ => {
+                // Encountered some markup event, flush the buffer as characters
+                // or a whitespace
+                let mut next_event = if self.buf_has_data() {
+                    let buf = self.take_buf();
+                    if self.inside_whitespace && self.config.trim_whitespace {
+                        None
+                    } else if self.inside_whitespace && !self.config.whitespace_to_characters {
+                        Some(Ok(XmlEvent::Whitespace(buf)))
+                    } else if self.config.trim_whitespace {
+                        Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
+                    } else {
+                        Some(Ok(XmlEvent::Characters(buf)))
+                    }
+                } else { None };
+                self.inside_whitespace = true;  // Reset inside_whitespace flag
+                self.push_pos();
+                match t {
+                    Token::ProcessingInstructionStart =>
+                        self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
+
+                    Token::DoctypeStart if !self.encountered_element => {
+                        // We don't have a doctype event so skip this position
+                        // FIXME: update when we have a doctype event
+                        self.next_pos();
+                        self.lexer.disable_errors();
+                        self.into_state(State::InsideDoctype, next_event)
+                    }
+
+                    Token::OpeningTagStart => {
+                        // If declaration was not parsed and we have encountered an element,
+                        // emit this declaration as the next event.
+                        if !self.parsed_declaration {
+                            self.parsed_declaration = true;
+                            let sd_event = XmlEvent::StartDocument {
+                                version: DEFAULT_VERSION,
+                                encoding: DEFAULT_ENCODING.into(),
+                                standalone: DEFAULT_STANDALONE
+                            };
+                            // next_event is always none here because we're outside of
+                            // the root element
+                            next_event = Some(Ok(sd_event));
+                            self.push_pos();
+                        }
+                        self.encountered_element = true;
+                        self.nst.push_empty();
+                        self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
+                    }
+
+                    Token::ClosingTagStart if self.depth() > 0 =>
+                        self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
+
+                    Token::CommentStart => {
+                        // We need to switch the lexer into a comment mode inside comments
+                        self.lexer.inside_comment();
+                        self.into_state(State::InsideComment, next_event)
+                    }
+
+                    Token::CDataStart => {
+                        // We need to disable lexing errors inside CDATA
+                        self.lexer.disable_errors();
+                        self.into_state(State::InsideCData, next_event)
+                    }
+
+                    _ => Some(self_error!(self; "Unexpected token: {}", t))
+                }
+            }
+        }
+    }
+}
diff --git a/src/util.rs b/src/util.rs

new file mode 100644 (file)

index 0000000..23fee04
--- /dev/null
+++ b/src/util.rs
@@ -0,0 +1,107 @@
+use std::io::{self, Read};
+use std::str;
+use std::fmt;
+
+#[derive(Debug)]
+pub enum CharReadError {
+    UnexpectedEof,
+    Utf8(str::Utf8Error),
+    Io(io::Error)
+}
+
+impl From<str::Utf8Error> for CharReadError {
+    fn from(e: str::Utf8Error) -> CharReadError {
+        CharReadError::Utf8(e)
+    }
+}
+
+impl From<io::Error> for CharReadError {
+    fn from(e: io::Error) -> CharReadError {
+        CharReadError::Io(e)
+    }
+}
+
+impl fmt::Display for CharReadError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use self::CharReadError::*;
+        match *self {
+            UnexpectedEof => write!(f, "unexpected end of stream"),
+            Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e),
+            Io(ref e) => write!(f, "I/O error: {}", e)
+        }
+    }
+}
+
+pub fn next_char_from<R: Read>(source: &mut R) -> Result<Option<char>, CharReadError> {
+    const MAX_CODEPOINT_LEN: usize = 4;
+
+    let mut bytes = source.bytes();
+    let mut buf = [0u8; MAX_CODEPOINT_LEN];
+    let mut pos = 0;
+
+    loop {
+        let next = match bytes.next() {
+            Some(Ok(b)) => b,
+            Some(Err(e)) => return Err(e.into()),
+            None if pos == 0 => return Ok(None),
+            None => return Err(CharReadError::UnexpectedEof)
+        };
+        buf[pos] = next;
+        pos += 1;
+
+        match str::from_utf8(&buf[..pos]) {
+            Ok(s) => return Ok(s.chars().next()),  // always Some(..)
+            Err(_) if pos < MAX_CODEPOINT_LEN => {},
+            Err(e) => return Err(e.into())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_next_char_from() {
+        use std::io;
+        use std::error::Error;
+
+        let mut bytes: &[u8] = "correct".as_bytes();    // correct ASCII
+        assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c'));
+
+        let mut bytes: &[u8] = "правильно".as_bytes();  // correct BMP
+        assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п'));
+
+        let mut bytes: &[u8] = "😊".as_bytes();          // correct non-BMP
+        assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊'));
+
+        let mut bytes: &[u8] = b"";                     // empty
+        assert_eq!(super::next_char_from(&mut bytes).unwrap(), None);
+
+        let mut bytes: &[u8] = b"\xf0\x9f\x98";         // incomplete code point
+        match super::next_char_from(&mut bytes).unwrap_err() {
+            super::CharReadError::UnexpectedEof => {},
+            e => panic!("Unexpected result: {:?}", e)
+        };
+
+        let mut bytes: &[u8] = b"\xff\x9f\x98\x32";     // invalid code point
+        match super::next_char_from(&mut bytes).unwrap_err() {
+            super::CharReadError::Utf8(_) => {},
+            e => panic!("Unexpected result: {:?}", e)
+        };
+
+
+        // error during read
+        struct ErrorReader;
+        impl io::Read for ErrorReader {
+            fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+                Err(io::Error::new(io::ErrorKind::Other, "test error"))
+            }
+        }
+
+        let mut r = ErrorReader;
+        match super::next_char_from(&mut r).unwrap_err() {
+            super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other &&
+                                               e.description() == "test error" => {},
+            e => panic!("Unexpected result: {:?}", e)
+        }
+    }
+}
diff --git a/src/writer/config.rs b/src/writer/config.rs

new file mode 100644 (file)

index 0000000..ebabf18
--- /dev/null
+++ b/src/writer/config.rs
@@ -0,0 +1,157 @@
+//! Contains emitter configuration structure.
+
+use std::io::Write;
+use std::borrow::Cow;
+
+use writer::EventWriter;
+
+/// Emitter configuration structure.
+///
+/// This structure contains various options which control XML document emitter behavior.
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub struct EmitterConfig {
+    /// Line separator used to separate lines in formatted output. Default is `"\n"`.
+    pub line_separator: Cow<'static, str>,
+
+    /// A string which will be used for a single level of indentation. Default is `"  "`
+    /// (two spaces).
+    pub indent_string: Cow<'static, str>,
+
+    /// Whether or not the emitted document should be indented. Default is false.
+    ///
+    /// The emitter is capable to perform automatic indentation of the emitted XML document.
+    /// It is done in stream-like fashion and does not require the knowledge of the whole
+    /// document in advance.
+    ///
+    /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep
+    /// existing layout when processing an existing XML document. Also the indentiation algorithm
+    /// is not thoroughly tested. Hence by default it is disabled.
+    pub perform_indent: bool,
+
+    /// Whether or not characters in output events will be escaped. Default is true.
+    ///
+    /// The emitter can automatically escape characters which can't appear in PCDATA sections
+    /// or element attributes of an XML document, like `<` or `"` (in attributes). This may
+    /// introduce some overhead because then every corresponding piece of character data
+    /// should be scanned for invalid characters.
+    ///
+    /// If this option is disabled, the XML writer may produce non-well-formed documents, so
+    /// use `false` value for this option with care.
+    pub perform_escaping: bool,
+
+    /// Whether or not to write XML document declaration at the beginning of a document.
+    /// Default is true.
+    ///
+    /// This option controls whether the document declaration should be emitted automatically
+    /// before a root element is written if it was not emitted explicitly by the user.
+    pub write_document_declaration: bool,
+
+    /// Whether or not to convert elements with empty content to empty elements. Default is true.
+    ///
+    /// This option allows turning elements like `<a></a>` (an element with empty content)
+    /// into `<a />` (an empty element).
+    pub normalize_empty_elements: bool,
+
+    /// Whether or not to emit CDATA events as plain characters. Default is false.
+    ///
+    /// This option forces the emitter to convert CDATA events into regular character events,
+    /// performing all the necessary escaping beforehand. This may be occasionally useful
+    /// for feeding the document into incorrect parsers which do not support CDATA.
+    pub cdata_to_characters: bool,
+
+    /// Whether or not to keep element names to support `EndElement` events without explicit names.
+    /// Default is true.
+    ///
+    /// This option makes the emitter to keep names of written elements in order to allow
+    /// omitting names when writing closing element tags. This could incur some memory overhead.
+    pub keep_element_names_stack: bool,
+
+    /// Whether or not to automatically insert leading and trailing spaces in emitted comments,
+    /// if necessary. Default is true.
+    ///
+    /// This is a convenience option in order for the user not to append spaces before and after
+    /// comments text in order to get more pretty comments: `<!-- something -->` instead of
+    /// `<!--something-->`.
+    pub autopad_comments: bool,
+
+    /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing
+    /// elements. Default is true.
+    ///
+    /// This option is only meaningful if `normalize_empty_elements` is true. For example, the
+    /// element `<a></a>` would be unaffected. When `normalize_empty_elements` is true, then when
+    /// this option is also true, the same element would appear `<a />`. If this option is false,
+    /// then the same element would appear `<a/>`.
+    pub pad_self_closing: bool,
+}
+
+impl EmitterConfig {
+    /// Creates an emitter configuration with default values.
+    ///
+    /// You can tweak default options with builder-like pattern:
+    ///
+    /// ```rust
+    /// use xml::writer::EmitterConfig;
+    ///
+    /// let config = EmitterConfig::new()
+    ///     .line_separator("\r\n")
+    ///     .perform_indent(true)
+    ///     .normalize_empty_elements(false);
+    /// ```
+    #[inline]
+    pub fn new() -> EmitterConfig {
+        EmitterConfig {
+            line_separator: "\n".into(),
+            indent_string: "  ".into(),  // two spaces
+            perform_indent: false,
+            perform_escaping: true,
+            write_document_declaration: true,
+            normalize_empty_elements: true,
+            cdata_to_characters: false,
+            keep_element_names_stack: true,
+            autopad_comments: true,
+            pad_self_closing: true
+        }
+    }
+
+    /// Creates an XML writer with this configuration.
+    ///
+    /// This is a convenience method for configuring and creating a writer at the same time:
+    ///
+    /// ```rust
+    /// use xml::writer::EmitterConfig;
+    ///
+    /// let mut target: Vec<u8> = Vec::new();
+    ///
+    /// let writer = EmitterConfig::new()
+    ///     .line_separator("\r\n")
+    ///     .perform_indent(true)
+    ///     .normalize_empty_elements(false)
+    ///     .create_writer(&mut target);
+    /// ```
+    ///
+    /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with
+    /// this configuration object.
+    #[inline]
+    pub fn create_writer<W: Write>(self, sink: W) -> EventWriter<W> {
+        EventWriter::new_with_config(sink, self)
+    }
+}
+
+impl Default for EmitterConfig {
+    #[inline]
+    fn default() -> EmitterConfig {
+        EmitterConfig::new()
+    }
+}
+
+gen_setters!(EmitterConfig,
+    line_separator: into Cow<'static, str>,
+    indent_string: into Cow<'static, str>,
+    perform_indent: val bool,
+    write_document_declaration: val bool,
+    normalize_empty_elements: val bool,
+    cdata_to_characters: val bool,
+    keep_element_names_stack: val bool,
+    autopad_comments: val bool,
+    pad_self_closing: val bool
+);
diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs

new file mode 100644 (file)

index 0000000..ba80f66
--- /dev/null
+++ b/src/writer/emitter.rs
@@ -0,0 +1,447 @@
+use std::io;
+use std::io::prelude::*;
+use std::fmt;
+use std::result;
+use std::borrow::Cow;
+use std::error::Error;
+
+use common;
+use name::{Name, OwnedName};
+use attribute::Attribute;
+use escape::{escape_str_attribute, escape_str_pcdata};
+use common::XmlVersion;
+use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX};
+
+use writer::config::EmitterConfig;
+
+/// An error which may be returned by `XmlWriter` when writing XML events.
+#[derive(Debug)]
+pub enum EmitterError {
+    /// An I/O error occured in the underlying `Write` instance.
+    Io(io::Error),
+
+    /// Document declaration has already been written to the output stream.
+    DocumentStartAlreadyEmitted,
+
+    /// The name of the last opening element is not available.
+    LastElementNameNotAvailable,
+
+    /// The name of the last opening element is not equal to the name of the provided
+    /// closing element.
+    EndElementNameIsNotEqualToLastStartElementName,
+
+    /// End element name is not specified when it is needed, for example, when automatic
+    /// closing is not enabled in configuration.
+    EndElementNameIsNotSpecified
+}
+
+impl From<io::Error> for EmitterError {
+    fn from(err: io::Error) -> EmitterError {
+        EmitterError::Io(err)
+    }
+}
+
+impl fmt::Display for EmitterError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+
+        write!(f, "emitter error: ")?;
+        match *self {
+            EmitterError::Io(ref e) =>
+                write!(f, "I/O error: {}", e),
+            ref other =>
+                write!(f, "{}", other.description()),
+        }
+    }
+}
+
+impl Error for EmitterError {
+    fn description(&self) -> &str {
+        match *self {
+            EmitterError::Io(_) =>
+                "I/O error",
+            EmitterError::DocumentStartAlreadyEmitted =>
+                "document start event has already been emitted",
+            EmitterError::LastElementNameNotAvailable =>
+                "last element name is not available",
+            EmitterError::EndElementNameIsNotEqualToLastStartElementName =>
+                "end element name is not equal to last start element name",
+            EmitterError::EndElementNameIsNotSpecified =>
+                "end element name is not specified and can't be inferred",
+        }
+    }
+}
+
+/// A result type yielded by `XmlWriter`.
+pub type Result<T> = result::Result<T, EmitterError>;
+
+// TODO: split into a low-level fast writer without any checks and formatting logic and a
+// high-level indenting validating writer
+pub struct Emitter {
+    config: EmitterConfig,
+
+    nst: NamespaceStack,
+
+    indent_level: usize,
+    indent_stack: Vec<IndentFlags>,
+
+    element_names: Vec<OwnedName>,
+
+    start_document_emitted: bool,
+    just_wrote_start_element: bool
+}
+
+impl Emitter {
+    pub fn new(config: EmitterConfig) -> Emitter {
+        Emitter {
+            config,
+
+            nst: NamespaceStack::empty(),
+
+            indent_level: 0,
+            indent_stack: vec![IndentFlags::WroteNothing],
+
+            element_names: Vec::new(),
+
+            start_document_emitted: false,
+            just_wrote_start_element: false
+        }
+    }
+}
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+enum IndentFlags {
+    WroteNothing,
+    WroteMarkup,
+    WroteText,
+}
+
+impl Emitter {
+    /// Returns the current state of namespaces.
+    #[inline]
+    pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack {
+        &mut self.nst
+    }
+
+    #[inline]
+    fn wrote_text(&self) -> bool {
+        *self.indent_stack.last().unwrap() == IndentFlags::WroteText
+    }
+
+    #[inline]
+    fn wrote_markup(&self) -> bool {
+        *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup
+    }
+
+    #[inline]
+    fn set_wrote_text(&mut self) {
+        *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText;
+    }
+
+    #[inline]
+    fn set_wrote_markup(&mut self) {
+        *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup;
+    }
+
+    #[inline]
+    fn reset_state(&mut self) {
+        *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing;
+    }
+
+    fn write_newline<W: Write>(&mut self, target: &mut W, level: usize) -> Result<()> {
+        target.write_all(self.config.line_separator.as_bytes())?;
+        for _ in 0..level {
+            target.write_all(self.config.indent_string.as_bytes())?;
+        }
+        Ok(())
+    }
+
+    fn before_markup<W: Write>(&mut self, target: &mut W) -> Result<()> {
+        if self.config.perform_indent && !self.wrote_text() &&
+           (self.indent_level > 0 || self.wrote_markup()) {
+            let indent_level = self.indent_level;
+            self.write_newline(target, indent_level)?;
+            if self.indent_level > 0 && self.config.indent_string.len() > 0 {
+                self.after_markup();
+            }
+        }
+        Ok(())
+    }
+
+    fn after_markup(&mut self) {
+        self.set_wrote_markup();
+    }
+
+    fn before_start_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
+        self.before_markup(target)?;
+        self.indent_stack.push(IndentFlags::WroteNothing);
+        Ok(())
+    }
+
+    fn after_start_element(&mut self) {
+        self.after_markup();
+        self.indent_level += 1;
+    }
+
+    fn before_end_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
+        if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() &&
+           !self.wrote_text() {
+            let indent_level = self.indent_level;
+            self.write_newline(target, indent_level - 1)
+        } else {
+            Ok(())
+        }
+    }
+
+    fn after_end_element(&mut self) {
+        if self.indent_level > 0 {
+            self.indent_level -= 1;
+            self.indent_stack.pop();
+        }
+        self.set_wrote_markup();
+    }
+
+    fn after_text(&mut self) {
+        self.set_wrote_text();
+    }
+
+    pub fn emit_start_document<W: Write>(&mut self, target: &mut W,
+                                         version: XmlVersion,
+                                         encoding: &str,
+                                         standalone: Option<bool>) -> Result<()> {
+        if self.start_document_emitted {
+            return Err(EmitterError::DocumentStartAlreadyEmitted);
+        }
+        self.start_document_emitted = true;
+
+        self.before_markup(target)?;
+        let result = {
+            let mut write = move || {
+                write!(target, "<?xml version=\"{}\" encoding=\"{}\"", version, encoding)?;
+
+                if let Some(standalone) = standalone {
+                    write!(target, " standalone=\"{}\"", if standalone { "yes" } else { "no" })?;
+                }
+
+                write!(target, "?>")?;
+
+                Ok(())
+            };
+            write()
+        };
+        self.after_markup();
+
+        result
+    }
+
+    fn check_document_started<W: Write>(&mut self, target: &mut W) -> Result<()> {
+        if !self.start_document_emitted && self.config.write_document_declaration {
+            self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None)
+        } else {
+            Ok(())
+        }
+    }
+
+    fn fix_non_empty_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
+        if self.config.normalize_empty_elements && self.just_wrote_start_element {
+            self.just_wrote_start_element = false;
+            target.write_all(b">").map_err(From::from)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub fn emit_processing_instruction<W: Write>(&mut self,
+                                                 target: &mut W,
+                                                 name: &str,
+                                                 data: Option<&str>) -> Result<()> {
+        self.check_document_started(target)?;
+        self.fix_non_empty_element(target)?;
+
+        self.before_markup(target)?;
+
+        let result = {
+            let mut write = || {
+                write!(target, "<?{}", name)?;
+
+                if let Some(data) = data {
+                    write!(target, " {}", data)?;
+                }
+
+                write!(target, "?>")?;
+
+                Ok(())
+            };
+            write()
+        };
+
+        self.after_markup();
+
+        result
+    }
+
+    fn emit_start_element_initial<W>(&mut self, target: &mut W,
+                                     name: Name,
+                                     attributes: &[Attribute]) -> Result<()>
+        where W: Write
+    {
+        self.check_document_started(target)?;
+        self.fix_non_empty_element(target)?;
+        self.before_start_element(target)?;
+        write!(target, "<{}", name.repr_display())?;
+        self.emit_current_namespace_attributes(target)?;
+        self.emit_attributes(target, attributes)?;
+        self.after_start_element();
+        Ok(())
+    }
+
+    pub fn emit_start_element<W>(&mut self, target: &mut W,
+                                 name: Name,
+                                 attributes: &[Attribute]) -> Result<()>
+        where W: Write
+    {
+        if self.config.keep_element_names_stack {
+            self.element_names.push(name.to_owned());
+        }
+
+        self.emit_start_element_initial(target, name, attributes)?;
+        self.just_wrote_start_element = true;
+
+        if !self.config.normalize_empty_elements {
+            write!(target, ">")?;
+        }
+
+        Ok(())
+    }
+
+    pub fn emit_current_namespace_attributes<W>(&mut self, target: &mut W) -> Result<()>
+        where W: Write
+    {
+        for (prefix, uri) in self.nst.peek() {
+            match prefix {
+                // internal namespaces are not emitted
+                NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()),
+                //// there is already a namespace binding with this prefix in scope
+                //prefix if self.nst.get(prefix) == Some(uri) => Ok(()),
+                // emit xmlns only if it is overridden
+                NS_NO_PREFIX => if uri != NS_EMPTY_URI {
+                    write!(target, " xmlns=\"{}\"", uri)
+                } else { Ok(()) },
+                // everything else
+                prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri)
+            }?;
+        }
+        Ok(())
+    }
+
+    pub fn emit_attributes<W: Write>(&mut self, target: &mut W,
+                                      attributes: &[Attribute]) -> Result<()> {
+        for attr in attributes.iter() {
+            write!(
+                target, " {}=\"{}\"",
+                attr.name.repr_display(),
+                if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) }
+            )?
+        }
+        Ok(())
+    }
+
+    pub fn emit_end_element<W: Write>(&mut self, target: &mut W,
+                                      name: Option<Name>) -> Result<()> {
+        let owned_name = if self.config.keep_element_names_stack {
+            Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?)
+        } else {
+            None
+        };
+
+        // Check that last started element name equals to the provided name, if there are both
+        if let Some(ref last_name) = owned_name {
+            if let Some(ref name) = name {
+                if last_name.borrow() != *name {
+                    return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName);
+                }
+            }
+        }
+
+        if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) {
+            if self.config.normalize_empty_elements && self.just_wrote_start_element {
+                self.just_wrote_start_element = false;
+                let termination = if self.config.pad_self_closing { " />" } else { "/>" };
+                let result = target.write_all(termination.as_bytes()).map_err(From::from);
+                self.after_end_element();
+                result
+            } else {
+                self.just_wrote_start_element = false;
+
+                self.before_end_element(target)?;
+                let result = write!(target, "</{}>", name.repr_display()).map_err(From::from);
+                self.after_end_element();
+
+                result
+            }
+        } else {
+            Err(EmitterError::EndElementNameIsNotSpecified)
+        }
+    }
+
+    pub fn emit_cdata<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
+        self.fix_non_empty_element(target)?;
+        if self.config.cdata_to_characters {
+            self.emit_characters(target, content)
+        } else {
+            // TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks
+            target.write_all(b"<![CDATA[")?;
+            target.write_all(content.as_bytes())?;
+            target.write_all(b"]]>")?;
+
+            self.after_text();
+
+            Ok(())
+        }
+    }
+
+    pub fn emit_characters<W: Write>(&mut self, target: &mut W,
+                                      content: &str) -> Result<()> {
+        self.check_document_started(target)?;
+        self.fix_non_empty_element(target)?;
+        target.write_all(
+            (if self.config.perform_escaping {
+                escape_str_pcdata(content)
+            } else {
+                Cow::Borrowed(content)
+            }).as_bytes()
+        )?;
+        self.after_text();
+        Ok(())
+    }
+
+    pub fn emit_comment<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
+        self.fix_non_empty_element(target)?;
+
+        // TODO: add escaping dashes at the end of the comment
+
+        let autopad_comments = self.config.autopad_comments;
+        let write = |target: &mut W| -> Result<()> {
+            target.write_all(b"<!--")?;
+
+            if autopad_comments && !content.starts_with(char::is_whitespace) {
+                target.write_all(b" ")?;
+            }
+
+            target.write_all(content.as_bytes())?;
+
+            if autopad_comments && !content.ends_with(char::is_whitespace) {
+                target.write_all(b" ")?;
+            }
+
+            target.write_all(b"-->")?;
+
+            Ok(())
+        };
+
+        self.before_markup(target)?;
+        let result = write(target);
+        self.after_markup();
+
+        result
+    }
+}
diff --git a/src/writer/events.rs b/src/writer/events.rs

new file mode 100644 (file)

index 0000000..1f7040f
--- /dev/null
+++ b/src/writer/events.rs
@@ -0,0 +1,241 @@
+//! Contains `XmlEvent` datatype, instances of which are consumed by the writer.
+
+use std::borrow::Cow;
+
+use name::Name;
+use attribute::Attribute;
+use common::XmlVersion;
+use namespace::{Namespace, NS_NO_PREFIX};
+
+/// A part of an XML output stream.
+///
+/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of
+/// an XML document.
+#[derive(Debug)]
+pub enum XmlEvent<'a> {
+    /// Corresponds to XML document declaration.
+    ///
+    /// This event should always be written before any other event. If it is not written
+    /// at all, a default XML declaration will be outputted if the corresponding option
+    /// is set in the configuration. Otherwise an error will be returned.
+    StartDocument {
+        /// XML version.
+        ///
+        /// Defaults to `XmlVersion::Version10`.
+        version: XmlVersion,
+
+        /// XML document encoding.
+        ///
+        /// Defaults to `Some("UTF-8")`.
+        encoding: Option<&'a str>,
+
+        /// XML standalone declaration.
+        ///
+        /// Defaults to `None`.
+        standalone: Option<bool>
+    },
+
+    /// Denotes an XML processing instruction.
+    ProcessingInstruction {
+        /// Processing instruction target.
+        name: &'a str,
+
+        /// Processing instruction content.
+        data: Option<&'a str>
+    },
+
+    /// Denotes a beginning of an XML element.
+    StartElement {
+        /// Qualified name of the element.
+        name: Name<'a>,
+
+        /// A list of attributes associated with the element.
+        ///
+        /// Currently attributes are not checked for duplicates (TODO). Attribute values
+        /// will be escaped, and all characters invalid for attribute values like `"` or `<`
+        /// will be changed into character entities.
+        attributes: Cow<'a, [Attribute<'a>]>,
+
+        /// Contents of the namespace mapping at this point of the document.
+        ///
+        /// This mapping will be inspected for "new" entries, and if at this point of the document
+        /// a particular pair of prefix and namespace URI is already defined, no namespace
+        /// attributes will be emitted.
+        namespace: Cow<'a, Namespace>,
+    },
+
+    /// Denotes an end of an XML element.
+    EndElement {
+        /// Optional qualified name of the element.
+        ///
+        /// If `None`, then it is assumed that the element name should be the last valid one.
+        /// If `Some` and element names tracking is enabled, then the writer will check it for
+        /// correctness.
+        name: Option<Name<'a>>
+    },
+
+    /// Denotes CDATA content.
+    ///
+    /// This event contains unparsed data, and no escaping will be performed when writing it
+    /// to the output stream.
+    CData(&'a str),
+
+    /// Denotes a comment.
+    ///
+    /// The string will be checked for invalid sequences and error will be returned by the
+    /// write operation
+    Comment(&'a str),
+
+    /// Denotes character data outside of tags.
+    ///
+    /// Contents of this event will be escaped if `perform_escaping` option is enabled,
+    /// that is, every character invalid for PCDATA will appear as a character entity.
+    Characters(&'a str)
+}
+
+impl<'a> XmlEvent<'a> {
+    /// Returns an writer event for a processing instruction.
+    #[inline]
+    pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> {
+        XmlEvent::ProcessingInstruction { name: name, data: data }
+    }
+
+    /// Returns a builder for a starting element.
+    ///
+    /// This builder can then be used to tweak attributes and namespace starting at
+    /// this element.
+    #[inline]
+    pub fn start_element<S>(name: S) -> StartElementBuilder<'a> where S: Into<Name<'a>> {
+        StartElementBuilder {
+            name: name.into(),
+            attributes: Vec::new(),
+            namespace: Namespace::empty().into()
+        }
+    }
+
+    /// Returns a builder for an closing element.
+    ///
+    /// This method, unline `start_element()`, does not accept a name because by default
+    /// the writer is able to determine it automatically. However, when this functionality
+    /// is disabled, it is possible to specify the name with `name()` method on the builder.
+    #[inline]
+    pub fn end_element() -> EndElementBuilder<'a> {
+        EndElementBuilder { name: None }
+    }
+
+    /// Returns a CDATA event.
+    ///
+    /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>`
+    /// (depending on the configuration).
+    #[inline]
+    pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) }
+
+    /// Returns a regular characters (PCDATA) event.
+    ///
+    /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer.
+    #[inline]
+    pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) }
+
+    /// Returns a comment event.
+    #[inline]
+    pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) }
+}
+
+impl<'a> From<&'a str> for XmlEvent<'a> {
+    #[inline]
+    fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) }
+}
+
+pub struct EndElementBuilder<'a> {
+    name: Option<Name<'a>>
+}
+
+/// A builder for a closing element event.
+impl<'a> EndElementBuilder<'a> {
+    /// Sets the name of this closing element.
+    ///
+    /// Usually the writer is able to determine closing element names automatically. If
+    /// this functionality is enabled (by default it is), then this name is checked for correctness.
+    /// It is possible, however, to disable such behavior; then the user must ensure that
+    /// closing element name is correct manually.
+    #[inline]
+    pub fn name<N>(mut self, name: N) -> EndElementBuilder<'a> where N: Into<Name<'a>> {
+        self.name = Some(name.into());
+        self
+    }
+}
+
+impl<'a> From<EndElementBuilder<'a>> for XmlEvent<'a> {
+    fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> {
+        XmlEvent::EndElement { name: b.name }
+    }
+}
+
+/// A builder for a starting element event.
+pub struct StartElementBuilder<'a> {
+    name: Name<'a>,
+    attributes: Vec<Attribute<'a>>,
+    namespace: Namespace
+}
+
+impl<'a> StartElementBuilder<'a> {
+    /// Sets an attribute value of this element to the given string.
+    ///
+    /// This method can be used to add attributes to the starting element. Name is a qualified
+    /// name; its namespace is ignored, but its prefix is checked for correctness, that is,
+    /// it is checked that the prefix is bound to some namespace in the current context.
+    ///
+    /// Currently attributes are not checked for duplicates. Note that duplicate attributes
+    /// are a violation of XML document well-formedness.
+    ///
+    /// The writer checks that you don't specify reserved prefix names, for example `xmlns`.
+    #[inline]
+    pub fn attr<N>(mut self, name: N, value: &'a str) -> StartElementBuilder<'a>
+        where N: Into<Name<'a>>
+    {
+        self.attributes.push(Attribute::new(name.into(), value));
+        self
+    }
+
+    /// Adds a namespace to the current namespace context.
+    ///
+    /// If no namespace URI was bound to the provided prefix at this point of the document,
+    /// then the mapping from the prefix to the provided namespace URI will be written as
+    /// a part of this element attribute set.
+    ///
+    /// If the same namespace URI was bound to the provided prefix at this point of the document,
+    /// then no namespace attributes will be emitted.
+    ///
+    /// If some other namespace URI was bound to the provided prefix at this point of the document,
+    /// then another binding will be added as a part of this element attribute set, shadowing
+    /// the outer binding.
+    #[inline]
+    pub fn ns<S1, S2>(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a>
+        where S1: Into<String>, S2: Into<String>
+    {
+        self.namespace.put(prefix, uri);
+        self
+    }
+
+    /// Adds a default namespace mapping to the current namespace context.
+    ///
+    /// Same rules as for `ns()` are also valid for the default namespace mapping.
+    #[inline]
+    pub fn default_ns<S>(mut self, uri: S) -> StartElementBuilder<'a>
+        where S: Into<String>
+    {
+        self.namespace.put(NS_NO_PREFIX, uri);
+        self
+    }
+}
+
+impl<'a> From<StartElementBuilder<'a>> for XmlEvent<'a> {
+    #[inline]
+    fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> {
+        XmlEvent::StartElement {
+            name: b.name,
+            attributes: Cow::Owned(b.attributes),
+            namespace: Cow::Owned(b.namespace)
+        }
+    }
+}
diff --git a/src/writer/mod.rs b/src/writer/mod.rs

new file mode 100644 (file)

index 0000000..ea1b242
--- /dev/null
+++ b/src/writer/mod.rs
@@ -0,0 +1,93 @@
+//! Contains high-level interface for an events-based XML emitter.
+//!
+//! The most important type in this module is `EventWriter` which allows writing an XML document
+//! to some output stream.
+
+pub use self::emitter::Result;
+pub use self::emitter::EmitterError as Error;
+pub use self::config::EmitterConfig;
+pub use self::events::XmlEvent;
+
+use self::emitter::Emitter;
+
+use std::io::prelude::*;
+
+mod emitter;
+mod config;
+pub mod events;
+
+/// A wrapper around an `std::io::Write` instance which emits XML document according to provided
+/// events.
+pub struct EventWriter<W> {
+    sink: W,
+    emitter: Emitter
+}
+
+impl<W: Write> EventWriter<W> {
+    /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default
+    /// configuration.
+    #[inline]
+    pub fn new(sink: W) -> EventWriter<W> {
+        EventWriter::new_with_config(sink, EmitterConfig::new())
+    }
+
+    /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided
+    /// configuration.
+    #[inline]
+    pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter<W> {
+        EventWriter {
+            sink,
+            emitter: Emitter::new(config)
+        }
+    }
+
+    /// Writes the next piece of XML document according to the provided event.
+    ///
+    /// Note that output data may not exactly correspond to the written event because
+    /// of various configuration options. For example, `XmlEvent::EndElement` may
+    /// correspond to a separate closing element or it may cause writing an empty element.
+    /// Another example is that `XmlEvent::CData` may be represented as characters in
+    /// the output stream.
+    pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into<XmlEvent<'a>> {
+        match event.into() {
+            XmlEvent::StartDocument { version, encoding, standalone } =>
+                self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone),
+            XmlEvent::ProcessingInstruction { name, data } =>
+                self.emitter.emit_processing_instruction(&mut self.sink, name, data),
+            XmlEvent::StartElement { name, attributes, namespace } => {
+                self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref());
+                self.emitter.emit_start_element(&mut self.sink, name, &attributes)
+            }
+            XmlEvent::EndElement { name } => {
+                let r = self.emitter.emit_end_element(&mut self.sink, name);
+                self.emitter.namespace_stack_mut().try_pop();
+                r
+            }
+            XmlEvent::Comment(content) =>
+                self.emitter.emit_comment(&mut self.sink, content),
+            XmlEvent::CData(content) =>
+                self.emitter.emit_cdata(&mut self.sink, content),
+            XmlEvent::Characters(content) =>
+                self.emitter.emit_characters(&mut self.sink, content)
+        }
+    }
+
+    /// Returns a mutable reference to the underlying `Writer`.
+    ///
+    /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML
+    /// documents. Use this method with care. Valid use cases for this method include accessing
+    /// methods like `Write::flush`, which do not emit new data but rather change the state
+    /// of the stream itself.
+    pub fn inner_mut(&mut self) -> &mut W {
+        &mut self.sink
+    }
+
+    /// Unwraps this `EventWriter`, returning the underlying writer.
+    ///
+    /// Note that this is a destructive operation: unwrapping a writer and then wrapping
+    /// it again with `EventWriter::new()` will create a fresh writer whose state will be
+    /// blank; for example, accumulated namespaces will be reset.
+    pub fn into_inner(self) -> W {
+        self.sink
+    }
+}
diff --git a/tests/documents/sample_1.xml b/tests/documents/sample_1.xml

new file mode 100644 (file)

index 0000000..4d1cbc0
--- /dev/null
+++ b/tests/documents/sample_1.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<project name="project-name">
+    <libraries>
+        <library groupId="org.example" artifactId="&lt;name&gt;" version="0.1"/>
+        <library groupId="com.example" artifactId="&quot;cool-lib&amp;" version="999"/>
+    </libraries>
+    <module name="module-1">
+        <files>
+            <file name="somefile.java" type="java">
+                Some &lt;java&gt; class
+            </file>
+            <file name="another_file.java" type="java">
+                Another &quot;java&quot; class
+            </file>
+            <file name="config.xml" type="xml">
+                Weird &apos;XML&apos; config
+            </file>
+        </files>
+        <libraries>
+            <library groupId="junit" artifactId="junit" version="1.9.5"/>
+        </libraries>
+    </module>
+    <module name="module-2">
+        <files>
+            <file name="program.js" type="javascript">
+                JavaScript &amp; program
+            </file>
+            <file name="style.css" type="css">
+                Cascading style sheet: &#xA9; - &#1161;
+            </file>
+        </files>
+    </module>
+</project>
+
diff --git a/tests/documents/sample_1_full.txt b/tests/documents/sample_1_full.txt

new file mode 100644 (file)

index 0000000..a8d64d0
--- /dev/null
+++ b/tests/documents/sample_1_full.txt
@@ -0,0 +1,58 @@
+StartDocument(1.0, utf-8)
+StartElement(project [name="project-name"])
+Whitespace("\n    ")
+StartElement(libraries)
+Whitespace("\n        ")
+StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"])
+EndElement(library)
+Whitespace("\n        ")
+StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"])
+EndElement(library)
+Whitespace("\n    ")
+EndElement(libraries)
+Whitespace("\n    ")
+StartElement(module [name="module-1"])
+Whitespace("\n        ")
+StartElement(files)
+Whitespace("\n            ")
+StartElement(file [name="somefile.java", type="java"])
+Characters("\n                Some <java> class\n            ")
+EndElement(file)
+Whitespace("\n            ")
+StartElement(file [name="another_file.java", type="java"])
+Characters("\n                Another \"java\" class\n            ")
+EndElement(file)
+Whitespace("\n            ")
+StartElement(file [name="config.xml", type="xml"])
+Characters("\n                Weird \'XML\' config\n            ")
+EndElement(file)
+Whitespace("\n        ")
+EndElement(files)
+Whitespace("\n        ")
+StartElement(libraries)
+Whitespace("\n            ")
+StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"])
+EndElement(library)
+Whitespace("\n        ")
+EndElement(libraries)
+Whitespace("\n    ")
+EndElement(module)
+Whitespace("\n    ")
+StartElement(module [name="module-2"])
+Whitespace("\n        ")
+StartElement(files)
+Whitespace("\n            ")
+StartElement(file [name="program.js", type="javascript"])
+Characters("\n                JavaScript & program\n            ")
+EndElement(file)
+Whitespace("\n            ")
+StartElement(file [name="style.css", type="css"])
+Characters("\n                Cascading style sheet: © - ҉\n            ")
+EndElement(file)
+Whitespace("\n        ")
+EndElement(files)
+Whitespace("\n    ")
+EndElement(module)
+Whitespace("\n")
+EndElement(project)
+EndDocument
diff --git a/tests/documents/sample_1_short.txt b/tests/documents/sample_1_short.txt

new file mode 100644 (file)

index 0000000..4dbe285
--- /dev/null
+++ b/tests/documents/sample_1_short.txt
@@ -0,0 +1,37 @@
+StartDocument(1.0, utf-8)
+StartElement(project [name="project-name"])
+StartElement(libraries)
+StartElement(library [groupId="org.example", artifactId="<name>", version="0.1"])
+EndElement(library)
+StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"])
+EndElement(library)
+EndElement(libraries)
+StartElement(module [name="module-1"])
+StartElement(files)
+StartElement(file [name="somefile.java", type="java"])
+Characters("Some <java> class")
+EndElement(file)
+StartElement(file [name="another_file.java", type="java"])
+Characters("Another \"java\" class")
+EndElement(file)
+StartElement(file [name="config.xml", type="xml"])
+Characters("Weird \'XML\' config")
+EndElement(file)
+EndElement(files)
+StartElement(libraries)
+StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"])
+EndElement(library)
+EndElement(libraries)
+EndElement(module)
+StartElement(module [name="module-2"])
+StartElement(files)
+StartElement(file [name="program.js", type="javascript"])
+Characters("JavaScript & program")
+EndElement(file)
+StartElement(file [name="style.css", type="css"])
+Characters("Cascading style sheet: © - ҉")
+EndElement(file)
+EndElement(files)
+EndElement(module)
+EndElement(project)
+EndDocument
diff --git a/tests/documents/sample_2.xml b/tests/documents/sample_2.xml

new file mode 100644 (file)

index 0000000..f9543ac
--- /dev/null
+++ b/tests/documents/sample_2.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<p:data xmlns:d="urn:example:double" xmlns:h="urn:example:header" xmlns:p="urn:example:namespace">
+  <p:datum id="34">
+    <p:name>Name</p:name>
+    <d:name>Another name</d:name>
+    <d:arg>0.3</d:arg>
+    <d:arg>0.2</d:arg>
+    <p:arg>0.1</p:arg>
+    <p:arg>0.01</p:arg>
+    <h:header name="Header-1">header 1 value</h:header>
+    <h:header name="Header-2">
+      Some bigger value
+    </h:header>
+  </p:datum>
+</p:data>
diff --git a/tests/documents/sample_2_full.txt b/tests/documents/sample_2_full.txt

new file mode 100644 (file)

index 0000000..75075cd
--- /dev/null
+++ b/tests/documents/sample_2_full.txt
@@ -0,0 +1,41 @@
+StartDocument(1.0, utf-8)
+StartElement({urn:example:namespace}p:data)
+Whitespace("\n  ")
+StartElement({urn:example:namespace}p:datum [id="34"])
+Whitespace("\n    ")
+StartElement({urn:example:namespace}p:name)
+Characters("Name")
+EndElement({urn:example:namespace}p:name)
+Whitespace("\n    ")
+StartElement({urn:example:double}d:name)
+Characters("Another name")
+EndElement({urn:example:double}d:name)
+Whitespace("\n    ")
+StartElement({urn:example:double}d:arg)
+Characters("0.3")
+EndElement({urn:example:double}d:arg)
+Whitespace("\n    ")
+StartElement({urn:example:double}d:arg)
+Characters("0.2")
+EndElement({urn:example:double}d:arg)
+Whitespace("\n    ")
+StartElement({urn:example:namespace}p:arg)
+Characters("0.1")
+EndElement({urn:example:namespace}p:arg)
+Whitespace("\n    ")
+StartElement({urn:example:namespace}p:arg)
+Characters("0.01")
+EndElement({urn:example:namespace}p:arg)
+Whitespace("\n    ")
+StartElement({urn:example:header}h:header [name="Header-1"])
+Characters("header 1 value")
+EndElement({urn:example:header}h:header)
+Whitespace("\n    ")
+StartElement({urn:example:header}h:header [name="Header-2"])
+Characters("\n      Some bigger value\n    ")
+EndElement({urn:example:header}h:header)
+Whitespace("\n  ")
+EndElement({urn:example:namespace}p:datum)
+Whitespace("\n")
+EndElement({urn:example:namespace}p:data)
+EndDocument
diff --git a/tests/documents/sample_2_short.txt b/tests/documents/sample_2_short.txt

new file mode 100644 (file)

index 0000000..2368025
--- /dev/null
+++ b/tests/documents/sample_2_short.txt
@@ -0,0 +1,30 @@
+StartDocument(1.0, utf-8)
+StartElement({urn:example:namespace}p:data)
+StartElement({urn:example:namespace}p:datum [id="34"])
+StartElement({urn:example:namespace}p:name)
+Characters("Name")
+EndElement({urn:example:namespace}p:name)
+StartElement({urn:example:double}d:name)
+Characters("Another name")
+EndElement({urn:example:double}d:name)
+StartElement({urn:example:double}d:arg)
+Characters("0.3")
+EndElement({urn:example:double}d:arg)
+StartElement({urn:example:double}d:arg)
+Characters("0.2")
+EndElement({urn:example:double}d:arg)
+StartElement({urn:example:namespace}p:arg)
+Characters("0.1")
+EndElement({urn:example:namespace}p:arg)
+StartElement({urn:example:namespace}p:arg)
+Characters("0.01")
+EndElement({urn:example:namespace}p:arg)
+StartElement({urn:example:header}h:header [name="Header-1"])
+Characters("header 1 value")
+EndElement({urn:example:header}h:header)
+StartElement({urn:example:header}h:header [name="Header-2"])
+Characters("Some bigger value")
+EndElement({urn:example:header}h:header)
+EndElement({urn:example:namespace}p:datum)
+EndElement({urn:example:namespace}p:data)
+EndDocument
diff --git a/tests/documents/sample_3.xml b/tests/documents/sample_3.xml

new file mode 100644 (file)

index 0000000..657e37d
--- /dev/null
+++ b/tests/documents/sample_3.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<p:data xmlns:p="urn:x" z=">">
+    <!-- abcd &lt; &gt; &amp; -->
+    <a>test</a>
+    <b>kkss" = ddd' ></b>
+    <![CDATA[
+            <a>ddddd</b>!e3--><!-- ddckx
+    ]]>
+    <c/>
+    <![CDATA[
+    <![CDATA[zzzz]]]]><![CDATA[>]]>
+</p:data>
+
diff --git a/tests/documents/sample_3_full.txt b/tests/documents/sample_3_full.txt

new file mode 100644 (file)

index 0000000..e9a0f7e
--- /dev/null
+++ b/tests/documents/sample_3_full.txt
@@ -0,0 +1,23 @@
+1:1 StartDocument(1.0, utf-8)
+2:1 StartElement({urn:x}p:data [z=">"])
+2:31 Whitespace("\n    ")
+3:5 Comment(" abcd &lt; &gt; &amp; ")
+3:34 Whitespace("\n    ")
+4:5 StartElement(a)
+4:8 Characters("test")
+4:12 EndElement(a)
+4:16 Whitespace("\n    ")
+5:5 StartElement(b)
+5:8 Characters("kkss\" = ddd\' >")
+5:22 EndElement(b)
+5:26 Whitespace("\n    ")
+6:5 CData("\n            <a>ddddd</b>!e3--><!-- ddckx\n    ")
+8:8 Characters("\n    ")
+9:5 StartElement(c)
+9:5 EndElement(c)
+9:9 Whitespace("\n    ")
+10:5 CData("\n    <![CDATA[zzzz]]")
+11:23 CData(">")
+11:36 Characters("\n")
+12:1 EndElement({urn:x}p:data)
+14:1 EndDocument
diff --git a/tests/documents/sample_3_short.txt b/tests/documents/sample_3_short.txt

new file mode 100644 (file)

index 0000000..2582f33
--- /dev/null
+++ b/tests/documents/sample_3_short.txt
@@ -0,0 +1,14 @@
+1:1 StartDocument(1.0, utf-8)
+2:1 StartElement({urn:x}p:data [z=">"])
+4:5 StartElement(a)
+4:8 Characters("test")
+4:12 EndElement(a)
+5:5 StartElement(b)
+5:8 Characters("kkss\" = ddd\' >")
+5:22 EndElement(b)
+6:5 Characters("<a>ddddd</b>!e3--><!-- ddckx")
+9:5 StartElement(c)
+9:5 EndElement(c)
+10:5 Characters("<![CDATA[zzzz]]>")
+12:1 EndElement({urn:x}p:data)
+14:1 EndDocument
diff --git a/tests/documents/sample_4.xml b/tests/documents/sample_4.xml

new file mode 100644 (file)

index 0000000..fb915ff
--- /dev/null
+++ b/tests/documents/sample_4.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE data SYSTEM "abcd.dtd">
+<p:data xmlns:p="urn:x" z=">">
+    <!-- abcd &lt; &gt; &amp; -->
+    <a>test</a>
+    <b>kkss" = ddd' ></b>
+    <![CDATA[
+            <a>ddddd</b>!e3--><!-- ddckx
+    ]]>
+    <c/>
+    <![CDATA[
+    <![CDATA[zzzz]]]]><![CDATA[>]]>
+</p:data>
+
+
diff --git a/tests/documents/sample_4_full.txt b/tests/documents/sample_4_full.txt

new file mode 100644 (file)

index 0000000..4bdadfb
--- /dev/null
+++ b/tests/documents/sample_4_full.txt
@@ -0,0 +1,23 @@
+StartDocument(1.0, utf-8)
+StartElement({urn:x}p:data [z=">"])
+Whitespace("\n    ")
+Comment(" abcd &lt; &gt; &amp; ")
+Whitespace("\n    ")
+StartElement(a)
+Characters("test")
+EndElement(a)
+Whitespace("\n    ")
+StartElement(b)
+Characters("kkss\" = ddd\' >")
+EndElement(b)
+Whitespace("\n    ")
+CData("\n            <a>ddddd</b>!e3--><!-- ddckx\n    ")
+Characters("\n    ")
+StartElement(c)
+EndElement(c)
+Whitespace("\n    ")
+CData("\n    <![CDATA[zzzz]]")
+CData(">")
+Characters("\n")
+EndElement({urn:x}p:data)
+EndDocument
diff --git a/tests/documents/sample_4_short.txt b/tests/documents/sample_4_short.txt

new file mode 100644 (file)

index 0000000..52e4b83
--- /dev/null
+++ b/tests/documents/sample_4_short.txt
@@ -0,0 +1,14 @@
+StartDocument(1.0, utf-8)
+StartElement({urn:x}p:data [z=">"])
+StartElement(a)
+Characters("test")
+EndElement(a)
+StartElement(b)
+Characters("kkss\" = ddd\' >")
+EndElement(b)
+Characters("<a>ddddd</b>!e3--><!-- ddckx")
+StartElement(c)
+EndElement(c)
+Characters("<![CDATA[zzzz]]>")
+EndElement({urn:x}p:data)
+EndDocument
diff --git a/tests/documents/sample_5.xml b/tests/documents/sample_5.xml

new file mode 100644 (file)

index 0000000..92aa31d
--- /dev/null
+++ b/tests/documents/sample_5.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE data SYSTEM "abcd.dtd">
+<p>
+    <a>test&nbsp;&copy;&NotEqualTilde;</a>
+</p>
+
+
diff --git a/tests/documents/sample_5_short.txt b/tests/documents/sample_5_short.txt

new file mode 100644 (file)

index 0000000..3079811
--- /dev/null
+++ b/tests/documents/sample_5_short.txt
@@ -0,0 +1,7 @@
+StartDocument(1.0, utf-8)
+StartElement(p)
+StartElement(a)
+Characters("test ©≂̸")
+EndElement(a)
+EndElement(p)
+EndDocument
diff --git a/tests/documents/sample_6.xml b/tests/documents/sample_6.xml

new file mode 100644 (file)

index 0000000..943c02d
--- /dev/null
+++ b/tests/documents/sample_6.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<?xml-stylesheet href="doc.xsl"?>
+
+<doc>Hello</doc>
diff --git a/tests/documents/sample_6_full.txt b/tests/documents/sample_6_full.txt

new file mode 100644 (file)

index 0000000..debb366
--- /dev/null
+++ b/tests/documents/sample_6_full.txt
@@ -0,0 +1,8 @@
+StartDocument(1.0, UTF-8)
+Whitespace("\n")
+ProcessingInstruction(xml-stylesheet="href=\"doc.xsl\"")
+Whitespace("\n\n")
+StartElement(doc)
+Characters("Hello")
+EndElement(doc)
+EndDocument
diff --git a/tests/event_reader.rs b/tests/event_reader.rs

new file mode 100644 (file)

index 0000000..750dcc4
--- /dev/null
+++ b/tests/event_reader.rs
@@ -0,0 +1,587 @@
+#![forbid(unsafe_code)]
+
+extern crate xml;
+#[macro_use]
+extern crate lazy_static;
+
+use std::env;
+use std::fmt;
+use std::fs::File;
+use std::io::{BufRead, BufReader, Write, stderr};
+use std::path::Path;
+
+use xml::name::OwnedName;
+use xml::common::Position;
+use xml::reader::{Result, XmlEvent, ParserConfig, EventReader};
+
+/// Dummy function that opens a file, parses it, and returns a `Result`.
+/// There can be IO errors (from `File::open`) and XML errors (from the parser).
+/// Having `impl From<std::io::Error> for xml::reader::Error` allows the user to
+/// do this without defining their own error type.
+#[allow(dead_code)]
+fn count_event_in_file(name: &Path) -> Result<usize> {
+    let mut event_count = 0;
+    for event in EventReader::new(BufReader::new(try!(File::open(name)))) {
+        try!(event);
+        event_count += 1;
+    }
+    Ok(event_count)
+}
+
+#[test]
+fn sample_1_short() {
+    test(
+        include_bytes!("documents/sample_1.xml"),
+        include_bytes!("documents/sample_1_short.txt"),
+        ParserConfig::new()
+            .ignore_comments(true)
+            .whitespace_to_characters(true)
+            .cdata_to_characters(true)
+            .trim_whitespace(true)
+            .coalesce_characters(true),
+        false
+    );
+}
+
+#[test]
+fn sample_1_full() {
+    test(
+        include_bytes!("documents/sample_1.xml"),
+        include_bytes!("documents/sample_1_full.txt"),
+        ParserConfig::new()
+            .ignore_comments(false)
+            .whitespace_to_characters(false)
+            .cdata_to_characters(false)
+            .trim_whitespace(false)
+            .coalesce_characters(false),
+        false
+    );
+}
+
+#[test]
+fn sample_2_short() {
+    test(
+        include_bytes!("documents/sample_2.xml"),
+        include_bytes!("documents/sample_2_short.txt"),
+        ParserConfig::new()
+            .ignore_comments(true)
+            .whitespace_to_characters(true)
+            .cdata_to_characters(true)
+            .trim_whitespace(true)
+            .coalesce_characters(true),
+        false
+    );
+}
+
+#[test]
+fn sample_2_full() {
+    test(
+        include_bytes!("documents/sample_2.xml"),
+        include_bytes!("documents/sample_2_full.txt"),
+        ParserConfig::new()
+            .ignore_comments(false)
+            .whitespace_to_characters(false)
+            .cdata_to_characters(false)
+            .trim_whitespace(false)
+            .coalesce_characters(false),
+        false
+    );
+}
+
+#[test]
+fn sample_3_short() {
+    test(
+        include_bytes!("documents/sample_3.xml"),
+        include_bytes!("documents/sample_3_short.txt"),
+        ParserConfig::new()
+            .ignore_comments(true)
+            .whitespace_to_characters(true)
+            .cdata_to_characters(true)
+            .trim_whitespace(true)
+            .coalesce_characters(true),
+        true
+    );
+}
+
+#[test]
+fn sample_3_full() {
+    test(
+        include_bytes!("documents/sample_3.xml"),
+        include_bytes!("documents/sample_3_full.txt"),
+        ParserConfig::new()
+            .ignore_comments(false)
+            .whitespace_to_characters(false)
+            .cdata_to_characters(false)
+            .trim_whitespace(false)
+            .coalesce_characters(false),
+        true
+    );
+}
+
+#[test]
+fn sample_4_short() {
+    test(
+        include_bytes!("documents/sample_4.xml"),
+        include_bytes!("documents/sample_4_short.txt"),
+        ParserConfig::new()
+            .ignore_comments(true)
+            .whitespace_to_characters(true)
+            .cdata_to_characters(true)
+            .trim_whitespace(true)
+            .coalesce_characters(true),
+        false
+    );
+}
+
+#[test]
+fn sample_4_full() {
+    test(
+        include_bytes!("documents/sample_4.xml"),
+        include_bytes!("documents/sample_4_full.txt"),
+        ParserConfig::new()
+            .ignore_comments(false)
+            .whitespace_to_characters(false)
+            .cdata_to_characters(false)
+            .trim_whitespace(false)
+            .coalesce_characters(false),
+        false
+    );
+
+}
+
+#[test]
+fn sample_5_short() {
+    test(
+        include_bytes!("documents/sample_5.xml"),
+        include_bytes!("documents/sample_5_short.txt"),
+        ParserConfig::new()
+            .ignore_comments(true)
+            .whitespace_to_characters(true)
+            .cdata_to_characters(true)
+            .trim_whitespace(true)
+            .coalesce_characters(true)
+            .add_entity("nbsp", " ")
+            .add_entity("copy", "©")
+            .add_entity("NotEqualTilde", "≂̸"),
+        false
+    );
+}
+
+#[test]
+fn sample_6_full() {
+    test(
+        include_bytes!("documents/sample_6.xml"),
+        include_bytes!("documents/sample_6_full.txt"),
+        ParserConfig::new()
+            .ignore_root_level_whitespace(false)
+            .ignore_comments(false)
+            .whitespace_to_characters(false)
+            .cdata_to_characters(false)
+            .trim_whitespace(false)
+            .coalesce_characters(false),
+        false
+    );
+}
+
+#[test]
+fn eof_1() {
+    test(
+        br#"<?xml"#,
+        br#"1:6 Unexpected end of stream: no root element found"#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn bad_1() {
+    test(
+        br#"<?xml&.,"#,
+        br#"1:6 Unexpected token: <?xml&"#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn dashes_in_comments() {
+    test(
+        br#"<!-- comment -- --><hello/>"#,
+        br#"
+            |1:14 Unexpected token '--' before ' '
+        "#,
+        ParserConfig::new(),
+        false
+    );
+
+    test(
+        br#"<!-- comment ---><hello/>"#,
+        br#"
+            |1:14 Unexpected token '--' before '-'
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn tabs_1() {
+    test(
+        b"\t<a>\t<b/></a>",
+        br#"
+            |1:2 StartDocument(1.0, UTF-8)
+            |1:2 StartElement(a)
+            |1:6 StartElement(b)
+            |1:6 EndElement(b)
+            |1:10 EndElement(a)
+            |1:14 EndDocument
+        "#,
+        ParserConfig::new()
+            .trim_whitespace(true),
+        true
+    );
+}
+
+#[test]
+fn issue_32_unescaped_cdata_end() {
+    test(
+        br#"<hello>]]></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("]]>")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_unescaped_processing_instruction_end() {
+    test(
+        br#"<hello>?></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("?>")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_unescaped_empty_tag_end() {
+    test(
+        br#"<hello>/></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("/>")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_83_duplicate_attributes() {
+    test(
+        br#"<hello><some-tag a='10' a="20"></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |1:30 Attribute 'a' is redefined
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_93_large_characters_in_entity_references() {
+    test(
+        r#"<hello>&𤶼;</hello>"#.as_bytes(),
+        r#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |1:10 Unexpected entity: 𤶼
+        "#.as_bytes(),  // FIXME: it shouldn't be 10, looks like indices are off slightly
+        ParserConfig::new(),
+        false
+    )
+}
+
+#[test]
+fn issue_98_cdata_ending_with_right_bracket() {
+    test(
+        br#"<hello><![CDATA[Foo [Bar]]]></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |CData("Foo [Bar]")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    )
+}
+
+#[test]
+fn issue_105_unexpected_double_dash() {
+    test(
+        br#"<hello>-- </hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("-- ")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+
+    test(
+        br#"<hello>--</hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("--")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+
+    test(
+        br#"<hello>--></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |Characters("-->")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+
+    test(
+        br#"<hello><![CDATA[--]]></hello>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(hello)
+            |CData("--")
+            |EndElement(hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_attribues_have_no_default_namespace () {
+    test(
+        br#"<hello xmlns="urn:foo" x="y"/>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement({urn:foo}hello [x="y"])
+            |EndElement({urn:foo}hello)
+            |EndDocument
+        "#,
+        ParserConfig::new(),
+        false
+    );
+}
+
+#[test]
+fn issue_replacement_character_entity_reference() {
+    test(
+        br#"<doc>&#55357;&#56628;</doc>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(doc)
+            |1:13 Invalid decimal character number in an entity: #55357
+        "#,
+        ParserConfig::new(),
+        false,
+    );
+
+    test(
+        br#"<doc>&#xd83d;&#xdd34;</doc>"#,
+        br#"
+            |StartDocument(1.0, UTF-8)
+            |StartElement(doc)
+            |1:13 Invalid hexadecimal character number in an entity: #xd83d
+        "#,
+        ParserConfig::new(),
+        false,
+    );
+
+    test(
+        br#"<doc>&#55357;&#56628;</doc>"#,
+        format!(
+            r#"
+                |StartDocument(1.0, UTF-8)
+                |StartElement(doc)
+                |Characters("{replacement_character}{replacement_character}")
+                |EndElement(doc)
+                |EndDocument
+            "#,
+            replacement_character = "\u{fffd}"
+        )
+        .as_bytes(),
+        ParserConfig::new()
+            .replace_unknown_entity_references(true),
+        false,
+    );
+
+    test(
+        br#"<doc>&#xd83d;&#xdd34;</doc>"#,
+        format!(
+            r#"
+                |StartDocument(1.0, UTF-8)
+                |StartElement(doc)
+                |Characters("{replacement_character}{replacement_character}")
+                |EndElement(doc)
+                |EndDocument
+            "#,
+            replacement_character = "\u{fffd}"
+        )
+        .as_bytes(),
+        ParserConfig::new()
+            .replace_unknown_entity_references(true),
+        false,
+    );
+}
+
+lazy_static! {
+    // If PRINT_SPEC env variable is set, print the lines
+    // to stderr instead of comparing with the output
+    // it can be used like this:
+    // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt
+    static ref PRINT: bool = {
+        for (key, value) in env::vars() {
+            if key == "PRINT_SPEC" && value == "1" {
+                return true;
+            }
+        }
+        false
+    };
+}
+
+// clones a lot but that's fine
+fn trim_until_bar(s: String) -> String {
+    match s.trim() {
+        ts if ts.starts_with('|') => return ts[1..].to_owned(),
+        _ => {}
+    }
+    s
+}
+
+fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) {
+    let mut reader = config.create_reader(input);
+    let mut spec_lines = BufReader::new(output).lines()
+        .map(|line| line.unwrap())
+        .enumerate()
+        .map(|(i, line)| (i, trim_until_bar(line)))
+        .filter(|&(_, ref line)| !line.trim().is_empty());
+
+    loop {
+        let e = reader.next();
+        let line =
+            if test_position {
+                format!("{} {}", reader.position(), Event(&e))
+            } else {
+                format!("{}", Event(&e))
+            };
+
+        if *PRINT {
+            writeln!(&mut stderr(), "{}", line).unwrap();
+        } else {
+            if let Some((n, spec)) = spec_lines.next() {
+                if line != spec {
+                    const SPLITTER: &'static str = "-------------------";
+                    panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound:    {}\n{}\n",
+                           SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap());
+                }
+            } else {
+                panic!("Unexpected event: {}", line);
+            }
+        }
+
+        match e {
+            Ok(XmlEvent::EndDocument) | Err(_) => break,
+            _ => {},
+        }
+    }
+}
+
+// Here we define our own string representation of events so we don't depend
+// on the specifics of Display implementation for XmlEvent and OwnedName.
+
+struct Name<'a>(&'a OwnedName);
+
+impl <'a> fmt::Display for Name<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if let Some(ref namespace) = self.0.namespace {
+            try! { write!(f, "{{{}}}", namespace) }
+        }
+
+        if let Some(ref prefix) = self.0.prefix {
+            try! { write!(f, "{}:", prefix) }
+        }
+
+        write!(f, "{}", self.0.local_name)
+    }
+}
+
+struct Event<'a>(&'a Result<XmlEvent>);
+
+impl<'a> fmt::Display for Event<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let empty = String::new();
+        match *self.0 {
+            Ok(ref e) => match *e {
+                XmlEvent::StartDocument { ref version, ref encoding, .. } =>
+                    write!(f, "StartDocument({}, {})", version, encoding),
+                XmlEvent::EndDocument =>
+                    write!(f, "EndDocument"),
+                XmlEvent::ProcessingInstruction { ref name, ref data } =>
+                    write!(f, "ProcessingInstruction({}={:?})", name,
+                        data.as_ref().unwrap_or(&empty)),
+                XmlEvent::StartElement { ref name, ref attributes, .. } => {
+                    if attributes.is_empty() {
+                        write!(f, "StartElement({})", Name(name))
+                    }
+                    else {
+                        let attrs: Vec<_> = attributes.iter()
+                            .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect();
+                        write!(f, "StartElement({} [{}])", Name(name), attrs.join(", "))
+                    }
+                },
+                XmlEvent::EndElement { ref name } =>
+                    write!(f, "EndElement({})", Name(name)),
+                XmlEvent::Comment(ref data) =>
+                    write!(f, r#"Comment("{}")"#, data.escape_debug()),
+                XmlEvent::CData(ref data) =>
+                    write!(f, r#"CData("{}")"#, data.escape_debug()),
+                XmlEvent::Characters(ref data) =>
+                    write!(f, r#"Characters("{}")"#, data.escape_debug()),
+                XmlEvent::Whitespace(ref data) =>
+                    write!(f, r#"Whitespace("{}")"#, data.escape_debug()),
+            },
+            Err(ref e) => e.fmt(f),
+        }
+    }
+}
diff --git a/tests/event_writer.rs b/tests/event_writer.rs

new file mode 100644 (file)

index 0000000..dd64a43
--- /dev/null
+++ b/tests/event_writer.rs
@@ -0,0 +1,269 @@
+#![forbid(unsafe_code)]
+
+extern crate xml;
+
+use std::io::{BufReader, SeekFrom};
+use std::io::prelude::*;
+use std::fs::File;
+use std::str;
+
+use xml::reader::EventReader;
+use xml::writer::EmitterConfig;
+
+macro_rules! unwrap_all {
+    ($($e:expr);+) => {{
+        $($e.unwrap();)+
+    }}
+}
+
+#[test]
+fn reading_writing_equal_with_namespaces() {
+    let mut f = File::open("tests/documents/sample_2.xml").unwrap();
+    let mut b = Vec::new();
+
+    {
+        let r = EventReader::new(BufReader::new(&mut f));
+        let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b);
+
+        for e in r {
+            match e {
+                Ok(e) => if let Some(e) = e.as_writer_event() {
+                    match w.write(e) {
+                        Ok(_) => {},
+                        Err(e) => panic!("Writer error: {:?}", e)
+                    }
+                },
+                Err(e) => panic!("Error: {}", e)
+            }
+        }
+    }
+
+    f.seek(SeekFrom::Start(0)).unwrap();
+    let mut fs = String::new();
+    f.read_to_string(&mut fs).unwrap();
+
+    let bs = String::from_utf8(b).unwrap();
+
+    assert_eq!(fs.trim(), bs.trim());
+}
+
+#[test]
+fn writing_simple() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b);
+
+        w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap();
+        w.write("hello world").unwrap();
+        w.write(XmlEvent::end_element()).unwrap();
+    }
+
+    assert_eq!(
+        str::from_utf8(&b).unwrap(),
+        r#"<h:hello xmlns:h="urn:hello-world">hello world</h:hello>"#
+    );
+}
+
+#[test]
+fn writing_empty_elements_with_normalizing() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("hello"));
+            w.write(XmlEvent::start_element("world"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#);
+}
+
+#[test]
+fn writing_empty_elements_without_normalizing() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .normalize_empty_elements(false)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("hello"));
+            w.write(XmlEvent::start_element("world"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world></world></hello>"#);
+}
+
+#[test]
+fn writing_empty_elements_without_pad_self_closing() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .pad_self_closing(false)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("hello"));
+            w.write(XmlEvent::start_element("world"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world/></hello>"#);
+}
+#[test]
+fn writing_empty_elements_pad_self_closing_explicit() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .pad_self_closing(true)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("hello"));
+            w.write(XmlEvent::start_element("world"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(str::from_utf8(&b).unwrap(), r#"<hello><world /></hello>"#);
+}
+
+#[test]
+fn writing_comments_with_indentation() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .perform_indent(true)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("hello"));
+            w.write(XmlEvent::start_element("world"));
+            w.write(XmlEvent::comment("  this is a manually padded comment\t"));
+            w.write(XmlEvent::comment("this is an unpadded comment"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(
+        str::from_utf8(&b).unwrap(),
+        "<hello>
+  <world>
+    <!--  this is a manually padded comment\t-->
+    <!-- this is an unpadded comment -->
+  </world>
+</hello>");
+}
+
+#[test]
+fn issue_112_overriding_namepace_prefix() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A"));
+            w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::start_element("whatever").ns("a", "urn:X"));
+            w.write(XmlEvent::end_element());
+            w.write(XmlEvent::end_element())
+        }
+    }
+
+    assert_eq!(
+        str::from_utf8(&b).unwrap(),
+        r#"<iq xmlns="jabber:client" xmlns:a="urn:A"><bind xmlns="urn:ietf:params:xml:ns:xmpp-bind" /><whatever xmlns:a="urn:X" /></iq>"#
+    )
+}
+
+#[test]
+fn attribute_escaping() {
+    use xml::writer::XmlEvent;
+
+    let mut b = Vec::new();
+
+    {
+        let mut w = EmitterConfig::new()
+            .write_document_declaration(false)
+            .perform_indent(true)
+            .create_writer(&mut b);
+
+        unwrap_all! {
+            w.write(
+                XmlEvent::start_element("hello")
+                    .attr("testLt", "<")
+                    .attr("testGt", ">")
+            );
+            w.write(XmlEvent::end_element());
+            w.write(
+                XmlEvent::start_element("hello")
+                    .attr("testQuot", "\"")
+                    .attr("testApos", "\'")
+            );
+            w.write(XmlEvent::end_element());
+            w.write(
+                XmlEvent::start_element("hello")
+                    .attr("testAmp", "&")
+            );
+            w.write(XmlEvent::end_element());
+            w.write(
+                XmlEvent::start_element("hello")
+                    .attr("testNl", "\n")
+                    .attr("testCr", "\r")
+            );
+            w.write(XmlEvent::end_element());
+            w.write(
+                XmlEvent::start_element("hello")
+                    .attr("testNl", "\\n")
+                    .attr("testCr", "\\r")
+            );
+            w.write(XmlEvent::end_element())
+        }
+    }
+    assert_eq!(
+        str::from_utf8(&b).unwrap(),
+        "<hello testLt=\"&lt;\" testGt=\"&gt;\" />
+<hello testQuot=\"&quot;\" testApos=\"&apos;\" />
+<hello testAmp=\"&amp;\" />
+<hello testNl=\"&#xA;\" testCr=\"&#xD;\" />
+<hello testNl=\"\\n\" testCr=\"\\r\" />"
+    );
+}
+\ No newline at end of file
diff --git a/tests/streaming.rs b/tests/streaming.rs

new file mode 100644 (file)

index 0000000..a577a00
--- /dev/null
+++ b/tests/streaming.rs
@@ -0,0 +1,103 @@
+#![forbid(unsafe_code)]
+
+extern crate xml;
+
+use std::io::{Cursor, Write};
+
+use xml::EventReader;
+use xml::reader::ParserConfig;
+use xml::reader::XmlEvent;
+
+macro_rules! assert_match {
+    ($actual:expr, $expected:pat) => {
+        match $actual {
+            $expected => {},
+            _ => panic!("assertion failed: `(left matches right)` \
+                        (left: `{:?}`, right: `{}`", $actual, stringify!($expected))
+        }
+    };
+    ($actual:expr, $expected:pat if $guard:expr) => {
+        match $actual {
+            $expected if $guard => {},
+            _ => panic!("assertion failed: `(left matches right)` \
+                        (left: `{:?}`, right: `{} if {}`",
+                        $actual, stringify!($expected), stringify!($guard))
+        }
+    }
+}
+
+fn write_and_reset_position<W>(c: &mut Cursor<W>, data: &[u8]) where Cursor<W>: Write {
+    let p = c.position();
+    c.write_all(data).unwrap();
+    c.set_position(p);
+}
+
+#[test]
+fn reading_streamed_content() {
+    let buf = Cursor::new(b"<root>".to_vec());
+    let reader = EventReader::new(buf);
+
+    let mut it = reader.into_iter();
+
+    assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. })));
+    assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
+
+    write_and_reset_position(it.source_mut(), b"<child-1>content</child-1>");
+    assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
+    assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
+    assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
+
+    write_and_reset_position(it.source_mut(), b"<child-2/>");
+    assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
+    assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
+
+    write_and_reset_position(it.source_mut(), b"<child-3/>");
+    assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
+    assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3");
+    // doesn't seem to work because of how tags parsing is done
+//    write_and_reset_position(it.source_mut(), b"some text");
+   // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text");
+
+    write_and_reset_position(it.source_mut(), b"</root>");
+    assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root");
+    assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument)));
+    assert_match!(it.next(), None);
+}
+
+#[test]
+fn reading_streamed_content2() {
+    let buf = Cursor::new(b"<root>".to_vec());
+    let mut config = ParserConfig::new();
+    config.ignore_end_of_stream = true;
+    let readerb = EventReader::new_with_config(buf, config);
+
+    let mut reader = readerb.into_iter();
+
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. })));
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root");
+
+    write_and_reset_position(reader.source_mut(), b"<child-1>content</child-1>");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1");
+
+    write_and_reset_position(reader.source_mut(), b"<child-2>content</child-2>");
+
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2");
+    assert_match!(reader.next(), Some(Err(_)));
+    write_and_reset_position(reader.source_mut(), b"<child-3></child-3>");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3");
+    write_and_reset_position(reader.source_mut(), b"<child-4 type='get'");
+    match reader.next() {
+       None |
+       Some(Ok(_)) => {
+          panic!("At this point, parser must not detect something.");
+       },
+       Some(Err(_)) => {}
+    };
+    write_and_reset_position(reader.source_mut(), b" />");
+    assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4");
+}
+
author	Woohyun Jung <wh0705.jung@samsung.com>
	Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)
committer	Woohyun Jung <wh0705.jung@samsung.com>
	Fri, 17 Mar 2023 03:45:47 +0000 (12:45 +0900)
.cargo_vcs_info.json	[new file with mode: 0644]	patch \| blob
.github/workflows/main.yml	[new file with mode: 0644]	patch \| blob
.gitignore	[new file with mode: 0644]	patch \| blob
Cargo.toml	[new file with mode: 0644]	patch \| blob
Cargo.toml.orig	[new file with mode: 0644]	patch \| blob
Changelog.md	[new file with mode: 0644]	patch \| blob
LICENSE	[new file with mode: 0644]	patch \| blob
Readme.md	[new file with mode: 0644]	patch \| blob
design.md	[new file with mode: 0644]	patch \| blob
src/analyze.rs	[new file with mode: 0644]	patch \| blob
src/attribute.rs	[new file with mode: 0644]	patch \| blob
src/common.rs	[new file with mode: 0644]	patch \| blob
src/escape.rs	[new file with mode: 0644]	patch \| blob
src/lib.rs	[new file with mode: 0644]	patch \| blob
src/macros.rs	[new file with mode: 0644]	patch \| blob
src/name.rs	[new file with mode: 0644]	patch \| blob
src/namespace.rs	[new file with mode: 0644]	patch \| blob
src/reader/config.rs	[new file with mode: 0644]	patch \| blob
src/reader/error.rs	[new file with mode: 0644]	patch \| blob
src/reader/events.rs	[new file with mode: 0644]	patch \| blob
src/reader/lexer.rs	[new file with mode: 0644]	patch \| blob
src/reader/mod.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_cdata.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_closing_tag_name.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_comment.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_declaration.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_doctype.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_opening_tag.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_processing_instruction.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/inside_reference.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/mod.rs	[new file with mode: 0644]	patch \| blob
src/reader/parser/outside_tag.rs	[new file with mode: 0644]	patch \| blob
src/util.rs	[new file with mode: 0644]	patch \| blob
src/writer/config.rs	[new file with mode: 0644]	patch \| blob
src/writer/emitter.rs	[new file with mode: 0644]	patch \| blob
src/writer/events.rs	[new file with mode: 0644]	patch \| blob
src/writer/mod.rs	[new file with mode: 0644]	patch \| blob
tests/documents/sample_1.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_1_full.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_1_short.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_2.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_2_full.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_2_short.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_3.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_3_full.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_3_short.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_4.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_4_full.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_4_short.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_5.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_5_short.txt	[new file with mode: 0644]	patch \| blob
tests/documents/sample_6.xml	[new file with mode: 0644]	patch \| blob
tests/documents/sample_6_full.txt	[new file with mode: 0644]	patch \| blob
tests/event_reader.rs	[new file with mode: 0644]	patch \| blob
tests/event_writer.rs	[new file with mode: 0644]	patch \| blob
tests/streaming.rs	[new file with mode: 0644]	patch \| blob