From: Woohyun Jung Date: Fri, 17 Mar 2023 03:45:47 +0000 (+0900) Subject: Import xml-rs 0.8.4 X-Git-Tag: upstream/0.8.4 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=refs%2Fheads%2Fupstream;p=platform%2Fupstream%2Frust-xml-rs.git Import xml-rs 0.8.4 --- 0521f209f7e30aea2aa4230ada2b2385d4ad0224 diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..6e0c55d --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,5 @@ +{ + "git": { + "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd" + } +} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..daca69f --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,31 @@ +name: CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + rust: [stable, beta, nightly] + + steps: + - uses: actions/checkout@v2 + + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + + - uses: actions-rs/cargo@v1 + with: + command: build + + - uses: actions-rs/cargo@v1 + with: + command: test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60b0232 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.swp +*.swo +/doc +*~ +/target/ +/Cargo.lock +.idea/ +*.iml \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e704337 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,36 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "xml-rs" +version = "0.8.4" +authors = ["Vladimir Matveev "] +description = "An XML library in pure Rust" +documentation = "http://docs.rs/xml-rs/" +readme = "Readme.md" +keywords = ["xml", "parsing", "parser"] +categories = ["parsing"] +license = "MIT" +repository = "https://github.com/netvl/xml-rs" + +[lib] +name = "xml" +path = "src/lib.rs" + +[[bin]] +name = "xml-analyze" +path = "src/analyze.rs" +[dev-dependencies.doc-comment] +version = "0.3" + +[dev-dependencies.lazy_static] +version = "1.2.0" diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..c8df8e6 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,23 @@ +[package] +name = "xml-rs" +version = "0.8.4" +authors = ["Vladimir Matveev "] +license = "MIT" +description = "An XML library in pure Rust" +repository = "https://github.com/netvl/xml-rs" +documentation = "http://docs.rs/xml-rs/" +readme = "Readme.md" +keywords = ["xml", "parsing", "parser"] +categories = ["parsing"] + +[lib] +name = "xml" +path = "src/lib.rs" + +[[bin]] +name = "xml-analyze" +path = "src/analyze.rs" + +[dev-dependencies] +doc-comment = "0.3" +lazy_static = "1.2.0" diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..3cca8b8 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,126 @@ +## Version 0.8.4 + +* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters. +* Fixed writer output operations to use `write_all` to ensure that the data + is written fully. +* The document declaration is now written before any characters automatically. + +## Version 0.8.3 + +* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser + skip emitting whitespace events outside of the root element when set to `true`. + This helps with certain tasks like canonicalization. + +## Version 0.8.2 + +* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore + invalid Unicode code points and replace them with a Unicode "replacement character" + during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs. +* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing + elements when they are emitted: `` (`true`) vs `` (`false`). + +## Version 0.8.1 + +* Fixed various issues with tests introduced by updates in Rust. +* Adjusted the lexer to ignore contents of the `` tag. +* Removed unnecessary unsafety in tests. +* Added tests for doc comments in the readme file. +* Switched to GitHub Actions from Travis CI. + +## Version 0.8.0 + +* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump. + +## Version 0.7.1 + +* Removed dependency on bitflags. +* Added the `XmlWriter::inner_mut()` method. +* Fixed some rustdoc warnings. + +## Version 0.7.0 + +* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc. + +## Version 0.6.2 + +* Bumped `bitflags` to 1.0. + +## Version 0.6.1 + +* Fixed the writer to escape some special characters when writing attribute values. + +## Version 0.6.0 + +* Changed the target type of extra entities from `char` to `String`. This is an incompatible + change. + +## Version 0.5.0 + +* Added support for ignoring EOF errors in order to read documents from streams incrementally. +* Bumped `bitflags` to 0.9. + +## Version 0.4.1 + +* Added missing `Debug` implementation to `xml::writer::XmlEvent`. + +## Version 0.4.0 + +* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility. + +## Version 0.3.8 + +* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors. + +## Version 0.3.7 + +* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140). +* Added support for configuring custom entities in the parser configuration. + +## Version 0.3.6 + +* Added an `Error` implementation for `EmitterError`. +* Fixed escaping of strings with multi-byte code points. + +## Version 0.3.5 + +* Added `Debug` implementation for `XmlVersion`. +* Fixed some failing tests. + +## Version 0.3.3 + +* Updated `bitflags` to 0.7. + +## Version 0.3.2 + +* Added `From` for `xml::reader::Error`, which improves usability of working with parsing errors. + +## Version 0.3.1 + +* Bumped `bitflags` dependency to 0.4, some internal warning fixes. + +## Version 0.3.0 + +* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer. + +## Version 0.2.4 + +* Fixed #112 - incorrect handling of namespace redefinitions when writing a document. + +## Version 0.2.3 + +* Added `into_inner()` methods to `EventReader` and `EventWriter`. + +## Version 0.2.2 + +* Using `join` instead of the deprecated `connect`. +* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness. +* Fixed incorrect handling of unqualified attribute names (#107). +* Added this changelog. + +## Version 0.2.1 + +* Fixed #105 - incorrect handling of double dashes. + +## Version 0.2.0 + +* Major update, includes proper document writing support and significant architecture changes. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6caa1d3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Vladimir Matveev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..5ab88f8 --- /dev/null +++ b/Readme.md @@ -0,0 +1,236 @@ +xml-rs, an XML library for Rust +=============================== + +[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI) +[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs) +[![docs][docs-img]](https://docs.rs/xml-rs/) + +[Documentation](https://docs.rs/xml-rs/) + + [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square + [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square + [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square + +xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language. +It is heavily inspired by Java [Streaming API for XML (StAX)][stax]. + + [stax]: https://en.wikipedia.org/wiki/StAX + +This library currently contains pull parser much like [StAX event reader][stax-reader]. +It provides iterator API, so you can leverage Rust's existing iterators library features. + + [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html + +It also provides a streaming document writer much like [StAX event writer][stax-writer]. +This writer consumes its own set of events, but reader events can be converted to +writer events easily, and so it is possible to write XML transformation chains in a pretty +clean manner. + + [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html + +This parser is mostly full-featured, however, there are limitations: +* no other encodings but UTF-8 are supported yet, because no stream-based encoding library + is available now; when (or if) one will be available, I'll try to make use of it; +* DTD validation is not supported, `` declarations are completely ignored; thus no + support for custom entities too; internal DTD declarations are likely to cause parsing errors; +* attribute value normalization is not performed, and end-of-line characters are not normalized too. + +Other than that the parser tries to be mostly XML-1.0-compliant. + +Writer is also mostly full-featured with the following limitations: +* no support for encodings other than UTF-8, for the same reason as above; +* no support for emitting `` declarations; +* more validations of input are needed, for example, checking that namespace prefixes are bounded + or comments are well-formed. + +What is planned (highest priority first, approximately): + +0. missing features required by XML standard (e.g. aforementioned normalization and + proper DTD parsing); +1. miscellaneous features of the writer; +2. parsing into a DOM tree and its serialization back to XML text; +3. SAX-like callback-based parser (fairly easy to implement over pull parser); +4. DTD validation; +5. (let's dream a bit) XML Schema validation. + +Building and using +------------------ + +xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest: + +```toml +[dependencies] +xml-rs = "0.8" +``` + +The package exposes a single crate called `xml`: + +```rust +extern crate xml; +``` + +Reading XML documents +--------------------- + +`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding +library is available, it is likely that xml-rs will be switched to use whatever character stream structure +this library would provide, but currently it is a `Read`. + +Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator +over events: + +```rust,no_run +extern crate xml; + +use std::fs::File; +use std::io::BufReader; + +use xml::reader::{EventReader, XmlEvent}; + +fn indent(size: usize) -> String { + const INDENT: &'static str = " "; + (0..size).map(|_| INDENT) + .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s) +} + +fn main() { + let file = File::open("file.xml").unwrap(); + let file = BufReader::new(file); + + let parser = EventReader::new(file); + let mut depth = 0; + for e in parser { + match e { + Ok(XmlEvent::StartElement { name, .. }) => { + println!("{}+{}", indent(depth), name); + depth += 1; + } + Ok(XmlEvent::EndElement { name }) => { + depth -= 1; + println!("{}-{}", indent(depth), name); + } + Err(e) => { + println!("Error: {}", e); + break; + } + _ => {} + } + } +} +``` + +`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly. +Document parsing can end normally or with an error. Regardless of exact cause, the parsing +process will be stopped, and iterator will terminate normally. + +You can also have finer control over when to pull the next event from the parser using its own +`next()` method: + +```rust,ignore +match parser.next() { + ... +} +``` + +Upon the end of the document or an error the parser will remember that last event and will always +return it in the result of `next()` call afterwards. If iterator is used, then it will yield +error or end-of-document event once and will produce `None` afterwards. + +It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure. +See its documentation for more information and examples. + +You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a +small program (BTW, it is built with `cargo build` and can be run after that) which shows various +statistics about specified XML document. It can also be used to check for well-formedness of +XML documents - if a document is not well-formed, this program will exit with an error. + +Writing XML documents +--------------------- + +xml-rs also provides a streaming writer much like StAX event writer. With it you can write an +XML document to any `Write` implementor. + +```rust,no_run +extern crate xml; + +use std::fs::File; +use std::io::{self, Write}; + +use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result}; + +fn handle_event(w: &mut EventWriter, line: String) -> Result<()> { + let line = line.trim(); + let event: XmlEvent = if line.starts_with("+") && line.len() > 1 { + XmlEvent::start_element(&line[1..]).into() + } else if line.starts_with("-") { + XmlEvent::end_element().into() + } else { + XmlEvent::characters(&line).into() + }; + w.write(event) +} + +fn main() { + let mut file = File::create("output.xml").unwrap(); + + let mut input = io::stdin(); + let mut output = io::stdout(); + let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file); + loop { + print!("> "); output.flush().unwrap(); + let mut line = String::new(); + match input.read_line(&mut line) { + Ok(0) => break, + Ok(_) => match handle_event(&mut writer, line) { + Ok(_) => {} + Err(e) => panic!("Write error: {}", e) + }, + Err(e) => panic!("Input error: {}", e) + } + } +} +``` + +The code example above also demonstrates how to create a writer out of its configuration. +Similar thing also works with `EventReader`. + +The library provides an XML event building DSL which helps to construct complex events, +e.g. ones having namespace definitions. Some examples: + +```rust,ignore +// +XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") + +// +XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") + +// +XmlEvent::cdata("some unescaped text") +``` + +Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. +There are more examples in `xml::writer::XmlEvent` documentation. + +The writer has multiple configuration options; see `EmitterConfig` documentation for more +information. + +Other things +------------ + +No performance tests or measurements are done. The implementation is rather naive, and no specific +optimizations are made. Hopefully the library is sufficiently fast to process documents of common size. +I intend to add benchmarks in future, but not until more important features are added. + +Known issues +------------ + +All known issues are present on GitHub issue tracker: . +Feel free to post any found problems there. + +License +------- + +This library is licensed under MIT license. + +--- +Copyright (C) Vladimir Matveev, 2014-2020 diff --git a/design.md b/design.md new file mode 100644 index 0000000..da67c7b --- /dev/null +++ b/design.md @@ -0,0 +1,37 @@ +# Reader + +Basic features: + * [x] Parsing XML 1.0 documents and returning a stream of events + - [ ] Support reading embedded DTD schemas + - [ ] Support for embedded entities + * [x] Support for namespaces and emitting namespace information in events + * [ ] \[maybe\] push-based wrapper + * Missing XML features + - [ ] Support for different encodings + - [ ] Attribute values normalization + - [ ] EOL characters normalization + +Advanced features: + * [ ] DTD schema validation + * [ ] XSD schema validation + +# Writer + +Basic features: + * [x] Writing basic XML 1.0 documents in UTF-8 + * [x] Writing XML 1.0 documents with namespace support + * [x] Support for writing elements with empty body as empty elements + * [x] Pretty-printed and compact output + * [ ] Writing XML document with embedded DTDs and DTD references + * Misc features: + - [ ] Support for different encodings + - [x] Support for writing CDATA as characters + - [ ] Checking events for invalid characters (e.g. `--` in comments) + - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI + - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace + - [ ] Support checking namespace prefix for events with both prefix and namespace URI + +# Other + +DOM-based API: + * [ ] Basic support for DOM-based API diff --git a/src/analyze.rs b/src/analyze.rs new file mode 100644 index 0000000..d369d2f --- /dev/null +++ b/src/analyze.rs @@ -0,0 +1,99 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::cmp; +use std::env; +use std::io::{self, Read, Write, BufReader}; +use std::fs::File; +use std::collections::HashSet; + +use xml::ParserConfig; +use xml::reader::XmlEvent; + +macro_rules! abort { + ($code:expr) => {::std::process::exit($code)}; + ($code:expr, $($args:tt)+) => {{ + writeln!(&mut ::std::io::stderr(), $($args)+).unwrap(); + ::std::process::exit($code); + }} +} + +fn main() { + let mut file; + let mut stdin; + let source: &mut Read = match env::args().nth(1) { + Some(file_name) => { + file = File::open(file_name) + .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e)); + &mut file + } + None => { + stdin = io::stdin(); + &mut stdin + } + }; + + let reader = ParserConfig::new() + .whitespace_to_characters(true) + .ignore_comments(false) + .create_reader(BufReader::new(source)); + + let mut processing_instructions = 0; + let mut elements = 0; + let mut character_blocks = 0; + let mut cdata_blocks = 0; + let mut characters = 0; + let mut comment_blocks = 0; + let mut comment_characters = 0; + let mut namespaces = HashSet::new(); + let mut depth = 0; + let mut max_depth = 0; + + for e in reader { + match e { + Ok(e) => match e { + XmlEvent::StartDocument { version, encoding, standalone } => + println!( + "XML document version {}, encoded in {}, {}standalone", + version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } + ), + XmlEvent::EndDocument => println!("Document finished"), + XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, + XmlEvent::Whitespace(_) => {} // can't happen due to configuration + XmlEvent::Characters(s) => { + character_blocks += 1; + characters += s.len(); + } + XmlEvent::CData(s) => { + cdata_blocks += 1; + characters += s.len(); + } + XmlEvent::Comment(s) => { + comment_blocks += 1; + comment_characters += s.len(); + } + XmlEvent::StartElement { namespace, .. } => { + depth += 1; + max_depth = cmp::max(max_depth, depth); + elements += 1; + namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri)); + } + XmlEvent::EndElement { .. } => { + depth -= 1; + } + }, + Err(e) => abort!(1, "Error parsing XML document: {}", e) + } + } + namespaces.remove(xml::namespace::NS_EMPTY_URI); + namespaces.remove(xml::namespace::NS_XMLNS_URI); + namespaces.remove(xml::namespace::NS_XML_URI); + + println!("Elements: {}, maximum depth: {}", elements, max_depth); + println!("Namespaces (excluding built-in): {}", namespaces.len()); + println!("Characters: {}, characters blocks: {}, CDATA blocks: {}", + characters, character_blocks, cdata_blocks); + println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters); + println!("Processing instructions (excluding built-in): {}", processing_instructions); +} diff --git a/src/attribute.rs b/src/attribute.rs new file mode 100644 index 0000000..8728f49 --- /dev/null +++ b/src/attribute.rs @@ -0,0 +1,99 @@ +//! Contains XML attributes manipulation types and functions. +//! + +use std::fmt; + +use name::{Name, OwnedName}; +use escape::escape_str_attribute; + +/// A borrowed version of an XML attribute. +/// +/// Consists of a borrowed qualified name and a borrowed string value. +#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] +pub struct Attribute<'a> { + /// Attribute name. + pub name: Name<'a>, + + /// Attribute value. + pub value: &'a str +} + +impl<'a> fmt::Display for Attribute<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value)) + } +} + +impl<'a> Attribute<'a> { + /// Creates an owned attribute out of this borrowed one. + #[inline] + pub fn to_owned(&self) -> OwnedAttribute { + OwnedAttribute { + name: self.name.into(), + value: self.value.into(), + } + } + + /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. + #[inline] + pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { + Attribute { name, value, } + } +} + +/// An owned version of an XML attribute. +/// +/// Consists of an owned qualified name and an owned string value. +#[derive(Clone, Eq, PartialEq, Hash, Debug)] +pub struct OwnedAttribute { + /// Attribute name. + pub name: OwnedName, + + /// Attribute value. + pub value: String +} + +impl OwnedAttribute { + /// Returns a borrowed `Attribute` out of this owned one. + pub fn borrow(&self) -> Attribute { + Attribute { + name: self.name.borrow(), + value: &*self.value, + } + } + + /// Creates a new owned attribute using the provided owned name and an owned string value. + #[inline] + pub fn new>(name: OwnedName, value: S) -> OwnedAttribute { + OwnedAttribute { + name, + value: value.into(), + } + } +} + +impl fmt::Display for OwnedAttribute { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value)) + } +} + +#[cfg(test)] +mod tests { + use super::{Attribute}; + + use name::Name; + + #[test] + fn attribute_display() { + let attr = Attribute::new( + Name::qualified("attribute", "urn:namespace", Some("n")), + "its value with > & \" ' < weird symbols" + ); + + assert_eq!( + &*attr.to_string(), + "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" + ) + } +} diff --git a/src/common.rs b/src/common.rs new file mode 100644 index 0000000..029e851 --- /dev/null +++ b/src/common.rs @@ -0,0 +1,142 @@ +//! Contains common types and functions used throughout the library. + +use std::fmt; + +/// Represents a position inside some textual document. +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct TextPosition { + /// Row, counting from 0 + pub row: u64, + /// Column, counting from 0 + pub column: u64, +} + +impl TextPosition { + /// Creates a new position initialized to the beginning of the document + #[inline] + pub fn new() -> TextPosition { + TextPosition { row: 0, column: 0 } + } + + /// Advances the position in a line + #[inline] + pub fn advance(&mut self, count: u8) { + self.column += count as u64; + } + + /// Advances the position in a line to the next tab position + #[inline] + pub fn advance_to_tab(&mut self, width: u8) { + let width = width as u64; + self.column += width - self.column % width + } + + /// Advances the position to the beginning of the next line + #[inline] + pub fn new_line(&mut self) { + self.column = 0; + self.row += 1; + } +} + +impl fmt::Debug for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +impl fmt::Display for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +/// Get the position in the document corresponding to the object +/// +/// This trait is implemented by parsers, lexers and errors. +pub trait Position { + /// Returns the current position or a position corresponding to the object. + fn position(&self) -> TextPosition; +} + +impl Position for TextPosition { + #[inline] + fn position(&self) -> TextPosition { + *self + } +} + +/// XML version enumeration. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum XmlVersion { + /// XML version 1.0. + Version10, + + /// XML version 1.1. + Version11 +} + +impl fmt::Display for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlVersion::Version10 => write!(f, "1.0"), + XmlVersion::Version11 => write!(f, "1.1") + } + } +} + +impl fmt::Debug for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// Checks whether the given character is a white space character (`S`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_whitespace_char(c: char) -> bool { + match c { + '\x20' | '\x09' | '\x0d' | '\x0a' => true, + _ => false + } +} + +/// Checks whether the given string is compound only by white space +/// characters (`S`) using the previous is_whitespace_char to check +/// all characters of this string +pub fn is_whitespace_str(s: &str) -> bool { + s.chars().all(is_whitespace_char) +} + +/// Checks whether the given character is a name start character (`NameStartChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_start_char(c: char) -> bool { + match c { + ':' | 'A'...'Z' | '_' | 'a'...'z' | + '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' | + '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' | + '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' | + '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' | + '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' | + '\u{10000}'...'\u{EFFFF}' => true, + _ => false + } +} + +/// Checks whether the given character is a name character (`NameChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_char(c: char) -> bool { + match c { + _ if is_name_start_char(c) => true, + '-' | '.' | '0'...'9' | '\u{B7}' | + '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true, + _ => false + } +} diff --git a/src/escape.rs b/src/escape.rs new file mode 100644 index 0000000..18298b9 --- /dev/null +++ b/src/escape.rs @@ -0,0 +1,126 @@ +//! Contains functions for performing XML special characters escaping. + +use std::borrow::Cow; + +enum Value { + Char(char), + Str(&'static str) +} + +impl Value { + fn dispatch_for_attribute(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '>' => Value::Str(">"), + '"' => Value::Str("""), + '\'' => Value::Str("'"), + '&' => Value::Str("&"), + '\n' => Value::Str(" "), + '\r' => Value::Str(" "), + _ => Value::Char(c) + } + } + + fn dispatch_for_pcdata(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '&' => Value::Str("&"), + _ => Value::Char(c) + } + } +} + +enum Process<'a> { + Borrowed(&'a str), + Owned(String) +} + +impl<'a> Process<'a> { + fn process(&mut self, (i, next): (usize, Value)) { + match next { + Value::Str(s) => match *self { + Process::Owned(ref mut o) => o.push_str(s), + Process::Borrowed(b) => { + let mut r = String::with_capacity(b.len() + s.len()); + r.push_str(&b[..i]); + r.push_str(s); + *self = Process::Owned(r); + } + }, + Value::Char(c) => match *self { + Process::Borrowed(_) => {} + Process::Owned(ref mut o) => o.push(c) + } + } + } + + fn into_result(self) -> Cow<'a, str> { + match self { + Process::Borrowed(b) => Cow::Borrowed(b), + Process::Owned(o) => Cow::Owned(o) + } + } +} + +impl<'a> Extend<(usize, Value)> for Process<'a> { + fn extend>(&mut self, it: I) { + for v in it.into_iter() { + self.process(v); + } + } +} + +fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow { + let mut p = Process::Borrowed(s); + p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c)))); + p.into_result() +} + +/// Performs escaping of common XML characters inside an attribute value. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `>` → `>` +/// * `"` → `"` +/// * `'` → `'` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside XML attribute values or in PCDATA sections. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_attribute(s: &str) -> Cow { + escape_str(s, Value::dispatch_for_attribute) +} + +/// Performs escaping of common XML characters inside PCDATA. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_pcdata(s: &str) -> Cow { + escape_str(s, Value::dispatch_for_pcdata) +} + +#[cfg(test)] +mod tests { + use super::{escape_str_pcdata, escape_str_attribute}; + + // TODO: add more tests + + #[test] + fn test_escape_multibyte_code_points() { + assert_eq!(escape_str_attribute("☃<"), "☃<"); + assert_eq!(escape_str_pcdata("☃<"), "☃<"); + } +} + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fb672ef --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,29 @@ +//#![warn(missing_doc)] +#![allow(dead_code)] +#![allow(unused_variables)] +#![forbid(non_camel_case_types)] +#![forbid(unsafe_code)] + +//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. + +#[cfg(doctest)] +#[macro_use] +extern crate doc_comment; + +#[cfg(doctest)] +doctest!("../Readme.md"); + +pub use reader::EventReader; +pub use reader::ParserConfig; +pub use writer::EventWriter; +pub use writer::EmitterConfig; + +pub mod macros; +pub mod name; +pub mod attribute; +pub mod common; +pub mod escape; +pub mod namespace; +pub mod reader; +pub mod writer; +mod util; diff --git a/src/macros.rs b/src/macros.rs new file mode 100644 index 0000000..1cce3d6 --- /dev/null +++ b/src/macros.rs @@ -0,0 +1,30 @@ +#![macro_use] + +//! Contains several macros used in this crate. + +macro_rules! gen_setter { + ($target:ty, $field:ident : into $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field>(mut self, value: T) -> $target { + self.$field = value.into(); + self + } + } + }; + ($target:ty, $field:ident : val $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field(mut self, value: $t) -> $target { + self.$field = value; + self + } + } + } +} + +macro_rules! gen_setters { + ($target:ty, $($field:ident : $k:tt $tpe:ty),+) => ($( + gen_setter! { $target, $field : $k $tpe } + )+) +} diff --git a/src/name.rs b/src/name.rs new file mode 100644 index 0000000..a20eae2 --- /dev/null +++ b/src/name.rs @@ -0,0 +1,301 @@ +//! Contains XML qualified names manipulation types and functions. +//! + +use std::fmt; +use std::str::FromStr; + +use namespace::NS_NO_PREFIX; + +/// Represents a qualified XML name. +/// +/// A qualified name always consists at least of a local name. It can optionally contain +/// a prefix; when reading an XML document, if it contains a prefix, it must also contain a +/// namespace URI, but this is not enforced statically; see below. The name can contain a +/// namespace without a prefix; in that case a default, empty prefix is assumed. +/// +/// When writing XML documents, it is possible to omit the namespace URI, leaving only +/// the prefix. In this case the writer will check that the specifed prefix is bound to some +/// URI in the current namespace context. If both prefix and namespace URI are specified, +/// it is checked that the current namespace context contains this exact correspondence +/// between prefix and namespace URI. +/// +/// # Prefixes and URIs +/// +/// A qualified name with a prefix must always contain a proper namespace URI --- names with +/// a prefix but without a namespace associated with that prefix are meaningless. However, +/// it is impossible to obtain proper namespace URI by a prefix without a context, and such +/// context is only available when parsing a document (or it can be constructed manually +/// when writing a document). Tying a name to a context statically seems impractical. This +/// may change in future, though. +/// +/// # Conversions +/// +/// `Name` implements some `From` instances for conversion from strings and tuples. For example: +/// +/// ```rust +/// # use xml::name::Name; +/// let n1: Name = "p:some-name".into(); +/// let n2: Name = ("p", "some-name").into(); +/// +/// assert_eq!(n1, n2); +/// assert_eq!(n1.local_name, "some-name"); +/// assert_eq!(n1.prefix, Some("p")); +/// assert!(n1.namespace.is_none()); +/// ``` +/// +/// This is added to support easy specification of XML elements when writing XML documents. +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub struct Name<'a> { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: &'a str, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option<&'a str>, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option<&'a str> +} + +impl<'a> From<&'a str> for Name<'a> { + fn from(s: &'a str) -> Name<'a> { + let mut parts = s.splitn(2, ":").fuse(); + match (parts.next(), parts.next()) { + (Some(name), None) => Name::local(name), + (Some(prefix), Some(name)) => Name::prefixed(name, prefix), + _ => unreachable!() + } + } +} + +impl<'a> From<(&'a str, &'a str)> for Name<'a> { + fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> { + Name::prefixed(name, prefix) + } +} + +impl<'a> fmt::Display for Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(namespace) = self.namespace { + write!(f, "{{{}}}", namespace)?; + } + + if let Some(prefix) = self.prefix { + write!(f, "{}:", prefix)?; + } + + write!(f, "{}", self.local_name) + } +} + +impl<'a> Name<'a> { + /// Returns an owned variant of the qualified name. + pub fn to_owned(&self) -> OwnedName { + OwnedName { + local_name: self.local_name.into(), + namespace: self.namespace.map(|s| s.into()), + prefix: self.prefix.map(|s| s.into()) + } + } + + /// Returns a new `Name` instance representing plain local name. + #[inline] + pub fn local(local_name: &str) -> Name { + Name { + local_name, + prefix: None, + namespace: None + } + } + + /// Returns a new `Name` instance with the given local name and prefix. + #[inline] + pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { + Name { + local_name, + namespace: None, + prefix: Some(prefix) + } + } + + /// Returns a new `Name` instance representing a qualified name with or without a prefix and + /// with a namespace URI. + #[inline] + pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { + Name { + local_name, + namespace: Some(namespace), + prefix, + } + } + + /// Returns a correct XML representation of this local name and prefix. + /// + /// This method is different from the autoimplemented `to_string()` because it does not + /// include namespace URI in the result. + pub fn to_repr(&self) -> String { + self.repr_display().to_string() + } + + /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this + /// local name and prefix. + /// + /// This method is needed for efficiency purposes in order not to create unnecessary + /// allocations. + #[inline] + pub fn repr_display(&self) -> ReprDisplay { + ReprDisplay(self) + } + + /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. + #[inline] + pub fn prefix_repr(&self) -> &str { + self.prefix.unwrap_or(NS_NO_PREFIX) + } +} + +/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is +/// displayed in an XML document. +pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>); + +impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0.prefix { + Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), + None => write!(f, "{}", self.0.local_name) + } + } +} + +/// An owned variant of `Name`. +/// +/// Everything about `Name` applies to this structure as well. +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub struct OwnedName { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: String, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option, +} + +impl fmt::Display for OwnedName { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.borrow(), f) + } +} + +impl OwnedName { + /// Constructs a borrowed `Name` based on this owned name. + pub fn borrow(&self) -> Name { + Name { + local_name: &*self.local_name, + namespace: self.namespace.as_ref().map(|s| &**s), + prefix: self.prefix.as_ref().map(|s| &**s), + } + } + + /// Returns a new `OwnedName` instance representing a plain local name. + #[inline] + pub fn local(local_name: S) -> OwnedName where S: Into { + OwnedName { + local_name: local_name.into(), + namespace: None, + prefix: None, + } + } + + /// Returns a new `OwnedName` instance representing a qualified name with or without + /// a prefix and with a namespace URI. + #[inline] + pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> OwnedName + where S1: Into, S2: Into, S3: Into + { + OwnedName { + local_name: local_name.into(), + namespace: Some(namespace.into()), + prefix: prefix.map(|v| v.into()) + } + } + + /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` + /// but avoids extra work. + #[inline] + pub fn prefix_ref(&self) -> Option<&str> { + self.prefix.as_ref().map(|s| &**s) + } + + /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` + /// but avoids extra work. + #[inline] + pub fn namespace_ref(&self) -> Option<&str> { + self.namespace.as_ref().map(|s| &**s) + } +} + +impl<'a> From> for OwnedName { + #[inline] + fn from(n: Name<'a>) -> OwnedName { + n.to_owned() + } +} + +impl FromStr for OwnedName { + type Err = (); + + /// Parses the given string slice into a qualified name. + /// + /// This function, when finishes sucessfully, always return a qualified + /// name without a namespace (`name.namespace == None`). It should be filled later + /// using proper `NamespaceStack`. + /// + /// It is supposed that all characters in the argument string are correct + /// as defined by the XML specification. No additional checks except a check + /// for emptiness are done. + fn from_str(s: &str) -> Result { + let mut it = s.split(':'); + + let r = match (it.next(), it.next(), it.next()) { + (Some(prefix), Some(local_name), None) if !prefix.is_empty() && + !local_name.is_empty() => + Some((local_name.into(), Some(prefix.into()))), + (Some(local_name), None, None) if !local_name.is_empty() => + Some((local_name.into(), None)), + (_, _, _) => None + }; + r.map(|(local_name, prefix)| OwnedName { + local_name, + namespace: None, + prefix + }).ok_or(()) + } +} + +#[cfg(test)] +mod tests { + use super::OwnedName; + + #[test] + fn test_owned_name_from_str() { + assert_eq!("prefix:name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: Some("prefix".into()) + })); + + assert_eq!("name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: None + })); + + assert_eq!("".parse(), Err::(())); + assert_eq!(":".parse(), Err::(())); + assert_eq!(":a".parse(), Err::(())); + assert_eq!("a:".parse(), Err::(())); + assert_eq!("a:b:c".parse(), Err::(())); + } +} diff --git a/src/namespace.rs b/src/namespace.rs new file mode 100644 index 0000000..1ab4a5c --- /dev/null +++ b/src/namespace.rs @@ -0,0 +1,485 @@ +//! Contains namespace manipulation types and functions. + +use std::iter::{Map, Rev}; +use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::btree_map::Iter as Entries; +use std::collections::HashSet; +use std::slice::Iter; + +/// Designates prefix for namespace definitions. +/// +/// See [Namespaces in XML][namespace] spec for more information. +/// +/// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl +pub const NS_XMLNS_PREFIX: &'static str = "xmlns"; + +/// Designates the standard URI for `xmlns` prefix. +/// +/// See [A Namespace Name for xmlns Attributes][1] for more information. +/// +/// [namespace]: http://www.w3.org/2000/xmlns/ +pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; + +/// Designates prefix for a namespace containing several special predefined attributes. +/// +/// See [2.10 White Space handling][1], [2.1 Language Identification][2], +/// [XML Base specification][3] and [xml:id specification][4] for more information. +/// +/// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space +/// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag +/// [3]: http://www.w3.org/TR/xmlbase/ +/// [4]: http://www.w3.org/TR/xml-id/ +pub const NS_XML_PREFIX: &'static str = "xml"; + +/// Designates the standard URI for `xml` prefix. +/// +/// See `NS_XML_PREFIX` documentation for more information. +pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; + +/// Designates the absence of prefix in a qualified name. +/// +/// This constant should be used to define or query default namespace which should be used +/// for element or attribute names without prefix. For example, if a namespace mapping +/// at a particular point in the document contains correspondence like +/// +/// ```none +/// NS_NO_PREFIX --> urn:some:namespace +/// ``` +/// +/// then all names declared without an explicit prefix `urn:some:namespace` is assumed as +/// a namespace URI. +/// +/// By default empty prefix corresponds to absence of namespace, but this can change either +/// when writing an XML document (manually) or when reading an XML document (based on namespace +/// declarations). +pub const NS_NO_PREFIX: &'static str = ""; + +/// Designates an empty namespace URI, which is equivalent to absence of namespace. +/// +/// This constant should not usually be used directly; it is used to designate that +/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with +/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping +/// in a namespace back to its default value. +pub const NS_EMPTY_URI: &'static str = ""; + +/// Namespace is a map from prefixes to namespace URIs. +/// +/// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant. +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Namespace(pub BTreeMap); + +impl Namespace { + /// Returns an empty namespace. + #[inline] + pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } + + /// Checks whether this namespace is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Checks whether this namespace is essentially empty, that is, it does not contain + /// anything but default mappings. + pub fn is_essentially_empty(&self) -> bool { + // a shortcut for a namespace which is definitely not empty + if self.0.len() > 3 { return false; } + + self.0.iter().all(|(k, v)| match (&**k, &**v) { + (NS_NO_PREFIX, NS_EMPTY_URI) => true, + (NS_XMLNS_PREFIX, NS_XMLNS_URI) => true, + (NS_XML_PREFIX, NS_XML_URI) => true, + _ => false + }) + } + + /// Checks whether this namespace mapping contains the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// `true` if this namespace contains the given prefix, `false` otherwise. + #[inline] + pub fn contains>(&self, prefix: &P) -> bool { + self.0.contains_key(prefix.as_ref()) + } + + /// Puts a mapping into this namespace. + /// + /// This method does not override any already existing mappings. + /// + /// Returns a boolean flag indicating whether the map already contained + /// the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + pub fn put(&mut self, prefix: P, uri: U) -> bool + where P: Into, U: Into + { + match self.0.entry(prefix.into()) { + Entry::Occupied(_) => false, + Entry::Vacant(ve) => { + ve.insert(uri.into()); + true + } + } + } + + /// Puts a mapping into this namespace forcefully. + /// + /// This method, unlike `put()`, does replace an already existing mapping. + /// + /// Returns previous URI which was assigned to the given prefix, if it is present. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or + /// `None` if such prefix was not present in the namespace before. + pub fn force_put(&mut self, prefix: P, uri: U) -> Option + where P: Into, U: Into + { + self.0.insert(prefix.into(), uri.into()) + } + + /// Queries the namespace for the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// Namespace URI corresponding to the given prefix, if it is present. + pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + self.0.get(prefix.as_ref()).map(|s| &**s) + } +} + +/// An alias for iterator type for namespace mappings contained in a namespace. +pub type NamespaceMappings<'a> = Map< + Entries<'a, String, String>, + for<'b> fn((&'b String, &'b String)) -> UriMapping<'b> +>; + +impl<'a> IntoIterator for &'a Namespace { + type Item = UriMapping<'a>; + type IntoIter = NamespaceMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { + (&*prefix, &*uri) + } + self.0.iter().map(mapper) + } +} + +/// Namespace stack is a sequence of namespaces. +/// +/// Namespace stack is used to represent cumulative namespace consisting of +/// combined namespaces from nested elements. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct NamespaceStack(pub Vec); + +impl NamespaceStack { + /// Returns an empty namespace stack. + #[inline] + pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } + + /// Returns a namespace stack with default items in it. + /// + /// Default items are the following: + /// + /// * `xml` → `http://www.w3.org/XML/1998/namespace`; + /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. + #[inline] + pub fn default() -> NamespaceStack { + let mut nst = NamespaceStack::empty(); + nst.push_empty(); + // xml namespace + nst.put(NS_XML_PREFIX, NS_XML_URI); + // xmlns namespace + nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI); + // empty namespace + nst.put(NS_NO_PREFIX, NS_EMPTY_URI); + nst + } + + /// Adds an empty namespace to the top of this stack. + #[inline] + pub fn push_empty(&mut self) -> &mut NamespaceStack { + self.0.push(Namespace::empty()); + self + } + + /// Removes the topmost namespace in this stack. + /// + /// Panics if the stack is empty. + #[inline] + pub fn pop(&mut self) -> Namespace { + self.0.pop().unwrap() + } + + /// Removes the topmost namespace in this stack. + /// + /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise. + #[inline] + pub fn try_pop(&mut self) -> Option { + self.0.pop() + } + + /// Borrows the topmost namespace mutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek_mut(&mut self) -> &mut Namespace { + self.0.last_mut().unwrap() + } + + /// Borrows the topmost namespace immutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek(&self) -> &Namespace { + self.0.last().unwrap() + } + + /// Puts a mapping into the topmost namespace if this stack does not already contain one. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// Note that both key and value are matched and the mapping is inserted if either + /// namespace prefix is not already mapped, or if it is mapped, but to a different URI. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace stack. + pub fn put_checked(&mut self, prefix: P, uri: U) -> bool + where P: Into + AsRef, + U: Into + AsRef + { + if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) { + false + } else { + self.put(prefix, uri); + true + } + } + + /// Puts a mapping into the topmost namespace in this stack. + /// + /// This method does not override a mapping in the topmost namespace if it is + /// already present, however, it does not depend on other namespaces in the stack, + /// so it is possible to put a mapping which is present in lower namespaces. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + #[inline] + pub fn put(&mut self, prefix: P, uri: U) -> bool + where P: Into, U: Into + { + self.0.last_mut().unwrap().put(prefix, uri) + } + + /// Performs a search for the given prefix in the whole stack. + /// + /// This method walks the stack from top to bottom, querying each namespace + /// in order for the given prefix. If none of the namespaces contains the prefix, + /// `None` is returned. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + #[inline] + pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + let prefix = prefix.as_ref(); + for ns in self.0.iter().rev() { + match ns.get(prefix) { + None => {}, + r => return r, + } + } + None + } + + /// Combines this stack of namespaces into a single namespace. + /// + /// Namespaces are combined in left-to-right order, that is, rightmost namespace + /// elements take priority over leftmost ones. + pub fn squash(&self) -> Namespace { + let mut result = BTreeMap::new(); + for ns in self.0.iter() { + result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); + } + Namespace(result) + } + + /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`. + /// + /// See `CheckedTarget` for more information. + #[inline] + pub fn checked_target(&mut self) -> CheckedTarget { + CheckedTarget(self) + } + + /// Returns an iterator over all mappings in this namespace stack. + #[inline] + pub fn iter(&self) -> NamespaceStackMappings { + self.into_iter() + } +} + +/// An iterator over mappings from prefixes to URIs in a namespace stack. +/// +/// # Example +/// ``` +/// # use xml::namespace::NamespaceStack; +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::>()); +/// ``` +pub struct NamespaceStackMappings<'a> { + namespaces: Rev>, + current_namespace: Option>, + used_keys: HashSet<&'a str> +} + +impl<'a> NamespaceStackMappings<'a> { + fn go_to_next_namespace(&mut self) -> bool { + self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter()); + self.current_namespace.is_some() + } +} + +impl<'a> Iterator for NamespaceStackMappings<'a> { + type Item = UriMapping<'a>; + + fn next(&mut self) -> Option> { + // If there is no current namespace and no next namespace, we're finished + if self.current_namespace.is_none() && !self.go_to_next_namespace() { + return None; + } + let next_item = self.current_namespace.as_mut().unwrap().next(); + + match next_item { + // There is an element in the current namespace + Some((k, v)) => if self.used_keys.contains(&k) { + // If the current key is used, go to the next one + self.next() + } else { + // Otherwise insert the current key to the set of used keys and + // return the mapping + self.used_keys.insert(k); + Some((k, v)) + }, + // Current namespace is exhausted + None => if self.go_to_next_namespace() { + // If there is next namespace, continue from it + self.next() + } else { + // No next namespace, exiting + None + } + } + } +} + +impl<'a> IntoIterator for &'a NamespaceStack { + type Item = UriMapping<'a>; + type IntoIter = NamespaceStackMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + NamespaceStackMappings { + namespaces: self.0.iter().rev(), + current_namespace: None, + used_keys: HashSet::new() + } + } +} + +/// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators. +pub type UriMapping<'a> = (&'a str, &'a str); + +impl<'a> Extend> for Namespace { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +impl<'a> Extend> for NamespaceStack { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +/// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`. +/// +/// # Example +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")], +/// nst.iter().collect::>() +/// ); +/// ``` +/// +/// Compare: +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// # let mut nst = NamespaceStack::empty(); +/// # nst.push_empty(); +/// # nst.put("a", "urn:A"); +/// # nst.put("b", "urn:B"); +/// # nst.push_empty(); +/// # nst.put("c", "urn:C"); +/// +/// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")], +/// nst.iter().collect::>() +/// ); +/// ``` +pub struct CheckedTarget<'a>(&'a mut NamespaceStack); + +impl<'a, 'b> Extend> for CheckedTarget<'a> { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.0.put_checked(prefix, uri); + } + } +} diff --git a/src/reader/config.rs b/src/reader/config.rs new file mode 100644 index 0000000..0abb165 --- /dev/null +++ b/src/reader/config.rs @@ -0,0 +1,181 @@ +//! Contains parser configuration structure. +use std::io::Read; +use std::collections::HashMap; + +use reader::EventReader; + +/// Parser configuration structure. +/// +/// This structure contains various configuration options which affect +/// behavior of the parser. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct ParserConfig { + /// Whether or not should whitespace in textual events be removed. Default is false. + /// + /// When true, all standalone whitespace will be removed (this means no + /// `Whitespace` events will be emitted), and leading and trailing whitespace + /// from `Character` events will be deleted. If after trimming `Characters` + /// event will be empty, it will also be omitted from output stream. This is + /// possible, however, only if `whitespace_to_characters` or + /// `cdata_to_characters` options are set. + /// + /// This option does not affect CDATA events, unless `cdata_to_characters` + /// option is also set. In that case CDATA content will also be trimmed. + pub trim_whitespace: bool, + + /// Whether or not should whitespace be converted to characters. + /// Default is false. + /// + /// If true, instead of `Whitespace` events `Characters` events with the + /// same content will be emitted. If `trim_whitespace` is also true, these + /// events will be trimmed to nothing and, consequently, not emitted. + pub whitespace_to_characters: bool, + + /// Whether or not should CDATA be converted to characters. + /// Default is false. + /// + /// If true, instead of `CData` events `Characters` events with the same + /// content will be emitted. If `trim_whitespace` is also true, these events + /// will be trimmed. If corresponding CDATA contained nothing but whitespace, + /// this event will be omitted from the stream. + pub cdata_to_characters: bool, + + /// Whether or not should comments be omitted. Default is true. + /// + /// If true, `Comment` events will not be emitted at all. + pub ignore_comments: bool, + + /// Whether or not should sequential `Characters` events be merged. + /// Default is true. + /// + /// If true, multiple sequential `Characters` events will be merged into + /// a single event, that is, their data will be concatenated. + /// + /// Multiple sequential `Characters` events are only possible if either + /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character + /// events will always be separated by other events. + pub coalesce_characters: bool, + + /// A map of extra entities recognized by the parser. Default is an empty map. + /// + /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, + /// however, it is convenient to make the parser recognize additional entities which + /// are also not available through the DTD definitions (especially given that at the moment + /// DTD parsing is not supported). + pub extra_entities: HashMap, + + /// Whether or not the parser should ignore the end of stream. Default is false. + /// + /// By default the parser will either error out when it encounters a premature end of + /// stream or complete normally if the end of stream was expected. If you want to continue + /// reading from a stream whose input is supplied progressively, you can set this option to true. + /// In this case the parser will allow you to invoke the next() method even if a supposed end + /// of stream has happened. + /// + /// Note that support for this functionality is incomplete; for example, the parser will fail if + /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. + pub ignore_end_of_stream: bool, + + /// Whether or not non-unicode entity references get replaced with the replacement character + /// + /// When true, any decimal or hexadecimal character reference that cannot be converted from a + /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) + /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). + pub replace_unknown_entity_references: bool, + + /// Whether or not whitespace at the root level of the document is ignored. Default is true. + /// + /// By default any whitespace that is not enclosed within at least one level of elements will be + /// ignored. Setting this value to false will cause root level whitespace events to be emitted. + pub ignore_root_level_whitespace: bool, +} + +impl ParserConfig { + /// Returns a new config with default values. + /// + /// You can tweak default values using builder-like pattern: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let config = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false); + /// ``` + pub fn new() -> ParserConfig { + ParserConfig { + trim_whitespace: false, + whitespace_to_characters: false, + cdata_to_characters: false, + ignore_comments: true, + coalesce_characters: true, + extra_entities: HashMap::new(), + ignore_end_of_stream: false, + replace_unknown_entity_references: false, + ignore_root_level_whitespace: true, + } + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader(self, source: R) -> EventReader { + EventReader::new_with_config(source, self) + } + + /// Adds a new entity mapping and returns an updated config object. + /// + /// This is a convenience method for adding external entities mappings to the XML parser. + /// An example: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .add_entity("nbsp", " ") + /// .add_entity("copy", "©") + /// .add_entity("reg", "®") + /// .create_reader(&mut source); + /// ``` + pub fn add_entity, T: Into>(mut self, entity: S, value: T) -> ParserConfig { + self.extra_entities.insert(entity.into(), value.into()); + self + } +} + +impl Default for ParserConfig { + #[inline] + fn default() -> ParserConfig { + ParserConfig::new() + } +} + +gen_setters! { ParserConfig, + trim_whitespace: val bool, + whitespace_to_characters: val bool, + cdata_to_characters: val bool, + ignore_comments: val bool, + coalesce_characters: val bool, + ignore_end_of_stream: val bool, + replace_unknown_entity_references: val bool, + ignore_root_level_whitespace: val bool +} diff --git a/src/reader/error.rs b/src/reader/error.rs new file mode 100644 index 0000000..92378e6 --- /dev/null +++ b/src/reader/error.rs @@ -0,0 +1,121 @@ + +use std::io; +use std::borrow::Cow; +use std::fmt; +use std::error; +use std::str; + +use util; +use common::{Position, TextPosition}; + +#[derive(Debug)] +pub enum ErrorKind { + Syntax(Cow<'static, str>), + Io(io::Error), + Utf8(str::Utf8Error), + UnexpectedEof, +} + +/// An XML parsing error. +/// +/// Consists of a 2D position in a document and a textual message describing the error. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct Error { + pos: TextPosition, + kind: ErrorKind, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {}", self.pos, self.msg()) + } +} + +impl Position for Error { + #[inline] + fn position(&self) -> TextPosition { self.pos } +} + +impl Error { + /// Returns a reference to a message which is contained inside this error. + #[inline] + pub fn msg(&self) -> &str { + use self::ErrorKind::*; + match self.kind { + UnexpectedEof => &"Unexpected EOF", + Utf8(ref reason) => error_description(reason), + Io(ref io_error) => error_description(io_error), + Syntax(ref msg) => msg.as_ref(), + } + } + + pub fn kind(&self) -> &ErrorKind { &self.kind } +} + +impl error::Error for Error { + #[inline] + fn description(&self) -> &str { self.msg() } +} + +impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into> { + fn from(orig: (&'a P, M)) -> Self { + Error{ + pos: orig.0.position(), + kind: ErrorKind::Syntax(orig.1.into()) + } + } +} + +impl From for Error { + fn from(e: util::CharReadError) -> Self { + use util::CharReadError::*; + Error{ + pos: TextPosition::new(), + kind: match e { + UnexpectedEof => ErrorKind::UnexpectedEof, + Utf8(reason) => ErrorKind::Utf8(reason), + Io(io_error) => ErrorKind::Io(io_error), + } + } + } +} + +impl From for Error { + fn from(e: io::Error) -> Self { + Error { + pos: TextPosition::new(), + kind: ErrorKind::Io(e), + } + } +} + +impl Clone for ErrorKind { + fn clone(&self) -> Self { + use self::ErrorKind::*; + match *self { + UnexpectedEof => UnexpectedEof, + Utf8(ref reason) => Utf8(reason.clone()), + Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), + Syntax(ref msg) => Syntax(msg.clone()), + } + } +} +impl PartialEq for ErrorKind { + fn eq(&self, other: &ErrorKind) -> bool { + use self::ErrorKind::*; + match (self, other) { + (&UnexpectedEof, &UnexpectedEof) => true, + (&Utf8(ref left), &Utf8(ref right)) => left == right, + (&Io(ref left), &Io(ref right)) => + left.kind() == right.kind() && + error_description(left) == error_description(right), + (&Syntax(ref left), &Syntax(ref right)) => + left == right, + + (_, _) => false, + } + } +} +impl Eq for ErrorKind {} + +fn error_description(e: &error::Error) -> &str { e.description() } diff --git a/src/reader/events.rs b/src/reader/events.rs new file mode 100644 index 0000000..46d7621 --- /dev/null +++ b/src/reader/events.rs @@ -0,0 +1,219 @@ +//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. + +use std::fmt; +use std::borrow::Cow; + +use name::OwnedName; +use attribute::OwnedAttribute; +use common::XmlVersion; +use namespace::Namespace; + +/// An element of an XML input stream. +/// +/// Items of this enum are emitted by `reader::EventReader`. They correspond to different +/// elements of an XML document. +#[derive(PartialEq, Clone)] +pub enum XmlEvent { + /// Corresponds to XML document declaration. + /// + /// This event is always emitted before any other event. It is emitted + /// even if the actual declaration is not present in the document. + StartDocument { + /// XML version. + /// + /// If XML declaration is not present, defaults to `Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// If XML declaration is not present or does not contain `encoding` attribute, + /// defaults to `"UTF-8"`. This field is currently used for no other purpose than + /// informational. + encoding: String, + + /// XML standalone declaration. + /// + /// If XML document is not present or does not contain `standalone` attribute, + /// defaults to `None`. This field is currently used for no other purpose than + /// informational. + standalone: Option + }, + + /// Denotes to the end of the document stream. + /// + /// This event is always emitted after any other event (except `Error`). After it + /// is emitted for the first time, it will always be emitted on next event pull attempts. + EndDocument, + + /// Denotes an XML processing instruction. + /// + /// This event contains a processing instruction target (`name`) and opaque `data`. It + /// is up to the application to process them. + ProcessingInstruction { + /// Processing instruction target. + name: String, + + /// Processing instruction content. + data: Option + }, + + /// Denotes a beginning of an XML element. + /// + /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the + /// latter case `EndElement` event immediately follows. + StartElement { + /// Qualified name of the element. + name: OwnedName, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO) + attributes: Vec, + + /// Contents of the namespace mapping at this point of the document. + namespace: Namespace, + }, + + /// Denotes an end of an XML element. + /// + /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the + /// latter case it is emitted immediately after corresponding `StartElement` event. + EndElement { + /// Qualified name of the element. + name: OwnedName + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data. No unescaping will be performed. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See + /// `pull::ParserConfiguration` structure for more information. + CData(String), + + /// Denotes a comment. + /// + /// It is possible to configure a parser to ignore comments, so this event will never be emitted. + /// See `pull::ParserConfiguration` structure for more information. + Comment(String), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` + /// will appear in it. + /// + /// It is possible to configure a parser to trim leading and trailing whitespace for this event. + /// See `pull::ParserConfiguration` structure for more information. + Characters(String), + + /// Denotes a chunk of whitespace outside of tags. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. + /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace + /// trimming, it will eliminate standalone whitespace from the event stream completely. + Whitespace(String) +} + +impl fmt::Debug for XmlEvent { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}{})", *name, match *data { + Some(ref data) => format!(", {}", data), + None => String::new() + }), + XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => + write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { + String::new() + } else { + let attributes: Vec = attributes.iter().map( + |a| format!("{} -> {}", a.name, a.value) + ).collect(); + format!(", [{}]", attributes.join(", ")) + }), + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", name), + XmlEvent::Comment(ref data) => + write!(f, "Comment({})", data), + XmlEvent::CData(ref data) => + write!(f, "CData({})", data), + XmlEvent::Characters(ref data) => + write!(f, "Characters({})", data), + XmlEvent::Whitespace(ref data) => + write!(f, "Whitespace({})", data) + } + } +} + +impl XmlEvent { + /// Obtains a writer event from this reader event. + /// + /// This method is useful for streaming processing of XML documents where the output + /// is also an XML document. With this method it is possible to process some events + /// while passing other events through to the writer unchanged: + /// + /// ```rust + /// use std::str; + /// + /// use xml::{EventReader, EventWriter}; + /// use xml::reader::XmlEvent as ReaderEvent; + /// use xml::writer::XmlEvent as WriterEvent; + /// + /// let mut input: &[u8] = b"world"; + /// let mut output: Vec = Vec::new(); + /// + /// { + /// let mut reader = EventReader::new(&mut input); + /// let mut writer = EventWriter::new(&mut output); + /// + /// for e in reader { + /// match e.unwrap() { + /// ReaderEvent::Characters(s) => + /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), + /// e => if let Some(e) = e.as_writer_event() { + /// writer.write(e).unwrap() + /// } + /// } + /// } + /// } + /// + /// assert_eq!( + /// str::from_utf8(&output).unwrap(), + /// r#"WORLD"# + /// ); + /// ``` + /// + /// Note that this API may change or get additions in future to improve its ergonomics. + pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + match *self { + XmlEvent::StartDocument { version, ref encoding, standalone } => + Some(::writer::events::XmlEvent::StartDocument { + version: version, + encoding: Some(encoding), + standalone: standalone + }), + XmlEvent::ProcessingInstruction { ref name, ref data } => + Some(::writer::events::XmlEvent::ProcessingInstruction { + name: name, + data: data.as_ref().map(|s| &s[..]) + }), + XmlEvent::StartElement { ref name, ref attributes, ref namespace } => + Some(::writer::events::XmlEvent::StartElement { + name: name.borrow(), + attributes: attributes.iter().map(|a| a.borrow()).collect(), + namespace: Cow::Borrowed(namespace) + }), + XmlEvent::EndElement { ref name } => + Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + _ => None + } + } +} diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs new file mode 100644 index 0000000..c466db9 --- /dev/null +++ b/src/reader/lexer.rs @@ -0,0 +1,867 @@ +//! Contains simple lexer for XML documents. +//! +//! This module is for internal use. Use `xml::pull` module to do parsing. + +use std::fmt; +use std::collections::VecDeque; +use std::io::Read; +use std::result; +use std::borrow::Cow; + +use common::{Position, TextPosition, is_whitespace_char, is_name_char}; +use reader::Error; +use util; + +/// `Token` represents a single lexeme of an XML document. These lexemes +/// are used to perform actual parsing. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub enum Token { + /// `` + ProcessingInstructionEnd, + /// `` + TagEnd, + /// `/>` + EmptyTagEnd, + /// `` + CommentEnd, + /// A chunk of characters, used for errors recovery. + Chunk(&'static str), + /// Any non-special character except whitespace. + Character(char), + /// Whitespace character. + Whitespace(char), + /// `=` + EqualsSign, + /// `'` + SingleQuote, + /// `"` + DoubleQuote, + /// `` + CDataEnd, + /// `&` + ReferenceStart, + /// `;` + ReferenceEnd, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Token::Chunk(s) => write!(f, "{}", s), + Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), + other => write!(f, "{}", match other { + Token::OpeningTagStart => "<", + Token::ProcessingInstructionStart => " " " "", + Token::CDataEnd => "]]>", + Token::ReferenceStart => "&", + Token::ReferenceEnd => ";", + Token::EqualsSign => "=", + Token::SingleQuote => "'", + Token::DoubleQuote => "\"", + _ => unreachable!() + }) + } + } +} + +impl Token { + pub fn as_static_str(&self) -> Option<&'static str> { + match *self { + Token::OpeningTagStart => Some("<"), + Token::ProcessingInstructionStart => Some(" Some(" Some(" Some(""), + Token::CDataEnd => Some("]]>"), + Token::ReferenceStart => Some("&"), + Token::ReferenceEnd => Some(";"), + Token::EqualsSign => Some("="), + Token::SingleQuote => Some("'"), + Token::DoubleQuote => Some("\""), + Token::Chunk(s) => Some(s), + _ => None + } + } + + // using String.push_str(token.to_string()) is simply way too slow + pub fn push_to_string(&self, target: &mut String) { + match self.as_static_str() { + Some(s) => { target.push_str(s); } + None => { + match *self { + Token::Character(c) | Token::Whitespace(c) => target.push(c), + _ => unreachable!() + } + } + } + } + + /// Returns `true` if this token contains data that can be interpreted + /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. + #[inline] + pub fn contains_char_data(&self) -> bool { + match *self { + Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | + Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, + _ => false + } + } + + /// Returns `true` if this token corresponds to a white space character. + #[inline] + pub fn is_whitespace(&self) -> bool { + match *self { + Token::Whitespace(_) => true, + _ => false + } + } +} + +enum State { + /// Triggered on '<' + TagStarted, + /// Triggered on ', Error>; + +/// Helps to set up a dispatch table for lexing large unambigous tokens like +/// ` ( + match $s { + $( + $st => match $c { + $stc => $_self.move_to($is($next_st)), + _ => $_self.handle_error($chunk, $c) + }, + )+ + $end_st => match $c { + $end_c => $e, + _ => $_self.handle_error($end_chunk, $c) + } + } + ) +); + +/// `Lexer` is a lexer for XML documents, which implements pull API. +/// +/// Main method is `next_token` which accepts an `std::io::Read` instance and +/// tries to read the next lexeme from it. +/// +/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. +/// When it is not set, errors will be reported as `Err` objects with a string message. +/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods +/// to toggle the behavior. +pub struct Lexer { + pos: TextPosition, + head_pos: TextPosition, + char_queue: VecDeque, + st: State, + skip_errors: bool, + inside_comment: bool, + inside_token: bool, + eof_handled: bool +} + +impl Position for Lexer { + #[inline] + /// Returns the position of the last token produced by the lexer + fn position(&self) -> TextPosition { self.pos } +} + +impl Lexer { + /// Returns a new lexer with default state. + pub fn new() -> Lexer { + Lexer { + pos: TextPosition::new(), + head_pos: TextPosition::new(), + char_queue: VecDeque::with_capacity(4), // TODO: check size + st: State::Normal, + skip_errors: false, + inside_comment: false, + inside_token: false, + eof_handled: false + } + } + + /// Enables error handling so `next_token` will return `Some(Err(..))` + /// upon invalid lexeme. + #[inline] + pub fn enable_errors(&mut self) { self.skip_errors = false; } + + /// Disables error handling so `next_token` will return `Some(Chunk(..))` + /// upon invalid lexeme with this lexeme content. + #[inline] + pub fn disable_errors(&mut self) { self.skip_errors = true; } + + /// Enables special handling of some lexemes which should be done when we're parsing comment + /// internals. + #[inline] + pub fn inside_comment(&mut self) { self.inside_comment = true; } + + /// Disables the effect of `inside_comment()` method. + #[inline] + pub fn outside_comment(&mut self) { self.inside_comment = false; } + + /// Reset the eof handled flag of the lexer. + #[inline] + pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } + + /// Tries to read the next token from the buffer. + /// + /// It is possible to pass different instaces of `BufReader` each time + /// this method is called, but the resulting behavior is undefined in this case. + /// + /// Return value: + /// * `Err(reason) where reason: reader::Error` - when an error occurs; + /// * `Ok(None)` - upon end of stream is reached; + /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. + pub fn next_token(&mut self, b: &mut B) -> Result { + // Already reached end of buffer + if self.eof_handled { + return Ok(None); + } + + if !self.inside_token { + self.pos = self.head_pos; + self.inside_token = true; + } + + // Check if we have saved a char or two for ourselves + while let Some(c) = self.char_queue.pop_front() { + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => {} // continue + } + } + + loop { + // TODO: this should handle multiple encodings + let c = match try!(util::next_char_from(b)) { + Some(c) => c, // got next char + None => break, // nothing to read left + }; + + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => { + // continue + } + } + } + + // Handle end of stream + self.eof_handled = true; + self.pos = self.head_pos; + match self.st { + State::TagStarted | State::CommentOrCDataOrDoctypeStarted | + State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | + State::CommentClosing(ClosingSubstate::Second) | + State::DoctypeFinishing(_) => + Err(self.error("Unexpected end of stream")), + State::ProcessingInstructionClosing => + Ok(Some(Token::Character('?'))), + State::EmptyTagClosing => + Ok(Some(Token::Character('/'))), + State::CommentClosing(ClosingSubstate::First) => + Ok(Some(Token::Character('-'))), + State::CDataClosing(ClosingSubstate::First) => + Ok(Some(Token::Character(']'))), + State::CDataClosing(ClosingSubstate::Second) => + Ok(Some(Token::Chunk("]]"))), + State::Normal => + Ok(None) + } + } + + #[inline] + fn error>>(&self, msg: M) -> Error { + (self, msg).into() + } + + #[inline] + fn read_next_token(&mut self, c: char) -> Result { + let res = self.dispatch_char(c); + if self.char_queue.is_empty() { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + } + res + } + + fn dispatch_char(&mut self, c: char) -> Result { + match self.st { + State::Normal => self.normal(c), + State::TagStarted => self.tag_opened(c), + State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), + State::CommentStarted => self.comment_started(c), + State::CDataStarted(s) => self.cdata_started(c, s), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::ProcessingInstructionClosing => self.processing_instruction_closing(c), + State::EmptyTagClosing => self.empty_element_closing(c), + State::CommentClosing(s) => self.comment_closing(c, s), + State::CDataClosing(s) => self.cdata_closing(c, s) + } + } + + #[inline] + fn move_to(&mut self, st: State) -> Result { + self.st = st; + Ok(None) + } + + #[inline] + fn move_to_with(&mut self, st: State, token: Token) -> Result { + self.st = st; + Ok(Some(token)) + } + + #[inline] + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { + self.char_queue.extend(cs.iter().cloned()); + self.move_to_with(st, token) + } + + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { + self.char_queue.push_back(c); + if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky + self.move_to_with(State::Normal, Token::Chunk(chunk)) + } else { + Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + } + } + + /// Encountered a char + fn normal(&mut self, c: char) -> Result { + match c { + '<' => self.move_to(State::TagStarted), + '>' => Ok(Some(Token::TagEnd)), + '/' => self.move_to(State::EmptyTagClosing), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + '?' => self.move_to(State::ProcessingInstructionClosing), + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + /// Encountered '<' + fn tag_opened(&mut self, c: char) -> Result { + match c { + '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), + '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), + _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ => self.handle_error("<", c) + } + } + + /// Encountered ' Result { + match c { + '-' => self.move_to(State::CommentStarted), + '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), + 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), + _ => self.handle_error(" Result { + match c { + '-' => self.move_to_with(State::Normal, Token::CommentStart), + _ => self.handle_error(" Result { + use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + dispatch_on_enum_state!(self, s, c, State::CDataStarted, + E ; 'C' ; C ; " Result { + use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; + dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, + D ; 'O' ; DO ; " Result { + match c { + '<' => self.move_to(State::DoctypeFinishing(d + 1)), + '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), + '>' => self.move_to(State::DoctypeFinishing(d - 1)), + _ => Ok(None), + } + } + + /// Encountered '?' + fn processing_instruction_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + } + } + + /// Encountered '/' + fn empty_element_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + } + } + + /// Encountered '-' + fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CommentEnd), + // double dash not followed by a greater-than is a hard error inside comment + _ if self.inside_comment => self.handle_error("--", c), + // nothing else except comment closing starts with a double dash, and comment + // closing can never be after another dash, and also we're outside of a comment, + // therefore it is safe to push only the last read character to the list of unread + // characters and pass the double dash directly to the output + _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) + } + } + } + + /// Encountered ']' + fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) + } + } + } +} + +#[cfg(test)] +mod tests { + use common::{Position}; + use std::io::{BufReader, Cursor}; + + use super::{Lexer, Token}; + + macro_rules! assert_oks( + (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ + $( + assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); + )+ + }) + ); + + macro_rules! assert_err( + (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ + let err = $lex.next_token(&mut $buf); + assert!(err.is_err()); + let err = err.unwrap_err(); + assert_eq!($r as u64, err.position().row); + assert_eq!($c as u64, err.position().column); + assert_eq!($s, err.msg()); + }) + ); + + macro_rules! assert_none( + (for $lex:ident and $buf:ident) => ( + assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + ) + ); + + fn make_lex_and_buf(s: &str) -> (Lexer, BufReader>>) { + (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) + } + + #[test] + fn simple_lexer_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" xd

 "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::Whitespace(' ') + Token::Character('p') + Token::EqualsSign + Token::SingleQuote + Token::Character('q') + Token::SingleQuote + Token::TagEnd + Token::Whitespace(' ') + Token::Character('x') + Token::OpeningTagStart + Token::Character('b') + Token::Whitespace(' ') + Token::Character('z') + Token::EqualsSign + Token::DoubleQuote + Token::Character('y') + Token::DoubleQuote + Token::TagEnd + Token::Character('d') + Token::Whitespace('\t') + Token::ClosingTagStart + Token::Character('b') + Token::TagEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + Token::OpeningTagStart + Token::Character('p') + Token::EmptyTagEnd + Token::Whitespace(' ') + Token::ProcessingInstructionStart + Token::Character('n') + Token::Character('m') + Token::Whitespace(' ') + Token::ProcessingInstructionEnd + Token::Whitespace(' ') + Token::CommentStart + Token::Whitespace(' ') + Token::Character('a') + Token::Whitespace(' ') + Token::Character('c') + Token::Whitespace(' ') + Token::CommentEnd + Token::Whitespace(' ') + Token::ReferenceStart + Token::Character('n') + Token::Character('b') + Token::Character('s') + Token::Character('p') + Token::ReferenceEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn special_chars_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"?x!+ // -| ]z]]"# + ); + + assert_oks!(for lex and buf ; + Token::Character('?') + Token::Character('x') + Token::Character('!') + Token::Character('+') + Token::Whitespace(' ') + Token::Character('/') + Token::Character('/') + Token::Whitespace(' ') + Token::Character('-') + Token::Character('|') + Token::Whitespace(' ') + Token::Character(']') + Token::Character('z') + Token::Chunk("]]") + ); + assert_none!(for lex and buf); + } + + #[test] + fn cdata_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CDataStart + Token::Character('x') + Token::Whitespace(' ') + Token::Character('y') + Token::Whitespace(' ') + Token::Character('?') + Token::CDataEnd + Token::Whitespace(' ') + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn doctype_with_internal_subset_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" ]> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn end_of_stream_handling_ok() { + macro_rules! eof_check( + ($data:expr ; $token:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_oks!(for lex and buf ; $token); + assert_none!(for lex and buf); + }) + ); + eof_check!("?" ; Token::Character('?')); + eof_check!("/" ; Token::Character('/')); + eof_check!("-" ; Token::Character('-')); + eof_check!("]" ; Token::Character(']')); + eof_check!("]]" ; Token::Chunk("]]")); + } + + #[test] + fn end_of_stream_handling_error() { + macro_rules! eof_check( + ($data:expr; $r:expr, $c:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); + assert_none!(for lex and buf); + }) + ); + eof_check!("<" ; 0, 1); + eof_check!(" ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, $s); + + let (mut lex, mut buf) = make_lex_and_buf($data); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk($chunk) + Token::Character($app) + ); + assert_none!(for lex and buf); + }) + ); + + #[test] + fn error_in_cdata_started() { + check_case!(""# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character('F') + Token::Character('o') + Token::Character('o') + Token::Whitespace(' ') + Token::Character('[') + Token::Character('B') + Token::Character('a') + Token::Character('r') + Token::Character(']') + Token::CDataEnd + ); + assert_none!(for lex and buf); + } +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs new file mode 100644 index 0000000..90f5b52 --- /dev/null +++ b/src/reader/mod.rs @@ -0,0 +1,129 @@ +//! Contains high-level interface for a pull-based XML parser. +//! +//! The most important type in this module is `EventReader`, which provides an iterator +//! view for events in XML document. + +use std::io::{Read}; +use std::result; + +use common::{Position, TextPosition}; + +pub use self::config::ParserConfig; +pub use self::events::XmlEvent; + +use self::parser::PullParser; + +mod lexer; +mod parser; +mod config; +mod events; + +mod error; +pub use self::error::{Error, ErrorKind}; + +/// A result type yielded by `XmlReader`. +pub type Result = result::Result; + +/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. +pub struct EventReader { + source: R, + parser: PullParser +} + +impl EventReader { + /// Creates a new reader, consuming the given stream. + #[inline] + pub fn new(source: R) -> EventReader { + EventReader::new_with_config(source, ParserConfig::new()) + } + + /// Creates a new reader with the provded configuration, consuming the given stream. + #[inline] + pub fn new_with_config(source: R, config: ParserConfig) -> EventReader { + EventReader { source: source, parser: PullParser::new(config) } + } + + /// Pulls and returns next XML event from the stream. + /// + /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then + /// further calls to this method will return this event again. + #[inline] + pub fn next(&mut self) -> Result { + self.parser.next(&mut self.source) + } + + pub fn source(&self) -> &R { &self.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.source } + + /// Unwraps this `EventReader`, returning the underlying reader. + /// + /// Note that this operation is destructive; unwrapping the reader and wrapping it + /// again with `EventReader::new()` will create a fresh reader which will attempt + /// to parse an XML document from the beginning. + pub fn into_inner(self) -> R { + self.source + } +} + +impl Position for EventReader { + /// Returns the position of the last event produced by the reader. + #[inline] + fn position(&self) -> TextPosition { + self.parser.position() + } +} + +impl IntoIterator for EventReader { + type Item = Result; + type IntoIter = Events; + + fn into_iter(self) -> Events { + Events { reader: self, finished: false } + } +} + +/// An iterator over XML events created from some type implementing `Read`. +/// +/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then +/// it will be returned by the iterator once, and then it will stop producing events. +pub struct Events { + reader: EventReader, + finished: bool +} + +impl Events { + /// Unwraps the iterator, returning the internal `EventReader`. + #[inline] + pub fn into_inner(self) -> EventReader { + self.reader + } + + pub fn source(&self) -> &R { &self.reader.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } + +} + +impl Iterator for Events { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option> { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } + else { + let ev = self.reader.next(); + match ev { + Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, + _ => {} + } + Some(ev) + } + } +} + +impl<'r> EventReader<&'r [u8]> { + /// A convenience method to create an `XmlReader` from a string slice. + #[inline] + pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { + EventReader::new(source.as_bytes()) + } +} diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs new file mode 100644 index 0000000..3269fb4 --- /dev/null +++ b/src/reader/parser/inside_cdata.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_cdata(&mut self, t: Token) -> Option { + match t { + Token::CDataEnd => { + self.lexer.enable_errors(); + let event = if self.config.cdata_to_characters { + None + } else { + let data = self.take_buf(); + Some(Ok(XmlEvent::CData(data))) + }; + self.into_state(State::OutsideTag, event) + } + + Token::Whitespace(_) => { + t.push_to_string(&mut self.buf); + None + } + + _ => { + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + } + } +} diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs new file mode 100644 index 0000000..1d8074a --- /dev/null +++ b/src/reader/parser/inside_closing_tag_name.rs @@ -0,0 +1,34 @@ +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; + +impl PullParser { + pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option { + match s { + ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + // TODO: {:?} is bad, need something better + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + Token::TagEnd => this.emit_end_element(), + _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + } + } + } + }), + ClosingTagSubstate::CTAfterName => match t { + Token::Whitespace(_) => None, // Skip whitespace + Token::TagEnd => self.emit_end_element(), + _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + } + } + } + +} diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs new file mode 100644 index 0000000..fc98320 --- /dev/null +++ b/src/reader/parser/inside_comment.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_comment(&mut self, t: Token) -> Option { + match t { + // Double dash is illegal inside a comment + Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), + + Token::CommentEnd if self.config.ignore_comments => { + self.lexer.outside_comment(); + self.into_state_continue(State::OutsideTag) + } + + Token::CommentEnd => { + self.lexer.outside_comment(); + let data = self.take_buf(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) + } + + _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + +} diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs new file mode 100644 index 0000000..af39d10 --- /dev/null +++ b/src/reader/parser/inside_declaration.rs @@ -0,0 +1,151 @@ + +use common::XmlVersion; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, + DEFAULT_VERSION, DEFAULT_ENCODING +}; + +impl PullParser { + // TODO: remove redundancy via macros or extra methods + pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { + macro_rules! unexpected_token( + ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); + ($t:expr) => (unexpected_token!(self; $t)); + ); + + #[inline] + fn emit_start_document(this: &mut PullParser) -> Option { + this.parsed_declaration = true; + let version = this.data.take_version(); + let encoding = this.data.take_encoding(); + let standalone = this.data.take_standalone(); + this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), + standalone: standalone + })) + } + + match s { + DeclarationSubstate::BeforeVersion => match t { + Token::Whitespace(_) => None, // continue + Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ersion" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideVersionValue + } else { + DeclarationSubstate::AfterVersion + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterVersion => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { + this.data.version = match &value[..] { + "1.0" => Some(XmlVersion::Version10), + "1.1" => Some(XmlVersion::Version11), + _ => None + }; + if this.data.version.is_some() { + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) + } else { + Some(self_error!(this; "Unexpected XML version value: {}", value)) + } + }), + + DeclarationSubstate::AfterVersionValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ncoding" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterEncoding => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { + this.data.encoding = Some(value); + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + }), + + DeclarationSubstate::BeforeStandaloneDecl => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "tandalone" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideStandaloneDeclValue + } else { + DeclarationSubstate::AfterStandaloneDecl + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterStandaloneDecl => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { + let standalone = match &value[..] { + "yes" => Some(true), + "no" => Some(false), + _ => None + }; + if standalone.is_some() { + this.data.standalone = standalone; + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) + } else { + Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + } + }), + + DeclarationSubstate::AfterStandaloneDeclValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + } + } + } + +} diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs new file mode 100644 index 0000000..8dcf367 --- /dev/null +++ b/src/reader/parser/inside_doctype.rs @@ -0,0 +1,16 @@ +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_doctype(&mut self, t: Token) -> Option { + match t { + Token::TagEnd => { + self.lexer.enable_errors(); + self.into_state_continue(State::OutsideTag) + } + + _ => None + } + } +} diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs new file mode 100644 index 0000000..533874f --- /dev/null +++ b/src/reader/parser/inside_opening_tag.rs @@ -0,0 +1,108 @@ +use common::is_name_start_char; +use attribute::OwnedAttribute; +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; + +impl PullParser { + pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { + macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); + match s { + OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::TagEnd => this.emit_start_element(false), + Token::EmptyTagEnd => this.emit_start_element(true), + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + _ => unreachable!() + } + } + } + }), + + OpeningTagSubstate::InsideTag => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_name_start_char(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) + } + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + this.data.attr_name = Some(name); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), + Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unreachable!() + } + }), + + OpeningTagSubstate::AfterAttributeName => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { + let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here + + // check that no attribute with such name is already present + // if there is one, XML is not well-formed + if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + // TODO: ideally this error should point to the beginning of the attribute, + // TODO: not the end of its value + Some(self_error!(this; "Attribute '{}' is redefined", name)) + } else { + match name.prefix_ref() { + // declaring a new prefix; it is sufficient to check prefix only + // because "xmlns" prefix is reserved + Some(namespace::NS_XMLNS_PREFIX) => { + let ln = &name.local_name[..]; + if ln == namespace::NS_XMLNS_PREFIX { + Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) + } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { + Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + } else if value.is_empty() { + Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + } else { + this.nst.put(name.local_name.clone(), value); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + + // declaring default namespace + None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => + match &value[..] { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => + Some(self_error!(this; "Namespace '{}' cannot be default", value)), + _ => { + this.nst.put(namespace::NS_NO_PREFIX, value.clone()); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + }, + + // regular attribute + _ => { + this.data.attributes.push(OwnedAttribute { + name: name.clone(), + value: value + }); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + } + }) + } + } + +} diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs new file mode 100644 index 0000000..8ddf6b8 --- /dev/null +++ b/src/reader/parser/inside_processing_instruction.rs @@ -0,0 +1,96 @@ +use common::{ + is_name_start_char, is_name_char, +}; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; + +impl PullParser { + pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { + match s { + ProcessingInstructionSubstate::PIInsideName => match t { + Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + + Token::ProcessingInstructionEnd => { + // self.buf contains PI name + let name = self.take_buf(); + + // Don't need to check for declaration because it has mandatory attributes + // but there is none + match &name[..] { + // Name is empty, it is an error + "" => Some(self_error!(self; "Encountered processing instruction without name")), + + // Found + Some(self_error!(self; "Invalid processing instruction: { + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: None + }) + ) + } + } + } + + Token::Whitespace(_) => { + // self.buf contains PI name + let name = self.take_buf(); + + match &name[..] { + // We have not ever encountered an element and have not parsed XML declaration + "xml" if !self.encountered_element && !self.parsed_declaration => + self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), + + // Found + Some(self_error!(self; "Invalid processing instruction: { + self.lexer.disable_errors(); // data is arbitrary, so disable errors + self.data.name = name; + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + } + + } + } + + _ => Some(self_error!(self; "Unexpected token: match t { + Token::ProcessingInstructionEnd => { + self.lexer.enable_errors(); + let name = self.data.take_name(); + let data = self.take_buf(); + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: Some(data) + }) + ) + }, + + // Any other token should be treated as plain characters + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + } + } + +} diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs new file mode 100644 index 0000000..60026d5 --- /dev/null +++ b/src/reader/parser/inside_reference.rs @@ -0,0 +1,89 @@ +use std::char; + +use common::{is_name_start_char, is_name_char, is_whitespace_str}; + +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option { + match t { + Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || + self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { + self.data.ref_data.push(c); + None + } + + Token::ReferenceEnd => { + // TODO: check for unicode correctness + let name = self.data.take_ref_data(); + let name_len = name.len(); // compute once + let c = match &name[..] { + "lt" => Ok('<'.to_string()), + "gt" => Ok('>'.to_string()), + "amp" => Ok('&'.to_string()), + "apos" => Ok('\''.to_string()), + "quot" => Ok('"'.to_string()), + "" => Err(self_error!(self; "Encountered empty entity")), + _ if name_len > 2 && name.starts_with("#x") => { + let num_str = &name[2..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } else { + match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } + } + } + _ if name_len > 1 && name.starts_with('#') => { + let num_str = &name[1..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + else { + match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + } + }, + _ => { + if let Some(v) = self.config.extra_entities.get(&name) { + Ok(v.clone()) + } else { + Err(self_error!(self; "Unexpected entity: {}", name)) + } + } + }; + match c { + Ok(c) => { + self.buf.push_str(&c); + if prev_st == State::OutsideTag && !is_whitespace_str(&c) { + self.inside_whitespace = false; + } + self.into_state_continue(prev_st) + } + Err(e) => Some(e) + } + } + + _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + } + } +} diff --git a/src/reader/parser/mod.rs b/src/reader/parser/mod.rs new file mode 100644 index 0000000..58ca3a6 --- /dev/null +++ b/src/reader/parser/mod.rs @@ -0,0 +1,622 @@ +//! Contains an implementation of pull-based XML parser. + +use std::mem; +use std::borrow::Cow; +use std::io::prelude::*; + +use common::{ + self, + XmlVersion, Position, TextPosition, + is_name_start_char, is_name_char, +}; +use name::OwnedName; +use attribute::OwnedAttribute; +use namespace::NamespaceStack; + +use reader::events::XmlEvent; +use reader::config::ParserConfig; +use reader::lexer::{Lexer, Token}; + +macro_rules! gen_takes( + ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( + $( + impl MarkupData { + #[inline] + fn $method(&mut self) -> $t { + mem::replace(&mut self.$field, $def) + } + } + )+ + ) +); + +gen_takes!( + name -> take_name, String, String::new(); + ref_data -> take_ref_data, String, String::new(); + + version -> take_version, Option, None; + encoding -> take_encoding, Option, None; + standalone -> take_standalone, Option, None; + + element_name -> take_element_name, Option, None; + + attr_name -> take_attr_name, Option, None; + attributes -> take_attributes, Vec, vec!() +); + +macro_rules! self_error( + ($this:ident; $msg:expr) => ($this.error($msg)); + ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) +); + +mod outside_tag; +mod inside_processing_instruction; +mod inside_declaration; +mod inside_doctype; +mod inside_opening_tag; +mod inside_closing_tag_name; +mod inside_comment; +mod inside_cdata; +mod inside_reference; + +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; +static DEFAULT_ENCODING: &'static str = "UTF-8"; +static DEFAULT_STANDALONE: Option = None; + +type ElementStack = Vec; +pub type Result = super::Result; + +/// Pull-based XML parser. +pub struct PullParser { + config: ParserConfig, + lexer: Lexer, + st: State, + buf: String, + nst: NamespaceStack, + + data: MarkupData, + final_result: Option, + next_event: Option, + est: ElementStack, + pos: Vec, + + encountered_element: bool, + parsed_declaration: bool, + inside_whitespace: bool, + read_prefix_separator: bool, + pop_namespace: bool +} + +impl PullParser { + /// Returns a new parser using the given config. + pub fn new(config: ParserConfig) -> PullParser { + PullParser { + config: config, + lexer: Lexer::new(), + st: State::OutsideTag, + buf: String::new(), + nst: NamespaceStack::default(), + + data: MarkupData { + name: String::new(), + version: None, + encoding: None, + standalone: None, + ref_data: String::new(), + element_name: None, + quote: None, + attr_name: None, + attributes: Vec::new() + }, + final_result: None, + next_event: None, + est: Vec::new(), + pos: vec![TextPosition::new()], + + encountered_element: false, + parsed_declaration: false, + inside_whitespace: true, + read_prefix_separator: false, + pop_namespace: false + } + } + + /// Checks if this parser ignores the end of stream errors. + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } +} + +impl Position for PullParser { + /// Returns the position of the last event produced by the parser + #[inline] + fn position(&self) -> TextPosition { + self.pos[0] + } +} + +#[derive(Clone, PartialEq)] +pub enum State { + OutsideTag, + InsideOpeningTag(OpeningTagSubstate), + InsideClosingTag(ClosingTagSubstate), + InsideProcessingInstruction(ProcessingInstructionSubstate), + InsideComment, + InsideCData, + InsideDeclaration(DeclarationSubstate), + InsideDoctype, + InsideReference(Box) +} + +#[derive(Clone, PartialEq)] +pub enum OpeningTagSubstate { + InsideName, + + InsideTag, + + InsideAttributeName, + AfterAttributeName, + + InsideAttributeValue, +} + +#[derive(Clone, PartialEq)] +pub enum ClosingTagSubstate { + CTInsideName, + CTAfterName +} + +#[derive(Clone, PartialEq)] +pub enum ProcessingInstructionSubstate { + PIInsideName, + PIInsideData +} + +#[derive(Clone, PartialEq)] +pub enum DeclarationSubstate { + BeforeVersion, + InsideVersion, + AfterVersion, + + InsideVersionValue, + AfterVersionValue, + + InsideEncoding, + AfterEncoding, + + InsideEncodingValue, + + BeforeStandaloneDecl, + InsideStandaloneDecl, + AfterStandaloneDecl, + + InsideStandaloneDeclValue, + AfterStandaloneDeclValue +} + +#[derive(PartialEq)] +enum QualifiedNameTarget { + AttributeNameTarget, + OpeningTagNameTarget, + ClosingTagNameTarget +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum QuoteToken { + SingleQuoteToken, + DoubleQuoteToken +} + +impl QuoteToken { + fn from_token(t: &Token) -> QuoteToken { + match *t { + Token::SingleQuote => QuoteToken::SingleQuoteToken, + Token::DoubleQuote => QuoteToken::DoubleQuoteToken, + _ => panic!("Unexpected token: {}", t) + } + } + + fn as_token(self) -> Token { + match self { + QuoteToken::SingleQuoteToken => Token::SingleQuote, + QuoteToken::DoubleQuoteToken => Token::DoubleQuote + } + } +} + +struct MarkupData { + name: String, // used for processing instruction name + ref_data: String, // used for reference content + + version: Option, // used for XML declaration version + encoding: Option, // used for XML declaration encoding + standalone: Option, // used for XML declaration standalone parameter + + element_name: Option, // used for element name + + quote: Option, // used to hold opening quote for attribute value + attr_name: Option, // used to hold attribute name + attributes: Vec // used to hold all accumulated attributes +} + +impl PullParser { + /// Returns next event read from the given buffer. + /// + /// This method should be always called with the same buffer. If you call it + /// providing different buffers each time, the result will be undefined. + pub fn next(&mut self, r: &mut R) -> Result { + if let Some(ref ev) = self.final_result { + return ev.clone(); + } + + if let Some(ev) = self.next_event.take() { + return ev; + } + + if self.pop_namespace { + self.pop_namespace = false; + self.nst.pop(); + } + + loop { + // While lexer gives us Ok(maybe_token) -- we loop. + // Upon having a complete XML-event -- we return from the whole function. + match self.lexer.next_token(r) { + Ok(maybe_token) => + match maybe_token { + None => break, + Some(token) => + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(XmlEvent::EndDocument)) => + return { + self.next_pos(); + self.set_final_result(Ok(XmlEvent::EndDocument)) + }, + Some(Ok(xml_event)) => + return { + self.next_pos(); + Ok(xml_event) + }, + Some(Err(xml_error)) => + return { + self.next_pos(); + self.set_final_result(Err(xml_error)) + }, + } + }, + Err(lexer_error) => + return self.set_final_result(Err(lexer_error)), + } + } + + // Handle end of stream + // Forward pos to the lexer head + self.next_pos(); + let ev = if self.depth() == 0 { + if self.encountered_element && self.st == State::OutsideTag { // all is ok + Ok(XmlEvent::EndDocument) + } else if !self.encountered_element { + self_error!(self; "Unexpected end of stream: no root element found") + } else { // self.st != State::OutsideTag + self_error!(self; "Unexpected end of stream") // TODO: add expected hint? + } + } else { + if self.config.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self_error!(self; "Unexpected end of stream: still inside the root element"); + } else { + self_error!(self; "Unexpected end of stream: still inside the root element") + } + }; + self.set_final_result(ev) + } + + // This function is to be called when a terminal event is reached. + // The function sets up the `self.final_result` into `Some(result)` and return `result`. + fn set_final_result(&mut self, result: Result) -> Result { + self.final_result = Some(result.clone()); + result + } + + #[inline] + fn error>>(&self, msg: M) -> Result { + Err((&self.lexer, msg).into()) + } + + #[inline] + fn next_pos(&mut self) { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } + } + + #[inline] + fn push_pos(&mut self) { + self.pos.push(self.lexer.position()); + } + + fn dispatch_token(&mut self, t: Token) -> Option { + match self.st.clone() { + State::OutsideTag => self.outside_tag(t), + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::InsideDoctype => self.inside_doctype(t), + State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), + State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideComment => self.inside_comment(t), + State::InsideCData => self.inside_cdata(t), + State::InsideReference(s) => self.inside_reference(t, *s) + } + } + + #[inline] + fn depth(&self) -> usize { + self.est.len() + } + + #[inline] + fn buf_has_data(&self) -> bool { + self.buf.len() > 0 + } + + #[inline] + fn take_buf(&mut self) -> String { + mem::replace(&mut self.buf, String::new()) + } + + #[inline] + fn append_char_continue(&mut self, c: char) -> Option { + self.buf.push(c); + None + } + + #[inline] + fn into_state(&mut self, st: State, ev: Option) -> Option { + self.st = st; + ev + } + + #[inline] + fn into_state_continue(&mut self, st: State) -> Option { + self.into_state(st, None) + } + + #[inline] + fn into_state_emit(&mut self, st: State, ev: Result) -> Option { + self.into_state(st, Some(ev)) + } + + /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, + /// an error is returned. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_name` --- a callback which is executed when whitespace is encountered. + fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option + where F: Fn(&mut PullParser, Token, OwnedName) -> Option { + // We can get here for the first time only when self.data.name contains zero or one character, + // but first character cannot be a colon anyway + if self.buf.len() <= 1 { + self.read_prefix_separator = false; + } + + let invoke_callback = |this: &mut PullParser, t| { + let name = this.take_buf(); + match name.parse() { + Ok(name) => on_name(this, t, name), + Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) + } + }; + + match t { + // There can be only one colon, and not as the first character + Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { + self.buf.push(':'); + self.read_prefix_separator = true; + None + } + + Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => + self.append_char_continue(c), + + Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), + + Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), + + Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || + target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), + + Token::Whitespace(_) => invoke_callback(self, t), + + _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) + } + } + + /// Dispatches tokens in order to process attribute value. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_value` --- a callback which is called when terminating quote is encountered. + fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option + where F: Fn(&mut PullParser, String) -> Option { + match t { + Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace + + Token::DoubleQuote | Token::SingleQuote => match self.data.quote { + None => { // Entered attribute value + self.data.quote = Some(QuoteToken::from_token(&t)); + None + } + Some(q) if q.as_token() == t => { + self.data.quote = None; + let value = self.take_buf(); + on_value(self, value) + } + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + + Token::ReferenceStart => { + let st = Box::new(self.st.clone()); + self.into_state_continue(State::InsideReference(st)) + } + + Token::OpeningTagStart => + Some(self_error!(self; "Unexpected token inside attribute value: <")), + + // Every character except " and ' and < is okay + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + + fn emit_start_element(&mut self, emit_end_element: bool) -> Option { + let mut name = self.data.take_element_name().unwrap(); + let mut attributes = self.data.take_attributes(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + // check and fix accumulated attributes prefixes + for attr in attributes.iter_mut() { + if let Some(ref pfx) = attr.name.prefix { + let new_ns = match self.nst.get(pfx) { + Some("") => None, // default namespace + Some(ns) => Some(ns.into()), + None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) + }; + attr.name.namespace = new_ns; + } + } + + if emit_end_element { + self.pop_namespace = true; + self.next_event = Some(Ok(XmlEvent::EndElement { + name: name.clone() + })); + } else { + self.est.push(name.clone()); + } + let namespace = self.nst.squash(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { + name: name, + attributes: attributes, + namespace: namespace + })) + } + + fn emit_end_element(&mut self) -> Option { + let mut name = self.data.take_element_name().unwrap(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + let op_name = self.est.pop().unwrap(); + + if name == op_name { + self.pop_namespace = true; + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) + } else { + Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) + } + } + +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + + use common::{Position, TextPosition}; + use name::OwnedName; + use attribute::OwnedAttribute; + use reader::parser::PullParser; + use reader::ParserConfig; + use reader::events::XmlEvent; + + fn new_parser() -> PullParser { + PullParser::new(ParserConfig::new()) + } + + macro_rules! expect_event( + ($r:expr, $p:expr, $t:pat) => ( + match $p.next(&mut $r) { + $t => {} + e => panic!("Unexpected event: {:?}", e) + } + ); + ($r:expr, $p:expr, $t:pat => $c:expr ) => ( + match $p.next(&mut $r) { + $t if $c => {} + e => panic!("Unexpected event: {:?}", e) + } + ) + ); + + macro_rules! test_data( + ($d:expr) => ({ + static DATA: &'static str = $d; + let r = BufReader::new(DATA.as_bytes()); + let p = new_parser(); + (r, p) + }) + ); + + #[test] + fn issue_3_semicolon_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => + *name == OwnedName::local("a") && + attributes.len() == 1 && + attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && + namespace.is_essentially_empty() + ); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_140_entity_reference_inside_tag() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn opening_tag_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Err(ref e) => + e.msg() == "Unexpected token inside attribute value: <" && + e.position() == TextPosition { row: 1, column: 24 } + ); + } +} diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs new file mode 100644 index 0000000..d3f7598 --- /dev/null +++ b/src/reader/parser/outside_tag.rs @@ -0,0 +1,130 @@ +use common::is_whitespace_char; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, + ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE +}; + +impl PullParser { + pub fn outside_tag(&mut self, t: Token) -> Option { + match t { + Token::ReferenceStart => + self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), + + Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + + Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + + Token::Whitespace(c) => { + if !self.buf_has_data() { + self.push_pos(); + } + self.append_char_continue(c) + } + + _ if t.contains_char_data() && self.depth() == 0 => + Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + + _ if t.contains_char_data() => { // Non-whitespace char data + if !self.buf_has_data() { + self.push_pos(); + } + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + + Token::ReferenceEnd => { // Semi-colon in a text outside an entity + self.inside_whitespace = false; + Token::ReferenceEnd.push_to_string(&mut self.buf); + None + } + + Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state_continue(State::InsideComment) + } + + Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { + if !self.buf_has_data() { + self.push_pos(); + } + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state_continue(State::InsideCData) + } + + _ => { + // Encountered some markup event, flush the buffer as characters + // or a whitespace + let mut next_event = if self.buf_has_data() { + let buf = self.take_buf(); + if self.inside_whitespace && self.config.trim_whitespace { + None + } else if self.inside_whitespace && !self.config.whitespace_to_characters { + Some(Ok(XmlEvent::Whitespace(buf))) + } else if self.config.trim_whitespace { + Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) + } else { + Some(Ok(XmlEvent::Characters(buf))) + } + } else { None }; + self.inside_whitespace = true; // Reset inside_whitespace flag + self.push_pos(); + match t { + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::DoctypeStart if !self.encountered_element => { + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.lexer.disable_errors(); + self.into_state(State::InsideDoctype, next_event) + } + + Token::OpeningTagStart => { + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if !self.parsed_declaration { + self.parsed_declaration = true; + let sd_event = XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: DEFAULT_ENCODING.into(), + standalone: DEFAULT_STANDALONE + }; + // next_event is always none here because we're outside of + // the root element + next_event = Some(Ok(sd_event)); + self.push_pos(); + } + self.encountered_element = true; + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + } + + Token::ClosingTagStart if self.depth() > 0 => + self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), + + Token::CommentStart => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state(State::InsideComment, next_event) + } + + Token::CDataStart => { + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state(State::InsideCData, next_event) + } + + _ => Some(self_error!(self; "Unexpected token: {}", t)) + } + } + } + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..23fee04 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,107 @@ +use std::io::{self, Read}; +use std::str; +use std::fmt; + +#[derive(Debug)] +pub enum CharReadError { + UnexpectedEof, + Utf8(str::Utf8Error), + Io(io::Error) +} + +impl From for CharReadError { + fn from(e: str::Utf8Error) -> CharReadError { + CharReadError::Utf8(e) + } +} + +impl From for CharReadError { + fn from(e: io::Error) -> CharReadError { + CharReadError::Io(e) + } +} + +impl fmt::Display for CharReadError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::CharReadError::*; + match *self { + UnexpectedEof => write!(f, "unexpected end of stream"), + Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e), + Io(ref e) => write!(f, "I/O error: {}", e) + } + } +} + +pub fn next_char_from(source: &mut R) -> Result, CharReadError> { + const MAX_CODEPOINT_LEN: usize = 4; + + let mut bytes = source.bytes(); + let mut buf = [0u8; MAX_CODEPOINT_LEN]; + let mut pos = 0; + + loop { + let next = match bytes.next() { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e.into()), + None if pos == 0 => return Ok(None), + None => return Err(CharReadError::UnexpectedEof) + }; + buf[pos] = next; + pos += 1; + + match str::from_utf8(&buf[..pos]) { + Ok(s) => return Ok(s.chars().next()), // always Some(..) + Err(_) if pos < MAX_CODEPOINT_LEN => {}, + Err(e) => return Err(e.into()) + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn test_next_char_from() { + use std::io; + use std::error::Error; + + let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c')); + + let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п')); + + let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊')); + + let mut bytes: &[u8] = b""; // empty + assert_eq!(super::next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::UnexpectedEof => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::Utf8(_) => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + + // error during read + struct ErrorReader; + impl io::Read for ErrorReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::Other, "test error")) + } + } + + let mut r = ErrorReader; + match super::next_char_from(&mut r).unwrap_err() { + super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && + e.description() == "test error" => {}, + e => panic!("Unexpected result: {:?}", e) + } + } +} diff --git a/src/writer/config.rs b/src/writer/config.rs new file mode 100644 index 0000000..ebabf18 --- /dev/null +++ b/src/writer/config.rs @@ -0,0 +1,157 @@ +//! Contains emitter configuration structure. + +use std::io::Write; +use std::borrow::Cow; + +use writer::EventWriter; + +/// Emitter configuration structure. +/// +/// This structure contains various options which control XML document emitter behavior. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct EmitterConfig { + /// Line separator used to separate lines in formatted output. Default is `"\n"`. + pub line_separator: Cow<'static, str>, + + /// A string which will be used for a single level of indentation. Default is `" "` + /// (two spaces). + pub indent_string: Cow<'static, str>, + + /// Whether or not the emitted document should be indented. Default is false. + /// + /// The emitter is capable to perform automatic indentation of the emitted XML document. + /// It is done in stream-like fashion and does not require the knowledge of the whole + /// document in advance. + /// + /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep + /// existing layout when processing an existing XML document. Also the indentiation algorithm + /// is not thoroughly tested. Hence by default it is disabled. + pub perform_indent: bool, + + /// Whether or not characters in output events will be escaped. Default is true. + /// + /// The emitter can automatically escape characters which can't appear in PCDATA sections + /// or element attributes of an XML document, like `<` or `"` (in attributes). This may + /// introduce some overhead because then every corresponding piece of character data + /// should be scanned for invalid characters. + /// + /// If this option is disabled, the XML writer may produce non-well-formed documents, so + /// use `false` value for this option with care. + pub perform_escaping: bool, + + /// Whether or not to write XML document declaration at the beginning of a document. + /// Default is true. + /// + /// This option controls whether the document declaration should be emitted automatically + /// before a root element is written if it was not emitted explicitly by the user. + pub write_document_declaration: bool, + + /// Whether or not to convert elements with empty content to empty elements. Default is true. + /// + /// This option allows turning elements like `` (an element with empty content) + /// into `` (an empty element). + pub normalize_empty_elements: bool, + + /// Whether or not to emit CDATA events as plain characters. Default is false. + /// + /// This option forces the emitter to convert CDATA events into regular character events, + /// performing all the necessary escaping beforehand. This may be occasionally useful + /// for feeding the document into incorrect parsers which do not support CDATA. + pub cdata_to_characters: bool, + + /// Whether or not to keep element names to support `EndElement` events without explicit names. + /// Default is true. + /// + /// This option makes the emitter to keep names of written elements in order to allow + /// omitting names when writing closing element tags. This could incur some memory overhead. + pub keep_element_names_stack: bool, + + /// Whether or not to automatically insert leading and trailing spaces in emitted comments, + /// if necessary. Default is true. + /// + /// This is a convenience option in order for the user not to append spaces before and after + /// comments text in order to get more pretty comments: `` instead of + /// ``. + pub autopad_comments: bool, + + /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing + /// elements. Default is true. + /// + /// This option is only meaningful if `normalize_empty_elements` is true. For example, the + /// element `` would be unaffected. When `normalize_empty_elements` is true, then when + /// this option is also true, the same element would appear ``. If this option is false, + /// then the same element would appear ``. + pub pad_self_closing: bool, +} + +impl EmitterConfig { + /// Creates an emitter configuration with default values. + /// + /// You can tweak default options with builder-like pattern: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let config = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false); + /// ``` + #[inline] + pub fn new() -> EmitterConfig { + EmitterConfig { + line_separator: "\n".into(), + indent_string: " ".into(), // two spaces + perform_indent: false, + perform_escaping: true, + write_document_declaration: true, + normalize_empty_elements: true, + cdata_to_characters: false, + keep_element_names_stack: true, + autopad_comments: true, + pad_self_closing: true + } + } + + /// Creates an XML writer with this configuration. + /// + /// This is a convenience method for configuring and creating a writer at the same time: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let mut target: Vec = Vec::new(); + /// + /// let writer = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false) + /// .create_writer(&mut target); + /// ``` + /// + /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_writer(self, sink: W) -> EventWriter { + EventWriter::new_with_config(sink, self) + } +} + +impl Default for EmitterConfig { + #[inline] + fn default() -> EmitterConfig { + EmitterConfig::new() + } +} + +gen_setters!(EmitterConfig, + line_separator: into Cow<'static, str>, + indent_string: into Cow<'static, str>, + perform_indent: val bool, + write_document_declaration: val bool, + normalize_empty_elements: val bool, + cdata_to_characters: val bool, + keep_element_names_stack: val bool, + autopad_comments: val bool, + pad_self_closing: val bool +); diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs new file mode 100644 index 0000000..ba80f66 --- /dev/null +++ b/src/writer/emitter.rs @@ -0,0 +1,447 @@ +use std::io; +use std::io::prelude::*; +use std::fmt; +use std::result; +use std::borrow::Cow; +use std::error::Error; + +use common; +use name::{Name, OwnedName}; +use attribute::Attribute; +use escape::{escape_str_attribute, escape_str_pcdata}; +use common::XmlVersion; +use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX}; + +use writer::config::EmitterConfig; + +/// An error which may be returned by `XmlWriter` when writing XML events. +#[derive(Debug)] +pub enum EmitterError { + /// An I/O error occured in the underlying `Write` instance. + Io(io::Error), + + /// Document declaration has already been written to the output stream. + DocumentStartAlreadyEmitted, + + /// The name of the last opening element is not available. + LastElementNameNotAvailable, + + /// The name of the last opening element is not equal to the name of the provided + /// closing element. + EndElementNameIsNotEqualToLastStartElementName, + + /// End element name is not specified when it is needed, for example, when automatic + /// closing is not enabled in configuration. + EndElementNameIsNotSpecified +} + +impl From for EmitterError { + fn from(err: io::Error) -> EmitterError { + EmitterError::Io(err) + } +} + +impl fmt::Display for EmitterError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + + write!(f, "emitter error: ")?; + match *self { + EmitterError::Io(ref e) => + write!(f, "I/O error: {}", e), + ref other => + write!(f, "{}", other.description()), + } + } +} + +impl Error for EmitterError { + fn description(&self) -> &str { + match *self { + EmitterError::Io(_) => + "I/O error", + EmitterError::DocumentStartAlreadyEmitted => + "document start event has already been emitted", + EmitterError::LastElementNameNotAvailable => + "last element name is not available", + EmitterError::EndElementNameIsNotEqualToLastStartElementName => + "end element name is not equal to last start element name", + EmitterError::EndElementNameIsNotSpecified => + "end element name is not specified and can't be inferred", + } + } +} + +/// A result type yielded by `XmlWriter`. +pub type Result = result::Result; + +// TODO: split into a low-level fast writer without any checks and formatting logic and a +// high-level indenting validating writer +pub struct Emitter { + config: EmitterConfig, + + nst: NamespaceStack, + + indent_level: usize, + indent_stack: Vec, + + element_names: Vec, + + start_document_emitted: bool, + just_wrote_start_element: bool +} + +impl Emitter { + pub fn new(config: EmitterConfig) -> Emitter { + Emitter { + config, + + nst: NamespaceStack::empty(), + + indent_level: 0, + indent_stack: vec![IndentFlags::WroteNothing], + + element_names: Vec::new(), + + start_document_emitted: false, + just_wrote_start_element: false + } + } +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +enum IndentFlags { + WroteNothing, + WroteMarkup, + WroteText, +} + +impl Emitter { + /// Returns the current state of namespaces. + #[inline] + pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack { + &mut self.nst + } + + #[inline] + fn wrote_text(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteText + } + + #[inline] + fn wrote_markup(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup + } + + #[inline] + fn set_wrote_text(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText; + } + + #[inline] + fn set_wrote_markup(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup; + } + + #[inline] + fn reset_state(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing; + } + + fn write_newline(&mut self, target: &mut W, level: usize) -> Result<()> { + target.write_all(self.config.line_separator.as_bytes())?; + for _ in 0..level { + target.write_all(self.config.indent_string.as_bytes())?; + } + Ok(()) + } + + fn before_markup(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && !self.wrote_text() && + (self.indent_level > 0 || self.wrote_markup()) { + let indent_level = self.indent_level; + self.write_newline(target, indent_level)?; + if self.indent_level > 0 && self.config.indent_string.len() > 0 { + self.after_markup(); + } + } + Ok(()) + } + + fn after_markup(&mut self) { + self.set_wrote_markup(); + } + + fn before_start_element(&mut self, target: &mut W) -> Result<()> { + self.before_markup(target)?; + self.indent_stack.push(IndentFlags::WroteNothing); + Ok(()) + } + + fn after_start_element(&mut self) { + self.after_markup(); + self.indent_level += 1; + } + + fn before_end_element(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() && + !self.wrote_text() { + let indent_level = self.indent_level; + self.write_newline(target, indent_level - 1) + } else { + Ok(()) + } + } + + fn after_end_element(&mut self) { + if self.indent_level > 0 { + self.indent_level -= 1; + self.indent_stack.pop(); + } + self.set_wrote_markup(); + } + + fn after_text(&mut self) { + self.set_wrote_text(); + } + + pub fn emit_start_document(&mut self, target: &mut W, + version: XmlVersion, + encoding: &str, + standalone: Option) -> Result<()> { + if self.start_document_emitted { + return Err(EmitterError::DocumentStartAlreadyEmitted); + } + self.start_document_emitted = true; + + self.before_markup(target)?; + let result = { + let mut write = move || { + write!(target, "")?; + + Ok(()) + }; + write() + }; + self.after_markup(); + + result + } + + fn check_document_started(&mut self, target: &mut W) -> Result<()> { + if !self.start_document_emitted && self.config.write_document_declaration { + self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None) + } else { + Ok(()) + } + } + + fn fix_non_empty_element(&mut self, target: &mut W) -> Result<()> { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + target.write_all(b">").map_err(From::from) + } else { + Ok(()) + } + } + + pub fn emit_processing_instruction(&mut self, + target: &mut W, + name: &str, + data: Option<&str>) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + + self.before_markup(target)?; + + let result = { + let mut write = || { + write!(target, "")?; + + Ok(()) + }; + write() + }; + + self.after_markup(); + + result + } + + fn emit_start_element_initial(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + self.before_start_element(target)?; + write!(target, "<{}", name.repr_display())?; + self.emit_current_namespace_attributes(target)?; + self.emit_attributes(target, attributes)?; + self.after_start_element(); + Ok(()) + } + + pub fn emit_start_element(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + if self.config.keep_element_names_stack { + self.element_names.push(name.to_owned()); + } + + self.emit_start_element_initial(target, name, attributes)?; + self.just_wrote_start_element = true; + + if !self.config.normalize_empty_elements { + write!(target, ">")?; + } + + Ok(()) + } + + pub fn emit_current_namespace_attributes(&mut self, target: &mut W) -> Result<()> + where W: Write + { + for (prefix, uri) in self.nst.peek() { + match prefix { + // internal namespaces are not emitted + NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()), + //// there is already a namespace binding with this prefix in scope + //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), + // emit xmlns only if it is overridden + NS_NO_PREFIX => if uri != NS_EMPTY_URI { + write!(target, " xmlns=\"{}\"", uri) + } else { Ok(()) }, + // everything else + prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri) + }?; + } + Ok(()) + } + + pub fn emit_attributes(&mut self, target: &mut W, + attributes: &[Attribute]) -> Result<()> { + for attr in attributes.iter() { + write!( + target, " {}=\"{}\"", + attr.name.repr_display(), + if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) } + )? + } + Ok(()) + } + + pub fn emit_end_element(&mut self, target: &mut W, + name: Option) -> Result<()> { + let owned_name = if self.config.keep_element_names_stack { + Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) + } else { + None + }; + + // Check that last started element name equals to the provided name, if there are both + if let Some(ref last_name) = owned_name { + if let Some(ref name) = name { + if last_name.borrow() != *name { + return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName); + } + } + } + + if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + let termination = if self.config.pad_self_closing { " />" } else { "/>" }; + let result = target.write_all(termination.as_bytes()).map_err(From::from); + self.after_end_element(); + result + } else { + self.just_wrote_start_element = false; + + self.before_end_element(target)?; + let result = write!(target, "", name.repr_display()).map_err(From::from); + self.after_end_element(); + + result + } + } else { + Err(EmitterError::EndElementNameIsNotSpecified) + } + } + + pub fn emit_cdata(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + if self.config.cdata_to_characters { + self.emit_characters(target, content) + } else { + // TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks + target.write_all(b"")?; + + self.after_text(); + + Ok(()) + } + } + + pub fn emit_characters(&mut self, target: &mut W, + content: &str) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + target.write_all( + (if self.config.perform_escaping { + escape_str_pcdata(content) + } else { + Cow::Borrowed(content) + }).as_bytes() + )?; + self.after_text(); + Ok(()) + } + + pub fn emit_comment(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + + // TODO: add escaping dashes at the end of the comment + + let autopad_comments = self.config.autopad_comments; + let write = |target: &mut W| -> Result<()> { + target.write_all(b"")?; + + Ok(()) + }; + + self.before_markup(target)?; + let result = write(target); + self.after_markup(); + + result + } +} diff --git a/src/writer/events.rs b/src/writer/events.rs new file mode 100644 index 0000000..1f7040f --- /dev/null +++ b/src/writer/events.rs @@ -0,0 +1,241 @@ +//! Contains `XmlEvent` datatype, instances of which are consumed by the writer. + +use std::borrow::Cow; + +use name::Name; +use attribute::Attribute; +use common::XmlVersion; +use namespace::{Namespace, NS_NO_PREFIX}; + +/// A part of an XML output stream. +/// +/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of +/// an XML document. +#[derive(Debug)] +pub enum XmlEvent<'a> { + /// Corresponds to XML document declaration. + /// + /// This event should always be written before any other event. If it is not written + /// at all, a default XML declaration will be outputted if the corresponding option + /// is set in the configuration. Otherwise an error will be returned. + StartDocument { + /// XML version. + /// + /// Defaults to `XmlVersion::Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// Defaults to `Some("UTF-8")`. + encoding: Option<&'a str>, + + /// XML standalone declaration. + /// + /// Defaults to `None`. + standalone: Option + }, + + /// Denotes an XML processing instruction. + ProcessingInstruction { + /// Processing instruction target. + name: &'a str, + + /// Processing instruction content. + data: Option<&'a str> + }, + + /// Denotes a beginning of an XML element. + StartElement { + /// Qualified name of the element. + name: Name<'a>, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO). Attribute values + /// will be escaped, and all characters invalid for attribute values like `"` or `<` + /// will be changed into character entities. + attributes: Cow<'a, [Attribute<'a>]>, + + /// Contents of the namespace mapping at this point of the document. + /// + /// This mapping will be inspected for "new" entries, and if at this point of the document + /// a particular pair of prefix and namespace URI is already defined, no namespace + /// attributes will be emitted. + namespace: Cow<'a, Namespace>, + }, + + /// Denotes an end of an XML element. + EndElement { + /// Optional qualified name of the element. + /// + /// If `None`, then it is assumed that the element name should be the last valid one. + /// If `Some` and element names tracking is enabled, then the writer will check it for + /// correctness. + name: Option> + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data, and no escaping will be performed when writing it + /// to the output stream. + CData(&'a str), + + /// Denotes a comment. + /// + /// The string will be checked for invalid sequences and error will be returned by the + /// write operation + Comment(&'a str), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will be escaped if `perform_escaping` option is enabled, + /// that is, every character invalid for PCDATA will appear as a character entity. + Characters(&'a str) +} + +impl<'a> XmlEvent<'a> { + /// Returns an writer event for a processing instruction. + #[inline] + pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> { + XmlEvent::ProcessingInstruction { name: name, data: data } + } + + /// Returns a builder for a starting element. + /// + /// This builder can then be used to tweak attributes and namespace starting at + /// this element. + #[inline] + pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> { + StartElementBuilder { + name: name.into(), + attributes: Vec::new(), + namespace: Namespace::empty().into() + } + } + + /// Returns a builder for an closing element. + /// + /// This method, unline `start_element()`, does not accept a name because by default + /// the writer is able to determine it automatically. However, when this functionality + /// is disabled, it is possible to specify the name with `name()` method on the builder. + #[inline] + pub fn end_element() -> EndElementBuilder<'a> { + EndElementBuilder { name: None } + } + + /// Returns a CDATA event. + /// + /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` + /// (depending on the configuration). + #[inline] + pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) } + + /// Returns a regular characters (PCDATA) event. + /// + /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. + #[inline] + pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) } + + /// Returns a comment event. + #[inline] + pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) } +} + +impl<'a> From<&'a str> for XmlEvent<'a> { + #[inline] + fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) } +} + +pub struct EndElementBuilder<'a> { + name: Option> +} + +/// A builder for a closing element event. +impl<'a> EndElementBuilder<'a> { + /// Sets the name of this closing element. + /// + /// Usually the writer is able to determine closing element names automatically. If + /// this functionality is enabled (by default it is), then this name is checked for correctness. + /// It is possible, however, to disable such behavior; then the user must ensure that + /// closing element name is correct manually. + #[inline] + pub fn name(mut self, name: N) -> EndElementBuilder<'a> where N: Into> { + self.name = Some(name.into()); + self + } +} + +impl<'a> From> for XmlEvent<'a> { + fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::EndElement { name: b.name } + } +} + +/// A builder for a starting element event. +pub struct StartElementBuilder<'a> { + name: Name<'a>, + attributes: Vec>, + namespace: Namespace +} + +impl<'a> StartElementBuilder<'a> { + /// Sets an attribute value of this element to the given string. + /// + /// This method can be used to add attributes to the starting element. Name is a qualified + /// name; its namespace is ignored, but its prefix is checked for correctness, that is, + /// it is checked that the prefix is bound to some namespace in the current context. + /// + /// Currently attributes are not checked for duplicates. Note that duplicate attributes + /// are a violation of XML document well-formedness. + /// + /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. + #[inline] + pub fn attr(mut self, name: N, value: &'a str) -> StartElementBuilder<'a> + where N: Into> + { + self.attributes.push(Attribute::new(name.into(), value)); + self + } + + /// Adds a namespace to the current namespace context. + /// + /// If no namespace URI was bound to the provided prefix at this point of the document, + /// then the mapping from the prefix to the provided namespace URI will be written as + /// a part of this element attribute set. + /// + /// If the same namespace URI was bound to the provided prefix at this point of the document, + /// then no namespace attributes will be emitted. + /// + /// If some other namespace URI was bound to the provided prefix at this point of the document, + /// then another binding will be added as a part of this element attribute set, shadowing + /// the outer binding. + #[inline] + pub fn ns(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a> + where S1: Into, S2: Into + { + self.namespace.put(prefix, uri); + self + } + + /// Adds a default namespace mapping to the current namespace context. + /// + /// Same rules as for `ns()` are also valid for the default namespace mapping. + #[inline] + pub fn default_ns(mut self, uri: S) -> StartElementBuilder<'a> + where S: Into + { + self.namespace.put(NS_NO_PREFIX, uri); + self + } +} + +impl<'a> From> for XmlEvent<'a> { + #[inline] + fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::StartElement { + name: b.name, + attributes: Cow::Owned(b.attributes), + namespace: Cow::Owned(b.namespace) + } + } +} diff --git a/src/writer/mod.rs b/src/writer/mod.rs new file mode 100644 index 0000000..ea1b242 --- /dev/null +++ b/src/writer/mod.rs @@ -0,0 +1,93 @@ +//! Contains high-level interface for an events-based XML emitter. +//! +//! The most important type in this module is `EventWriter` which allows writing an XML document +//! to some output stream. + +pub use self::emitter::Result; +pub use self::emitter::EmitterError as Error; +pub use self::config::EmitterConfig; +pub use self::events::XmlEvent; + +use self::emitter::Emitter; + +use std::io::prelude::*; + +mod emitter; +mod config; +pub mod events; + +/// A wrapper around an `std::io::Write` instance which emits XML document according to provided +/// events. +pub struct EventWriter { + sink: W, + emitter: Emitter +} + +impl EventWriter { + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default + /// configuration. + #[inline] + pub fn new(sink: W) -> EventWriter { + EventWriter::new_with_config(sink, EmitterConfig::new()) + } + + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided + /// configuration. + #[inline] + pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter { + EventWriter { + sink, + emitter: Emitter::new(config) + } + } + + /// Writes the next piece of XML document according to the provided event. + /// + /// Note that output data may not exactly correspond to the written event because + /// of various configuration options. For example, `XmlEvent::EndElement` may + /// correspond to a separate closing element or it may cause writing an empty element. + /// Another example is that `XmlEvent::CData` may be represented as characters in + /// the output stream. + pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { + match event.into() { + XmlEvent::StartDocument { version, encoding, standalone } => + self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), + XmlEvent::ProcessingInstruction { name, data } => + self.emitter.emit_processing_instruction(&mut self.sink, name, data), + XmlEvent::StartElement { name, attributes, namespace } => { + self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); + self.emitter.emit_start_element(&mut self.sink, name, &attributes) + } + XmlEvent::EndElement { name } => { + let r = self.emitter.emit_end_element(&mut self.sink, name); + self.emitter.namespace_stack_mut().try_pop(); + r + } + XmlEvent::Comment(content) => + self.emitter.emit_comment(&mut self.sink, content), + XmlEvent::CData(content) => + self.emitter.emit_cdata(&mut self.sink, content), + XmlEvent::Characters(content) => + self.emitter.emit_characters(&mut self.sink, content) + } + } + + /// Returns a mutable reference to the underlying `Writer`. + /// + /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML + /// documents. Use this method with care. Valid use cases for this method include accessing + /// methods like `Write::flush`, which do not emit new data but rather change the state + /// of the stream itself. + pub fn inner_mut(&mut self) -> &mut W { + &mut self.sink + } + + /// Unwraps this `EventWriter`, returning the underlying writer. + /// + /// Note that this is a destructive operation: unwrapping a writer and then wrapping + /// it again with `EventWriter::new()` will create a fresh writer whose state will be + /// blank; for example, accumulated namespaces will be reset. + pub fn into_inner(self) -> W { + self.sink + } +} diff --git a/tests/documents/sample_1.xml b/tests/documents/sample_1.xml new file mode 100644 index 0000000..4d1cbc0 --- /dev/null +++ b/tests/documents/sample_1.xml @@ -0,0 +1,34 @@ + + + + + + + + + + Some <java> class + + + Another "java" class + + + Weird 'XML' config + + + + + + + + + + JavaScript & program + + + Cascading style sheet: © - ҉ + + + + + diff --git a/tests/documents/sample_1_full.txt b/tests/documents/sample_1_full.txt new file mode 100644 index 0000000..a8d64d0 --- /dev/null +++ b/tests/documents/sample_1_full.txt @@ -0,0 +1,58 @@ +StartDocument(1.0, utf-8) +StartElement(project [name="project-name"]) +Whitespace("\n ") +StartElement(libraries) +Whitespace("\n ") +StartElement(library [groupId="org.example", artifactId="", version="0.1"]) +EndElement(library) +Whitespace("\n ") +StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) +EndElement(library) +Whitespace("\n ") +EndElement(libraries) +Whitespace("\n ") +StartElement(module [name="module-1"]) +Whitespace("\n ") +StartElement(files) +Whitespace("\n ") +StartElement(file [name="somefile.java", type="java"]) +Characters("\n Some class\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="another_file.java", type="java"]) +Characters("\n Another \"java\" class\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="config.xml", type="xml"]) +Characters("\n Weird \'XML\' config\n ") +EndElement(file) +Whitespace("\n ") +EndElement(files) +Whitespace("\n ") +StartElement(libraries) +Whitespace("\n ") +StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) +EndElement(library) +Whitespace("\n ") +EndElement(libraries) +Whitespace("\n ") +EndElement(module) +Whitespace("\n ") +StartElement(module [name="module-2"]) +Whitespace("\n ") +StartElement(files) +Whitespace("\n ") +StartElement(file [name="program.js", type="javascript"]) +Characters("\n JavaScript & program\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="style.css", type="css"]) +Characters("\n Cascading style sheet: © - ҉\n ") +EndElement(file) +Whitespace("\n ") +EndElement(files) +Whitespace("\n ") +EndElement(module) +Whitespace("\n") +EndElement(project) +EndDocument diff --git a/tests/documents/sample_1_short.txt b/tests/documents/sample_1_short.txt new file mode 100644 index 0000000..4dbe285 --- /dev/null +++ b/tests/documents/sample_1_short.txt @@ -0,0 +1,37 @@ +StartDocument(1.0, utf-8) +StartElement(project [name="project-name"]) +StartElement(libraries) +StartElement(library [groupId="org.example", artifactId="", version="0.1"]) +EndElement(library) +StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) +EndElement(library) +EndElement(libraries) +StartElement(module [name="module-1"]) +StartElement(files) +StartElement(file [name="somefile.java", type="java"]) +Characters("Some class") +EndElement(file) +StartElement(file [name="another_file.java", type="java"]) +Characters("Another \"java\" class") +EndElement(file) +StartElement(file [name="config.xml", type="xml"]) +Characters("Weird \'XML\' config") +EndElement(file) +EndElement(files) +StartElement(libraries) +StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) +EndElement(library) +EndElement(libraries) +EndElement(module) +StartElement(module [name="module-2"]) +StartElement(files) +StartElement(file [name="program.js", type="javascript"]) +Characters("JavaScript & program") +EndElement(file) +StartElement(file [name="style.css", type="css"]) +Characters("Cascading style sheet: © - ҉") +EndElement(file) +EndElement(files) +EndElement(module) +EndElement(project) +EndDocument diff --git a/tests/documents/sample_2.xml b/tests/documents/sample_2.xml new file mode 100644 index 0000000..f9543ac --- /dev/null +++ b/tests/documents/sample_2.xml @@ -0,0 +1,15 @@ + + + + Name + Another name + 0.3 + 0.2 + 0.1 + 0.01 + header 1 value + + Some bigger value + + + diff --git a/tests/documents/sample_2_full.txt b/tests/documents/sample_2_full.txt new file mode 100644 index 0000000..75075cd --- /dev/null +++ b/tests/documents/sample_2_full.txt @@ -0,0 +1,41 @@ +StartDocument(1.0, utf-8) +StartElement({urn:example:namespace}p:data) +Whitespace("\n ") +StartElement({urn:example:namespace}p:datum [id="34"]) +Whitespace("\n ") +StartElement({urn:example:namespace}p:name) +Characters("Name") +EndElement({urn:example:namespace}p:name) +Whitespace("\n ") +StartElement({urn:example:double}d:name) +Characters("Another name") +EndElement({urn:example:double}d:name) +Whitespace("\n ") +StartElement({urn:example:double}d:arg) +Characters("0.3") +EndElement({urn:example:double}d:arg) +Whitespace("\n ") +StartElement({urn:example:double}d:arg) +Characters("0.2") +EndElement({urn:example:double}d:arg) +Whitespace("\n ") +StartElement({urn:example:namespace}p:arg) +Characters("0.1") +EndElement({urn:example:namespace}p:arg) +Whitespace("\n ") +StartElement({urn:example:namespace}p:arg) +Characters("0.01") +EndElement({urn:example:namespace}p:arg) +Whitespace("\n ") +StartElement({urn:example:header}h:header [name="Header-1"]) +Characters("header 1 value") +EndElement({urn:example:header}h:header) +Whitespace("\n ") +StartElement({urn:example:header}h:header [name="Header-2"]) +Characters("\n Some bigger value\n ") +EndElement({urn:example:header}h:header) +Whitespace("\n ") +EndElement({urn:example:namespace}p:datum) +Whitespace("\n") +EndElement({urn:example:namespace}p:data) +EndDocument diff --git a/tests/documents/sample_2_short.txt b/tests/documents/sample_2_short.txt new file mode 100644 index 0000000..2368025 --- /dev/null +++ b/tests/documents/sample_2_short.txt @@ -0,0 +1,30 @@ +StartDocument(1.0, utf-8) +StartElement({urn:example:namespace}p:data) +StartElement({urn:example:namespace}p:datum [id="34"]) +StartElement({urn:example:namespace}p:name) +Characters("Name") +EndElement({urn:example:namespace}p:name) +StartElement({urn:example:double}d:name) +Characters("Another name") +EndElement({urn:example:double}d:name) +StartElement({urn:example:double}d:arg) +Characters("0.3") +EndElement({urn:example:double}d:arg) +StartElement({urn:example:double}d:arg) +Characters("0.2") +EndElement({urn:example:double}d:arg) +StartElement({urn:example:namespace}p:arg) +Characters("0.1") +EndElement({urn:example:namespace}p:arg) +StartElement({urn:example:namespace}p:arg) +Characters("0.01") +EndElement({urn:example:namespace}p:arg) +StartElement({urn:example:header}h:header [name="Header-1"]) +Characters("header 1 value") +EndElement({urn:example:header}h:header) +StartElement({urn:example:header}h:header [name="Header-2"]) +Characters("Some bigger value") +EndElement({urn:example:header}h:header) +EndElement({urn:example:namespace}p:datum) +EndElement({urn:example:namespace}p:data) +EndDocument diff --git a/tests/documents/sample_3.xml b/tests/documents/sample_3.xml new file mode 100644 index 0000000..657e37d --- /dev/null +++ b/tests/documents/sample_3.xml @@ -0,0 +1,13 @@ + + + + test + kkss" = ddd' > + ddddd!e3--> + test + kkss" = ddd' > + ddddd!e3-->"#, + br#" + |1:14 Unexpected token '--' before ' ' + "#, + ParserConfig::new(), + false + ); + + test( + br#""#, + br#" + |1:14 Unexpected token '--' before '-' + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn tabs_1() { + test( + b"\t\t", + br#" + |1:2 StartDocument(1.0, UTF-8) + |1:2 StartElement(a) + |1:6 StartElement(b) + |1:6 EndElement(b) + |1:10 EndElement(a) + |1:14 EndDocument + "#, + ParserConfig::new() + .trim_whitespace(true), + true + ); +} + +#[test] +fn issue_32_unescaped_cdata_end() { + test( + br#"]]>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("]]>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_unescaped_processing_instruction_end() { + test( + br#"?>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("?>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_unescaped_empty_tag_end() { + test( + br#"/>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("/>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_83_duplicate_attributes() { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |1:30 Attribute 'a' is redefined + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_93_large_characters_in_entity_references() { + test( + r#"&𤶼;"#.as_bytes(), + r#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |1:10 Unexpected entity: 𤶼 + "#.as_bytes(), // FIXME: it shouldn't be 10, looks like indices are off slightly + ParserConfig::new(), + false + ) +} + +#[test] +fn issue_98_cdata_ending_with_right_bracket() { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |CData("Foo [Bar]") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ) +} + +#[test] +fn issue_105_unexpected_double_dash() { + test( + br#"-- "#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("-- ") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#"--"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("--") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#"-->"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("-->") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |CData("--") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_attribues_have_no_default_namespace () { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement({urn:foo}hello [x="y"]) + |EndElement({urn:foo}hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_replacement_character_entity_reference() { + test( + br#"��"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |1:13 Invalid decimal character number in an entity: #55357 + "#, + ParserConfig::new(), + false, + ); + + test( + br#"��"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |1:13 Invalid hexadecimal character number in an entity: #xd83d + "#, + ParserConfig::new(), + false, + ); + + test( + br#"��"#, + format!( + r#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |Characters("{replacement_character}{replacement_character}") + |EndElement(doc) + |EndDocument + "#, + replacement_character = "\u{fffd}" + ) + .as_bytes(), + ParserConfig::new() + .replace_unknown_entity_references(true), + false, + ); + + test( + br#"��"#, + format!( + r#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |Characters("{replacement_character}{replacement_character}") + |EndElement(doc) + |EndDocument + "#, + replacement_character = "\u{fffd}" + ) + .as_bytes(), + ParserConfig::new() + .replace_unknown_entity_references(true), + false, + ); +} + +lazy_static! { + // If PRINT_SPEC env variable is set, print the lines + // to stderr instead of comparing with the output + // it can be used like this: + // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt + static ref PRINT: bool = { + for (key, value) in env::vars() { + if key == "PRINT_SPEC" && value == "1" { + return true; + } + } + false + }; +} + +// clones a lot but that's fine +fn trim_until_bar(s: String) -> String { + match s.trim() { + ts if ts.starts_with('|') => return ts[1..].to_owned(), + _ => {} + } + s +} + +fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) { + let mut reader = config.create_reader(input); + let mut spec_lines = BufReader::new(output).lines() + .map(|line| line.unwrap()) + .enumerate() + .map(|(i, line)| (i, trim_until_bar(line))) + .filter(|&(_, ref line)| !line.trim().is_empty()); + + loop { + let e = reader.next(); + let line = + if test_position { + format!("{} {}", reader.position(), Event(&e)) + } else { + format!("{}", Event(&e)) + }; + + if *PRINT { + writeln!(&mut stderr(), "{}", line).unwrap(); + } else { + if let Some((n, spec)) = spec_lines.next() { + if line != spec { + const SPLITTER: &'static str = "-------------------"; + panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound: {}\n{}\n", + SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap()); + } + } else { + panic!("Unexpected event: {}", line); + } + } + + match e { + Ok(XmlEvent::EndDocument) | Err(_) => break, + _ => {}, + } + } +} + +// Here we define our own string representation of events so we don't depend +// on the specifics of Display implementation for XmlEvent and OwnedName. + +struct Name<'a>(&'a OwnedName); + +impl <'a> fmt::Display for Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(ref namespace) = self.0.namespace { + try! { write!(f, "{{{}}}", namespace) } + } + + if let Some(ref prefix) = self.0.prefix { + try! { write!(f, "{}:", prefix) } + } + + write!(f, "{}", self.0.local_name) + } +} + +struct Event<'a>(&'a Result); + +impl<'a> fmt::Display for Event<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let empty = String::new(); + match *self.0 { + Ok(ref e) => match *e { + XmlEvent::StartDocument { ref version, ref encoding, .. } => + write!(f, "StartDocument({}, {})", version, encoding), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}={:?})", name, + data.as_ref().unwrap_or(&empty)), + XmlEvent::StartElement { ref name, ref attributes, .. } => { + if attributes.is_empty() { + write!(f, "StartElement({})", Name(name)) + } + else { + let attrs: Vec<_> = attributes.iter() + .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect(); + write!(f, "StartElement({} [{}])", Name(name), attrs.join(", ")) + } + }, + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", Name(name)), + XmlEvent::Comment(ref data) => + write!(f, r#"Comment("{}")"#, data.escape_debug()), + XmlEvent::CData(ref data) => + write!(f, r#"CData("{}")"#, data.escape_debug()), + XmlEvent::Characters(ref data) => + write!(f, r#"Characters("{}")"#, data.escape_debug()), + XmlEvent::Whitespace(ref data) => + write!(f, r#"Whitespace("{}")"#, data.escape_debug()), + }, + Err(ref e) => e.fmt(f), + } + } +} diff --git a/tests/event_writer.rs b/tests/event_writer.rs new file mode 100644 index 0000000..dd64a43 --- /dev/null +++ b/tests/event_writer.rs @@ -0,0 +1,269 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::io::{BufReader, SeekFrom}; +use std::io::prelude::*; +use std::fs::File; +use std::str; + +use xml::reader::EventReader; +use xml::writer::EmitterConfig; + +macro_rules! unwrap_all { + ($($e:expr);+) => {{ + $($e.unwrap();)+ + }} +} + +#[test] +fn reading_writing_equal_with_namespaces() { + let mut f = File::open("tests/documents/sample_2.xml").unwrap(); + let mut b = Vec::new(); + + { + let r = EventReader::new(BufReader::new(&mut f)); + let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b); + + for e in r { + match e { + Ok(e) => if let Some(e) = e.as_writer_event() { + match w.write(e) { + Ok(_) => {}, + Err(e) => panic!("Writer error: {:?}", e) + } + }, + Err(e) => panic!("Error: {}", e) + } + } + } + + f.seek(SeekFrom::Start(0)).unwrap(); + let mut fs = String::new(); + f.read_to_string(&mut fs).unwrap(); + + let bs = String::from_utf8(b).unwrap(); + + assert_eq!(fs.trim(), bs.trim()); +} + +#[test] +fn writing_simple() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); + + w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap(); + w.write("hello world").unwrap(); + w.write(XmlEvent::end_element()).unwrap(); + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + r#"hello world"# + ); +} + +#[test] +fn writing_empty_elements_with_normalizing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_empty_elements_without_normalizing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .normalize_empty_elements(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_empty_elements_without_pad_self_closing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .pad_self_closing(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} +#[test] +fn writing_empty_elements_pad_self_closing_explicit() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .pad_self_closing(true) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_comments_with_indentation() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .perform_indent(true) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::comment(" this is a manually padded comment\t")); + w.write(XmlEvent::comment("this is an unpadded comment")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + " + + + + +"); +} + +#[test] +fn issue_112_overriding_namepace_prefix() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A")); + w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::start_element("whatever").ns("a", "urn:X")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + r#""# + ) +} + +#[test] +fn attribute_escaping() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .perform_indent(true) + .create_writer(&mut b); + + unwrap_all! { + w.write( + XmlEvent::start_element("hello") + .attr("testLt", "<") + .attr("testGt", ">") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testQuot", "\"") + .attr("testApos", "\'") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testAmp", "&") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testNl", "\n") + .attr("testCr", "\r") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testNl", "\\n") + .attr("testCr", "\\r") + ); + w.write(XmlEvent::end_element()) + } + } + assert_eq!( + str::from_utf8(&b).unwrap(), + " + + + +" + ); +} \ No newline at end of file diff --git a/tests/streaming.rs b/tests/streaming.rs new file mode 100644 index 0000000..a577a00 --- /dev/null +++ b/tests/streaming.rs @@ -0,0 +1,103 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::io::{Cursor, Write}; + +use xml::EventReader; +use xml::reader::ParserConfig; +use xml::reader::XmlEvent; + +macro_rules! assert_match { + ($actual:expr, $expected:pat) => { + match $actual { + $expected => {}, + _ => panic!("assertion failed: `(left matches right)` \ + (left: `{:?}`, right: `{}`", $actual, stringify!($expected)) + } + }; + ($actual:expr, $expected:pat if $guard:expr) => { + match $actual { + $expected if $guard => {}, + _ => panic!("assertion failed: `(left matches right)` \ + (left: `{:?}`, right: `{} if {}`", + $actual, stringify!($expected), stringify!($guard)) + } + } +} + +fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write { + let p = c.position(); + c.write_all(data).unwrap(); + c.set_position(p); +} + +#[test] +fn reading_streamed_content() { + let buf = Cursor::new(b"".to_vec()); + let reader = EventReader::new(buf); + + let mut it = reader.into_iter(); + + assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. }))); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); + + write_and_reset_position(it.source_mut(), b"content"); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); + assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3"); + // doesn't seem to work because of how tags parsing is done +// write_and_reset_position(it.source_mut(), b"some text"); + // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument))); + assert_match!(it.next(), None); +} + +#[test] +fn reading_streamed_content2() { + let buf = Cursor::new(b"".to_vec()); + let mut config = ParserConfig::new(); + config.ignore_end_of_stream = true; + let readerb = EventReader::new_with_config(buf, config); + + let mut reader = readerb.into_iter(); + + assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. }))); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); + + write_and_reset_position(reader.source_mut(), b"content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); + assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); + + write_and_reset_position(reader.source_mut(), b"content"); + + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); + assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); + assert_match!(reader.next(), Some(Err(_))); + write_and_reset_position(reader.source_mut(), b""); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); + write_and_reset_position(reader.source_mut(), b" { + panic!("At this point, parser must not detect something."); + }, + Some(Err(_)) => {} + }; + write_and_reset_position(reader.source_mut(), b" />"); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4"); +} +