From 0521f209f7e30aea2aa4230ada2b2385d4ad0224 Mon Sep 17 00:00:00 2001 From: Woohyun Jung Date: Fri, 17 Mar 2023 12:45:47 +0900 Subject: [PATCH 1/1] Import xml-rs 0.8.4 --- .cargo_vcs_info.json | 5 + .github/workflows/main.yml | 31 + .gitignore | 8 + Cargo.toml | 36 + Cargo.toml.orig | 23 + Changelog.md | 126 +++ LICENSE | 21 + Readme.md | 236 ++++++ design.md | 37 + src/analyze.rs | 99 +++ src/attribute.rs | 99 +++ src/common.rs | 142 ++++ src/escape.rs | 126 +++ src/lib.rs | 29 + src/macros.rs | 30 + src/name.rs | 301 +++++++ src/namespace.rs | 485 ++++++++++++ src/reader/config.rs | 181 +++++ src/reader/error.rs | 121 +++ src/reader/events.rs | 219 ++++++ src/reader/lexer.rs | 867 +++++++++++++++++++++ src/reader/mod.rs | 129 +++ src/reader/parser/inside_cdata.rs | 32 + src/reader/parser/inside_closing_tag_name.rs | 34 + src/reader/parser/inside_comment.rs | 32 + src/reader/parser/inside_declaration.rs | 151 ++++ src/reader/parser/inside_doctype.rs | 16 + src/reader/parser/inside_opening_tag.rs | 108 +++ src/reader/parser/inside_processing_instruction.rs | 96 +++ src/reader/parser/inside_reference.rs | 89 +++ src/reader/parser/mod.rs | 622 +++++++++++++++ src/reader/parser/outside_tag.rs | 130 +++ src/util.rs | 107 +++ src/writer/config.rs | 157 ++++ src/writer/emitter.rs | 447 +++++++++++ src/writer/events.rs | 241 ++++++ src/writer/mod.rs | 93 +++ tests/documents/sample_1.xml | 34 + tests/documents/sample_1_full.txt | 58 ++ tests/documents/sample_1_short.txt | 37 + tests/documents/sample_2.xml | 15 + tests/documents/sample_2_full.txt | 41 + tests/documents/sample_2_short.txt | 30 + tests/documents/sample_3.xml | 13 + tests/documents/sample_3_full.txt | 23 + tests/documents/sample_3_short.txt | 14 + tests/documents/sample_4.xml | 15 + tests/documents/sample_4_full.txt | 23 + tests/documents/sample_4_short.txt | 14 + tests/documents/sample_5.xml | 7 + tests/documents/sample_5_short.txt | 7 + tests/documents/sample_6.xml | 4 + tests/documents/sample_6_full.txt | 8 + tests/event_reader.rs | 587 ++++++++++++++ tests/event_writer.rs | 269 +++++++ tests/streaming.rs | 103 +++ 56 files changed, 7008 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 .github/workflows/main.yml create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 Changelog.md create mode 100644 LICENSE create mode 100644 Readme.md create mode 100644 design.md create mode 100644 src/analyze.rs create mode 100644 src/attribute.rs create mode 100644 src/common.rs create mode 100644 src/escape.rs create mode 100644 src/lib.rs create mode 100644 src/macros.rs create mode 100644 src/name.rs create mode 100644 src/namespace.rs create mode 100644 src/reader/config.rs create mode 100644 src/reader/error.rs create mode 100644 src/reader/events.rs create mode 100644 src/reader/lexer.rs create mode 100644 src/reader/mod.rs create mode 100644 src/reader/parser/inside_cdata.rs create mode 100644 src/reader/parser/inside_closing_tag_name.rs create mode 100644 src/reader/parser/inside_comment.rs create mode 100644 src/reader/parser/inside_declaration.rs create mode 100644 src/reader/parser/inside_doctype.rs create mode 100644 src/reader/parser/inside_opening_tag.rs create mode 100644 src/reader/parser/inside_processing_instruction.rs create mode 100644 src/reader/parser/inside_reference.rs create mode 100644 src/reader/parser/mod.rs create mode 100644 src/reader/parser/outside_tag.rs create mode 100644 src/util.rs create mode 100644 src/writer/config.rs create mode 100644 src/writer/emitter.rs create mode 100644 src/writer/events.rs create mode 100644 src/writer/mod.rs create mode 100644 tests/documents/sample_1.xml create mode 100644 tests/documents/sample_1_full.txt create mode 100644 tests/documents/sample_1_short.txt create mode 100644 tests/documents/sample_2.xml create mode 100644 tests/documents/sample_2_full.txt create mode 100644 tests/documents/sample_2_short.txt create mode 100644 tests/documents/sample_3.xml create mode 100644 tests/documents/sample_3_full.txt create mode 100644 tests/documents/sample_3_short.txt create mode 100644 tests/documents/sample_4.xml create mode 100644 tests/documents/sample_4_full.txt create mode 100644 tests/documents/sample_4_short.txt create mode 100644 tests/documents/sample_5.xml create mode 100644 tests/documents/sample_5_short.txt create mode 100644 tests/documents/sample_6.xml create mode 100644 tests/documents/sample_6_full.txt create mode 100644 tests/event_reader.rs create mode 100644 tests/event_writer.rs create mode 100644 tests/streaming.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..6e0c55d --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,5 @@ +{ + "git": { + "sha1": "7cd06954fd6e22b7dbf9ea02ff4e22f9ff6309fd" + } +} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..daca69f --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,31 @@ +name: CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + rust: [stable, beta, nightly] + + steps: + - uses: actions/checkout@v2 + + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + + - uses: actions-rs/cargo@v1 + with: + command: build + + - uses: actions-rs/cargo@v1 + with: + command: test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60b0232 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.swp +*.swo +/doc +*~ +/target/ +/Cargo.lock +.idea/ +*.iml \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e704337 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,36 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "xml-rs" +version = "0.8.4" +authors = ["Vladimir Matveev "] +description = "An XML library in pure Rust" +documentation = "http://docs.rs/xml-rs/" +readme = "Readme.md" +keywords = ["xml", "parsing", "parser"] +categories = ["parsing"] +license = "MIT" +repository = "https://github.com/netvl/xml-rs" + +[lib] +name = "xml" +path = "src/lib.rs" + +[[bin]] +name = "xml-analyze" +path = "src/analyze.rs" +[dev-dependencies.doc-comment] +version = "0.3" + +[dev-dependencies.lazy_static] +version = "1.2.0" diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..c8df8e6 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,23 @@ +[package] +name = "xml-rs" +version = "0.8.4" +authors = ["Vladimir Matveev "] +license = "MIT" +description = "An XML library in pure Rust" +repository = "https://github.com/netvl/xml-rs" +documentation = "http://docs.rs/xml-rs/" +readme = "Readme.md" +keywords = ["xml", "parsing", "parser"] +categories = ["parsing"] + +[lib] +name = "xml" +path = "src/lib.rs" + +[[bin]] +name = "xml-analyze" +path = "src/analyze.rs" + +[dev-dependencies] +doc-comment = "0.3" +lazy_static = "1.2.0" diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..3cca8b8 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,126 @@ +## Version 0.8.4 + +* Fixed recognition of `?>`, `]]>` and `/>` tokens as characters. +* Fixed writer output operations to use `write_all` to ensure that the data + is written fully. +* The document declaration is now written before any characters automatically. + +## Version 0.8.3 + +* Added a new parser option, `ignore_root_level_whitespace`, which makes the parser + skip emitting whitespace events outside of the root element when set to `true`. + This helps with certain tasks like canonicalization. + +## Version 0.8.2 + +* Added a new parser option, `replace_unknown_entity_references`, which allows to ignore + invalid Unicode code points and replace them with a Unicode "replacement character" + during parsing. This can be helpful to deal with e.g. UTF-16 surrogate pairs. +* Added a new emitter option, `pad_self_closing`, which determines the style of the self-closing + elements when they are emitted: `` (`true`) vs `` (`false`). + +## Version 0.8.1 + +* Fixed various issues with tests introduced by updates in Rust. +* Adjusted the lexer to ignore contents of the `` tag. +* Removed unnecessary unsafety in tests. +* Added tests for doc comments in the readme file. +* Switched to GitHub Actions from Travis CI. + +## Version 0.8.0 + +* Same as 0.7.1, with 0.7.1 being yanked because of the incorrect semver bump. + +## Version 0.7.1 + +* Removed dependency on bitflags. +* Added the `XmlWriter::inner_mut()` method. +* Fixed some rustdoc warnings. + +## Version 0.7.0 + +* Same as 0.6.2, with 0.6.2 being yanked because of the incompatible bump of minimum required version of rustc. + +## Version 0.6.2 + +* Bumped `bitflags` to 1.0. + +## Version 0.6.1 + +* Fixed the writer to escape some special characters when writing attribute values. + +## Version 0.6.0 + +* Changed the target type of extra entities from `char` to `String`. This is an incompatible + change. + +## Version 0.5.0 + +* Added support for ignoring EOF errors in order to read documents from streams incrementally. +* Bumped `bitflags` to 0.9. + +## Version 0.4.1 + +* Added missing `Debug` implementation to `xml::writer::XmlEvent`. + +## Version 0.4.0 + +* Bumped version number, since changes introduced in 0.3.7 break backwards compatibility. + +## Version 0.3.8 + +* Fixed a problem introduced in 0.3.7 with entities in attributes causing parsing errors. + +## Version 0.3.7 + +* Fixed the problem with parsing non-whitespace character entities as whitespace (issue #140). +* Added support for configuring custom entities in the parser configuration. + +## Version 0.3.6 + +* Added an `Error` implementation for `EmitterError`. +* Fixed escaping of strings with multi-byte code points. + +## Version 0.3.5 + +* Added `Debug` implementation for `XmlVersion`. +* Fixed some failing tests. + +## Version 0.3.3 + +* Updated `bitflags` to 0.7. + +## Version 0.3.2 + +* Added `From` for `xml::reader::Error`, which improves usability of working with parsing errors. + +## Version 0.3.1 + +* Bumped `bitflags` dependency to 0.4, some internal warning fixes. + +## Version 0.3.0 + +* Changed error handling in `EventReader` - now I/O errors are properly bubbled up from the lexer. + +## Version 0.2.4 + +* Fixed #112 - incorrect handling of namespace redefinitions when writing a document. + +## Version 0.2.3 + +* Added `into_inner()` methods to `EventReader` and `EventWriter`. + +## Version 0.2.2 + +* Using `join` instead of the deprecated `connect`. +* Added a simple XML analyzer program which demonstrates library usage and can be used to check XML documents for well-formedness. +* Fixed incorrect handling of unqualified attribute names (#107). +* Added this changelog. + +## Version 0.2.1 + +* Fixed #105 - incorrect handling of double dashes. + +## Version 0.2.0 + +* Major update, includes proper document writing support and significant architecture changes. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6caa1d3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Vladimir Matveev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..5ab88f8 --- /dev/null +++ b/Readme.md @@ -0,0 +1,236 @@ +xml-rs, an XML library for Rust +=============================== + +[![Build Status][build-status-img]](https://github.com/netvl/xml-rs/actions?query=workflow%3ACI) +[![crates.io][crates-io-img]](https://crates.io/crates/xml-rs) +[![docs][docs-img]](https://docs.rs/xml-rs/) + +[Documentation](https://docs.rs/xml-rs/) + + [build-status-img]: https://img.shields.io/github/workflow/status/netvl/xml-rs/CI/master?style=flat-square + [crates-io-img]: https://img.shields.io/crates/v/xml-rs.svg?style=flat-square + [docs-img]: https://img.shields.io/badge/docs-latest%20release-6495ed.svg?style=flat-square + +xml-rs is an XML library for [Rust](http://www.rust-lang.org/) programming language. +It is heavily inspired by Java [Streaming API for XML (StAX)][stax]. + + [stax]: https://en.wikipedia.org/wiki/StAX + +This library currently contains pull parser much like [StAX event reader][stax-reader]. +It provides iterator API, so you can leverage Rust's existing iterators library features. + + [stax-reader]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventReader.html + +It also provides a streaming document writer much like [StAX event writer][stax-writer]. +This writer consumes its own set of events, but reader events can be converted to +writer events easily, and so it is possible to write XML transformation chains in a pretty +clean manner. + + [stax-writer]: http://docs.oracle.com/javase/8/docs/api/javax/xml/stream/XMLEventWriter.html + +This parser is mostly full-featured, however, there are limitations: +* no other encodings but UTF-8 are supported yet, because no stream-based encoding library + is available now; when (or if) one will be available, I'll try to make use of it; +* DTD validation is not supported, `` declarations are completely ignored; thus no + support for custom entities too; internal DTD declarations are likely to cause parsing errors; +* attribute value normalization is not performed, and end-of-line characters are not normalized too. + +Other than that the parser tries to be mostly XML-1.0-compliant. + +Writer is also mostly full-featured with the following limitations: +* no support for encodings other than UTF-8, for the same reason as above; +* no support for emitting `` declarations; +* more validations of input are needed, for example, checking that namespace prefixes are bounded + or comments are well-formed. + +What is planned (highest priority first, approximately): + +0. missing features required by XML standard (e.g. aforementioned normalization and + proper DTD parsing); +1. miscellaneous features of the writer; +2. parsing into a DOM tree and its serialization back to XML text; +3. SAX-like callback-based parser (fairly easy to implement over pull parser); +4. DTD validation; +5. (let's dream a bit) XML Schema validation. + +Building and using +------------------ + +xml-rs uses [Cargo](http://crates.io), so just add a dependency section in your project's manifest: + +```toml +[dependencies] +xml-rs = "0.8" +``` + +The package exposes a single crate called `xml`: + +```rust +extern crate xml; +``` + +Reading XML documents +--------------------- + +`xml::reader::EventReader` requires a `Read` instance to read from. When a proper stream-based encoding +library is available, it is likely that xml-rs will be switched to use whatever character stream structure +this library would provide, but currently it is a `Read`. + +Using `EventReader` is very straightforward. Just provide a `Read` instance to obtain an iterator +over events: + +```rust,no_run +extern crate xml; + +use std::fs::File; +use std::io::BufReader; + +use xml::reader::{EventReader, XmlEvent}; + +fn indent(size: usize) -> String { + const INDENT: &'static str = " "; + (0..size).map(|_| INDENT) + .fold(String::with_capacity(size*INDENT.len()), |r, s| r + s) +} + +fn main() { + let file = File::open("file.xml").unwrap(); + let file = BufReader::new(file); + + let parser = EventReader::new(file); + let mut depth = 0; + for e in parser { + match e { + Ok(XmlEvent::StartElement { name, .. }) => { + println!("{}+{}", indent(depth), name); + depth += 1; + } + Ok(XmlEvent::EndElement { name }) => { + depth -= 1; + println!("{}-{}", indent(depth), name); + } + Err(e) => { + println!("Error: {}", e); + break; + } + _ => {} + } + } +} +``` + +`EventReader` implements `IntoIterator` trait, so you can just use it in a `for` loop directly. +Document parsing can end normally or with an error. Regardless of exact cause, the parsing +process will be stopped, and iterator will terminate normally. + +You can also have finer control over when to pull the next event from the parser using its own +`next()` method: + +```rust,ignore +match parser.next() { + ... +} +``` + +Upon the end of the document or an error the parser will remember that last event and will always +return it in the result of `next()` call afterwards. If iterator is used, then it will yield +error or end-of-document event once and will produce `None` afterwards. + +It is also possible to tweak parsing process a little using `xml::reader::ParserConfig` structure. +See its documentation for more information and examples. + +You can find a more extensive example of using `EventReader` in `src/analyze.rs`, which is a +small program (BTW, it is built with `cargo build` and can be run after that) which shows various +statistics about specified XML document. It can also be used to check for well-formedness of +XML documents - if a document is not well-formed, this program will exit with an error. + +Writing XML documents +--------------------- + +xml-rs also provides a streaming writer much like StAX event writer. With it you can write an +XML document to any `Write` implementor. + +```rust,no_run +extern crate xml; + +use std::fs::File; +use std::io::{self, Write}; + +use xml::writer::{EventWriter, EmitterConfig, XmlEvent, Result}; + +fn handle_event(w: &mut EventWriter, line: String) -> Result<()> { + let line = line.trim(); + let event: XmlEvent = if line.starts_with("+") && line.len() > 1 { + XmlEvent::start_element(&line[1..]).into() + } else if line.starts_with("-") { + XmlEvent::end_element().into() + } else { + XmlEvent::characters(&line).into() + }; + w.write(event) +} + +fn main() { + let mut file = File::create("output.xml").unwrap(); + + let mut input = io::stdin(); + let mut output = io::stdout(); + let mut writer = EmitterConfig::new().perform_indent(true).create_writer(&mut file); + loop { + print!("> "); output.flush().unwrap(); + let mut line = String::new(); + match input.read_line(&mut line) { + Ok(0) => break, + Ok(_) => match handle_event(&mut writer, line) { + Ok(_) => {} + Err(e) => panic!("Write error: {}", e) + }, + Err(e) => panic!("Input error: {}", e) + } + } +} +``` + +The code example above also demonstrates how to create a writer out of its configuration. +Similar thing also works with `EventReader`. + +The library provides an XML event building DSL which helps to construct complex events, +e.g. ones having namespace definitions. Some examples: + +```rust,ignore +// +XmlEvent::start_element("a:hello").attr("a:param", "value").ns("a", "urn:some:document") + +// +XmlEvent::start_element("hello").attr("b:config", "value").default_ns("urn:defaul:uri") + +// +XmlEvent::cdata("some unescaped text") +``` + +Of course, one can create `XmlEvent` enum variants directly instead of using the builder DSL. +There are more examples in `xml::writer::XmlEvent` documentation. + +The writer has multiple configuration options; see `EmitterConfig` documentation for more +information. + +Other things +------------ + +No performance tests or measurements are done. The implementation is rather naive, and no specific +optimizations are made. Hopefully the library is sufficiently fast to process documents of common size. +I intend to add benchmarks in future, but not until more important features are added. + +Known issues +------------ + +All known issues are present on GitHub issue tracker: . +Feel free to post any found problems there. + +License +------- + +This library is licensed under MIT license. + +--- +Copyright (C) Vladimir Matveev, 2014-2020 diff --git a/design.md b/design.md new file mode 100644 index 0000000..da67c7b --- /dev/null +++ b/design.md @@ -0,0 +1,37 @@ +# Reader + +Basic features: + * [x] Parsing XML 1.0 documents and returning a stream of events + - [ ] Support reading embedded DTD schemas + - [ ] Support for embedded entities + * [x] Support for namespaces and emitting namespace information in events + * [ ] \[maybe\] push-based wrapper + * Missing XML features + - [ ] Support for different encodings + - [ ] Attribute values normalization + - [ ] EOL characters normalization + +Advanced features: + * [ ] DTD schema validation + * [ ] XSD schema validation + +# Writer + +Basic features: + * [x] Writing basic XML 1.0 documents in UTF-8 + * [x] Writing XML 1.0 documents with namespace support + * [x] Support for writing elements with empty body as empty elements + * [x] Pretty-printed and compact output + * [ ] Writing XML document with embedded DTDs and DTD references + * Misc features: + - [ ] Support for different encodings + - [x] Support for writing CDATA as characters + - [ ] Checking events for invalid characters (e.g. `--` in comments) + - [ ] Check for namespaces more correctly, i.e. check both for prefix and namespace URI + - [ ] Support checking namespace prefix presence in the current namespace for events with prefix but without namespace + - [ ] Support checking namespace prefix for events with both prefix and namespace URI + +# Other + +DOM-based API: + * [ ] Basic support for DOM-based API diff --git a/src/analyze.rs b/src/analyze.rs new file mode 100644 index 0000000..d369d2f --- /dev/null +++ b/src/analyze.rs @@ -0,0 +1,99 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::cmp; +use std::env; +use std::io::{self, Read, Write, BufReader}; +use std::fs::File; +use std::collections::HashSet; + +use xml::ParserConfig; +use xml::reader::XmlEvent; + +macro_rules! abort { + ($code:expr) => {::std::process::exit($code)}; + ($code:expr, $($args:tt)+) => {{ + writeln!(&mut ::std::io::stderr(), $($args)+).unwrap(); + ::std::process::exit($code); + }} +} + +fn main() { + let mut file; + let mut stdin; + let source: &mut Read = match env::args().nth(1) { + Some(file_name) => { + file = File::open(file_name) + .unwrap_or_else(|e| abort!(1, "Cannot open input file: {}", e)); + &mut file + } + None => { + stdin = io::stdin(); + &mut stdin + } + }; + + let reader = ParserConfig::new() + .whitespace_to_characters(true) + .ignore_comments(false) + .create_reader(BufReader::new(source)); + + let mut processing_instructions = 0; + let mut elements = 0; + let mut character_blocks = 0; + let mut cdata_blocks = 0; + let mut characters = 0; + let mut comment_blocks = 0; + let mut comment_characters = 0; + let mut namespaces = HashSet::new(); + let mut depth = 0; + let mut max_depth = 0; + + for e in reader { + match e { + Ok(e) => match e { + XmlEvent::StartDocument { version, encoding, standalone } => + println!( + "XML document version {}, encoded in {}, {}standalone", + version, encoding, if standalone.unwrap_or(false) { "" } else { "not " } + ), + XmlEvent::EndDocument => println!("Document finished"), + XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1, + XmlEvent::Whitespace(_) => {} // can't happen due to configuration + XmlEvent::Characters(s) => { + character_blocks += 1; + characters += s.len(); + } + XmlEvent::CData(s) => { + cdata_blocks += 1; + characters += s.len(); + } + XmlEvent::Comment(s) => { + comment_blocks += 1; + comment_characters += s.len(); + } + XmlEvent::StartElement { namespace, .. } => { + depth += 1; + max_depth = cmp::max(max_depth, depth); + elements += 1; + namespaces.extend(namespace.0.into_iter().map(|(_, ns_uri)| ns_uri)); + } + XmlEvent::EndElement { .. } => { + depth -= 1; + } + }, + Err(e) => abort!(1, "Error parsing XML document: {}", e) + } + } + namespaces.remove(xml::namespace::NS_EMPTY_URI); + namespaces.remove(xml::namespace::NS_XMLNS_URI); + namespaces.remove(xml::namespace::NS_XML_URI); + + println!("Elements: {}, maximum depth: {}", elements, max_depth); + println!("Namespaces (excluding built-in): {}", namespaces.len()); + println!("Characters: {}, characters blocks: {}, CDATA blocks: {}", + characters, character_blocks, cdata_blocks); + println!("Comment blocks: {}, comment characters: {}", comment_blocks, comment_characters); + println!("Processing instructions (excluding built-in): {}", processing_instructions); +} diff --git a/src/attribute.rs b/src/attribute.rs new file mode 100644 index 0000000..8728f49 --- /dev/null +++ b/src/attribute.rs @@ -0,0 +1,99 @@ +//! Contains XML attributes manipulation types and functions. +//! + +use std::fmt; + +use name::{Name, OwnedName}; +use escape::escape_str_attribute; + +/// A borrowed version of an XML attribute. +/// +/// Consists of a borrowed qualified name and a borrowed string value. +#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] +pub struct Attribute<'a> { + /// Attribute name. + pub name: Name<'a>, + + /// Attribute value. + pub value: &'a str +} + +impl<'a> fmt::Display for Attribute<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(self.value)) + } +} + +impl<'a> Attribute<'a> { + /// Creates an owned attribute out of this borrowed one. + #[inline] + pub fn to_owned(&self) -> OwnedAttribute { + OwnedAttribute { + name: self.name.into(), + value: self.value.into(), + } + } + + /// Creates a borrowed attribute using the provided borrowed name and a borrowed string value. + #[inline] + pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> { + Attribute { name, value, } + } +} + +/// An owned version of an XML attribute. +/// +/// Consists of an owned qualified name and an owned string value. +#[derive(Clone, Eq, PartialEq, Hash, Debug)] +pub struct OwnedAttribute { + /// Attribute name. + pub name: OwnedName, + + /// Attribute value. + pub value: String +} + +impl OwnedAttribute { + /// Returns a borrowed `Attribute` out of this owned one. + pub fn borrow(&self) -> Attribute { + Attribute { + name: self.name.borrow(), + value: &*self.value, + } + } + + /// Creates a new owned attribute using the provided owned name and an owned string value. + #[inline] + pub fn new>(name: OwnedName, value: S) -> OwnedAttribute { + OwnedAttribute { + name, + value: value.into(), + } + } +} + +impl fmt::Display for OwnedAttribute { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}=\"{}\"", self.name, escape_str_attribute(&*self.value)) + } +} + +#[cfg(test)] +mod tests { + use super::{Attribute}; + + use name::Name; + + #[test] + fn attribute_display() { + let attr = Attribute::new( + Name::qualified("attribute", "urn:namespace", Some("n")), + "its value with > & \" ' < weird symbols" + ); + + assert_eq!( + &*attr.to_string(), + "{urn:namespace}n:attribute=\"its value with > & " ' < weird symbols\"" + ) + } +} diff --git a/src/common.rs b/src/common.rs new file mode 100644 index 0000000..029e851 --- /dev/null +++ b/src/common.rs @@ -0,0 +1,142 @@ +//! Contains common types and functions used throughout the library. + +use std::fmt; + +/// Represents a position inside some textual document. +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct TextPosition { + /// Row, counting from 0 + pub row: u64, + /// Column, counting from 0 + pub column: u64, +} + +impl TextPosition { + /// Creates a new position initialized to the beginning of the document + #[inline] + pub fn new() -> TextPosition { + TextPosition { row: 0, column: 0 } + } + + /// Advances the position in a line + #[inline] + pub fn advance(&mut self, count: u8) { + self.column += count as u64; + } + + /// Advances the position in a line to the next tab position + #[inline] + pub fn advance_to_tab(&mut self, width: u8) { + let width = width as u64; + self.column += width - self.column % width + } + + /// Advances the position to the beginning of the next line + #[inline] + pub fn new_line(&mut self) { + self.column = 0; + self.row += 1; + } +} + +impl fmt::Debug for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +impl fmt::Display for TextPosition { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}:{}", self.row + 1, self.column + 1) + } +} + +/// Get the position in the document corresponding to the object +/// +/// This trait is implemented by parsers, lexers and errors. +pub trait Position { + /// Returns the current position or a position corresponding to the object. + fn position(&self) -> TextPosition; +} + +impl Position for TextPosition { + #[inline] + fn position(&self) -> TextPosition { + *self + } +} + +/// XML version enumeration. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum XmlVersion { + /// XML version 1.0. + Version10, + + /// XML version 1.1. + Version11 +} + +impl fmt::Display for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlVersion::Version10 => write!(f, "1.0"), + XmlVersion::Version11 => write!(f, "1.1") + } + } +} + +impl fmt::Debug for XmlVersion { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// Checks whether the given character is a white space character (`S`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_whitespace_char(c: char) -> bool { + match c { + '\x20' | '\x09' | '\x0d' | '\x0a' => true, + _ => false + } +} + +/// Checks whether the given string is compound only by white space +/// characters (`S`) using the previous is_whitespace_char to check +/// all characters of this string +pub fn is_whitespace_str(s: &str) -> bool { + s.chars().all(is_whitespace_char) +} + +/// Checks whether the given character is a name start character (`NameStartChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_start_char(c: char) -> bool { + match c { + ':' | 'A'...'Z' | '_' | 'a'...'z' | + '\u{C0}'...'\u{D6}' | '\u{D8}'...'\u{F6}' | '\u{F8}'...'\u{2FF}' | + '\u{370}'...'\u{37D}' | '\u{37F}'...'\u{1FFF}' | + '\u{200C}'...'\u{200D}' | '\u{2070}'...'\u{218F}' | + '\u{2C00}'...'\u{2FEF}' | '\u{3001}'...'\u{D7FF}' | + '\u{F900}'...'\u{FDCF}' | '\u{FDF0}'...'\u{FFFD}' | + '\u{10000}'...'\u{EFFFF}' => true, + _ => false + } +} + +/// Checks whether the given character is a name character (`NameChar`) +/// as is defined by XML 1.1 specification, [section 2.3][1]. +/// +/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn +pub fn is_name_char(c: char) -> bool { + match c { + _ if is_name_start_char(c) => true, + '-' | '.' | '0'...'9' | '\u{B7}' | + '\u{300}'...'\u{36F}' | '\u{203F}'...'\u{2040}' => true, + _ => false + } +} diff --git a/src/escape.rs b/src/escape.rs new file mode 100644 index 0000000..18298b9 --- /dev/null +++ b/src/escape.rs @@ -0,0 +1,126 @@ +//! Contains functions for performing XML special characters escaping. + +use std::borrow::Cow; + +enum Value { + Char(char), + Str(&'static str) +} + +impl Value { + fn dispatch_for_attribute(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '>' => Value::Str(">"), + '"' => Value::Str("""), + '\'' => Value::Str("'"), + '&' => Value::Str("&"), + '\n' => Value::Str(" "), + '\r' => Value::Str(" "), + _ => Value::Char(c) + } + } + + fn dispatch_for_pcdata(c: char) -> Value { + match c { + '<' => Value::Str("<"), + '&' => Value::Str("&"), + _ => Value::Char(c) + } + } +} + +enum Process<'a> { + Borrowed(&'a str), + Owned(String) +} + +impl<'a> Process<'a> { + fn process(&mut self, (i, next): (usize, Value)) { + match next { + Value::Str(s) => match *self { + Process::Owned(ref mut o) => o.push_str(s), + Process::Borrowed(b) => { + let mut r = String::with_capacity(b.len() + s.len()); + r.push_str(&b[..i]); + r.push_str(s); + *self = Process::Owned(r); + } + }, + Value::Char(c) => match *self { + Process::Borrowed(_) => {} + Process::Owned(ref mut o) => o.push(c) + } + } + } + + fn into_result(self) -> Cow<'a, str> { + match self { + Process::Borrowed(b) => Cow::Borrowed(b), + Process::Owned(o) => Cow::Owned(o) + } + } +} + +impl<'a> Extend<(usize, Value)> for Process<'a> { + fn extend>(&mut self, it: I) { + for v in it.into_iter() { + self.process(v); + } + } +} + +fn escape_str(s: &str, dispatch: fn(char) -> Value) -> Cow { + let mut p = Process::Borrowed(s); + p.extend(s.char_indices().map(|(ind, c)| (ind, dispatch(c)))); + p.into_result() +} + +/// Performs escaping of common XML characters inside an attribute value. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `>` → `>` +/// * `"` → `"` +/// * `'` → `'` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside XML attribute values or in PCDATA sections. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_attribute(s: &str) -> Cow { + escape_str(s, Value::dispatch_for_attribute) +} + +/// Performs escaping of common XML characters inside PCDATA. +/// +/// This function replaces several important markup characters with their +/// entity equivalents: +/// +/// * `<` → `<` +/// * `&` → `&` +/// +/// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values. +/// +/// Does not perform allocations if the given string does not contain escapable characters. +#[inline] +pub fn escape_str_pcdata(s: &str) -> Cow { + escape_str(s, Value::dispatch_for_pcdata) +} + +#[cfg(test)] +mod tests { + use super::{escape_str_pcdata, escape_str_attribute}; + + // TODO: add more tests + + #[test] + fn test_escape_multibyte_code_points() { + assert_eq!(escape_str_attribute("☃<"), "☃<"); + assert_eq!(escape_str_pcdata("☃<"), "☃<"); + } +} + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fb672ef --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,29 @@ +//#![warn(missing_doc)] +#![allow(dead_code)] +#![allow(unused_variables)] +#![forbid(non_camel_case_types)] +#![forbid(unsafe_code)] + +//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser. + +#[cfg(doctest)] +#[macro_use] +extern crate doc_comment; + +#[cfg(doctest)] +doctest!("../Readme.md"); + +pub use reader::EventReader; +pub use reader::ParserConfig; +pub use writer::EventWriter; +pub use writer::EmitterConfig; + +pub mod macros; +pub mod name; +pub mod attribute; +pub mod common; +pub mod escape; +pub mod namespace; +pub mod reader; +pub mod writer; +mod util; diff --git a/src/macros.rs b/src/macros.rs new file mode 100644 index 0000000..1cce3d6 --- /dev/null +++ b/src/macros.rs @@ -0,0 +1,30 @@ +#![macro_use] + +//! Contains several macros used in this crate. + +macro_rules! gen_setter { + ($target:ty, $field:ident : into $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field>(mut self, value: T) -> $target { + self.$field = value.into(); + self + } + } + }; + ($target:ty, $field:ident : val $t:ty) => { + impl $target { + /// Sets the field to the provided value and returns updated config object. + pub fn $field(mut self, value: $t) -> $target { + self.$field = value; + self + } + } + } +} + +macro_rules! gen_setters { + ($target:ty, $($field:ident : $k:tt $tpe:ty),+) => ($( + gen_setter! { $target, $field : $k $tpe } + )+) +} diff --git a/src/name.rs b/src/name.rs new file mode 100644 index 0000000..a20eae2 --- /dev/null +++ b/src/name.rs @@ -0,0 +1,301 @@ +//! Contains XML qualified names manipulation types and functions. +//! + +use std::fmt; +use std::str::FromStr; + +use namespace::NS_NO_PREFIX; + +/// Represents a qualified XML name. +/// +/// A qualified name always consists at least of a local name. It can optionally contain +/// a prefix; when reading an XML document, if it contains a prefix, it must also contain a +/// namespace URI, but this is not enforced statically; see below. The name can contain a +/// namespace without a prefix; in that case a default, empty prefix is assumed. +/// +/// When writing XML documents, it is possible to omit the namespace URI, leaving only +/// the prefix. In this case the writer will check that the specifed prefix is bound to some +/// URI in the current namespace context. If both prefix and namespace URI are specified, +/// it is checked that the current namespace context contains this exact correspondence +/// between prefix and namespace URI. +/// +/// # Prefixes and URIs +/// +/// A qualified name with a prefix must always contain a proper namespace URI --- names with +/// a prefix but without a namespace associated with that prefix are meaningless. However, +/// it is impossible to obtain proper namespace URI by a prefix without a context, and such +/// context is only available when parsing a document (or it can be constructed manually +/// when writing a document). Tying a name to a context statically seems impractical. This +/// may change in future, though. +/// +/// # Conversions +/// +/// `Name` implements some `From` instances for conversion from strings and tuples. For example: +/// +/// ```rust +/// # use xml::name::Name; +/// let n1: Name = "p:some-name".into(); +/// let n2: Name = ("p", "some-name").into(); +/// +/// assert_eq!(n1, n2); +/// assert_eq!(n1.local_name, "some-name"); +/// assert_eq!(n1.prefix, Some("p")); +/// assert!(n1.namespace.is_none()); +/// ``` +/// +/// This is added to support easy specification of XML elements when writing XML documents. +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub struct Name<'a> { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: &'a str, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option<&'a str>, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option<&'a str> +} + +impl<'a> From<&'a str> for Name<'a> { + fn from(s: &'a str) -> Name<'a> { + let mut parts = s.splitn(2, ":").fuse(); + match (parts.next(), parts.next()) { + (Some(name), None) => Name::local(name), + (Some(prefix), Some(name)) => Name::prefixed(name, prefix), + _ => unreachable!() + } + } +} + +impl<'a> From<(&'a str, &'a str)> for Name<'a> { + fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> { + Name::prefixed(name, prefix) + } +} + +impl<'a> fmt::Display for Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(namespace) = self.namespace { + write!(f, "{{{}}}", namespace)?; + } + + if let Some(prefix) = self.prefix { + write!(f, "{}:", prefix)?; + } + + write!(f, "{}", self.local_name) + } +} + +impl<'a> Name<'a> { + /// Returns an owned variant of the qualified name. + pub fn to_owned(&self) -> OwnedName { + OwnedName { + local_name: self.local_name.into(), + namespace: self.namespace.map(|s| s.into()), + prefix: self.prefix.map(|s| s.into()) + } + } + + /// Returns a new `Name` instance representing plain local name. + #[inline] + pub fn local(local_name: &str) -> Name { + Name { + local_name, + prefix: None, + namespace: None + } + } + + /// Returns a new `Name` instance with the given local name and prefix. + #[inline] + pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> { + Name { + local_name, + namespace: None, + prefix: Some(prefix) + } + } + + /// Returns a new `Name` instance representing a qualified name with or without a prefix and + /// with a namespace URI. + #[inline] + pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> { + Name { + local_name, + namespace: Some(namespace), + prefix, + } + } + + /// Returns a correct XML representation of this local name and prefix. + /// + /// This method is different from the autoimplemented `to_string()` because it does not + /// include namespace URI in the result. + pub fn to_repr(&self) -> String { + self.repr_display().to_string() + } + + /// Returns a structure which can be displayed with `std::fmt` machinery to obtain this + /// local name and prefix. + /// + /// This method is needed for efficiency purposes in order not to create unnecessary + /// allocations. + #[inline] + pub fn repr_display(&self) -> ReprDisplay { + ReprDisplay(self) + } + + /// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant. + #[inline] + pub fn prefix_repr(&self) -> &str { + self.prefix.unwrap_or(NS_NO_PREFIX) + } +} + +/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is +/// displayed in an XML document. +pub struct ReprDisplay<'a, 'b:'a>(&'a Name<'b>); + +impl<'a, 'b:'a> fmt::Display for ReprDisplay<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0.prefix { + Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name), + None => write!(f, "{}", self.0.local_name) + } + } +} + +/// An owned variant of `Name`. +/// +/// Everything about `Name` applies to this structure as well. +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub struct OwnedName { + /// A local name, e.g. `string` in `xsi:string`. + pub local_name: String, + + /// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`. + pub namespace: Option, + + /// A name prefix, e.g. `xsi` in `xsi:string`. + pub prefix: Option, +} + +impl fmt::Display for OwnedName { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.borrow(), f) + } +} + +impl OwnedName { + /// Constructs a borrowed `Name` based on this owned name. + pub fn borrow(&self) -> Name { + Name { + local_name: &*self.local_name, + namespace: self.namespace.as_ref().map(|s| &**s), + prefix: self.prefix.as_ref().map(|s| &**s), + } + } + + /// Returns a new `OwnedName` instance representing a plain local name. + #[inline] + pub fn local(local_name: S) -> OwnedName where S: Into { + OwnedName { + local_name: local_name.into(), + namespace: None, + prefix: None, + } + } + + /// Returns a new `OwnedName` instance representing a qualified name with or without + /// a prefix and with a namespace URI. + #[inline] + pub fn qualified(local_name: S1, namespace: S2, prefix: Option) -> OwnedName + where S1: Into, S2: Into, S3: Into + { + OwnedName { + local_name: local_name.into(), + namespace: Some(namespace.into()), + prefix: prefix.map(|v| v.into()) + } + } + + /// Returns an optional prefix by reference, equivalent to `self.borrow().prefix` + /// but avoids extra work. + #[inline] + pub fn prefix_ref(&self) -> Option<&str> { + self.prefix.as_ref().map(|s| &**s) + } + + /// Returns an optional namespace by reference, equivalen to `self.borrow().namespace` + /// but avoids extra work. + #[inline] + pub fn namespace_ref(&self) -> Option<&str> { + self.namespace.as_ref().map(|s| &**s) + } +} + +impl<'a> From> for OwnedName { + #[inline] + fn from(n: Name<'a>) -> OwnedName { + n.to_owned() + } +} + +impl FromStr for OwnedName { + type Err = (); + + /// Parses the given string slice into a qualified name. + /// + /// This function, when finishes sucessfully, always return a qualified + /// name without a namespace (`name.namespace == None`). It should be filled later + /// using proper `NamespaceStack`. + /// + /// It is supposed that all characters in the argument string are correct + /// as defined by the XML specification. No additional checks except a check + /// for emptiness are done. + fn from_str(s: &str) -> Result { + let mut it = s.split(':'); + + let r = match (it.next(), it.next(), it.next()) { + (Some(prefix), Some(local_name), None) if !prefix.is_empty() && + !local_name.is_empty() => + Some((local_name.into(), Some(prefix.into()))), + (Some(local_name), None, None) if !local_name.is_empty() => + Some((local_name.into(), None)), + (_, _, _) => None + }; + r.map(|(local_name, prefix)| OwnedName { + local_name, + namespace: None, + prefix + }).ok_or(()) + } +} + +#[cfg(test)] +mod tests { + use super::OwnedName; + + #[test] + fn test_owned_name_from_str() { + assert_eq!("prefix:name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: Some("prefix".into()) + })); + + assert_eq!("name".parse(), Ok(OwnedName { + local_name: "name".into(), + namespace: None, + prefix: None + })); + + assert_eq!("".parse(), Err::(())); + assert_eq!(":".parse(), Err::(())); + assert_eq!(":a".parse(), Err::(())); + assert_eq!("a:".parse(), Err::(())); + assert_eq!("a:b:c".parse(), Err::(())); + } +} diff --git a/src/namespace.rs b/src/namespace.rs new file mode 100644 index 0000000..1ab4a5c --- /dev/null +++ b/src/namespace.rs @@ -0,0 +1,485 @@ +//! Contains namespace manipulation types and functions. + +use std::iter::{Map, Rev}; +use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::btree_map::Iter as Entries; +use std::collections::HashSet; +use std::slice::Iter; + +/// Designates prefix for namespace definitions. +/// +/// See [Namespaces in XML][namespace] spec for more information. +/// +/// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl +pub const NS_XMLNS_PREFIX: &'static str = "xmlns"; + +/// Designates the standard URI for `xmlns` prefix. +/// +/// See [A Namespace Name for xmlns Attributes][1] for more information. +/// +/// [namespace]: http://www.w3.org/2000/xmlns/ +pub const NS_XMLNS_URI: &'static str = "http://www.w3.org/2000/xmlns/"; + +/// Designates prefix for a namespace containing several special predefined attributes. +/// +/// See [2.10 White Space handling][1], [2.1 Language Identification][2], +/// [XML Base specification][3] and [xml:id specification][4] for more information. +/// +/// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space +/// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag +/// [3]: http://www.w3.org/TR/xmlbase/ +/// [4]: http://www.w3.org/TR/xml-id/ +pub const NS_XML_PREFIX: &'static str = "xml"; + +/// Designates the standard URI for `xml` prefix. +/// +/// See `NS_XML_PREFIX` documentation for more information. +pub const NS_XML_URI: &'static str = "http://www.w3.org/XML/1998/namespace"; + +/// Designates the absence of prefix in a qualified name. +/// +/// This constant should be used to define or query default namespace which should be used +/// for element or attribute names without prefix. For example, if a namespace mapping +/// at a particular point in the document contains correspondence like +/// +/// ```none +/// NS_NO_PREFIX --> urn:some:namespace +/// ``` +/// +/// then all names declared without an explicit prefix `urn:some:namespace` is assumed as +/// a namespace URI. +/// +/// By default empty prefix corresponds to absence of namespace, but this can change either +/// when writing an XML document (manually) or when reading an XML document (based on namespace +/// declarations). +pub const NS_NO_PREFIX: &'static str = ""; + +/// Designates an empty namespace URI, which is equivalent to absence of namespace. +/// +/// This constant should not usually be used directly; it is used to designate that +/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with +/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping +/// in a namespace back to its default value. +pub const NS_EMPTY_URI: &'static str = ""; + +/// Namespace is a map from prefixes to namespace URIs. +/// +/// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant. +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct Namespace(pub BTreeMap); + +impl Namespace { + /// Returns an empty namespace. + #[inline] + pub fn empty() -> Namespace { Namespace(BTreeMap::new()) } + + /// Checks whether this namespace is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Checks whether this namespace is essentially empty, that is, it does not contain + /// anything but default mappings. + pub fn is_essentially_empty(&self) -> bool { + // a shortcut for a namespace which is definitely not empty + if self.0.len() > 3 { return false; } + + self.0.iter().all(|(k, v)| match (&**k, &**v) { + (NS_NO_PREFIX, NS_EMPTY_URI) => true, + (NS_XMLNS_PREFIX, NS_XMLNS_URI) => true, + (NS_XML_PREFIX, NS_XML_URI) => true, + _ => false + }) + } + + /// Checks whether this namespace mapping contains the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// `true` if this namespace contains the given prefix, `false` otherwise. + #[inline] + pub fn contains>(&self, prefix: &P) -> bool { + self.0.contains_key(prefix.as_ref()) + } + + /// Puts a mapping into this namespace. + /// + /// This method does not override any already existing mappings. + /// + /// Returns a boolean flag indicating whether the map already contained + /// the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + pub fn put(&mut self, prefix: P, uri: U) -> bool + where P: Into, U: Into + { + match self.0.entry(prefix.into()) { + Entry::Occupied(_) => false, + Entry::Vacant(ve) => { + ve.insert(uri.into()); + true + } + } + } + + /// Puts a mapping into this namespace forcefully. + /// + /// This method, unlike `put()`, does replace an already existing mapping. + /// + /// Returns previous URI which was assigned to the given prefix, if it is present. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or + /// `None` if such prefix was not present in the namespace before. + pub fn force_put(&mut self, prefix: P, uri: U) -> Option + where P: Into, U: Into + { + self.0.insert(prefix.into(), uri.into()) + } + + /// Queries the namespace for the given prefix. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + /// + /// # Return value + /// Namespace URI corresponding to the given prefix, if it is present. + pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + self.0.get(prefix.as_ref()).map(|s| &**s) + } +} + +/// An alias for iterator type for namespace mappings contained in a namespace. +pub type NamespaceMappings<'a> = Map< + Entries<'a, String, String>, + for<'b> fn((&'b String, &'b String)) -> UriMapping<'b> +>; + +impl<'a> IntoIterator for &'a Namespace { + type Item = UriMapping<'a>; + type IntoIter = NamespaceMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> { + (&*prefix, &*uri) + } + self.0.iter().map(mapper) + } +} + +/// Namespace stack is a sequence of namespaces. +/// +/// Namespace stack is used to represent cumulative namespace consisting of +/// combined namespaces from nested elements. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct NamespaceStack(pub Vec); + +impl NamespaceStack { + /// Returns an empty namespace stack. + #[inline] + pub fn empty() -> NamespaceStack { NamespaceStack(Vec::with_capacity(2)) } + + /// Returns a namespace stack with default items in it. + /// + /// Default items are the following: + /// + /// * `xml` → `http://www.w3.org/XML/1998/namespace`; + /// * `xmlns` → `http://www.w3.org/2000/xmlns/`. + #[inline] + pub fn default() -> NamespaceStack { + let mut nst = NamespaceStack::empty(); + nst.push_empty(); + // xml namespace + nst.put(NS_XML_PREFIX, NS_XML_URI); + // xmlns namespace + nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI); + // empty namespace + nst.put(NS_NO_PREFIX, NS_EMPTY_URI); + nst + } + + /// Adds an empty namespace to the top of this stack. + #[inline] + pub fn push_empty(&mut self) -> &mut NamespaceStack { + self.0.push(Namespace::empty()); + self + } + + /// Removes the topmost namespace in this stack. + /// + /// Panics if the stack is empty. + #[inline] + pub fn pop(&mut self) -> Namespace { + self.0.pop().unwrap() + } + + /// Removes the topmost namespace in this stack. + /// + /// Returns `Some(namespace)` if this stack is not empty and `None` otherwise. + #[inline] + pub fn try_pop(&mut self) -> Option { + self.0.pop() + } + + /// Borrows the topmost namespace mutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek_mut(&mut self) -> &mut Namespace { + self.0.last_mut().unwrap() + } + + /// Borrows the topmost namespace immutably, leaving the stack intact. + /// + /// Panics if the stack is empty. + #[inline] + pub fn peek(&self) -> &Namespace { + self.0.last().unwrap() + } + + /// Puts a mapping into the topmost namespace if this stack does not already contain one. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// Note that both key and value are matched and the mapping is inserted if either + /// namespace prefix is not already mapped, or if it is mapped, but to a different URI. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace stack. + pub fn put_checked(&mut self, prefix: P, uri: U) -> bool + where P: Into + AsRef, + U: Into + AsRef + { + if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) { + false + } else { + self.put(prefix, uri); + true + } + } + + /// Puts a mapping into the topmost namespace in this stack. + /// + /// This method does not override a mapping in the topmost namespace if it is + /// already present, however, it does not depend on other namespaces in the stack, + /// so it is possible to put a mapping which is present in lower namespaces. + /// + /// Returns a boolean flag indicating whether the insertion has completed successfully. + /// + /// # Parameters + /// * `prefix` --- namespace prefix; + /// * `uri` --- namespace URI. + /// + /// # Return value + /// `true` if `prefix` has been inserted successfully; `false` if the `prefix` + /// was already present in the namespace. + #[inline] + pub fn put(&mut self, prefix: P, uri: U) -> bool + where P: Into, U: Into + { + self.0.last_mut().unwrap().put(prefix, uri) + } + + /// Performs a search for the given prefix in the whole stack. + /// + /// This method walks the stack from top to bottom, querying each namespace + /// in order for the given prefix. If none of the namespaces contains the prefix, + /// `None` is returned. + /// + /// # Parameters + /// * `prefix` --- namespace prefix. + #[inline] + pub fn get<'a, P: ?Sized+AsRef>(&'a self, prefix: &P) -> Option<&'a str> { + let prefix = prefix.as_ref(); + for ns in self.0.iter().rev() { + match ns.get(prefix) { + None => {}, + r => return r, + } + } + None + } + + /// Combines this stack of namespaces into a single namespace. + /// + /// Namespaces are combined in left-to-right order, that is, rightmost namespace + /// elements take priority over leftmost ones. + pub fn squash(&self) -> Namespace { + let mut result = BTreeMap::new(); + for ns in self.0.iter() { + result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone()))); + } + Namespace(result) + } + + /// Returns an object which implements `Extend` using `put_checked()` instead of `put()`. + /// + /// See `CheckedTarget` for more information. + #[inline] + pub fn checked_target(&mut self) -> CheckedTarget { + CheckedTarget(self) + } + + /// Returns an iterator over all mappings in this namespace stack. + #[inline] + pub fn iter(&self) -> NamespaceStackMappings { + self.into_iter() + } +} + +/// An iterator over mappings from prefixes to URIs in a namespace stack. +/// +/// # Example +/// ``` +/// # use xml::namespace::NamespaceStack; +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::>()); +/// ``` +pub struct NamespaceStackMappings<'a> { + namespaces: Rev>, + current_namespace: Option>, + used_keys: HashSet<&'a str> +} + +impl<'a> NamespaceStackMappings<'a> { + fn go_to_next_namespace(&mut self) -> bool { + self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter()); + self.current_namespace.is_some() + } +} + +impl<'a> Iterator for NamespaceStackMappings<'a> { + type Item = UriMapping<'a>; + + fn next(&mut self) -> Option> { + // If there is no current namespace and no next namespace, we're finished + if self.current_namespace.is_none() && !self.go_to_next_namespace() { + return None; + } + let next_item = self.current_namespace.as_mut().unwrap().next(); + + match next_item { + // There is an element in the current namespace + Some((k, v)) => if self.used_keys.contains(&k) { + // If the current key is used, go to the next one + self.next() + } else { + // Otherwise insert the current key to the set of used keys and + // return the mapping + self.used_keys.insert(k); + Some((k, v)) + }, + // Current namespace is exhausted + None => if self.go_to_next_namespace() { + // If there is next namespace, continue from it + self.next() + } else { + // No next namespace, exiting + None + } + } + } +} + +impl<'a> IntoIterator for &'a NamespaceStack { + type Item = UriMapping<'a>; + type IntoIter = NamespaceStackMappings<'a>; + + fn into_iter(self) -> Self::IntoIter { + NamespaceStackMappings { + namespaces: self.0.iter().rev(), + current_namespace: None, + used_keys: HashSet::new() + } + } +} + +/// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators. +pub type UriMapping<'a> = (&'a str, &'a str); + +impl<'a> Extend> for Namespace { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +impl<'a> Extend> for NamespaceStack { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.put(prefix, uri); + } + } +} + +/// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`. +/// +/// # Example +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// +/// let mut nst = NamespaceStack::empty(); +/// nst.push_empty(); +/// nst.put("a", "urn:A"); +/// nst.put("b", "urn:B"); +/// nst.push_empty(); +/// nst.put("c", "urn:C"); +/// +/// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")], +/// nst.iter().collect::>() +/// ); +/// ``` +/// +/// Compare: +/// +/// ``` +/// # use xml::namespace::NamespaceStack; +/// # let mut nst = NamespaceStack::empty(); +/// # nst.push_empty(); +/// # nst.put("a", "urn:A"); +/// # nst.put("b", "urn:B"); +/// # nst.push_empty(); +/// # nst.put("c", "urn:C"); +/// +/// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]); +/// assert_eq!( +/// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")], +/// nst.iter().collect::>() +/// ); +/// ``` +pub struct CheckedTarget<'a>(&'a mut NamespaceStack); + +impl<'a, 'b> Extend> for CheckedTarget<'a> { + fn extend(&mut self, iterable: T) where T: IntoIterator> { + for (prefix, uri) in iterable { + self.0.put_checked(prefix, uri); + } + } +} diff --git a/src/reader/config.rs b/src/reader/config.rs new file mode 100644 index 0000000..0abb165 --- /dev/null +++ b/src/reader/config.rs @@ -0,0 +1,181 @@ +//! Contains parser configuration structure. +use std::io::Read; +use std::collections::HashMap; + +use reader::EventReader; + +/// Parser configuration structure. +/// +/// This structure contains various configuration options which affect +/// behavior of the parser. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct ParserConfig { + /// Whether or not should whitespace in textual events be removed. Default is false. + /// + /// When true, all standalone whitespace will be removed (this means no + /// `Whitespace` events will be emitted), and leading and trailing whitespace + /// from `Character` events will be deleted. If after trimming `Characters` + /// event will be empty, it will also be omitted from output stream. This is + /// possible, however, only if `whitespace_to_characters` or + /// `cdata_to_characters` options are set. + /// + /// This option does not affect CDATA events, unless `cdata_to_characters` + /// option is also set. In that case CDATA content will also be trimmed. + pub trim_whitespace: bool, + + /// Whether or not should whitespace be converted to characters. + /// Default is false. + /// + /// If true, instead of `Whitespace` events `Characters` events with the + /// same content will be emitted. If `trim_whitespace` is also true, these + /// events will be trimmed to nothing and, consequently, not emitted. + pub whitespace_to_characters: bool, + + /// Whether or not should CDATA be converted to characters. + /// Default is false. + /// + /// If true, instead of `CData` events `Characters` events with the same + /// content will be emitted. If `trim_whitespace` is also true, these events + /// will be trimmed. If corresponding CDATA contained nothing but whitespace, + /// this event will be omitted from the stream. + pub cdata_to_characters: bool, + + /// Whether or not should comments be omitted. Default is true. + /// + /// If true, `Comment` events will not be emitted at all. + pub ignore_comments: bool, + + /// Whether or not should sequential `Characters` events be merged. + /// Default is true. + /// + /// If true, multiple sequential `Characters` events will be merged into + /// a single event, that is, their data will be concatenated. + /// + /// Multiple sequential `Characters` events are only possible if either + /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character + /// events will always be separated by other events. + pub coalesce_characters: bool, + + /// A map of extra entities recognized by the parser. Default is an empty map. + /// + /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, + /// however, it is convenient to make the parser recognize additional entities which + /// are also not available through the DTD definitions (especially given that at the moment + /// DTD parsing is not supported). + pub extra_entities: HashMap, + + /// Whether or not the parser should ignore the end of stream. Default is false. + /// + /// By default the parser will either error out when it encounters a premature end of + /// stream or complete normally if the end of stream was expected. If you want to continue + /// reading from a stream whose input is supplied progressively, you can set this option to true. + /// In this case the parser will allow you to invoke the next() method even if a supposed end + /// of stream has happened. + /// + /// Note that support for this functionality is incomplete; for example, the parser will fail if + /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. + pub ignore_end_of_stream: bool, + + /// Whether or not non-unicode entity references get replaced with the replacement character + /// + /// When true, any decimal or hexadecimal character reference that cannot be converted from a + /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) + /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). + pub replace_unknown_entity_references: bool, + + /// Whether or not whitespace at the root level of the document is ignored. Default is true. + /// + /// By default any whitespace that is not enclosed within at least one level of elements will be + /// ignored. Setting this value to false will cause root level whitespace events to be emitted. + pub ignore_root_level_whitespace: bool, +} + +impl ParserConfig { + /// Returns a new config with default values. + /// + /// You can tweak default values using builder-like pattern: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let config = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false); + /// ``` + pub fn new() -> ParserConfig { + ParserConfig { + trim_whitespace: false, + whitespace_to_characters: false, + cdata_to_characters: false, + ignore_comments: true, + coalesce_characters: true, + extra_entities: HashMap::new(), + ignore_end_of_stream: false, + replace_unknown_entity_references: false, + ignore_root_level_whitespace: true, + } + } + + /// Creates an XML reader with this configuration. + /// + /// This is a convenience method for configuring and creating a reader at the same time: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .trim_whitespace(true) + /// .ignore_comments(true) + /// .coalesce_characters(false) + /// .create_reader(&mut source); + /// ``` + /// + /// This method is exactly equivalent to calling `EventReader::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_reader(self, source: R) -> EventReader { + EventReader::new_with_config(source, self) + } + + /// Adds a new entity mapping and returns an updated config object. + /// + /// This is a convenience method for adding external entities mappings to the XML parser. + /// An example: + /// + /// ```rust + /// use xml::reader::ParserConfig; + /// + /// let mut source: &[u8] = b"..."; + /// + /// let reader = ParserConfig::new() + /// .add_entity("nbsp", " ") + /// .add_entity("copy", "©") + /// .add_entity("reg", "®") + /// .create_reader(&mut source); + /// ``` + pub fn add_entity, T: Into>(mut self, entity: S, value: T) -> ParserConfig { + self.extra_entities.insert(entity.into(), value.into()); + self + } +} + +impl Default for ParserConfig { + #[inline] + fn default() -> ParserConfig { + ParserConfig::new() + } +} + +gen_setters! { ParserConfig, + trim_whitespace: val bool, + whitespace_to_characters: val bool, + cdata_to_characters: val bool, + ignore_comments: val bool, + coalesce_characters: val bool, + ignore_end_of_stream: val bool, + replace_unknown_entity_references: val bool, + ignore_root_level_whitespace: val bool +} diff --git a/src/reader/error.rs b/src/reader/error.rs new file mode 100644 index 0000000..92378e6 --- /dev/null +++ b/src/reader/error.rs @@ -0,0 +1,121 @@ + +use std::io; +use std::borrow::Cow; +use std::fmt; +use std::error; +use std::str; + +use util; +use common::{Position, TextPosition}; + +#[derive(Debug)] +pub enum ErrorKind { + Syntax(Cow<'static, str>), + Io(io::Error), + Utf8(str::Utf8Error), + UnexpectedEof, +} + +/// An XML parsing error. +/// +/// Consists of a 2D position in a document and a textual message describing the error. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct Error { + pos: TextPosition, + kind: ErrorKind, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {}", self.pos, self.msg()) + } +} + +impl Position for Error { + #[inline] + fn position(&self) -> TextPosition { self.pos } +} + +impl Error { + /// Returns a reference to a message which is contained inside this error. + #[inline] + pub fn msg(&self) -> &str { + use self::ErrorKind::*; + match self.kind { + UnexpectedEof => &"Unexpected EOF", + Utf8(ref reason) => error_description(reason), + Io(ref io_error) => error_description(io_error), + Syntax(ref msg) => msg.as_ref(), + } + } + + pub fn kind(&self) -> &ErrorKind { &self.kind } +} + +impl error::Error for Error { + #[inline] + fn description(&self) -> &str { self.msg() } +} + +impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into> { + fn from(orig: (&'a P, M)) -> Self { + Error{ + pos: orig.0.position(), + kind: ErrorKind::Syntax(orig.1.into()) + } + } +} + +impl From for Error { + fn from(e: util::CharReadError) -> Self { + use util::CharReadError::*; + Error{ + pos: TextPosition::new(), + kind: match e { + UnexpectedEof => ErrorKind::UnexpectedEof, + Utf8(reason) => ErrorKind::Utf8(reason), + Io(io_error) => ErrorKind::Io(io_error), + } + } + } +} + +impl From for Error { + fn from(e: io::Error) -> Self { + Error { + pos: TextPosition::new(), + kind: ErrorKind::Io(e), + } + } +} + +impl Clone for ErrorKind { + fn clone(&self) -> Self { + use self::ErrorKind::*; + match *self { + UnexpectedEof => UnexpectedEof, + Utf8(ref reason) => Utf8(reason.clone()), + Io(ref io_error) => Io(io::Error::new(io_error.kind(), error_description(io_error))), + Syntax(ref msg) => Syntax(msg.clone()), + } + } +} +impl PartialEq for ErrorKind { + fn eq(&self, other: &ErrorKind) -> bool { + use self::ErrorKind::*; + match (self, other) { + (&UnexpectedEof, &UnexpectedEof) => true, + (&Utf8(ref left), &Utf8(ref right)) => left == right, + (&Io(ref left), &Io(ref right)) => + left.kind() == right.kind() && + error_description(left) == error_description(right), + (&Syntax(ref left), &Syntax(ref right)) => + left == right, + + (_, _) => false, + } + } +} +impl Eq for ErrorKind {} + +fn error_description(e: &error::Error) -> &str { e.description() } diff --git a/src/reader/events.rs b/src/reader/events.rs new file mode 100644 index 0000000..46d7621 --- /dev/null +++ b/src/reader/events.rs @@ -0,0 +1,219 @@ +//! Contains `XmlEvent` datatype, instances of which are emitted by the parser. + +use std::fmt; +use std::borrow::Cow; + +use name::OwnedName; +use attribute::OwnedAttribute; +use common::XmlVersion; +use namespace::Namespace; + +/// An element of an XML input stream. +/// +/// Items of this enum are emitted by `reader::EventReader`. They correspond to different +/// elements of an XML document. +#[derive(PartialEq, Clone)] +pub enum XmlEvent { + /// Corresponds to XML document declaration. + /// + /// This event is always emitted before any other event. It is emitted + /// even if the actual declaration is not present in the document. + StartDocument { + /// XML version. + /// + /// If XML declaration is not present, defaults to `Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// If XML declaration is not present or does not contain `encoding` attribute, + /// defaults to `"UTF-8"`. This field is currently used for no other purpose than + /// informational. + encoding: String, + + /// XML standalone declaration. + /// + /// If XML document is not present or does not contain `standalone` attribute, + /// defaults to `None`. This field is currently used for no other purpose than + /// informational. + standalone: Option + }, + + /// Denotes to the end of the document stream. + /// + /// This event is always emitted after any other event (except `Error`). After it + /// is emitted for the first time, it will always be emitted on next event pull attempts. + EndDocument, + + /// Denotes an XML processing instruction. + /// + /// This event contains a processing instruction target (`name`) and opaque `data`. It + /// is up to the application to process them. + ProcessingInstruction { + /// Processing instruction target. + name: String, + + /// Processing instruction content. + data: Option + }, + + /// Denotes a beginning of an XML element. + /// + /// This event is emitted after parsing opening tags or after parsing bodiless tags. In the + /// latter case `EndElement` event immediately follows. + StartElement { + /// Qualified name of the element. + name: OwnedName, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO) + attributes: Vec, + + /// Contents of the namespace mapping at this point of the document. + namespace: Namespace, + }, + + /// Denotes an end of an XML element. + /// + /// This event is emitted after parsing closing tags or after parsing bodiless tags. In the + /// latter case it is emitted immediately after corresponding `StartElement` event. + EndElement { + /// Qualified name of the element. + name: OwnedName + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data. No unescaping will be performed. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `CData`. See + /// `pull::ParserConfiguration` structure for more information. + CData(String), + + /// Denotes a comment. + /// + /// It is possible to configure a parser to ignore comments, so this event will never be emitted. + /// See `pull::ParserConfiguration` structure for more information. + Comment(String), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will always be unescaped, so no entities like `<` or `&` or `{` + /// will appear in it. + /// + /// It is possible to configure a parser to trim leading and trailing whitespace for this event. + /// See `pull::ParserConfiguration` structure for more information. + Characters(String), + + /// Denotes a chunk of whitespace outside of tags. + /// + /// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`. + /// See `pull::ParserConfiguration` structure for more information. When combined with whitespace + /// trimming, it will eliminate standalone whitespace from the event stream completely. + Whitespace(String) +} + +impl fmt::Debug for XmlEvent { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + XmlEvent::StartDocument { ref version, ref encoding, ref standalone } => + write!(f, "StartDocument({}, {}, {:?})", version, *encoding, *standalone), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}{})", *name, match *data { + Some(ref data) => format!(", {}", data), + None => String::new() + }), + XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } => + write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() { + String::new() + } else { + let attributes: Vec = attributes.iter().map( + |a| format!("{} -> {}", a.name, a.value) + ).collect(); + format!(", [{}]", attributes.join(", ")) + }), + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", name), + XmlEvent::Comment(ref data) => + write!(f, "Comment({})", data), + XmlEvent::CData(ref data) => + write!(f, "CData({})", data), + XmlEvent::Characters(ref data) => + write!(f, "Characters({})", data), + XmlEvent::Whitespace(ref data) => + write!(f, "Whitespace({})", data) + } + } +} + +impl XmlEvent { + /// Obtains a writer event from this reader event. + /// + /// This method is useful for streaming processing of XML documents where the output + /// is also an XML document. With this method it is possible to process some events + /// while passing other events through to the writer unchanged: + /// + /// ```rust + /// use std::str; + /// + /// use xml::{EventReader, EventWriter}; + /// use xml::reader::XmlEvent as ReaderEvent; + /// use xml::writer::XmlEvent as WriterEvent; + /// + /// let mut input: &[u8] = b"world"; + /// let mut output: Vec = Vec::new(); + /// + /// { + /// let mut reader = EventReader::new(&mut input); + /// let mut writer = EventWriter::new(&mut output); + /// + /// for e in reader { + /// match e.unwrap() { + /// ReaderEvent::Characters(s) => + /// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(), + /// e => if let Some(e) = e.as_writer_event() { + /// writer.write(e).unwrap() + /// } + /// } + /// } + /// } + /// + /// assert_eq!( + /// str::from_utf8(&output).unwrap(), + /// r#"WORLD"# + /// ); + /// ``` + /// + /// Note that this API may change or get additions in future to improve its ergonomics. + pub fn as_writer_event<'a>(&'a self) -> Option<::writer::events::XmlEvent<'a>> { + match *self { + XmlEvent::StartDocument { version, ref encoding, standalone } => + Some(::writer::events::XmlEvent::StartDocument { + version: version, + encoding: Some(encoding), + standalone: standalone + }), + XmlEvent::ProcessingInstruction { ref name, ref data } => + Some(::writer::events::XmlEvent::ProcessingInstruction { + name: name, + data: data.as_ref().map(|s| &s[..]) + }), + XmlEvent::StartElement { ref name, ref attributes, ref namespace } => + Some(::writer::events::XmlEvent::StartElement { + name: name.borrow(), + attributes: attributes.iter().map(|a| a.borrow()).collect(), + namespace: Cow::Borrowed(namespace) + }), + XmlEvent::EndElement { ref name } => + Some(::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }), + XmlEvent::Comment(ref data) => Some(::writer::events::XmlEvent::Comment(data)), + XmlEvent::CData(ref data) => Some(::writer::events::XmlEvent::CData(data)), + XmlEvent::Characters(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + XmlEvent::Whitespace(ref data) => Some(::writer::events::XmlEvent::Characters(data)), + _ => None + } + } +} diff --git a/src/reader/lexer.rs b/src/reader/lexer.rs new file mode 100644 index 0000000..c466db9 --- /dev/null +++ b/src/reader/lexer.rs @@ -0,0 +1,867 @@ +//! Contains simple lexer for XML documents. +//! +//! This module is for internal use. Use `xml::pull` module to do parsing. + +use std::fmt; +use std::collections::VecDeque; +use std::io::Read; +use std::result; +use std::borrow::Cow; + +use common::{Position, TextPosition, is_whitespace_char, is_name_char}; +use reader::Error; +use util; + +/// `Token` represents a single lexeme of an XML document. These lexemes +/// are used to perform actual parsing. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub enum Token { + /// `` + ProcessingInstructionEnd, + /// `` + TagEnd, + /// `/>` + EmptyTagEnd, + /// `` + CommentEnd, + /// A chunk of characters, used for errors recovery. + Chunk(&'static str), + /// Any non-special character except whitespace. + Character(char), + /// Whitespace character. + Whitespace(char), + /// `=` + EqualsSign, + /// `'` + SingleQuote, + /// `"` + DoubleQuote, + /// `` + CDataEnd, + /// `&` + ReferenceStart, + /// `;` + ReferenceEnd, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Token::Chunk(s) => write!(f, "{}", s), + Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), + other => write!(f, "{}", match other { + Token::OpeningTagStart => "<", + Token::ProcessingInstructionStart => " " " "", + Token::CDataEnd => "]]>", + Token::ReferenceStart => "&", + Token::ReferenceEnd => ";", + Token::EqualsSign => "=", + Token::SingleQuote => "'", + Token::DoubleQuote => "\"", + _ => unreachable!() + }) + } + } +} + +impl Token { + pub fn as_static_str(&self) -> Option<&'static str> { + match *self { + Token::OpeningTagStart => Some("<"), + Token::ProcessingInstructionStart => Some(" Some(" Some(" Some(""), + Token::CDataEnd => Some("]]>"), + Token::ReferenceStart => Some("&"), + Token::ReferenceEnd => Some(";"), + Token::EqualsSign => Some("="), + Token::SingleQuote => Some("'"), + Token::DoubleQuote => Some("\""), + Token::Chunk(s) => Some(s), + _ => None + } + } + + // using String.push_str(token.to_string()) is simply way too slow + pub fn push_to_string(&self, target: &mut String) { + match self.as_static_str() { + Some(s) => { target.push_str(s); } + None => { + match *self { + Token::Character(c) | Token::Whitespace(c) => target.push(c), + _ => unreachable!() + } + } + } + } + + /// Returns `true` if this token contains data that can be interpreted + /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. + #[inline] + pub fn contains_char_data(&self) -> bool { + match *self { + Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | + Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | + Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, + _ => false + } + } + + /// Returns `true` if this token corresponds to a white space character. + #[inline] + pub fn is_whitespace(&self) -> bool { + match *self { + Token::Whitespace(_) => true, + _ => false + } + } +} + +enum State { + /// Triggered on '<' + TagStarted, + /// Triggered on ', Error>; + +/// Helps to set up a dispatch table for lexing large unambigous tokens like +/// ` ( + match $s { + $( + $st => match $c { + $stc => $_self.move_to($is($next_st)), + _ => $_self.handle_error($chunk, $c) + }, + )+ + $end_st => match $c { + $end_c => $e, + _ => $_self.handle_error($end_chunk, $c) + } + } + ) +); + +/// `Lexer` is a lexer for XML documents, which implements pull API. +/// +/// Main method is `next_token` which accepts an `std::io::Read` instance and +/// tries to read the next lexeme from it. +/// +/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. +/// When it is not set, errors will be reported as `Err` objects with a string message. +/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods +/// to toggle the behavior. +pub struct Lexer { + pos: TextPosition, + head_pos: TextPosition, + char_queue: VecDeque, + st: State, + skip_errors: bool, + inside_comment: bool, + inside_token: bool, + eof_handled: bool +} + +impl Position for Lexer { + #[inline] + /// Returns the position of the last token produced by the lexer + fn position(&self) -> TextPosition { self.pos } +} + +impl Lexer { + /// Returns a new lexer with default state. + pub fn new() -> Lexer { + Lexer { + pos: TextPosition::new(), + head_pos: TextPosition::new(), + char_queue: VecDeque::with_capacity(4), // TODO: check size + st: State::Normal, + skip_errors: false, + inside_comment: false, + inside_token: false, + eof_handled: false + } + } + + /// Enables error handling so `next_token` will return `Some(Err(..))` + /// upon invalid lexeme. + #[inline] + pub fn enable_errors(&mut self) { self.skip_errors = false; } + + /// Disables error handling so `next_token` will return `Some(Chunk(..))` + /// upon invalid lexeme with this lexeme content. + #[inline] + pub fn disable_errors(&mut self) { self.skip_errors = true; } + + /// Enables special handling of some lexemes which should be done when we're parsing comment + /// internals. + #[inline] + pub fn inside_comment(&mut self) { self.inside_comment = true; } + + /// Disables the effect of `inside_comment()` method. + #[inline] + pub fn outside_comment(&mut self) { self.inside_comment = false; } + + /// Reset the eof handled flag of the lexer. + #[inline] + pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } + + /// Tries to read the next token from the buffer. + /// + /// It is possible to pass different instaces of `BufReader` each time + /// this method is called, but the resulting behavior is undefined in this case. + /// + /// Return value: + /// * `Err(reason) where reason: reader::Error` - when an error occurs; + /// * `Ok(None)` - upon end of stream is reached; + /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. + pub fn next_token(&mut self, b: &mut B) -> Result { + // Already reached end of buffer + if self.eof_handled { + return Ok(None); + } + + if !self.inside_token { + self.pos = self.head_pos; + self.inside_token = true; + } + + // Check if we have saved a char or two for ourselves + while let Some(c) = self.char_queue.pop_front() { + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => {} // continue + } + } + + loop { + // TODO: this should handle multiple encodings + let c = match try!(util::next_char_from(b)) { + Some(c) => c, // got next char + None => break, // nothing to read left + }; + + match try!(self.read_next_token(c)) { + Some(t) => { + self.inside_token = false; + return Ok(Some(t)); + } + None => { + // continue + } + } + } + + // Handle end of stream + self.eof_handled = true; + self.pos = self.head_pos; + match self.st { + State::TagStarted | State::CommentOrCDataOrDoctypeStarted | + State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | + State::CommentClosing(ClosingSubstate::Second) | + State::DoctypeFinishing(_) => + Err(self.error("Unexpected end of stream")), + State::ProcessingInstructionClosing => + Ok(Some(Token::Character('?'))), + State::EmptyTagClosing => + Ok(Some(Token::Character('/'))), + State::CommentClosing(ClosingSubstate::First) => + Ok(Some(Token::Character('-'))), + State::CDataClosing(ClosingSubstate::First) => + Ok(Some(Token::Character(']'))), + State::CDataClosing(ClosingSubstate::Second) => + Ok(Some(Token::Chunk("]]"))), + State::Normal => + Ok(None) + } + } + + #[inline] + fn error>>(&self, msg: M) -> Error { + (self, msg).into() + } + + #[inline] + fn read_next_token(&mut self, c: char) -> Result { + let res = self.dispatch_char(c); + if self.char_queue.is_empty() { + if c == '\n' { + self.head_pos.new_line(); + } else { + self.head_pos.advance(1); + } + } + res + } + + fn dispatch_char(&mut self, c: char) -> Result { + match self.st { + State::Normal => self.normal(c), + State::TagStarted => self.tag_opened(c), + State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), + State::CommentStarted => self.comment_started(c), + State::CDataStarted(s) => self.cdata_started(c, s), + State::DoctypeStarted(s) => self.doctype_started(c, s), + State::DoctypeFinishing(d) => self.doctype_finishing(c, d), + State::ProcessingInstructionClosing => self.processing_instruction_closing(c), + State::EmptyTagClosing => self.empty_element_closing(c), + State::CommentClosing(s) => self.comment_closing(c, s), + State::CDataClosing(s) => self.cdata_closing(c, s) + } + } + + #[inline] + fn move_to(&mut self, st: State) -> Result { + self.st = st; + Ok(None) + } + + #[inline] + fn move_to_with(&mut self, st: State, token: Token) -> Result { + self.st = st; + Ok(Some(token)) + } + + #[inline] + fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { + self.char_queue.extend(cs.iter().cloned()); + self.move_to_with(st, token) + } + + fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { + self.char_queue.push_back(c); + if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky + self.move_to_with(State::Normal, Token::Chunk(chunk)) + } else { + Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) + } + } + + /// Encountered a char + fn normal(&mut self, c: char) -> Result { + match c { + '<' => self.move_to(State::TagStarted), + '>' => Ok(Some(Token::TagEnd)), + '/' => self.move_to(State::EmptyTagClosing), + '=' => Ok(Some(Token::EqualsSign)), + '"' => Ok(Some(Token::DoubleQuote)), + '\'' => Ok(Some(Token::SingleQuote)), + '?' => self.move_to(State::ProcessingInstructionClosing), + '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), + ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), + '&' => Ok(Some(Token::ReferenceStart)), + ';' => Ok(Some(Token::ReferenceEnd)), + _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), + _ => Ok(Some(Token::Character(c))) + } + } + + /// Encountered '<' + fn tag_opened(&mut self, c: char) -> Result { + match c { + '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), + '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), + '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), + _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), + _ => self.handle_error("<", c) + } + } + + /// Encountered ' Result { + match c { + '-' => self.move_to(State::CommentStarted), + '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), + 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), + _ => self.handle_error(" Result { + match c { + '-' => self.move_to_with(State::Normal, Token::CommentStart), + _ => self.handle_error(" Result { + use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; + dispatch_on_enum_state!(self, s, c, State::CDataStarted, + E ; 'C' ; C ; " Result { + use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; + dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, + D ; 'O' ; DO ; " Result { + match c { + '<' => self.move_to(State::DoctypeFinishing(d + 1)), + '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), + '>' => self.move_to(State::DoctypeFinishing(d - 1)), + _ => Ok(None), + } + } + + /// Encountered '?' + fn processing_instruction_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), + } + } + + /// Encountered '/' + fn empty_element_closing(&mut self, c: char) -> Result { + match c { + '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), + } + } + + /// Encountered '-' + fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CommentEnd), + // double dash not followed by a greater-than is a hard error inside comment + _ if self.inside_comment => self.handle_error("--", c), + // nothing else except comment closing starts with a double dash, and comment + // closing can never be after another dash, and also we're outside of a comment, + // therefore it is safe to push only the last read character to the list of unread + // characters and pass the double dash directly to the output + _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) + } + } + } + + /// Encountered ']' + fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { + match s { + ClosingSubstate::First => match c { + ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), + _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) + }, + ClosingSubstate::Second => match c { + '>' => self.move_to_with(State::Normal, Token::CDataEnd), + _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) + } + } + } +} + +#[cfg(test)] +mod tests { + use common::{Position}; + use std::io::{BufReader, Cursor}; + + use super::{Lexer, Token}; + + macro_rules! assert_oks( + (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ + $( + assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); + )+ + }) + ); + + macro_rules! assert_err( + (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ + let err = $lex.next_token(&mut $buf); + assert!(err.is_err()); + let err = err.unwrap_err(); + assert_eq!($r as u64, err.position().row); + assert_eq!($c as u64, err.position().column); + assert_eq!($s, err.msg()); + }) + ); + + macro_rules! assert_none( + (for $lex:ident and $buf:ident) => ( + assert_eq!(Ok(None), $lex.next_token(&mut $buf)); + ) + ); + + fn make_lex_and_buf(s: &str) -> (Lexer, BufReader>>) { + (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) + } + + #[test] + fn simple_lexer_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" xd

 "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::Whitespace(' ') + Token::Character('p') + Token::EqualsSign + Token::SingleQuote + Token::Character('q') + Token::SingleQuote + Token::TagEnd + Token::Whitespace(' ') + Token::Character('x') + Token::OpeningTagStart + Token::Character('b') + Token::Whitespace(' ') + Token::Character('z') + Token::EqualsSign + Token::DoubleQuote + Token::Character('y') + Token::DoubleQuote + Token::TagEnd + Token::Character('d') + Token::Whitespace('\t') + Token::ClosingTagStart + Token::Character('b') + Token::TagEnd + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + Token::OpeningTagStart + Token::Character('p') + Token::EmptyTagEnd + Token::Whitespace(' ') + Token::ProcessingInstructionStart + Token::Character('n') + Token::Character('m') + Token::Whitespace(' ') + Token::ProcessingInstructionEnd + Token::Whitespace(' ') + Token::CommentStart + Token::Whitespace(' ') + Token::Character('a') + Token::Whitespace(' ') + Token::Character('c') + Token::Whitespace(' ') + Token::CommentEnd + Token::Whitespace(' ') + Token::ReferenceStart + Token::Character('n') + Token::Character('b') + Token::Character('s') + Token::Character('p') + Token::ReferenceEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn special_chars_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#"?x!+ // -| ]z]]"# + ); + + assert_oks!(for lex and buf ; + Token::Character('?') + Token::Character('x') + Token::Character('!') + Token::Character('+') + Token::Whitespace(' ') + Token::Character('/') + Token::Character('/') + Token::Whitespace(' ') + Token::Character('-') + Token::Character('|') + Token::Whitespace(' ') + Token::Character(']') + Token::Character('z') + Token::Chunk("]]") + ); + assert_none!(for lex and buf); + } + + #[test] + fn cdata_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" "# + ); + + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::CDataStart + Token::Character('x') + Token::Whitespace(' ') + Token::Character('y') + Token::Whitespace(' ') + Token::Character('?') + Token::CDataEnd + Token::Whitespace(' ') + Token::ClosingTagStart + Token::Character('a') + Token::TagEnd + ); + assert_none!(for lex and buf); + } + + #[test] + fn doctype_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn doctype_with_internal_subset_test() { + let (mut lex, mut buf) = make_lex_and_buf( + r#" ]> "# + ); + assert_oks!(for lex and buf ; + Token::OpeningTagStart + Token::Character('a') + Token::TagEnd + Token::DoctypeStart + Token::TagEnd + Token::Whitespace(' ') + ); + assert_none!(for lex and buf) + } + + #[test] + fn end_of_stream_handling_ok() { + macro_rules! eof_check( + ($data:expr ; $token:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_oks!(for lex and buf ; $token); + assert_none!(for lex and buf); + }) + ); + eof_check!("?" ; Token::Character('?')); + eof_check!("/" ; Token::Character('/')); + eof_check!("-" ; Token::Character('-')); + eof_check!("]" ; Token::Character(']')); + eof_check!("]]" ; Token::Chunk("]]")); + } + + #[test] + fn end_of_stream_handling_error() { + macro_rules! eof_check( + ($data:expr; $r:expr, $c:expr) => ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); + assert_none!(for lex and buf); + }) + ); + eof_check!("<" ; 0, 1); + eof_check!(" ({ + let (mut lex, mut buf) = make_lex_and_buf($data); + assert_err!(for lex and buf expect row $r ; $c, $s); + + let (mut lex, mut buf) = make_lex_and_buf($data); + lex.disable_errors(); + assert_oks!(for lex and buf ; + Token::Chunk($chunk) + Token::Character($app) + ); + assert_none!(for lex and buf); + }) + ); + + #[test] + fn error_in_cdata_started() { + check_case!(""# + ); + + assert_oks!(for lex and buf ; + Token::CDataStart + Token::Character('F') + Token::Character('o') + Token::Character('o') + Token::Whitespace(' ') + Token::Character('[') + Token::Character('B') + Token::Character('a') + Token::Character('r') + Token::Character(']') + Token::CDataEnd + ); + assert_none!(for lex and buf); + } +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs new file mode 100644 index 0000000..90f5b52 --- /dev/null +++ b/src/reader/mod.rs @@ -0,0 +1,129 @@ +//! Contains high-level interface for a pull-based XML parser. +//! +//! The most important type in this module is `EventReader`, which provides an iterator +//! view for events in XML document. + +use std::io::{Read}; +use std::result; + +use common::{Position, TextPosition}; + +pub use self::config::ParserConfig; +pub use self::events::XmlEvent; + +use self::parser::PullParser; + +mod lexer; +mod parser; +mod config; +mod events; + +mod error; +pub use self::error::{Error, ErrorKind}; + +/// A result type yielded by `XmlReader`. +pub type Result = result::Result; + +/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing. +pub struct EventReader { + source: R, + parser: PullParser +} + +impl EventReader { + /// Creates a new reader, consuming the given stream. + #[inline] + pub fn new(source: R) -> EventReader { + EventReader::new_with_config(source, ParserConfig::new()) + } + + /// Creates a new reader with the provded configuration, consuming the given stream. + #[inline] + pub fn new_with_config(source: R, config: ParserConfig) -> EventReader { + EventReader { source: source, parser: PullParser::new(config) } + } + + /// Pulls and returns next XML event from the stream. + /// + /// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then + /// further calls to this method will return this event again. + #[inline] + pub fn next(&mut self) -> Result { + self.parser.next(&mut self.source) + } + + pub fn source(&self) -> &R { &self.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.source } + + /// Unwraps this `EventReader`, returning the underlying reader. + /// + /// Note that this operation is destructive; unwrapping the reader and wrapping it + /// again with `EventReader::new()` will create a fresh reader which will attempt + /// to parse an XML document from the beginning. + pub fn into_inner(self) -> R { + self.source + } +} + +impl Position for EventReader { + /// Returns the position of the last event produced by the reader. + #[inline] + fn position(&self) -> TextPosition { + self.parser.position() + } +} + +impl IntoIterator for EventReader { + type Item = Result; + type IntoIter = Events; + + fn into_iter(self) -> Events { + Events { reader: self, finished: false } + } +} + +/// An iterator over XML events created from some type implementing `Read`. +/// +/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then +/// it will be returned by the iterator once, and then it will stop producing events. +pub struct Events { + reader: EventReader, + finished: bool +} + +impl Events { + /// Unwraps the iterator, returning the internal `EventReader`. + #[inline] + pub fn into_inner(self) -> EventReader { + self.reader + } + + pub fn source(&self) -> &R { &self.reader.source } + pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source } + +} + +impl Iterator for Events { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option> { + if self.finished && !self.reader.parser.is_ignoring_end_of_stream() { None } + else { + let ev = self.reader.next(); + match ev { + Ok(XmlEvent::EndDocument) | Err(_) => self.finished = true, + _ => {} + } + Some(ev) + } + } +} + +impl<'r> EventReader<&'r [u8]> { + /// A convenience method to create an `XmlReader` from a string slice. + #[inline] + pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> { + EventReader::new(source.as_bytes()) + } +} diff --git a/src/reader/parser/inside_cdata.rs b/src/reader/parser/inside_cdata.rs new file mode 100644 index 0000000..3269fb4 --- /dev/null +++ b/src/reader/parser/inside_cdata.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_cdata(&mut self, t: Token) -> Option { + match t { + Token::CDataEnd => { + self.lexer.enable_errors(); + let event = if self.config.cdata_to_characters { + None + } else { + let data = self.take_buf(); + Some(Ok(XmlEvent::CData(data))) + }; + self.into_state(State::OutsideTag, event) + } + + Token::Whitespace(_) => { + t.push_to_string(&mut self.buf); + None + } + + _ => { + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + } + } +} diff --git a/src/reader/parser/inside_closing_tag_name.rs b/src/reader/parser/inside_closing_tag_name.rs new file mode 100644 index 0000000..1d8074a --- /dev/null +++ b/src/reader/parser/inside_closing_tag_name.rs @@ -0,0 +1,34 @@ +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, QualifiedNameTarget, ClosingTagSubstate}; + +impl PullParser { + pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option { + match s { + ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + // TODO: {:?} is bad, need something better + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)), + Token::TagEnd => this.emit_end_element(), + _ => Some(self_error!(this; "Unexpected token inside closing tag: {}", token)) + } + } + } + }), + ClosingTagSubstate::CTAfterName => match t { + Token::Whitespace(_) => None, // Skip whitespace + Token::TagEnd => self.emit_end_element(), + _ => Some(self_error!(self; "Unexpected token inside closing tag: {}", t)) + } + } + } + +} diff --git a/src/reader/parser/inside_comment.rs b/src/reader/parser/inside_comment.rs new file mode 100644 index 0000000..fc98320 --- /dev/null +++ b/src/reader/parser/inside_comment.rs @@ -0,0 +1,32 @@ +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_comment(&mut self, t: Token) -> Option { + match t { + // Double dash is illegal inside a comment + Token::Chunk(ref s) if &s[..] == "--" => Some(self_error!(self; "Unexpected token inside a comment: --")), + + Token::CommentEnd if self.config.ignore_comments => { + self.lexer.outside_comment(); + self.into_state_continue(State::OutsideTag) + } + + Token::CommentEnd => { + self.lexer.outside_comment(); + let data = self.take_buf(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data))) + } + + _ if self.config.ignore_comments => None, // Do not modify buffer if ignoring the comment + + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + +} diff --git a/src/reader/parser/inside_declaration.rs b/src/reader/parser/inside_declaration.rs new file mode 100644 index 0000000..af39d10 --- /dev/null +++ b/src/reader/parser/inside_declaration.rs @@ -0,0 +1,151 @@ + +use common::XmlVersion; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, DeclarationSubstate, QualifiedNameTarget, + DEFAULT_VERSION, DEFAULT_ENCODING +}; + +impl PullParser { + // TODO: remove redundancy via macros or extra methods + pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option { + macro_rules! unexpected_token( + ($this:expr; $t:expr) => (Some($this.error(format!("Unexpected token inside XML declaration: {}", $t)))); + ($t:expr) => (unexpected_token!(self; $t)); + ); + + #[inline] + fn emit_start_document(this: &mut PullParser) -> Option { + this.parsed_declaration = true; + let version = this.data.take_version(); + let encoding = this.data.take_encoding(); + let standalone = this.data.take_standalone(); + this.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument { + version: version.unwrap_or(DEFAULT_VERSION), + encoding: encoding.unwrap_or(DEFAULT_ENCODING.into()), + standalone: standalone + })) + } + + match s { + DeclarationSubstate::BeforeVersion => match t { + Token::Whitespace(_) => None, // continue + Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ersion" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideVersionValue + } else { + DeclarationSubstate::AfterVersion + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterVersion => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| { + this.data.version = match &value[..] { + "1.0" => Some(XmlVersion::Version10), + "1.1" => Some(XmlVersion::Version11), + _ => None + }; + if this.data.version.is_some() { + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue)) + } else { + Some(self_error!(this; "Unexpected XML version value: {}", value)) + } + }), + + DeclarationSubstate::AfterVersionValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)), + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "ncoding" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterEncoding => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| { + this.data.encoding = Some(value); + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)) + }), + + DeclarationSubstate::BeforeStandaloneDecl => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)), + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + match &name.local_name[..] { + "tandalone" if name.namespace.is_none() => + this.into_state_continue(State::InsideDeclaration( + if token == Token::EqualsSign { + DeclarationSubstate::InsideStandaloneDeclValue + } else { + DeclarationSubstate::AfterStandaloneDecl + } + )), + _ => unexpected_token!(this; name) + } + }), + + DeclarationSubstate::AfterStandaloneDecl => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)), + _ => unexpected_token!(t) + }, + + DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| { + let standalone = match &value[..] { + "yes" => Some(true), + "no" => Some(false), + _ => None + }; + if standalone.is_some() { + this.data.standalone = standalone; + this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue)) + } else { + Some(self_error!(this; "Invalid standalone declaration value: {}", value)) + } + }), + + DeclarationSubstate::AfterStandaloneDeclValue => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::ProcessingInstructionEnd => emit_start_document(self), + _ => unexpected_token!(t) + } + } + } + +} diff --git a/src/reader/parser/inside_doctype.rs b/src/reader/parser/inside_doctype.rs new file mode 100644 index 0000000..8dcf367 --- /dev/null +++ b/src/reader/parser/inside_doctype.rs @@ -0,0 +1,16 @@ +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_doctype(&mut self, t: Token) -> Option { + match t { + Token::TagEnd => { + self.lexer.enable_errors(); + self.into_state_continue(State::OutsideTag) + } + + _ => None + } + } +} diff --git a/src/reader/parser/inside_opening_tag.rs b/src/reader/parser/inside_opening_tag.rs new file mode 100644 index 0000000..533874f --- /dev/null +++ b/src/reader/parser/inside_opening_tag.rs @@ -0,0 +1,108 @@ +use common::is_name_start_char; +use attribute::OwnedAttribute; +use namespace; + +use reader::lexer::Token; + +use super::{Result, PullParser, State, OpeningTagSubstate, QualifiedNameTarget}; + +impl PullParser { + pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option { + macro_rules! unexpected_token(($t:expr) => (Some(self_error!(self; "Unexpected token inside opening tag: {}", $t)))); + match s { + OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| { + match name.prefix_ref() { + Some(prefix) if prefix == namespace::NS_XML_PREFIX || + prefix == namespace::NS_XMLNS_PREFIX => + Some(self_error!(this; "'{:?}' cannot be an element name prefix", name.prefix)), + _ => { + this.data.element_name = Some(name.clone()); + match token { + Token::TagEnd => this.emit_start_element(false), + Token::EmptyTagEnd => this.emit_start_element(true), + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)), + _ => unreachable!() + } + } + } + }), + + OpeningTagSubstate::InsideTag => match t { + Token::Whitespace(_) => None, // skip whitespace + Token::Character(c) if is_name_start_char(c) => { + self.buf.push(c); + self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName)) + } + Token::TagEnd => self.emit_start_element(false), + Token::EmptyTagEnd => self.emit_start_element(true), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| { + this.data.attr_name = Some(name); + match token { + Token::Whitespace(_) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)), + Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unreachable!() + } + }), + + OpeningTagSubstate::AfterAttributeName => match t { + Token::Whitespace(_) => None, + Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)), + _ => unexpected_token!(t) + }, + + OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| { + let name = this.data.take_attr_name().unwrap(); // unwrap() will always succeed here + + // check that no attribute with such name is already present + // if there is one, XML is not well-formed + if this.data.attributes.iter().find(|a| a.name == name).is_some() { // TODO: looks bad + // TODO: ideally this error should point to the beginning of the attribute, + // TODO: not the end of its value + Some(self_error!(this; "Attribute '{}' is redefined", name)) + } else { + match name.prefix_ref() { + // declaring a new prefix; it is sufficient to check prefix only + // because "xmlns" prefix is reserved + Some(namespace::NS_XMLNS_PREFIX) => { + let ln = &name.local_name[..]; + if ln == namespace::NS_XMLNS_PREFIX { + Some(self_error!(this; "Cannot redefine prefix '{}'", namespace::NS_XMLNS_PREFIX)) + } else if ln == namespace::NS_XML_PREFIX && &value[..] != namespace::NS_XML_URI { + Some(self_error!(this; "Prefix '{}' cannot be rebound to another value", namespace::NS_XML_PREFIX)) + } else if value.is_empty() { + Some(self_error!(this; "Cannot undefine prefix '{}'", ln)) + } else { + this.nst.put(name.local_name.clone(), value); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + + // declaring default namespace + None if &name.local_name[..] == namespace::NS_XMLNS_PREFIX => + match &value[..] { + namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX => + Some(self_error!(this; "Namespace '{}' cannot be default", value)), + _ => { + this.nst.put(namespace::NS_NO_PREFIX, value.clone()); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + }, + + // regular attribute + _ => { + this.data.attributes.push(OwnedAttribute { + name: name.clone(), + value: value + }); + this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)) + } + } + } + }) + } + } + +} diff --git a/src/reader/parser/inside_processing_instruction.rs b/src/reader/parser/inside_processing_instruction.rs new file mode 100644 index 0000000..8ddf6b8 --- /dev/null +++ b/src/reader/parser/inside_processing_instruction.rs @@ -0,0 +1,96 @@ +use common::{ + is_name_start_char, is_name_char, +}; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{Result, PullParser, State, ProcessingInstructionSubstate, DeclarationSubstate}; + +impl PullParser { + pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option { + match s { + ProcessingInstructionSubstate::PIInsideName => match t { + Token::Character(c) if !self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c) => self.append_char_continue(c), + + Token::ProcessingInstructionEnd => { + // self.buf contains PI name + let name = self.take_buf(); + + // Don't need to check for declaration because it has mandatory attributes + // but there is none + match &name[..] { + // Name is empty, it is an error + "" => Some(self_error!(self; "Encountered processing instruction without name")), + + // Found + Some(self_error!(self; "Invalid processing instruction: { + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: None + }) + ) + } + } + } + + Token::Whitespace(_) => { + // self.buf contains PI name + let name = self.take_buf(); + + match &name[..] { + // We have not ever encountered an element and have not parsed XML declaration + "xml" if !self.encountered_element && !self.parsed_declaration => + self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)), + + // Found + Some(self_error!(self; "Invalid processing instruction: { + self.lexer.disable_errors(); // data is arbitrary, so disable errors + self.data.name = name; + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData)) + } + + } + } + + _ => Some(self_error!(self; "Unexpected token: match t { + Token::ProcessingInstructionEnd => { + self.lexer.enable_errors(); + let name = self.data.take_name(); + let data = self.take_buf(); + self.into_state_emit( + State::OutsideTag, + Ok(XmlEvent::ProcessingInstruction { + name: name, + data: Some(data) + }) + ) + }, + + // Any other token should be treated as plain characters + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + } + } + +} diff --git a/src/reader/parser/inside_reference.rs b/src/reader/parser/inside_reference.rs new file mode 100644 index 0000000..60026d5 --- /dev/null +++ b/src/reader/parser/inside_reference.rs @@ -0,0 +1,89 @@ +use std::char; + +use common::{is_name_start_char, is_name_char, is_whitespace_str}; + +use reader::lexer::Token; + +use super::{Result, PullParser, State}; + +impl PullParser { + pub fn inside_reference(&mut self, t: Token, prev_st: State) -> Option { + match t { + Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) || + self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => { + self.data.ref_data.push(c); + None + } + + Token::ReferenceEnd => { + // TODO: check for unicode correctness + let name = self.data.take_ref_data(); + let name_len = name.len(); // compute once + let c = match &name[..] { + "lt" => Ok('<'.to_string()), + "gt" => Ok('>'.to_string()), + "amp" => Ok('&'.to_string()), + "apos" => Ok('\''.to_string()), + "quot" => Ok('"'.to_string()), + "" => Err(self_error!(self; "Encountered empty entity")), + _ if name_len > 2 && name.starts_with("#x") => { + let num_str = &name[2..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 16).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } else { + match u32::from_str_radix(num_str, 16).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid hexadecimal character number in an entity: {}", name)) + } + } + } + } + _ if name_len > 1 && name.starts_with('#') => { + let num_str = &name[1..name_len]; + if num_str == "0" { + Err(self_error!(self; "Null character entity is not allowed")) + } else { + if self.config.replace_unknown_entity_references { + match u32::from_str_radix(num_str, 10).ok().map(|i| char::from_u32(i).unwrap_or('\u{fffd}')) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + else { + match u32::from_str_radix(num_str, 10).ok().and_then(char::from_u32) { + Some(c) => Ok(c.to_string()), + None => Err(self_error!(self; "Invalid decimal character number in an entity: {}", name)) + } + } + } + }, + _ => { + if let Some(v) = self.config.extra_entities.get(&name) { + Ok(v.clone()) + } else { + Err(self_error!(self; "Unexpected entity: {}", name)) + } + } + }; + match c { + Ok(c) => { + self.buf.push_str(&c); + if prev_st == State::OutsideTag && !is_whitespace_str(&c) { + self.inside_whitespace = false; + } + self.into_state_continue(prev_st) + } + Err(e) => Some(e) + } + } + + _ => Some(self_error!(self; "Unexpected token inside an entity: {}", t)) + } + } +} diff --git a/src/reader/parser/mod.rs b/src/reader/parser/mod.rs new file mode 100644 index 0000000..58ca3a6 --- /dev/null +++ b/src/reader/parser/mod.rs @@ -0,0 +1,622 @@ +//! Contains an implementation of pull-based XML parser. + +use std::mem; +use std::borrow::Cow; +use std::io::prelude::*; + +use common::{ + self, + XmlVersion, Position, TextPosition, + is_name_start_char, is_name_char, +}; +use name::OwnedName; +use attribute::OwnedAttribute; +use namespace::NamespaceStack; + +use reader::events::XmlEvent; +use reader::config::ParserConfig; +use reader::lexer::{Lexer, Token}; + +macro_rules! gen_takes( + ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( + $( + impl MarkupData { + #[inline] + fn $method(&mut self) -> $t { + mem::replace(&mut self.$field, $def) + } + } + )+ + ) +); + +gen_takes!( + name -> take_name, String, String::new(); + ref_data -> take_ref_data, String, String::new(); + + version -> take_version, Option, None; + encoding -> take_encoding, Option, None; + standalone -> take_standalone, Option, None; + + element_name -> take_element_name, Option, None; + + attr_name -> take_attr_name, Option, None; + attributes -> take_attributes, Vec, vec!() +); + +macro_rules! self_error( + ($this:ident; $msg:expr) => ($this.error($msg)); + ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) +); + +mod outside_tag; +mod inside_processing_instruction; +mod inside_declaration; +mod inside_doctype; +mod inside_opening_tag; +mod inside_closing_tag_name; +mod inside_comment; +mod inside_cdata; +mod inside_reference; + +static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; +static DEFAULT_ENCODING: &'static str = "UTF-8"; +static DEFAULT_STANDALONE: Option = None; + +type ElementStack = Vec; +pub type Result = super::Result; + +/// Pull-based XML parser. +pub struct PullParser { + config: ParserConfig, + lexer: Lexer, + st: State, + buf: String, + nst: NamespaceStack, + + data: MarkupData, + final_result: Option, + next_event: Option, + est: ElementStack, + pos: Vec, + + encountered_element: bool, + parsed_declaration: bool, + inside_whitespace: bool, + read_prefix_separator: bool, + pop_namespace: bool +} + +impl PullParser { + /// Returns a new parser using the given config. + pub fn new(config: ParserConfig) -> PullParser { + PullParser { + config: config, + lexer: Lexer::new(), + st: State::OutsideTag, + buf: String::new(), + nst: NamespaceStack::default(), + + data: MarkupData { + name: String::new(), + version: None, + encoding: None, + standalone: None, + ref_data: String::new(), + element_name: None, + quote: None, + attr_name: None, + attributes: Vec::new() + }, + final_result: None, + next_event: None, + est: Vec::new(), + pos: vec![TextPosition::new()], + + encountered_element: false, + parsed_declaration: false, + inside_whitespace: true, + read_prefix_separator: false, + pop_namespace: false + } + } + + /// Checks if this parser ignores the end of stream errors. + pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } +} + +impl Position for PullParser { + /// Returns the position of the last event produced by the parser + #[inline] + fn position(&self) -> TextPosition { + self.pos[0] + } +} + +#[derive(Clone, PartialEq)] +pub enum State { + OutsideTag, + InsideOpeningTag(OpeningTagSubstate), + InsideClosingTag(ClosingTagSubstate), + InsideProcessingInstruction(ProcessingInstructionSubstate), + InsideComment, + InsideCData, + InsideDeclaration(DeclarationSubstate), + InsideDoctype, + InsideReference(Box) +} + +#[derive(Clone, PartialEq)] +pub enum OpeningTagSubstate { + InsideName, + + InsideTag, + + InsideAttributeName, + AfterAttributeName, + + InsideAttributeValue, +} + +#[derive(Clone, PartialEq)] +pub enum ClosingTagSubstate { + CTInsideName, + CTAfterName +} + +#[derive(Clone, PartialEq)] +pub enum ProcessingInstructionSubstate { + PIInsideName, + PIInsideData +} + +#[derive(Clone, PartialEq)] +pub enum DeclarationSubstate { + BeforeVersion, + InsideVersion, + AfterVersion, + + InsideVersionValue, + AfterVersionValue, + + InsideEncoding, + AfterEncoding, + + InsideEncodingValue, + + BeforeStandaloneDecl, + InsideStandaloneDecl, + AfterStandaloneDecl, + + InsideStandaloneDeclValue, + AfterStandaloneDeclValue +} + +#[derive(PartialEq)] +enum QualifiedNameTarget { + AttributeNameTarget, + OpeningTagNameTarget, + ClosingTagNameTarget +} + +#[derive(Copy, Clone, PartialEq, Eq)] +enum QuoteToken { + SingleQuoteToken, + DoubleQuoteToken +} + +impl QuoteToken { + fn from_token(t: &Token) -> QuoteToken { + match *t { + Token::SingleQuote => QuoteToken::SingleQuoteToken, + Token::DoubleQuote => QuoteToken::DoubleQuoteToken, + _ => panic!("Unexpected token: {}", t) + } + } + + fn as_token(self) -> Token { + match self { + QuoteToken::SingleQuoteToken => Token::SingleQuote, + QuoteToken::DoubleQuoteToken => Token::DoubleQuote + } + } +} + +struct MarkupData { + name: String, // used for processing instruction name + ref_data: String, // used for reference content + + version: Option, // used for XML declaration version + encoding: Option, // used for XML declaration encoding + standalone: Option, // used for XML declaration standalone parameter + + element_name: Option, // used for element name + + quote: Option, // used to hold opening quote for attribute value + attr_name: Option, // used to hold attribute name + attributes: Vec // used to hold all accumulated attributes +} + +impl PullParser { + /// Returns next event read from the given buffer. + /// + /// This method should be always called with the same buffer. If you call it + /// providing different buffers each time, the result will be undefined. + pub fn next(&mut self, r: &mut R) -> Result { + if let Some(ref ev) = self.final_result { + return ev.clone(); + } + + if let Some(ev) = self.next_event.take() { + return ev; + } + + if self.pop_namespace { + self.pop_namespace = false; + self.nst.pop(); + } + + loop { + // While lexer gives us Ok(maybe_token) -- we loop. + // Upon having a complete XML-event -- we return from the whole function. + match self.lexer.next_token(r) { + Ok(maybe_token) => + match maybe_token { + None => break, + Some(token) => + match self.dispatch_token(token) { + None => {} // continue + Some(Ok(XmlEvent::EndDocument)) => + return { + self.next_pos(); + self.set_final_result(Ok(XmlEvent::EndDocument)) + }, + Some(Ok(xml_event)) => + return { + self.next_pos(); + Ok(xml_event) + }, + Some(Err(xml_error)) => + return { + self.next_pos(); + self.set_final_result(Err(xml_error)) + }, + } + }, + Err(lexer_error) => + return self.set_final_result(Err(lexer_error)), + } + } + + // Handle end of stream + // Forward pos to the lexer head + self.next_pos(); + let ev = if self.depth() == 0 { + if self.encountered_element && self.st == State::OutsideTag { // all is ok + Ok(XmlEvent::EndDocument) + } else if !self.encountered_element { + self_error!(self; "Unexpected end of stream: no root element found") + } else { // self.st != State::OutsideTag + self_error!(self; "Unexpected end of stream") // TODO: add expected hint? + } + } else { + if self.config.ignore_end_of_stream { + self.final_result = None; + self.lexer.reset_eof_handled(); + return self_error!(self; "Unexpected end of stream: still inside the root element"); + } else { + self_error!(self; "Unexpected end of stream: still inside the root element") + } + }; + self.set_final_result(ev) + } + + // This function is to be called when a terminal event is reached. + // The function sets up the `self.final_result` into `Some(result)` and return `result`. + fn set_final_result(&mut self, result: Result) -> Result { + self.final_result = Some(result.clone()); + result + } + + #[inline] + fn error>>(&self, msg: M) -> Result { + Err((&self.lexer, msg).into()) + } + + #[inline] + fn next_pos(&mut self) { + if self.pos.len() > 1 { + self.pos.remove(0); + } else { + self.pos[0] = self.lexer.position(); + } + } + + #[inline] + fn push_pos(&mut self) { + self.pos.push(self.lexer.position()); + } + + fn dispatch_token(&mut self, t: Token) -> Option { + match self.st.clone() { + State::OutsideTag => self.outside_tag(t), + State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), + State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::InsideDoctype => self.inside_doctype(t), + State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), + State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), + State::InsideComment => self.inside_comment(t), + State::InsideCData => self.inside_cdata(t), + State::InsideReference(s) => self.inside_reference(t, *s) + } + } + + #[inline] + fn depth(&self) -> usize { + self.est.len() + } + + #[inline] + fn buf_has_data(&self) -> bool { + self.buf.len() > 0 + } + + #[inline] + fn take_buf(&mut self) -> String { + mem::replace(&mut self.buf, String::new()) + } + + #[inline] + fn append_char_continue(&mut self, c: char) -> Option { + self.buf.push(c); + None + } + + #[inline] + fn into_state(&mut self, st: State, ev: Option) -> Option { + self.st = st; + ev + } + + #[inline] + fn into_state_continue(&mut self, st: State) -> Option { + self.into_state(st, None) + } + + #[inline] + fn into_state_emit(&mut self, st: State, ev: Result) -> Option { + self.into_state(st, Some(ev)) + } + + /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, + /// an error is returned. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_name` --- a callback which is executed when whitespace is encountered. + fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option + where F: Fn(&mut PullParser, Token, OwnedName) -> Option { + // We can get here for the first time only when self.data.name contains zero or one character, + // but first character cannot be a colon anyway + if self.buf.len() <= 1 { + self.read_prefix_separator = false; + } + + let invoke_callback = |this: &mut PullParser, t| { + let name = this.take_buf(); + match name.parse() { + Ok(name) => on_name(this, t, name), + Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) + } + }; + + match t { + // There can be only one colon, and not as the first character + Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { + self.buf.push(':'); + self.read_prefix_separator = true; + None + } + + Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || + self.buf_has_data() && is_name_char(c)) => + self.append_char_continue(c), + + Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), + + Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), + + Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || + target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), + + Token::Whitespace(_) => invoke_callback(self, t), + + _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) + } + } + + /// Dispatches tokens in order to process attribute value. + /// + /// # Parameters + /// * `t` --- next token; + /// * `on_value` --- a callback which is called when terminating quote is encountered. + fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option + where F: Fn(&mut PullParser, String) -> Option { + match t { + Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace + + Token::DoubleQuote | Token::SingleQuote => match self.data.quote { + None => { // Entered attribute value + self.data.quote = Some(QuoteToken::from_token(&t)); + None + } + Some(q) if q.as_token() == t => { + self.data.quote = None; + let value = self.take_buf(); + on_value(self, value) + } + _ => { + t.push_to_string(&mut self.buf); + None + } + }, + + Token::ReferenceStart => { + let st = Box::new(self.st.clone()); + self.into_state_continue(State::InsideReference(st)) + } + + Token::OpeningTagStart => + Some(self_error!(self; "Unexpected token inside attribute value: <")), + + // Every character except " and ' and < is okay + _ => { + t.push_to_string(&mut self.buf); + None + } + } + } + + fn emit_start_element(&mut self, emit_end_element: bool) -> Option { + let mut name = self.data.take_element_name().unwrap(); + let mut attributes = self.data.take_attributes(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + // check and fix accumulated attributes prefixes + for attr in attributes.iter_mut() { + if let Some(ref pfx) = attr.name.prefix { + let new_ns = match self.nst.get(pfx) { + Some("") => None, // default namespace + Some(ns) => Some(ns.into()), + None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) + }; + attr.name.namespace = new_ns; + } + } + + if emit_end_element { + self.pop_namespace = true; + self.next_event = Some(Ok(XmlEvent::EndElement { + name: name.clone() + })); + } else { + self.est.push(name.clone()); + } + let namespace = self.nst.squash(); + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { + name: name, + attributes: attributes, + namespace: namespace + })) + } + + fn emit_end_element(&mut self) -> Option { + let mut name = self.data.take_element_name().unwrap(); + + // check whether the name prefix is bound and fix its namespace + match self.nst.get(name.borrow().prefix_repr()) { + Some("") => name.namespace = None, // default namespace + Some(ns) => name.namespace = Some(ns.into()), + None => return Some(self_error!(self; "Element {} prefix is unbound", name)) + } + + let op_name = self.est.pop().unwrap(); + + if name == op_name { + self.pop_namespace = true; + self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) + } else { + Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) + } + } + +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + + use common::{Position, TextPosition}; + use name::OwnedName; + use attribute::OwnedAttribute; + use reader::parser::PullParser; + use reader::ParserConfig; + use reader::events::XmlEvent; + + fn new_parser() -> PullParser { + PullParser::new(ParserConfig::new()) + } + + macro_rules! expect_event( + ($r:expr, $p:expr, $t:pat) => ( + match $p.next(&mut $r) { + $t => {} + e => panic!("Unexpected event: {:?}", e) + } + ); + ($r:expr, $p:expr, $t:pat => $c:expr ) => ( + match $p.next(&mut $r) { + $t if $c => {} + e => panic!("Unexpected event: {:?}", e) + } + ) + ); + + macro_rules! test_data( + ($d:expr) => ({ + static DATA: &'static str = $d; + let r = BufReader::new(DATA.as_bytes()); + let p = new_parser(); + (r, p) + }) + ); + + #[test] + fn issue_3_semicolon_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => + *name == OwnedName::local("a") && + attributes.len() == 1 && + attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && + namespace.is_essentially_empty() + ); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn issue_140_entity_reference_inside_tag() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); + expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); + expect_event!(r, p, Ok(XmlEvent::EndDocument)); + } + + #[test] + fn opening_tag_in_attribute_value() { + let (mut r, mut p) = test_data!(r#" + + "#); + + expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); + expect_event!(r, p, Err(ref e) => + e.msg() == "Unexpected token inside attribute value: <" && + e.position() == TextPosition { row: 1, column: 24 } + ); + } +} diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs new file mode 100644 index 0000000..d3f7598 --- /dev/null +++ b/src/reader/parser/outside_tag.rs @@ -0,0 +1,130 @@ +use common::is_whitespace_char; + +use reader::events::XmlEvent; +use reader::lexer::Token; + +use super::{ + Result, PullParser, State, ClosingTagSubstate, OpeningTagSubstate, + ProcessingInstructionSubstate, DEFAULT_VERSION, DEFAULT_ENCODING, DEFAULT_STANDALONE +}; + +impl PullParser { + pub fn outside_tag(&mut self, t: Token) -> Option { + match t { + Token::ReferenceStart => + self.into_state_continue(State::InsideReference(Box::new(State::OutsideTag))), + + Token::Whitespace(_) if self.depth() == 0 && self.config.ignore_root_level_whitespace => None, // skip whitespace outside of the root element + + Token::Whitespace(_) if self.config.trim_whitespace && !self.buf_has_data() => None, + + Token::Whitespace(c) => { + if !self.buf_has_data() { + self.push_pos(); + } + self.append_char_continue(c) + } + + _ if t.contains_char_data() && self.depth() == 0 => + Some(self_error!(self; "Unexpected characters outside the root element: {}", t)), + + _ if t.contains_char_data() => { // Non-whitespace char data + if !self.buf_has_data() { + self.push_pos(); + } + self.inside_whitespace = false; + t.push_to_string(&mut self.buf); + None + } + + Token::ReferenceEnd => { // Semi-colon in a text outside an entity + self.inside_whitespace = false; + Token::ReferenceEnd.push_to_string(&mut self.buf); + None + } + + Token::CommentStart if self.config.coalesce_characters && self.config.ignore_comments => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state_continue(State::InsideComment) + } + + Token::CDataStart if self.config.coalesce_characters && self.config.cdata_to_characters => { + if !self.buf_has_data() { + self.push_pos(); + } + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state_continue(State::InsideCData) + } + + _ => { + // Encountered some markup event, flush the buffer as characters + // or a whitespace + let mut next_event = if self.buf_has_data() { + let buf = self.take_buf(); + if self.inside_whitespace && self.config.trim_whitespace { + None + } else if self.inside_whitespace && !self.config.whitespace_to_characters { + Some(Ok(XmlEvent::Whitespace(buf))) + } else if self.config.trim_whitespace { + Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) + } else { + Some(Ok(XmlEvent::Characters(buf))) + } + } else { None }; + self.inside_whitespace = true; // Reset inside_whitespace flag + self.push_pos(); + match t { + Token::ProcessingInstructionStart => + self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), + + Token::DoctypeStart if !self.encountered_element => { + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.lexer.disable_errors(); + self.into_state(State::InsideDoctype, next_event) + } + + Token::OpeningTagStart => { + // If declaration was not parsed and we have encountered an element, + // emit this declaration as the next event. + if !self.parsed_declaration { + self.parsed_declaration = true; + let sd_event = XmlEvent::StartDocument { + version: DEFAULT_VERSION, + encoding: DEFAULT_ENCODING.into(), + standalone: DEFAULT_STANDALONE + }; + // next_event is always none here because we're outside of + // the root element + next_event = Some(Ok(sd_event)); + self.push_pos(); + } + self.encountered_element = true; + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + } + + Token::ClosingTagStart if self.depth() > 0 => + self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), + + Token::CommentStart => { + // We need to switch the lexer into a comment mode inside comments + self.lexer.inside_comment(); + self.into_state(State::InsideComment, next_event) + } + + Token::CDataStart => { + // We need to disable lexing errors inside CDATA + self.lexer.disable_errors(); + self.into_state(State::InsideCData, next_event) + } + + _ => Some(self_error!(self; "Unexpected token: {}", t)) + } + } + } + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..23fee04 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,107 @@ +use std::io::{self, Read}; +use std::str; +use std::fmt; + +#[derive(Debug)] +pub enum CharReadError { + UnexpectedEof, + Utf8(str::Utf8Error), + Io(io::Error) +} + +impl From for CharReadError { + fn from(e: str::Utf8Error) -> CharReadError { + CharReadError::Utf8(e) + } +} + +impl From for CharReadError { + fn from(e: io::Error) -> CharReadError { + CharReadError::Io(e) + } +} + +impl fmt::Display for CharReadError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::CharReadError::*; + match *self { + UnexpectedEof => write!(f, "unexpected end of stream"), + Utf8(ref e) => write!(f, "UTF-8 decoding error: {}", e), + Io(ref e) => write!(f, "I/O error: {}", e) + } + } +} + +pub fn next_char_from(source: &mut R) -> Result, CharReadError> { + const MAX_CODEPOINT_LEN: usize = 4; + + let mut bytes = source.bytes(); + let mut buf = [0u8; MAX_CODEPOINT_LEN]; + let mut pos = 0; + + loop { + let next = match bytes.next() { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e.into()), + None if pos == 0 => return Ok(None), + None => return Err(CharReadError::UnexpectedEof) + }; + buf[pos] = next; + pos += 1; + + match str::from_utf8(&buf[..pos]) { + Ok(s) => return Ok(s.chars().next()), // always Some(..) + Err(_) if pos < MAX_CODEPOINT_LEN => {}, + Err(e) => return Err(e.into()) + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn test_next_char_from() { + use std::io; + use std::error::Error; + + let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('c')); + + let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('п')); + + let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP + assert_eq!(super::next_char_from(&mut bytes).unwrap(), Some('😊')); + + let mut bytes: &[u8] = b""; // empty + assert_eq!(super::next_char_from(&mut bytes).unwrap(), None); + + let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::UnexpectedEof => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point + match super::next_char_from(&mut bytes).unwrap_err() { + super::CharReadError::Utf8(_) => {}, + e => panic!("Unexpected result: {:?}", e) + }; + + + // error during read + struct ErrorReader; + impl io::Read for ErrorReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Err(io::Error::new(io::ErrorKind::Other, "test error")) + } + } + + let mut r = ErrorReader; + match super::next_char_from(&mut r).unwrap_err() { + super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other && + e.description() == "test error" => {}, + e => panic!("Unexpected result: {:?}", e) + } + } +} diff --git a/src/writer/config.rs b/src/writer/config.rs new file mode 100644 index 0000000..ebabf18 --- /dev/null +++ b/src/writer/config.rs @@ -0,0 +1,157 @@ +//! Contains emitter configuration structure. + +use std::io::Write; +use std::borrow::Cow; + +use writer::EventWriter; + +/// Emitter configuration structure. +/// +/// This structure contains various options which control XML document emitter behavior. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct EmitterConfig { + /// Line separator used to separate lines in formatted output. Default is `"\n"`. + pub line_separator: Cow<'static, str>, + + /// A string which will be used for a single level of indentation. Default is `" "` + /// (two spaces). + pub indent_string: Cow<'static, str>, + + /// Whether or not the emitted document should be indented. Default is false. + /// + /// The emitter is capable to perform automatic indentation of the emitted XML document. + /// It is done in stream-like fashion and does not require the knowledge of the whole + /// document in advance. + /// + /// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep + /// existing layout when processing an existing XML document. Also the indentiation algorithm + /// is not thoroughly tested. Hence by default it is disabled. + pub perform_indent: bool, + + /// Whether or not characters in output events will be escaped. Default is true. + /// + /// The emitter can automatically escape characters which can't appear in PCDATA sections + /// or element attributes of an XML document, like `<` or `"` (in attributes). This may + /// introduce some overhead because then every corresponding piece of character data + /// should be scanned for invalid characters. + /// + /// If this option is disabled, the XML writer may produce non-well-formed documents, so + /// use `false` value for this option with care. + pub perform_escaping: bool, + + /// Whether or not to write XML document declaration at the beginning of a document. + /// Default is true. + /// + /// This option controls whether the document declaration should be emitted automatically + /// before a root element is written if it was not emitted explicitly by the user. + pub write_document_declaration: bool, + + /// Whether or not to convert elements with empty content to empty elements. Default is true. + /// + /// This option allows turning elements like `` (an element with empty content) + /// into `` (an empty element). + pub normalize_empty_elements: bool, + + /// Whether or not to emit CDATA events as plain characters. Default is false. + /// + /// This option forces the emitter to convert CDATA events into regular character events, + /// performing all the necessary escaping beforehand. This may be occasionally useful + /// for feeding the document into incorrect parsers which do not support CDATA. + pub cdata_to_characters: bool, + + /// Whether or not to keep element names to support `EndElement` events without explicit names. + /// Default is true. + /// + /// This option makes the emitter to keep names of written elements in order to allow + /// omitting names when writing closing element tags. This could incur some memory overhead. + pub keep_element_names_stack: bool, + + /// Whether or not to automatically insert leading and trailing spaces in emitted comments, + /// if necessary. Default is true. + /// + /// This is a convenience option in order for the user not to append spaces before and after + /// comments text in order to get more pretty comments: `` instead of + /// ``. + pub autopad_comments: bool, + + /// Whether or not to automatically insert spaces before the trailing `/>` in self-closing + /// elements. Default is true. + /// + /// This option is only meaningful if `normalize_empty_elements` is true. For example, the + /// element `` would be unaffected. When `normalize_empty_elements` is true, then when + /// this option is also true, the same element would appear ``. If this option is false, + /// then the same element would appear ``. + pub pad_self_closing: bool, +} + +impl EmitterConfig { + /// Creates an emitter configuration with default values. + /// + /// You can tweak default options with builder-like pattern: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let config = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false); + /// ``` + #[inline] + pub fn new() -> EmitterConfig { + EmitterConfig { + line_separator: "\n".into(), + indent_string: " ".into(), // two spaces + perform_indent: false, + perform_escaping: true, + write_document_declaration: true, + normalize_empty_elements: true, + cdata_to_characters: false, + keep_element_names_stack: true, + autopad_comments: true, + pad_self_closing: true + } + } + + /// Creates an XML writer with this configuration. + /// + /// This is a convenience method for configuring and creating a writer at the same time: + /// + /// ```rust + /// use xml::writer::EmitterConfig; + /// + /// let mut target: Vec = Vec::new(); + /// + /// let writer = EmitterConfig::new() + /// .line_separator("\r\n") + /// .perform_indent(true) + /// .normalize_empty_elements(false) + /// .create_writer(&mut target); + /// ``` + /// + /// This method is exactly equivalent to calling `EventWriter::new_with_config()` with + /// this configuration object. + #[inline] + pub fn create_writer(self, sink: W) -> EventWriter { + EventWriter::new_with_config(sink, self) + } +} + +impl Default for EmitterConfig { + #[inline] + fn default() -> EmitterConfig { + EmitterConfig::new() + } +} + +gen_setters!(EmitterConfig, + line_separator: into Cow<'static, str>, + indent_string: into Cow<'static, str>, + perform_indent: val bool, + write_document_declaration: val bool, + normalize_empty_elements: val bool, + cdata_to_characters: val bool, + keep_element_names_stack: val bool, + autopad_comments: val bool, + pad_self_closing: val bool +); diff --git a/src/writer/emitter.rs b/src/writer/emitter.rs new file mode 100644 index 0000000..ba80f66 --- /dev/null +++ b/src/writer/emitter.rs @@ -0,0 +1,447 @@ +use std::io; +use std::io::prelude::*; +use std::fmt; +use std::result; +use std::borrow::Cow; +use std::error::Error; + +use common; +use name::{Name, OwnedName}; +use attribute::Attribute; +use escape::{escape_str_attribute, escape_str_pcdata}; +use common::XmlVersion; +use namespace::{NamespaceStack, NS_NO_PREFIX, NS_EMPTY_URI, NS_XMLNS_PREFIX, NS_XML_PREFIX}; + +use writer::config::EmitterConfig; + +/// An error which may be returned by `XmlWriter` when writing XML events. +#[derive(Debug)] +pub enum EmitterError { + /// An I/O error occured in the underlying `Write` instance. + Io(io::Error), + + /// Document declaration has already been written to the output stream. + DocumentStartAlreadyEmitted, + + /// The name of the last opening element is not available. + LastElementNameNotAvailable, + + /// The name of the last opening element is not equal to the name of the provided + /// closing element. + EndElementNameIsNotEqualToLastStartElementName, + + /// End element name is not specified when it is needed, for example, when automatic + /// closing is not enabled in configuration. + EndElementNameIsNotSpecified +} + +impl From for EmitterError { + fn from(err: io::Error) -> EmitterError { + EmitterError::Io(err) + } +} + +impl fmt::Display for EmitterError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + + write!(f, "emitter error: ")?; + match *self { + EmitterError::Io(ref e) => + write!(f, "I/O error: {}", e), + ref other => + write!(f, "{}", other.description()), + } + } +} + +impl Error for EmitterError { + fn description(&self) -> &str { + match *self { + EmitterError::Io(_) => + "I/O error", + EmitterError::DocumentStartAlreadyEmitted => + "document start event has already been emitted", + EmitterError::LastElementNameNotAvailable => + "last element name is not available", + EmitterError::EndElementNameIsNotEqualToLastStartElementName => + "end element name is not equal to last start element name", + EmitterError::EndElementNameIsNotSpecified => + "end element name is not specified and can't be inferred", + } + } +} + +/// A result type yielded by `XmlWriter`. +pub type Result = result::Result; + +// TODO: split into a low-level fast writer without any checks and formatting logic and a +// high-level indenting validating writer +pub struct Emitter { + config: EmitterConfig, + + nst: NamespaceStack, + + indent_level: usize, + indent_stack: Vec, + + element_names: Vec, + + start_document_emitted: bool, + just_wrote_start_element: bool +} + +impl Emitter { + pub fn new(config: EmitterConfig) -> Emitter { + Emitter { + config, + + nst: NamespaceStack::empty(), + + indent_level: 0, + indent_stack: vec![IndentFlags::WroteNothing], + + element_names: Vec::new(), + + start_document_emitted: false, + just_wrote_start_element: false + } + } +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +enum IndentFlags { + WroteNothing, + WroteMarkup, + WroteText, +} + +impl Emitter { + /// Returns the current state of namespaces. + #[inline] + pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack { + &mut self.nst + } + + #[inline] + fn wrote_text(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteText + } + + #[inline] + fn wrote_markup(&self) -> bool { + *self.indent_stack.last().unwrap() == IndentFlags::WroteMarkup + } + + #[inline] + fn set_wrote_text(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteText; + } + + #[inline] + fn set_wrote_markup(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteMarkup; + } + + #[inline] + fn reset_state(&mut self) { + *self.indent_stack.last_mut().unwrap() = IndentFlags::WroteNothing; + } + + fn write_newline(&mut self, target: &mut W, level: usize) -> Result<()> { + target.write_all(self.config.line_separator.as_bytes())?; + for _ in 0..level { + target.write_all(self.config.indent_string.as_bytes())?; + } + Ok(()) + } + + fn before_markup(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && !self.wrote_text() && + (self.indent_level > 0 || self.wrote_markup()) { + let indent_level = self.indent_level; + self.write_newline(target, indent_level)?; + if self.indent_level > 0 && self.config.indent_string.len() > 0 { + self.after_markup(); + } + } + Ok(()) + } + + fn after_markup(&mut self) { + self.set_wrote_markup(); + } + + fn before_start_element(&mut self, target: &mut W) -> Result<()> { + self.before_markup(target)?; + self.indent_stack.push(IndentFlags::WroteNothing); + Ok(()) + } + + fn after_start_element(&mut self) { + self.after_markup(); + self.indent_level += 1; + } + + fn before_end_element(&mut self, target: &mut W) -> Result<()> { + if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() && + !self.wrote_text() { + let indent_level = self.indent_level; + self.write_newline(target, indent_level - 1) + } else { + Ok(()) + } + } + + fn after_end_element(&mut self) { + if self.indent_level > 0 { + self.indent_level -= 1; + self.indent_stack.pop(); + } + self.set_wrote_markup(); + } + + fn after_text(&mut self) { + self.set_wrote_text(); + } + + pub fn emit_start_document(&mut self, target: &mut W, + version: XmlVersion, + encoding: &str, + standalone: Option) -> Result<()> { + if self.start_document_emitted { + return Err(EmitterError::DocumentStartAlreadyEmitted); + } + self.start_document_emitted = true; + + self.before_markup(target)?; + let result = { + let mut write = move || { + write!(target, "")?; + + Ok(()) + }; + write() + }; + self.after_markup(); + + result + } + + fn check_document_started(&mut self, target: &mut W) -> Result<()> { + if !self.start_document_emitted && self.config.write_document_declaration { + self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None) + } else { + Ok(()) + } + } + + fn fix_non_empty_element(&mut self, target: &mut W) -> Result<()> { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + target.write_all(b">").map_err(From::from) + } else { + Ok(()) + } + } + + pub fn emit_processing_instruction(&mut self, + target: &mut W, + name: &str, + data: Option<&str>) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + + self.before_markup(target)?; + + let result = { + let mut write = || { + write!(target, "")?; + + Ok(()) + }; + write() + }; + + self.after_markup(); + + result + } + + fn emit_start_element_initial(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + self.before_start_element(target)?; + write!(target, "<{}", name.repr_display())?; + self.emit_current_namespace_attributes(target)?; + self.emit_attributes(target, attributes)?; + self.after_start_element(); + Ok(()) + } + + pub fn emit_start_element(&mut self, target: &mut W, + name: Name, + attributes: &[Attribute]) -> Result<()> + where W: Write + { + if self.config.keep_element_names_stack { + self.element_names.push(name.to_owned()); + } + + self.emit_start_element_initial(target, name, attributes)?; + self.just_wrote_start_element = true; + + if !self.config.normalize_empty_elements { + write!(target, ">")?; + } + + Ok(()) + } + + pub fn emit_current_namespace_attributes(&mut self, target: &mut W) -> Result<()> + where W: Write + { + for (prefix, uri) in self.nst.peek() { + match prefix { + // internal namespaces are not emitted + NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()), + //// there is already a namespace binding with this prefix in scope + //prefix if self.nst.get(prefix) == Some(uri) => Ok(()), + // emit xmlns only if it is overridden + NS_NO_PREFIX => if uri != NS_EMPTY_URI { + write!(target, " xmlns=\"{}\"", uri) + } else { Ok(()) }, + // everything else + prefix => write!(target, " xmlns:{}=\"{}\"", prefix, uri) + }?; + } + Ok(()) + } + + pub fn emit_attributes(&mut self, target: &mut W, + attributes: &[Attribute]) -> Result<()> { + for attr in attributes.iter() { + write!( + target, " {}=\"{}\"", + attr.name.repr_display(), + if self.config.perform_escaping { escape_str_attribute(attr.value) } else { Cow::Borrowed(attr.value) } + )? + } + Ok(()) + } + + pub fn emit_end_element(&mut self, target: &mut W, + name: Option) -> Result<()> { + let owned_name = if self.config.keep_element_names_stack { + Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?) + } else { + None + }; + + // Check that last started element name equals to the provided name, if there are both + if let Some(ref last_name) = owned_name { + if let Some(ref name) = name { + if last_name.borrow() != *name { + return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName); + } + } + } + + if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) { + if self.config.normalize_empty_elements && self.just_wrote_start_element { + self.just_wrote_start_element = false; + let termination = if self.config.pad_self_closing { " />" } else { "/>" }; + let result = target.write_all(termination.as_bytes()).map_err(From::from); + self.after_end_element(); + result + } else { + self.just_wrote_start_element = false; + + self.before_end_element(target)?; + let result = write!(target, "", name.repr_display()).map_err(From::from); + self.after_end_element(); + + result + } + } else { + Err(EmitterError::EndElementNameIsNotSpecified) + } + } + + pub fn emit_cdata(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + if self.config.cdata_to_characters { + self.emit_characters(target, content) + } else { + // TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks + target.write_all(b"")?; + + self.after_text(); + + Ok(()) + } + } + + pub fn emit_characters(&mut self, target: &mut W, + content: &str) -> Result<()> { + self.check_document_started(target)?; + self.fix_non_empty_element(target)?; + target.write_all( + (if self.config.perform_escaping { + escape_str_pcdata(content) + } else { + Cow::Borrowed(content) + }).as_bytes() + )?; + self.after_text(); + Ok(()) + } + + pub fn emit_comment(&mut self, target: &mut W, content: &str) -> Result<()> { + self.fix_non_empty_element(target)?; + + // TODO: add escaping dashes at the end of the comment + + let autopad_comments = self.config.autopad_comments; + let write = |target: &mut W| -> Result<()> { + target.write_all(b"")?; + + Ok(()) + }; + + self.before_markup(target)?; + let result = write(target); + self.after_markup(); + + result + } +} diff --git a/src/writer/events.rs b/src/writer/events.rs new file mode 100644 index 0000000..1f7040f --- /dev/null +++ b/src/writer/events.rs @@ -0,0 +1,241 @@ +//! Contains `XmlEvent` datatype, instances of which are consumed by the writer. + +use std::borrow::Cow; + +use name::Name; +use attribute::Attribute; +use common::XmlVersion; +use namespace::{Namespace, NS_NO_PREFIX}; + +/// A part of an XML output stream. +/// +/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of +/// an XML document. +#[derive(Debug)] +pub enum XmlEvent<'a> { + /// Corresponds to XML document declaration. + /// + /// This event should always be written before any other event. If it is not written + /// at all, a default XML declaration will be outputted if the corresponding option + /// is set in the configuration. Otherwise an error will be returned. + StartDocument { + /// XML version. + /// + /// Defaults to `XmlVersion::Version10`. + version: XmlVersion, + + /// XML document encoding. + /// + /// Defaults to `Some("UTF-8")`. + encoding: Option<&'a str>, + + /// XML standalone declaration. + /// + /// Defaults to `None`. + standalone: Option + }, + + /// Denotes an XML processing instruction. + ProcessingInstruction { + /// Processing instruction target. + name: &'a str, + + /// Processing instruction content. + data: Option<&'a str> + }, + + /// Denotes a beginning of an XML element. + StartElement { + /// Qualified name of the element. + name: Name<'a>, + + /// A list of attributes associated with the element. + /// + /// Currently attributes are not checked for duplicates (TODO). Attribute values + /// will be escaped, and all characters invalid for attribute values like `"` or `<` + /// will be changed into character entities. + attributes: Cow<'a, [Attribute<'a>]>, + + /// Contents of the namespace mapping at this point of the document. + /// + /// This mapping will be inspected for "new" entries, and if at this point of the document + /// a particular pair of prefix and namespace URI is already defined, no namespace + /// attributes will be emitted. + namespace: Cow<'a, Namespace>, + }, + + /// Denotes an end of an XML element. + EndElement { + /// Optional qualified name of the element. + /// + /// If `None`, then it is assumed that the element name should be the last valid one. + /// If `Some` and element names tracking is enabled, then the writer will check it for + /// correctness. + name: Option> + }, + + /// Denotes CDATA content. + /// + /// This event contains unparsed data, and no escaping will be performed when writing it + /// to the output stream. + CData(&'a str), + + /// Denotes a comment. + /// + /// The string will be checked for invalid sequences and error will be returned by the + /// write operation + Comment(&'a str), + + /// Denotes character data outside of tags. + /// + /// Contents of this event will be escaped if `perform_escaping` option is enabled, + /// that is, every character invalid for PCDATA will appear as a character entity. + Characters(&'a str) +} + +impl<'a> XmlEvent<'a> { + /// Returns an writer event for a processing instruction. + #[inline] + pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> { + XmlEvent::ProcessingInstruction { name: name, data: data } + } + + /// Returns a builder for a starting element. + /// + /// This builder can then be used to tweak attributes and namespace starting at + /// this element. + #[inline] + pub fn start_element(name: S) -> StartElementBuilder<'a> where S: Into> { + StartElementBuilder { + name: name.into(), + attributes: Vec::new(), + namespace: Namespace::empty().into() + } + } + + /// Returns a builder for an closing element. + /// + /// This method, unline `start_element()`, does not accept a name because by default + /// the writer is able to determine it automatically. However, when this functionality + /// is disabled, it is possible to specify the name with `name()` method on the builder. + #[inline] + pub fn end_element() -> EndElementBuilder<'a> { + EndElementBuilder { name: None } + } + + /// Returns a CDATA event. + /// + /// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>` + /// (depending on the configuration). + #[inline] + pub fn cdata(data: &'a str) -> XmlEvent<'a> { XmlEvent::CData(data) } + + /// Returns a regular characters (PCDATA) event. + /// + /// All offending symbols, in particular, `&` and `<`, will be escaped by the writer. + #[inline] + pub fn characters(data: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(data) } + + /// Returns a comment event. + #[inline] + pub fn comment(data: &'a str) -> XmlEvent<'a> { XmlEvent::Comment(data) } +} + +impl<'a> From<&'a str> for XmlEvent<'a> { + #[inline] + fn from(s: &'a str) -> XmlEvent<'a> { XmlEvent::Characters(s) } +} + +pub struct EndElementBuilder<'a> { + name: Option> +} + +/// A builder for a closing element event. +impl<'a> EndElementBuilder<'a> { + /// Sets the name of this closing element. + /// + /// Usually the writer is able to determine closing element names automatically. If + /// this functionality is enabled (by default it is), then this name is checked for correctness. + /// It is possible, however, to disable such behavior; then the user must ensure that + /// closing element name is correct manually. + #[inline] + pub fn name(mut self, name: N) -> EndElementBuilder<'a> where N: Into> { + self.name = Some(name.into()); + self + } +} + +impl<'a> From> for XmlEvent<'a> { + fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::EndElement { name: b.name } + } +} + +/// A builder for a starting element event. +pub struct StartElementBuilder<'a> { + name: Name<'a>, + attributes: Vec>, + namespace: Namespace +} + +impl<'a> StartElementBuilder<'a> { + /// Sets an attribute value of this element to the given string. + /// + /// This method can be used to add attributes to the starting element. Name is a qualified + /// name; its namespace is ignored, but its prefix is checked for correctness, that is, + /// it is checked that the prefix is bound to some namespace in the current context. + /// + /// Currently attributes are not checked for duplicates. Note that duplicate attributes + /// are a violation of XML document well-formedness. + /// + /// The writer checks that you don't specify reserved prefix names, for example `xmlns`. + #[inline] + pub fn attr(mut self, name: N, value: &'a str) -> StartElementBuilder<'a> + where N: Into> + { + self.attributes.push(Attribute::new(name.into(), value)); + self + } + + /// Adds a namespace to the current namespace context. + /// + /// If no namespace URI was bound to the provided prefix at this point of the document, + /// then the mapping from the prefix to the provided namespace URI will be written as + /// a part of this element attribute set. + /// + /// If the same namespace URI was bound to the provided prefix at this point of the document, + /// then no namespace attributes will be emitted. + /// + /// If some other namespace URI was bound to the provided prefix at this point of the document, + /// then another binding will be added as a part of this element attribute set, shadowing + /// the outer binding. + #[inline] + pub fn ns(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a> + where S1: Into, S2: Into + { + self.namespace.put(prefix, uri); + self + } + + /// Adds a default namespace mapping to the current namespace context. + /// + /// Same rules as for `ns()` are also valid for the default namespace mapping. + #[inline] + pub fn default_ns(mut self, uri: S) -> StartElementBuilder<'a> + where S: Into + { + self.namespace.put(NS_NO_PREFIX, uri); + self + } +} + +impl<'a> From> for XmlEvent<'a> { + #[inline] + fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> { + XmlEvent::StartElement { + name: b.name, + attributes: Cow::Owned(b.attributes), + namespace: Cow::Owned(b.namespace) + } + } +} diff --git a/src/writer/mod.rs b/src/writer/mod.rs new file mode 100644 index 0000000..ea1b242 --- /dev/null +++ b/src/writer/mod.rs @@ -0,0 +1,93 @@ +//! Contains high-level interface for an events-based XML emitter. +//! +//! The most important type in this module is `EventWriter` which allows writing an XML document +//! to some output stream. + +pub use self::emitter::Result; +pub use self::emitter::EmitterError as Error; +pub use self::config::EmitterConfig; +pub use self::events::XmlEvent; + +use self::emitter::Emitter; + +use std::io::prelude::*; + +mod emitter; +mod config; +pub mod events; + +/// A wrapper around an `std::io::Write` instance which emits XML document according to provided +/// events. +pub struct EventWriter { + sink: W, + emitter: Emitter +} + +impl EventWriter { + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the default + /// configuration. + #[inline] + pub fn new(sink: W) -> EventWriter { + EventWriter::new_with_config(sink, EmitterConfig::new()) + } + + /// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided + /// configuration. + #[inline] + pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter { + EventWriter { + sink, + emitter: Emitter::new(config) + } + } + + /// Writes the next piece of XML document according to the provided event. + /// + /// Note that output data may not exactly correspond to the written event because + /// of various configuration options. For example, `XmlEvent::EndElement` may + /// correspond to a separate closing element or it may cause writing an empty element. + /// Another example is that `XmlEvent::CData` may be represented as characters in + /// the output stream. + pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into> { + match event.into() { + XmlEvent::StartDocument { version, encoding, standalone } => + self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone), + XmlEvent::ProcessingInstruction { name, data } => + self.emitter.emit_processing_instruction(&mut self.sink, name, data), + XmlEvent::StartElement { name, attributes, namespace } => { + self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref()); + self.emitter.emit_start_element(&mut self.sink, name, &attributes) + } + XmlEvent::EndElement { name } => { + let r = self.emitter.emit_end_element(&mut self.sink, name); + self.emitter.namespace_stack_mut().try_pop(); + r + } + XmlEvent::Comment(content) => + self.emitter.emit_comment(&mut self.sink, content), + XmlEvent::CData(content) => + self.emitter.emit_cdata(&mut self.sink, content), + XmlEvent::Characters(content) => + self.emitter.emit_characters(&mut self.sink, content) + } + } + + /// Returns a mutable reference to the underlying `Writer`. + /// + /// Note that having a reference to the underlying sink makes it very easy to emit invalid XML + /// documents. Use this method with care. Valid use cases for this method include accessing + /// methods like `Write::flush`, which do not emit new data but rather change the state + /// of the stream itself. + pub fn inner_mut(&mut self) -> &mut W { + &mut self.sink + } + + /// Unwraps this `EventWriter`, returning the underlying writer. + /// + /// Note that this is a destructive operation: unwrapping a writer and then wrapping + /// it again with `EventWriter::new()` will create a fresh writer whose state will be + /// blank; for example, accumulated namespaces will be reset. + pub fn into_inner(self) -> W { + self.sink + } +} diff --git a/tests/documents/sample_1.xml b/tests/documents/sample_1.xml new file mode 100644 index 0000000..4d1cbc0 --- /dev/null +++ b/tests/documents/sample_1.xml @@ -0,0 +1,34 @@ + + + + + + + + + + Some <java> class + + + Another "java" class + + + Weird 'XML' config + + + + + + + + + + JavaScript & program + + + Cascading style sheet: © - ҉ + + + + + diff --git a/tests/documents/sample_1_full.txt b/tests/documents/sample_1_full.txt new file mode 100644 index 0000000..a8d64d0 --- /dev/null +++ b/tests/documents/sample_1_full.txt @@ -0,0 +1,58 @@ +StartDocument(1.0, utf-8) +StartElement(project [name="project-name"]) +Whitespace("\n ") +StartElement(libraries) +Whitespace("\n ") +StartElement(library [groupId="org.example", artifactId="", version="0.1"]) +EndElement(library) +Whitespace("\n ") +StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) +EndElement(library) +Whitespace("\n ") +EndElement(libraries) +Whitespace("\n ") +StartElement(module [name="module-1"]) +Whitespace("\n ") +StartElement(files) +Whitespace("\n ") +StartElement(file [name="somefile.java", type="java"]) +Characters("\n Some class\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="another_file.java", type="java"]) +Characters("\n Another \"java\" class\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="config.xml", type="xml"]) +Characters("\n Weird \'XML\' config\n ") +EndElement(file) +Whitespace("\n ") +EndElement(files) +Whitespace("\n ") +StartElement(libraries) +Whitespace("\n ") +StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) +EndElement(library) +Whitespace("\n ") +EndElement(libraries) +Whitespace("\n ") +EndElement(module) +Whitespace("\n ") +StartElement(module [name="module-2"]) +Whitespace("\n ") +StartElement(files) +Whitespace("\n ") +StartElement(file [name="program.js", type="javascript"]) +Characters("\n JavaScript & program\n ") +EndElement(file) +Whitespace("\n ") +StartElement(file [name="style.css", type="css"]) +Characters("\n Cascading style sheet: © - ҉\n ") +EndElement(file) +Whitespace("\n ") +EndElement(files) +Whitespace("\n ") +EndElement(module) +Whitespace("\n") +EndElement(project) +EndDocument diff --git a/tests/documents/sample_1_short.txt b/tests/documents/sample_1_short.txt new file mode 100644 index 0000000..4dbe285 --- /dev/null +++ b/tests/documents/sample_1_short.txt @@ -0,0 +1,37 @@ +StartDocument(1.0, utf-8) +StartElement(project [name="project-name"]) +StartElement(libraries) +StartElement(library [groupId="org.example", artifactId="", version="0.1"]) +EndElement(library) +StartElement(library [groupId="com.example", artifactId="\"cool-lib&", version="999"]) +EndElement(library) +EndElement(libraries) +StartElement(module [name="module-1"]) +StartElement(files) +StartElement(file [name="somefile.java", type="java"]) +Characters("Some class") +EndElement(file) +StartElement(file [name="another_file.java", type="java"]) +Characters("Another \"java\" class") +EndElement(file) +StartElement(file [name="config.xml", type="xml"]) +Characters("Weird \'XML\' config") +EndElement(file) +EndElement(files) +StartElement(libraries) +StartElement(library [groupId="junit", artifactId="junit", version="1.9.5"]) +EndElement(library) +EndElement(libraries) +EndElement(module) +StartElement(module [name="module-2"]) +StartElement(files) +StartElement(file [name="program.js", type="javascript"]) +Characters("JavaScript & program") +EndElement(file) +StartElement(file [name="style.css", type="css"]) +Characters("Cascading style sheet: © - ҉") +EndElement(file) +EndElement(files) +EndElement(module) +EndElement(project) +EndDocument diff --git a/tests/documents/sample_2.xml b/tests/documents/sample_2.xml new file mode 100644 index 0000000..f9543ac --- /dev/null +++ b/tests/documents/sample_2.xml @@ -0,0 +1,15 @@ + + + + Name + Another name + 0.3 + 0.2 + 0.1 + 0.01 + header 1 value + + Some bigger value + + + diff --git a/tests/documents/sample_2_full.txt b/tests/documents/sample_2_full.txt new file mode 100644 index 0000000..75075cd --- /dev/null +++ b/tests/documents/sample_2_full.txt @@ -0,0 +1,41 @@ +StartDocument(1.0, utf-8) +StartElement({urn:example:namespace}p:data) +Whitespace("\n ") +StartElement({urn:example:namespace}p:datum [id="34"]) +Whitespace("\n ") +StartElement({urn:example:namespace}p:name) +Characters("Name") +EndElement({urn:example:namespace}p:name) +Whitespace("\n ") +StartElement({urn:example:double}d:name) +Characters("Another name") +EndElement({urn:example:double}d:name) +Whitespace("\n ") +StartElement({urn:example:double}d:arg) +Characters("0.3") +EndElement({urn:example:double}d:arg) +Whitespace("\n ") +StartElement({urn:example:double}d:arg) +Characters("0.2") +EndElement({urn:example:double}d:arg) +Whitespace("\n ") +StartElement({urn:example:namespace}p:arg) +Characters("0.1") +EndElement({urn:example:namespace}p:arg) +Whitespace("\n ") +StartElement({urn:example:namespace}p:arg) +Characters("0.01") +EndElement({urn:example:namespace}p:arg) +Whitespace("\n ") +StartElement({urn:example:header}h:header [name="Header-1"]) +Characters("header 1 value") +EndElement({urn:example:header}h:header) +Whitespace("\n ") +StartElement({urn:example:header}h:header [name="Header-2"]) +Characters("\n Some bigger value\n ") +EndElement({urn:example:header}h:header) +Whitespace("\n ") +EndElement({urn:example:namespace}p:datum) +Whitespace("\n") +EndElement({urn:example:namespace}p:data) +EndDocument diff --git a/tests/documents/sample_2_short.txt b/tests/documents/sample_2_short.txt new file mode 100644 index 0000000..2368025 --- /dev/null +++ b/tests/documents/sample_2_short.txt @@ -0,0 +1,30 @@ +StartDocument(1.0, utf-8) +StartElement({urn:example:namespace}p:data) +StartElement({urn:example:namespace}p:datum [id="34"]) +StartElement({urn:example:namespace}p:name) +Characters("Name") +EndElement({urn:example:namespace}p:name) +StartElement({urn:example:double}d:name) +Characters("Another name") +EndElement({urn:example:double}d:name) +StartElement({urn:example:double}d:arg) +Characters("0.3") +EndElement({urn:example:double}d:arg) +StartElement({urn:example:double}d:arg) +Characters("0.2") +EndElement({urn:example:double}d:arg) +StartElement({urn:example:namespace}p:arg) +Characters("0.1") +EndElement({urn:example:namespace}p:arg) +StartElement({urn:example:namespace}p:arg) +Characters("0.01") +EndElement({urn:example:namespace}p:arg) +StartElement({urn:example:header}h:header [name="Header-1"]) +Characters("header 1 value") +EndElement({urn:example:header}h:header) +StartElement({urn:example:header}h:header [name="Header-2"]) +Characters("Some bigger value") +EndElement({urn:example:header}h:header) +EndElement({urn:example:namespace}p:datum) +EndElement({urn:example:namespace}p:data) +EndDocument diff --git a/tests/documents/sample_3.xml b/tests/documents/sample_3.xml new file mode 100644 index 0000000..657e37d --- /dev/null +++ b/tests/documents/sample_3.xml @@ -0,0 +1,13 @@ + + + + test + kkss" = ddd' > + ddddd!e3--> + test + kkss" = ddd' > + ddddd!e3-->"#, + br#" + |1:14 Unexpected token '--' before ' ' + "#, + ParserConfig::new(), + false + ); + + test( + br#""#, + br#" + |1:14 Unexpected token '--' before '-' + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn tabs_1() { + test( + b"\t\t", + br#" + |1:2 StartDocument(1.0, UTF-8) + |1:2 StartElement(a) + |1:6 StartElement(b) + |1:6 EndElement(b) + |1:10 EndElement(a) + |1:14 EndDocument + "#, + ParserConfig::new() + .trim_whitespace(true), + true + ); +} + +#[test] +fn issue_32_unescaped_cdata_end() { + test( + br#"]]>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("]]>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_unescaped_processing_instruction_end() { + test( + br#"?>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("?>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_unescaped_empty_tag_end() { + test( + br#"/>"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("/>") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_83_duplicate_attributes() { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |1:30 Attribute 'a' is redefined + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_93_large_characters_in_entity_references() { + test( + r#"&𤶼;"#.as_bytes(), + r#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |1:10 Unexpected entity: 𤶼 + "#.as_bytes(), // FIXME: it shouldn't be 10, looks like indices are off slightly + ParserConfig::new(), + false + ) +} + +#[test] +fn issue_98_cdata_ending_with_right_bracket() { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |CData("Foo [Bar]") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ) +} + +#[test] +fn issue_105_unexpected_double_dash() { + test( + br#"-- "#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("-- ") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#"--"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("--") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#"-->"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |Characters("-->") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); + + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(hello) + |CData("--") + |EndElement(hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_attribues_have_no_default_namespace () { + test( + br#""#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement({urn:foo}hello [x="y"]) + |EndElement({urn:foo}hello) + |EndDocument + "#, + ParserConfig::new(), + false + ); +} + +#[test] +fn issue_replacement_character_entity_reference() { + test( + br#"��"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |1:13 Invalid decimal character number in an entity: #55357 + "#, + ParserConfig::new(), + false, + ); + + test( + br#"��"#, + br#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |1:13 Invalid hexadecimal character number in an entity: #xd83d + "#, + ParserConfig::new(), + false, + ); + + test( + br#"��"#, + format!( + r#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |Characters("{replacement_character}{replacement_character}") + |EndElement(doc) + |EndDocument + "#, + replacement_character = "\u{fffd}" + ) + .as_bytes(), + ParserConfig::new() + .replace_unknown_entity_references(true), + false, + ); + + test( + br#"��"#, + format!( + r#" + |StartDocument(1.0, UTF-8) + |StartElement(doc) + |Characters("{replacement_character}{replacement_character}") + |EndElement(doc) + |EndDocument + "#, + replacement_character = "\u{fffd}" + ) + .as_bytes(), + ParserConfig::new() + .replace_unknown_entity_references(true), + false, + ); +} + +lazy_static! { + // If PRINT_SPEC env variable is set, print the lines + // to stderr instead of comparing with the output + // it can be used like this: + // PRINT_SPEC=1 cargo test --test event_reader sample_1_full 2> sample_1_full.txt + static ref PRINT: bool = { + for (key, value) in env::vars() { + if key == "PRINT_SPEC" && value == "1" { + return true; + } + } + false + }; +} + +// clones a lot but that's fine +fn trim_until_bar(s: String) -> String { + match s.trim() { + ts if ts.starts_with('|') => return ts[1..].to_owned(), + _ => {} + } + s +} + +fn test(input: &[u8], output: &[u8], config: ParserConfig, test_position: bool) { + let mut reader = config.create_reader(input); + let mut spec_lines = BufReader::new(output).lines() + .map(|line| line.unwrap()) + .enumerate() + .map(|(i, line)| (i, trim_until_bar(line))) + .filter(|&(_, ref line)| !line.trim().is_empty()); + + loop { + let e = reader.next(); + let line = + if test_position { + format!("{} {}", reader.position(), Event(&e)) + } else { + format!("{}", Event(&e)) + }; + + if *PRINT { + writeln!(&mut stderr(), "{}", line).unwrap(); + } else { + if let Some((n, spec)) = spec_lines.next() { + if line != spec { + const SPLITTER: &'static str = "-------------------"; + panic!("\n{}\nUnexpected event at line {}:\nExpected: {}\nFound: {}\n{}\n", + SPLITTER, n + 1, spec, line, std::str::from_utf8(output).unwrap()); + } + } else { + panic!("Unexpected event: {}", line); + } + } + + match e { + Ok(XmlEvent::EndDocument) | Err(_) => break, + _ => {}, + } + } +} + +// Here we define our own string representation of events so we don't depend +// on the specifics of Display implementation for XmlEvent and OwnedName. + +struct Name<'a>(&'a OwnedName); + +impl <'a> fmt::Display for Name<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Some(ref namespace) = self.0.namespace { + try! { write!(f, "{{{}}}", namespace) } + } + + if let Some(ref prefix) = self.0.prefix { + try! { write!(f, "{}:", prefix) } + } + + write!(f, "{}", self.0.local_name) + } +} + +struct Event<'a>(&'a Result); + +impl<'a> fmt::Display for Event<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let empty = String::new(); + match *self.0 { + Ok(ref e) => match *e { + XmlEvent::StartDocument { ref version, ref encoding, .. } => + write!(f, "StartDocument({}, {})", version, encoding), + XmlEvent::EndDocument => + write!(f, "EndDocument"), + XmlEvent::ProcessingInstruction { ref name, ref data } => + write!(f, "ProcessingInstruction({}={:?})", name, + data.as_ref().unwrap_or(&empty)), + XmlEvent::StartElement { ref name, ref attributes, .. } => { + if attributes.is_empty() { + write!(f, "StartElement({})", Name(name)) + } + else { + let attrs: Vec<_> = attributes.iter() + .map(|a| format!("{}={:?}", Name(&a.name), a.value)) .collect(); + write!(f, "StartElement({} [{}])", Name(name), attrs.join(", ")) + } + }, + XmlEvent::EndElement { ref name } => + write!(f, "EndElement({})", Name(name)), + XmlEvent::Comment(ref data) => + write!(f, r#"Comment("{}")"#, data.escape_debug()), + XmlEvent::CData(ref data) => + write!(f, r#"CData("{}")"#, data.escape_debug()), + XmlEvent::Characters(ref data) => + write!(f, r#"Characters("{}")"#, data.escape_debug()), + XmlEvent::Whitespace(ref data) => + write!(f, r#"Whitespace("{}")"#, data.escape_debug()), + }, + Err(ref e) => e.fmt(f), + } + } +} diff --git a/tests/event_writer.rs b/tests/event_writer.rs new file mode 100644 index 0000000..dd64a43 --- /dev/null +++ b/tests/event_writer.rs @@ -0,0 +1,269 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::io::{BufReader, SeekFrom}; +use std::io::prelude::*; +use std::fs::File; +use std::str; + +use xml::reader::EventReader; +use xml::writer::EmitterConfig; + +macro_rules! unwrap_all { + ($($e:expr);+) => {{ + $($e.unwrap();)+ + }} +} + +#[test] +fn reading_writing_equal_with_namespaces() { + let mut f = File::open("tests/documents/sample_2.xml").unwrap(); + let mut b = Vec::new(); + + { + let r = EventReader::new(BufReader::new(&mut f)); + let mut w = EmitterConfig::default().perform_indent(true).create_writer(&mut b); + + for e in r { + match e { + Ok(e) => if let Some(e) = e.as_writer_event() { + match w.write(e) { + Ok(_) => {}, + Err(e) => panic!("Writer error: {:?}", e) + } + }, + Err(e) => panic!("Error: {}", e) + } + } + } + + f.seek(SeekFrom::Start(0)).unwrap(); + let mut fs = String::new(); + f.read_to_string(&mut fs).unwrap(); + + let bs = String::from_utf8(b).unwrap(); + + assert_eq!(fs.trim(), bs.trim()); +} + +#[test] +fn writing_simple() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); + + w.write(XmlEvent::start_element("h:hello").ns("h", "urn:hello-world")).unwrap(); + w.write("hello world").unwrap(); + w.write(XmlEvent::end_element()).unwrap(); + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + r#"hello world"# + ); +} + +#[test] +fn writing_empty_elements_with_normalizing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new().write_document_declaration(false).create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_empty_elements_without_normalizing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .normalize_empty_elements(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_empty_elements_without_pad_self_closing() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .pad_self_closing(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} +#[test] +fn writing_empty_elements_pad_self_closing_explicit() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .pad_self_closing(true) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!(str::from_utf8(&b).unwrap(), r#""#); +} + +#[test] +fn writing_comments_with_indentation() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .perform_indent(true) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("hello")); + w.write(XmlEvent::start_element("world")); + w.write(XmlEvent::comment(" this is a manually padded comment\t")); + w.write(XmlEvent::comment("this is an unpadded comment")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + " + + + + +"); +} + +#[test] +fn issue_112_overriding_namepace_prefix() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .create_writer(&mut b); + + unwrap_all! { + w.write(XmlEvent::start_element("iq").ns("", "jabber:client").ns("a", "urn:A")); + w.write(XmlEvent::start_element("bind").ns("", "urn:ietf:params:xml:ns:xmpp-bind")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::start_element("whatever").ns("a", "urn:X")); + w.write(XmlEvent::end_element()); + w.write(XmlEvent::end_element()) + } + } + + assert_eq!( + str::from_utf8(&b).unwrap(), + r#""# + ) +} + +#[test] +fn attribute_escaping() { + use xml::writer::XmlEvent; + + let mut b = Vec::new(); + + { + let mut w = EmitterConfig::new() + .write_document_declaration(false) + .perform_indent(true) + .create_writer(&mut b); + + unwrap_all! { + w.write( + XmlEvent::start_element("hello") + .attr("testLt", "<") + .attr("testGt", ">") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testQuot", "\"") + .attr("testApos", "\'") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testAmp", "&") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testNl", "\n") + .attr("testCr", "\r") + ); + w.write(XmlEvent::end_element()); + w.write( + XmlEvent::start_element("hello") + .attr("testNl", "\\n") + .attr("testCr", "\\r") + ); + w.write(XmlEvent::end_element()) + } + } + assert_eq!( + str::from_utf8(&b).unwrap(), + " + + + +" + ); +} \ No newline at end of file diff --git a/tests/streaming.rs b/tests/streaming.rs new file mode 100644 index 0000000..a577a00 --- /dev/null +++ b/tests/streaming.rs @@ -0,0 +1,103 @@ +#![forbid(unsafe_code)] + +extern crate xml; + +use std::io::{Cursor, Write}; + +use xml::EventReader; +use xml::reader::ParserConfig; +use xml::reader::XmlEvent; + +macro_rules! assert_match { + ($actual:expr, $expected:pat) => { + match $actual { + $expected => {}, + _ => panic!("assertion failed: `(left matches right)` \ + (left: `{:?}`, right: `{}`", $actual, stringify!($expected)) + } + }; + ($actual:expr, $expected:pat if $guard:expr) => { + match $actual { + $expected if $guard => {}, + _ => panic!("assertion failed: `(left matches right)` \ + (left: `{:?}`, right: `{} if {}`", + $actual, stringify!($expected), stringify!($guard)) + } + } +} + +fn write_and_reset_position(c: &mut Cursor, data: &[u8]) where Cursor: Write { + let p = c.position(); + c.write_all(data).unwrap(); + c.set_position(p); +} + +#[test] +fn reading_streamed_content() { + let buf = Cursor::new(b"".to_vec()); + let reader = EventReader::new(buf); + + let mut it = reader.into_iter(); + + assert_match!(it.next(), Some(Ok(XmlEvent::StartDocument { .. }))); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); + + write_and_reset_position(it.source_mut(), b"content"); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); + assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-3"); + // doesn't seem to work because of how tags parsing is done +// write_and_reset_position(it.source_mut(), b"some text"); + // assert_match!(it.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "some text"); + + write_and_reset_position(it.source_mut(), b""); + assert_match!(it.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "root"); + assert_match!(it.next(), Some(Ok(XmlEvent::EndDocument))); + assert_match!(it.next(), None); +} + +#[test] +fn reading_streamed_content2() { + let buf = Cursor::new(b"".to_vec()); + let mut config = ParserConfig::new(); + config.ignore_end_of_stream = true; + let readerb = EventReader::new_with_config(buf, config); + + let mut reader = readerb.into_iter(); + + assert_match!(reader.next(), Some(Ok(XmlEvent::StartDocument { .. }))); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "root"); + + write_and_reset_position(reader.source_mut(), b"content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-1"); + assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-1"); + + write_and_reset_position(reader.source_mut(), b"content"); + + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-2"); + assert_match!(reader.next(), Some(Ok(XmlEvent::Characters(ref c))) if c == "content"); + assert_match!(reader.next(), Some(Ok(XmlEvent::EndElement { ref name })) if name.local_name == "child-2"); + assert_match!(reader.next(), Some(Err(_))); + write_and_reset_position(reader.source_mut(), b""); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-3"); + write_and_reset_position(reader.source_mut(), b" { + panic!("At this point, parser must not detect something."); + }, + Some(Err(_)) => {} + }; + write_and_reset_position(reader.source_mut(), b" />"); + assert_match!(reader.next(), Some(Ok(XmlEvent::StartElement { ref name, .. })) if name.local_name == "child-4"); +} + -- 2.7.4