#![deny(missing_docs)]
#[macro_use]
extern crate log;
pub mod error;
pub mod attributes;
pub mod namespace;
mod escape;
#[cfg(test)]
mod test;
use std::fs::File;
use std::io::{self, BufRead, BufReader, Write};
use std::iter::IntoIterator;
use std::ops::Range;
use std::path::Path;
use std::fmt;
use std::str::from_utf8;
use std::borrow::Cow;
use error::{Error, Result, ResultPos};
use attributes::{Attributes, UnescapedAttributes};
use namespace::XmlnsReader;
use escape::unescape;
#[derive(Clone)]
enum TagState {
Opened,
Closed,
}
pub trait AsStr {
fn as_str(&self) -> Result<&str>;
}
impl AsStr for [u8] {
fn as_str(&self) -> Result<&str> {
from_utf8(self).map_err(Error::Utf8)
}
}
#[derive(Clone)]
pub struct XmlReader<B: BufRead> {
reader: B,
exit: bool,
next_close: bool,
opened: Vec<Element>,
tag_state: TagState,
trim_text: bool,
with_check: bool,
check_comments: bool,
buf_position: usize,
}
impl<'a> ::std::convert::From<&'a str> for XmlReader<&'a [u8]> {
fn from(reader: &'a str) -> XmlReader<&'a [u8]> {
XmlReader::from_reader(reader.as_bytes())
}
}
impl<B: BufRead> XmlReader<B> {
pub fn from_reader(reader: B) -> XmlReader<B> {
XmlReader {
reader: reader,
exit: false,
next_close: false,
opened: Vec::new(),
tag_state: TagState::Closed,
trim_text: false,
with_check: true,
buf_position: 0,
check_comments: false,
}
}
pub fn namespaced(self) -> XmlnsReader<B> {
XmlnsReader::new(self)
}
pub fn trim_text(mut self, val: bool) -> XmlReader<B> {
self.trim_text = val;
self
}
pub fn with_check(mut self, val: bool) -> XmlReader<B> {
self.with_check = val;
self
}
pub fn check_comments(mut self, val: bool) -> XmlReader<B> {
self.check_comments = val;
self
}
pub fn read_to_end<K: AsRef<[u8]>>(&mut self, end: K) -> ResultPos<()> {
let mut depth = 0;
let end = end.as_ref();
loop {
match self.next() {
Some(Ok(Event::End(ref e))) if e.name() == end => {
if depth == 0 {
return Ok(());
}
depth -= 1;
}
Some(Ok(Event::Start(ref e))) if e.name() == end => depth += 1,
Some(Err(e)) => return Err(e),
None => {
warn!("EOF instead of {:?}", from_utf8(end));
return Err((Error::Unexpected(format!(
"Reached EOF, expecting {:?} end tag",
from_utf8(end))),
self.buf_position));
}
_ => (),
}
}
}
pub fn read_text<K: AsRef<[u8]>>(&mut self, end: K) -> ResultPos<String> {
match self.next() {
Some(Ok(Event::Text(e))) => {
self.read_to_end(end)
.and_then(|_| e.into_string().map_err(|e| (e, self.buf_position)))
}
Some(Ok(Event::End(ref e))) if e.name() == end.as_ref() => {
Ok("".to_string())
},
Some(Err(e)) => Err(e),
None => {
Err((Error::Unexpected("Reached EOF while reading text".to_string()),
self.buf_position))
}
_ => {
Err((Error::Unexpected("Cannot read text, expecting Event::Text".to_string()),
self.buf_position))
}
}
}
pub fn buffer_position(&self) -> usize {
self.buf_position
}
fn read_until_open(&mut self) -> Option<ResultPos<Event>> {
self.tag_state = TagState::Opened;
let mut buf = Vec::new();
match read_until(&mut self.reader, b'<', &mut buf) {
Ok(0) => None,
Ok(n) => {
self.buf_position += n;
let (start, len) = if self.trim_text {
match buf.iter().position(|&b| !is_whitespace(b)) {
Some(start) => {
(start, buf.len() - buf.iter().rev()
.position(|&b| !is_whitespace(b))
.unwrap_or(0))
}
None => return self.next(),
}
} else {
(0, buf.len())
};
Some(Ok(Event::Text(Element::from_buffer(buf, start, len, len))))
}
Err(e) => Some(self.error(e, 0)),
}
}
fn read_until_close(&mut self) -> Option<ResultPos<Event>> {
self.tag_state = TagState::Closed;
let mut buf = Vec::new();
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => None,
Ok(n) => {
self.buf_position += n;
match buf[0] {
b'/' => Some(self.read_end(buf)),
b'!' => Some(self.read_bang(buf)),
b'?' => Some(self.read_question_mark(buf)),
_ => Some(self.read_start(buf)),
}
}
Err(e) => Some(self.error(e, 0)),
}
}
fn read_end(&mut self, buf: Vec<u8>) -> ResultPos<Event> {
let len = buf.len();
if self.with_check {
let e = match self.opened.pop() {
Some(e) => e,
None => return self.error(
Error::Malformed(format!("Cannot close {:?} element, \
there is no opened element",
buf[1..].as_str())), len),
};
if &buf[1..] != e.name() {
let m = format!("End event {:?} doesn't match last \
opened element {:?}, opened: {:?}",
Element::from_buffer(buf, 1, len, len), e, &self.opened);
return self.error(Error::Malformed(m), len);
}
}
Ok(Event::End(Element::from_buffer(buf, 1, len, len)))
}
fn read_bang(&mut self, mut buf: Vec<u8>) -> ResultPos<Event> {
let len = buf.len();
if len >= 3 && &buf[1..3] == b"--" {
let mut len = buf.len();
while len < 5 || &buf[(len - 2)..] != b"--" {
buf.push(b'>');
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => return self.error(
Error::Malformed("Unescaped Comment event".to_string()), len),
Ok(n) => self.buf_position += n,
Err(e) => return self.error(e, 0),
}
len = buf.len();
}
if self.check_comments {
let mut offset = len - 3;
for w in buf[3..(len - 1)].windows(2) {
if &*w == b"--" {
return self.error(
Error::Malformed("Unexpected token '--'".to_string()), offset);
}
offset -= 1;
}
}
Ok(Event::Comment(Element::from_buffer(buf, 3, len - 2, len - 2)))
} else if len >= 8 {
match &buf[1..8] {
b"[CDATA[" => {
let mut len = buf.len();
while len < 10 || &buf[(len - 2)..] != b"]]" {
buf.push(b'>');
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => return self.error(
Error::Malformed("Unescaped CDATA event".to_string()), len),
Ok(n) => self.buf_position += n,
Err(e) => return self.error(e, 0),
}
len = buf.len();
}
Ok(Event::CData(Element::from_buffer(buf, 8, len - 2, len - 2)))
}
b"DOCTYPE" => {
let mut count = buf.iter().filter(|&&b| b == b'<').count();
while count > 0 {
buf.push(b'>');
match read_until(&mut self.reader, b'>', &mut buf) {
Ok(0) => return self.error(
Error::Malformed("Unescaped DOCTYPE node".to_string()), buf.len()),
Ok(n) => {
self.buf_position += n;
let start = buf.len() - n;
count += buf[start..].iter().filter(|&&b| b == b'<').count() - 1;
}
Err(e) => return self.error(e, 0),
}
}
let len = buf.len();
Ok(Event::DocType(Element::from_buffer(buf, 1, len, 8)))
}
_ => self.error(Error::Malformed("Only Comment, CDATA and DOCTYPE nodes \
can start with a '!'".to_string()), 0),
}
} else {
self.error(Error::Malformed("Only Comment, CDATA and DOCTYPE nodes can start \
with a '!'".to_string()), buf.len())
}
}
fn read_question_mark(&mut self, buf: Vec<u8>) -> ResultPos<Event> {
let len = buf.len();
if len > 2 && buf[len - 1] == b'?' {
if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
Ok(Event::Decl(XmlDecl { element: Element::from_buffer(buf, 1, len - 1, 3) }))
} else {
Ok(Event::PI(Element::from_buffer(buf, 1, len - 1, 3)))
}
} else {
self.error(Error::Malformed("Unescaped XmlDecl event".to_string()), len)
}
}
fn read_start(&mut self, buf: Vec<u8>) -> ResultPos<Event> {
let len = buf.len();
let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
if buf[len - 1] == b'/' {
self.next_close = true;
let end = if name_end < len { name_end } else { len - 1 };
let element = Element::from_buffer(buf, 0, len - 1, end);
self.opened.push(element.clone());
Ok(Event::Start(element))
} else {
let element = Element::from_buffer(buf, 0, len, name_end);
if self.with_check { self.opened.push(element.clone()); }
Ok(Event::Start(element))
}
}
fn error(&mut self, e: Error, offset: usize) -> ResultPos<Event> {
self.exit = true;
Err((e, self.buf_position - offset))
}
}
impl XmlReader<BufReader<File>> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<XmlReader<BufReader<File>>> {
let reader = BufReader::new(try!(File::open(path)));
Ok(XmlReader::from_reader(reader))
}
}
impl<B: BufRead> Iterator for XmlReader<B> {
type Item = ResultPos<Event>;
fn next(&mut self) -> Option<Self::Item> {
if self.exit {
return None;
}
if self.next_close {
self.next_close = false;
let e = self.opened.pop().unwrap();
return Some(Ok(Event::End(e)));
}
match self.tag_state {
TagState::Opened => self.read_until_close(),
TagState::Closed => self.read_until_open(),
}
}
}
#[derive(Clone)]
pub struct Element {
buf: Vec<u8>,
content: Range<usize>,
name: Range<usize>,
}
impl Element {
pub fn new<A>(name: A) -> Element
where A: AsRef<[u8]>
{
let bytes = Vec::from(name.as_ref());
let end = bytes.len();
Element::from_buffer(bytes, 0, end, end)
}
fn from_buffer(buf: Vec<u8>, start: usize, end: usize, name_end: usize)
-> Element
{
Element {
buf: buf,
content: Range { start: start, end: end },
name: Range { start: start, end: name_end },
}
}
pub fn with_attributes<K, V, I>(mut self, attributes: I) -> Self
where K: AsRef<[u8]>,
V: AsRef<[u8]>,
I: IntoIterator<Item = (K, V)>
{
self.extend_attributes(attributes);
self
}
pub fn name(&self) -> &[u8] {
&self.buf[self.name.clone()]
}
pub fn content(&self) -> &[u8] {
&self.buf[self.content.clone()]
}
pub fn unescaped_content(&self) -> ResultPos<Cow<[u8]>> {
unescape(self.content())
}
pub fn attributes(&self) -> Attributes {
Attributes::new(self.content(), self.name.end)
}
pub fn unescaped_attributes(&self) -> UnescapedAttributes {
self.attributes().unescaped()
}
pub fn extend_attributes<K, V, I>(&mut self, attributes: I) -> &mut Element
where K: AsRef<[u8]>,
V: AsRef<[u8]>,
I: IntoIterator<Item = (K, V)>
{
for attr in attributes {
self.push_attribute(attr.0, attr.1);
}
self
}
pub fn into_string(self) -> Result<String> {
::std::string::String::from_utf8(self.buf)
.map_err(|e| Error::Utf8(e.utf8_error()))
}
pub fn push_attribute<K, V>(&mut self, key: K, value: V)
where K: AsRef<[u8]>,
V: AsRef<[u8]>
{
let bytes = &mut self.buf;
bytes.push(b' ');
bytes.extend_from_slice(key.as_ref());
bytes.extend_from_slice(b"=\"");
bytes.extend_from_slice(value.as_ref());
bytes.push(b'"');
self.content.end = bytes.len();
}
}
impl fmt::Debug for Element {
fn fmt(&self, f: &mut fmt::Formatter) -> ::std::result::Result<(), fmt::Error> {
write!(f,
"Element {{ buf: {:?}, name_end: {}, end: {} }}",
self.content().as_str(),
self.name.end,
self.content.end)
}
}
#[derive(Clone, Debug)]
pub struct XmlDecl {
element: Element,
}
impl XmlDecl {
pub fn version(&self) -> ResultPos<&[u8]> {
match self.element.attributes().next() {
Some(Err(e)) => Err(e),
Some(Ok((b"version", v))) => Ok(v),
Some(Ok((k, _))) => {
let m = format!("XmlDecl must start with 'version' attribute, found {:?}",
k.as_str());
Err((Error::Malformed(m), 0))
}
None => {
let m = "XmlDecl must start with 'version' attribute, found none".to_string();
Err((Error::Malformed(m), 0))
}
}
}
pub fn encoding(&self) -> Option<ResultPos<&[u8]>> {
for a in self.element.attributes() {
match a {
Err(e) => return Some(Err(e)),
Ok((b"encoding", v)) => return Some(Ok(v)),
_ => (),
}
}
None
}
pub fn standalone(&self) -> Option<ResultPos<&[u8]>> {
for a in self.element.attributes() {
match a {
Err(e) => return Some(Err(e)),
Ok((b"standalone", v)) => return Some(Ok(v)),
_ => (),
}
}
None
}
}
#[derive(Clone, Debug)]
pub enum Event {
Start(Element),
End(Element),
Text(Element),
Comment(Element),
CData(Element),
Decl(XmlDecl),
PI(Element),
DocType(Element),
}
impl Event {
pub fn element(&self) -> &Element {
match *self {
Event::Start(ref e) |
Event::End(ref e) |
Event::Text(ref e) |
Event::Comment(ref e) |
Event::CData(ref e) |
Event::PI(ref e) |
Event::DocType(ref e) => e,
Event::Decl(ref e) => &e.element,
}
}
}
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
#[inline]
fn read_until<R: BufRead>(r: &mut R, byte: u8, buf: &mut Vec<u8>)
-> Result<usize>
{
let mut read = 0;
let mut done = false;
while !done {
let used = {
let available = match r.fill_buf() {
Ok(n) if n.is_empty() => return Ok(read),
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(Error::Io(e)),
};
let mut bytes = available.iter().enumerate();
let used: usize;
loop {
match bytes.next() {
Some((i, &b)) => {
if b == byte {
buf.extend_from_slice(&available[..i]);
done = true;
used = i + 1;
break;
}
}
None => {
buf.extend_from_slice(available);
used = available.len();
break;
}
}
}
used
};
r.consume(used);
read += used;
}
Ok(read)
}
#[derive(Clone)]
pub struct XmlWriter<W: Write> {
writer: W,
}
impl<W: Write> XmlWriter<W> {
pub fn new(inner: W) -> XmlWriter<W> {
XmlWriter { writer: inner }
}
pub fn into_inner(self) -> W {
self.writer
}
pub fn write(&mut self, event: Event) -> Result<()> {
match event {
Event::Start(ref e) => self.write_wrapped_element(b"<", e, b">"),
Event::End(ref e) => self.write_wrapped_bytes(b"</", &e.name(), b">"),
Event::Text(ref e) => self.write_bytes(e.content()),
Event::Comment(ref e) => self.write_wrapped_element(b"<!--", e, b"-->"),
Event::CData(ref e) => self.write_wrapped_element(b"<![CDATA[", e, b"]]>"),
Event::Decl(ref e) => self.write_wrapped_element(b"<?", &e.element, b"?>"),
Event::PI(ref e) => self.write_wrapped_element(b"<?", e, b"?>"),
Event::DocType(ref e) => self.write_wrapped_element(b"<!DOCTYPE", e, b">"),
}
}
#[inline]
fn write_bytes(&mut self, value: &[u8]) -> Result<()> {
try!(self.writer.write(value));
Ok(())
}
fn write_wrapped_bytes(&mut self, before: &[u8], value: &[u8], after: &[u8])
-> Result<()>
{
try!(self.writer.write(before)
.and_then(|_| self.writer.write(value))
.and_then(|_| self.writer.write(after)));
Ok(())
}
#[inline]
fn write_wrapped_element(&mut self, before: &[u8], element: &Element, after: &[u8])
-> Result<()>
{
self.write_wrapped_bytes(before, &element.content(), after)
}
}