use std::borrow::Cow;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::path::Path;
use std::str::from_utf8;
use encoding_rs::Encoding;
use errors::{Result, ErrorKind};
use events::{Event, BytesStart, BytesEnd, BytesText, BytesDecl};
use events::attributes::Attribute;
enum TagState {
Opened,
Closed,
Empty,
}
pub struct Reader<B: BufRead> {
reader: B,
exit: bool,
buf_position: usize,
tag_state: TagState,
expand_empty_elements: bool,
trim_text: bool,
check_end_names: bool,
check_comments: bool,
opened_buffer: Vec<u8>,
opened_starts: Vec<usize>,
ns_buffer: NamespaceBufferIndex,
encoding: &'static Encoding,
}
impl<B: BufRead> Reader<B> {
pub fn from_reader(reader: B) -> Reader<B> {
Reader {
reader: reader,
exit: false,
opened_buffer: Vec::new(),
opened_starts: Vec::new(),
tag_state: TagState::Closed,
expand_empty_elements: false,
trim_text: false,
check_end_names: true,
buf_position: 0,
check_comments: false,
ns_buffer: NamespaceBufferIndex::default(),
encoding: ::encoding_rs::UTF_8,
}
}
pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader<B> {
self.expand_empty_elements = val;
self
}
pub fn trim_text(&mut self, val: bool) -> &mut Reader<B> {
self.trim_text = val;
self
}
pub fn check_end_names(&mut self, val: bool) -> &mut Reader<B> {
self.check_end_names = val;
self
}
pub fn check_comments(&mut self, val: bool) -> &mut Reader<B> {
self.check_comments = val;
self
}
pub fn buffer_position(&self) -> usize {
self.buf_position
}
fn read_until_open<'a, 'b>(&'a mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
self.tag_state = TagState::Opened;
let buf_start = buf.len();
match read_until(&mut self.reader, b'<', buf) {
Ok(0) => Ok(Event::Eof),
Ok(n) => {
self.buf_position += n;
let (start, len) = if self.trim_text {
match buf.iter().skip(buf_start).position(|&b| !is_whitespace(b)) {
Some(start) => {
(buf_start + start,
buf.iter()
.rposition(|&b| !is_whitespace(b))
.map(|p| p + 1)
.unwrap_or_else(|| buf.len()))
}
None => return self.read_event(buf),
}
} else {
(buf_start, buf.len())
};
Ok(Event::Text(BytesText::borrowed(&buf[start..len])))
}
Err(e) => Err(e),
}
}
fn read_until_close<'a, 'b>(&'a mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
self.tag_state = TagState::Closed;
let buf_start = buf.len();
let start;
loop {
let start_result = {
let available = match self.reader.fill_buf() {
Ok(n) if n.is_empty() => return Ok(Event::Eof),
Ok(n) => Ok(n),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(e),
};
available.map(|xs| xs[0])
};
start = start_result?;
break;
}
if start != b'/' && start != b'!' && start != b'?' {
match read_elem_until(&mut self.reader, b'>', buf) {
Ok(0) => Ok(Event::Eof),
Ok(n) => {
self.buf_position += n;
self.read_start(&buf[buf_start..])
}
Err(e) => Err(e),
}
} else {
match read_until(&mut self.reader, b'>', buf) {
Ok(0) => Ok(Event::Eof),
Ok(n) => {
self.buf_position += n;
match start {
b'/' => self.read_end(&buf[buf_start..]),
b'!' => self.read_bang(buf_start, buf),
b'?' => self.read_question_mark(&buf[buf_start..]),
_ => {
unreachable!("We checked that `start` must be one of [/!?], was {:?} \
instead.",
start)
}
}
}
Err(e) => Err(e),
}
}
}
fn read_end<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
let len = buf.len();
if self.check_end_names {
match self.opened_starts.pop() {
Some(start) => {
if buf[1..] != self.opened_buffer[start..] {
self.buf_position -= len;
bail!(ErrorKind::EndEventMismatch(from_utf8(&self.opened_buffer[start..])
.unwrap_or("")
.to_owned(),
from_utf8(&buf[1..])
.unwrap_or("")
.to_owned()));
}
self.opened_buffer.truncate(start);
}
None => {
self.buf_position -= len;
bail!(ErrorKind::EndEventMismatch("".to_owned(),
from_utf8(&buf[1..])
.unwrap_or("")
.to_owned()));
}
}
}
Ok(Event::End(BytesEnd::borrowed(&buf[1..])))
}
fn read_bang<'a, 'b>(&'a mut self,
buf_start: usize,
buf: &'b mut Vec<u8>)
-> Result<Event<'b>> {
let len = buf.len();
if len >= 3 && &buf[buf_start + 1..buf_start + 3] == b"--" {
let mut len = buf.len();
while len < 5 || &buf[len - 2..] != b"--" {
buf.push(b'>');
match read_until(&mut self.reader, b'>', buf) {
Ok(0) => {
self.buf_position -= len;
bail!(io_eof("Comment"))
}
Ok(n) => self.buf_position += n,
Err(e) => return Err(e.into()),
}
len = buf.len();
}
if self.check_comments {
let mut offset = len - 3;
for w in buf[buf_start + 3..len - 1].windows(2) {
if &*w == b"--" {
self.buf_position -= offset;
bail!("Unexpected token '--'");
}
offset -= 1;
}
}
Ok(Event::Comment(BytesText::borrowed(&buf[buf_start + 3..len - 2])))
} else if len >= 8 {
match &buf[buf_start + 1..buf_start + 8] {
b"[CDATA[" => {
let mut len = buf.len();
while len < 10 || &buf[len - 2..] != b"]]" {
buf.push(b'>');
match read_until(&mut self.reader, b'>', buf) {
Ok(0) => {
self.buf_position -= len;
bail!(io_eof("CData"));
}
Ok(n) => self.buf_position += n,
Err(e) => return Err(e.into()),
}
len = buf.len();
}
Ok(Event::CData(BytesText::borrowed(&buf[buf_start + 8..len - 2])))
}
b"DOCTYPE" => {
let mut count = buf.iter().skip(buf_start).filter(|&&b| b == b'<').count();
while count > 0 {
buf.push(b'>');
match read_until(&mut self.reader, b'>', buf) {
Ok(0) => {
self.buf_position -= buf.len();
bail!(io_eof("DOCTYPE"));
}
Ok(n) => {
self.buf_position += n;
let start = buf.len() - n;
count += buf.iter().skip(start).filter(|&&b| b == b'<').count();
count -= 1;
}
Err(e) => return Err(e.into()),
}
}
let len = buf.len();
Ok(Event::DocType(BytesText::borrowed(&buf[buf_start + 8..len])))
}
_ => bail!("Only Comment, CDATA and DOCTYPE nodes can start with a '!'"),
}
} else {
self.buf_position -= buf.len();
bail!("Only Comment, CDATA and DOCTYPE nodes can start with a '!'");
}
}
fn read_question_mark<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
let len = buf.len();
if len > 2 && buf[len - 1] == b'?' {
if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3));
if let Some(enc) = event.encoder() {
self.encoding = enc;
}
Ok(Event::Decl(event))
} else {
Ok(Event::PI(BytesText::borrowed(&buf[1..len - 1])))
}
} else {
self.buf_position -= len;
bail!(io_eof("XmlDecl"));
}
}
#[inline]
fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
self.tag_state = TagState::Closed;
let name = self.opened_buffer
.split_off(self.opened_starts.pop().unwrap());
Ok(Event::End(BytesEnd::owned(name)))
}
fn read_start<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result<Event<'b>> {
let len = buf.len();
let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
if let Some(&b'/') = buf.last() {
let end = if name_end < len { name_end } else { len - 1 };
if self.expand_empty_elements {
self.tag_state = TagState::Empty;
self.opened_starts.push(self.opened_buffer.len());
self.opened_buffer.extend(&buf[..end]);
Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end)))
} else {
Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end)))
}
} else {
if self.check_end_names {
self.opened_starts.push(self.opened_buffer.len());
self.opened_buffer.extend(&buf[..name_end]);
}
Ok(Event::Start(BytesStart::borrowed(buf, name_end)))
}
}
pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
if self.exit {
return Ok(Event::Eof);
}
let r = match self.tag_state {
TagState::Opened => self.read_until_close(buf),
TagState::Closed => self.read_until_open(buf),
TagState::Empty => self.close_expanded_empty(),
};
if r.is_err() {
self.exit = true;
}
r
}
#[inline]
pub fn resolve_namespace<'a, 'b, 'c>(&'a self,
qname: &'b [u8],
namespace_buffer: &'c [u8])
-> (Option<&'c [u8]>, &'b [u8]) {
self.ns_buffer.resolve_namespace(qname, namespace_buffer)
}
pub fn read_namespaced_event<'a, 'b, 'c>(&'a mut self,
buf: &'b mut Vec<u8>,
namespace_buffer: &'c mut Vec<u8>)
-> Result<(Option<&'c [u8]>, Event<'b>)> {
self.ns_buffer.pop_empty_namespaces(namespace_buffer);
match self.read_event(buf) {
Ok(Event::Eof) => Ok((None, Event::Eof)),
Ok(Event::Start(e)) => {
self.ns_buffer.push_new_namespaces(&e, namespace_buffer);
Ok((self.ns_buffer
.find_namespace_value(e.name(), &**namespace_buffer),
Event::Start(e)))
}
Ok(Event::Empty(e)) => {
self.ns_buffer.push_new_namespaces(&e, namespace_buffer);
self.ns_buffer.pending_pop = true;
Ok((self.ns_buffer
.find_namespace_value(e.name(), &**namespace_buffer),
Event::Empty(e)))
}
Ok(Event::End(e)) => {
self.ns_buffer.pending_pop = true;
Ok((self.ns_buffer
.find_namespace_value(e.name(), &**namespace_buffer),
Event::End(e)))
}
Ok(e) => Ok((None, e)),
Err(e) => Err(e),
}
}
#[inline]
pub fn decode<'b, 'c>(&'b self, bytes: &'c [u8]) -> Cow<'c, str> {
self.encoding.decode(bytes).0
}
pub fn read_to_end<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<()> {
let mut depth = 0;
let end = end.as_ref();
loop {
match self.read_event(buf) {
Ok(Event::End(ref e)) if e.name() == end => {
if depth == 0 {
return Ok(());
}
depth -= 1;
}
Ok(Event::Start(ref e)) if e.name() == end => depth += 1,
Err(e) => return Err(e),
Ok(Event::Eof) => {
return Err(io_eof(&format!("Expecting {:?} end", from_utf8(end))).into())
}
_ => (),
}
buf.clear();
}
}
pub fn read_text<K: AsRef<[u8]>>(&mut self, end: K, buf: &mut Vec<u8>) -> Result<String> {
let s = match self.read_event(buf) {
Ok(Event::Text(e)) => e.unescape_and_decode(self),
Ok(Event::End(ref e)) if e.name() == end.as_ref() => return Ok("".to_string()),
Err(e) => return Err(e),
Ok(Event::Eof) => return Err(io_eof("Text").into()),
_ => return Err("Cannot read text, expecting Event::Text".into()),
};
self.read_to_end(end, buf)?;
s
}
}
impl Reader<BufReader<File>> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Reader<BufReader<File>>> {
let reader = BufReader::new(try!(File::open(path)));
Ok(Reader::from_reader(reader))
}
}
impl<'a> Reader<&'a [u8]> {
pub fn from_str(s: &'a str) -> Reader<&'a [u8]> {
Reader::from_reader(s.as_bytes())
}
}
#[inline]
fn read_until<R: BufRead>(r: &mut R, byte: u8, buf: &mut Vec<u8>) -> Result<usize> {
let mut read = 0;
let mut done = false;
while !done {
let used = {
let available = match r.fill_buf() {
Ok(n) if n.is_empty() => return Ok(read),
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => bail!(e),
};
let mut bytes = available.iter().enumerate();
let used: usize;
loop {
match bytes.next() {
Some((i, &b)) => {
if b == byte {
buf.extend_from_slice(&available[..i]);
done = true;
used = i + 1;
break;
}
}
None => {
buf.extend_from_slice(available);
used = available.len();
break;
}
}
}
used
};
r.consume(used);
read += used;
}
Ok(read)
}
#[inline]
fn read_elem_until<R: BufRead>(r: &mut R, end_byte: u8, buf: &mut Vec<u8>) -> Result<usize> {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ElemReadState {
Elem,
SingleQ,
DoubleQ,
}
let mut state = ElemReadState::Elem;
let mut read = 0;
let mut done = false;
while !done {
let used = {
let available = match r.fill_buf() {
Ok(n) if n.is_empty() => return Ok(read),
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => bail!(e),
};
let mut bytes = available.iter().enumerate();
let used: usize;
loop {
match bytes.next() {
Some((i, &b)) => {
state = match (state, b) {
(ElemReadState::Elem, b) if b == end_byte => {
buf.extend_from_slice(&available[..i]);
done = true;
used = i + 1;
break;
}
(ElemReadState::Elem, b'\'') => ElemReadState::SingleQ,
(ElemReadState::Elem, b'\"') => ElemReadState::DoubleQ,
(ElemReadState::SingleQ, b'\'') |
(ElemReadState::DoubleQ, b'\"') => ElemReadState::Elem,
_ => state,
};
}
None => {
buf.extend_from_slice(available);
used = available.len();
break;
}
}
}
used
};
r.consume(used);
read += used;
}
Ok(read)
}
#[inline]
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,
_ => false,
}
}
#[derive(Debug)]
struct Namespace {
start: usize,
prefix_len: usize,
value_len: usize,
level: i32,
}
impl Namespace {
#[inline]
fn prefix<'a, 'b>(&'a self, ns_buffer: &'b [u8]) -> &'b [u8] {
&ns_buffer[self.start..self.start + self.prefix_len]
}
#[inline]
fn opt_value<'a, 'b>(&'a self, ns_buffer: &'b [u8]) -> Option<&'b [u8]> {
if self.value_len == 0 {
None
} else {
Some(&ns_buffer[self.start + self.prefix_len..
self.start + self.prefix_len + self.value_len])
}
}
}
#[derive(Debug, Default)]
struct NamespaceBufferIndex {
slices: Vec<Namespace>,
nesting_level: i32,
pending_pop: bool,
}
impl NamespaceBufferIndex {
#[inline]
fn find_namespace_value<'a, 'b, 'c>(&'a self,
element_name: &'b [u8],
buffer: &'c [u8])
-> Option<&'c [u8]> {
let ns = match element_name.iter().position(|b| *b == b':') {
None => self.slices.iter().rev().find(|n| n.prefix_len == 0),
Some(len) => {
self.slices
.iter()
.rev()
.find(|n| n.prefix(buffer) == &element_name[..len])
}
};
ns.and_then(|n| n.opt_value(buffer))
}
fn pop_empty_namespaces(&mut self, buffer: &mut Vec<u8>) {
if !self.pending_pop {
return;
}
self.pending_pop = false;
self.nesting_level -= 1;
let current_level = self.nesting_level;
match self.slices.iter().rposition(|n| n.level <= current_level) {
None => {
buffer.clear();
self.slices.clear();
}
Some(last_valid_pos) => {
if let Some(len) = self.slices.get(last_valid_pos + 1).map(|n| n.start) {
buffer.truncate(len);
self.slices.truncate(last_valid_pos + 1);
}
}
}
}
fn push_new_namespaces(&mut self, e: &BytesStart, buffer: &mut Vec<u8>) {
self.nesting_level += 1;
let level = self.nesting_level;
for a in e.attributes().with_checks(false) {
if let Ok(Attribute { key: k, value: v }) = a {
if k.starts_with(b"xmlns") {
match k.get(5) {
None => {
let start = buffer.len();
buffer.extend_from_slice(v);
self.slices.push(Namespace {
start: start,
prefix_len: 0,
value_len: v.len(),
level: level,
});
}
Some(&b':') => {
let start = buffer.len();
buffer.extend_from_slice(&k[6..]);
buffer.extend_from_slice(v);
self.slices.push(Namespace {
start: start,
prefix_len: k.len() - 6,
value_len: v.len(),
level: level,
});
}
_ => break,
}
}
} else {
break;
}
}
}
#[inline]
fn resolve_namespace<'a, 'b, 'c>(&'a self,
qname: &'b [u8],
buffer: &'c [u8])
-> (Option<&'c [u8]>, &'b [u8]) {
qname
.iter()
.position(|b| *b == b':')
.and_then(|len| {
let (prefix, value) = qname.split_at(len);
self.slices
.iter()
.rev()
.find(|n| n.prefix(buffer) == prefix)
.map(|ns| (ns.opt_value(buffer), &value[1..]))
})
.unwrap_or((None, qname))
}
}
fn io_eof(msg: &str) -> ::std::io::Error {
::std::io::Error::new(::std::io::ErrorKind::UnexpectedEof, msg.to_string())
}