[go: up one dir, main page]

xml/
reader.rs

1//! Contains high-level interface for a pull-based XML parser.
2//!
3//! The most important type in this module is `EventReader`, which provides an iterator
4//! view for events in XML document.
5
6use std::io::Read;
7use std::iter::FusedIterator;
8use std::result;
9
10use crate::common::{Position, TextPosition};
11
12pub use self::config::ParserConfig;
13pub use self::error::{Error, ErrorKind};
14pub use events::{XmlEvent, DoctypeRef};
15
16// back compat
17#[doc(hidden)]
18#[deprecated(note = "Merged into ParserConfig")]
19pub type ParserConfig2 = ParserConfig;
20
21use self::parser::PullParser;
22
23mod config;
24mod error;
25mod events;
26mod indexset;
27mod lexer;
28mod parser;
29
30/// A result type yielded by `XmlReader`.
31pub type Result<T, E = Error> = result::Result<T, E>;
32
33/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing.
34///
35/// The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
36pub struct EventReader<R: Read> {
37    source: R,
38    parser: PullParser,
39}
40
41impl<R: Read> EventReader<R> {
42    /// Creates a new reader, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
43    #[inline]
44    pub fn new(source: R) -> Self {
45        Self::new_with_config(source, ParserConfig::new())
46    }
47
48    /// Creates a new reader with the provded configuration, consuming the given stream. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
49    #[inline]
50    pub fn new_with_config(source: R, config: impl Into<ParserConfig>) -> Self {
51        Self {
52            source,
53            parser: PullParser::new(config),
54        }
55    }
56
57    /// Pulls and returns next XML event from the stream.
58    ///
59    /// If this returns [Err] or [`XmlEvent::EndDocument`] then further calls to
60    /// this method will return this event again.
61    #[inline]
62    #[allow(clippy::should_implement_trait)]
63    pub fn next(&mut self) -> Result<XmlEvent> {
64        self.parser.next(&mut self.source)
65    }
66
67    /// Skips all XML events until the next end tag at the current level.
68    ///
69    /// Convenience function that is useful for the case where you have
70    /// encountered a start tag that is of no interest and want to
71    /// skip the entire XML subtree until the corresponding end tag.
72    #[inline]
73    pub fn skip(&mut self) -> Result<()> {
74        let mut depth = 1;
75
76        while depth > 0 {
77            match self.next()? {
78                XmlEvent::StartElement { .. } => depth += 1,
79                XmlEvent::EndElement { .. } => depth -= 1,
80                XmlEvent::EndDocument => return Err(Error {
81                    kind: ErrorKind::UnexpectedEof,
82                    pos: self.parser.position(),
83                }),
84                _ => {},
85            }
86        }
87
88        Ok(())
89    }
90
91    /// Access underlying reader
92    ///
93    /// Using it directly while the event reader is parsing is not recommended
94    pub fn source(&self) -> &R { &self.source }
95
96    /// Access underlying reader
97    ///
98    /// Using it directly while the event reader is parsing is not recommended
99    pub fn source_mut(&mut self) -> &mut R { &mut self.source }
100
101    /// Unwraps this `EventReader`, returning the underlying reader.
102    ///
103    /// Note that this operation is destructive; unwrapping the reader and wrapping it
104    /// again with `EventReader::new()` will create a fresh reader which will attempt
105    /// to parse an XML document from the beginning.
106    pub fn into_inner(self) -> R {
107        self.source
108    }
109
110    /// Returns the DOCTYPE of the document if it has already been seen
111    ///
112    /// Available only after the `Doctype` event
113    #[inline]
114    #[deprecated(note = "there is `XmlEvent::Doctype` now")]
115    #[allow(deprecated)]
116    pub fn doctype(&self) -> Option<&str> {
117        self.parser.doctype()
118    }
119
120    /// Returns PUBLIC/SYSTEM DOCTYPE IDs if it has already been seen
121    ///
122    /// Available only after the `Doctype` event
123    #[inline]
124    pub fn doctype_ids(&self) -> Option<DoctypeRef<'_>> {
125        self.parser.doctype_ids()
126    }
127
128    /// Add new entity definitions **before any XML elements have been parsed**.
129    ///
130    /// ## Errors
131    ///
132    /// It's valid to call this after DOCTYPE, but not later. It won't be possible to add entities to a document without either XML decl or DOCTYPE.
133    ///
134    /// It will fail if the document is declared as _standalone_.
135    #[inline]
136    pub fn add_entities<S: Into<String>, T: Into<String>>(&mut self, entities: impl IntoIterator<Item=(S, T)>) -> std::result::Result<(), crate::reader::error::ImmutableEntitiesError> {
137        self.parser.add_entities(entities)
138    }
139}
140
141impl<B: Read> Position for EventReader<B> {
142    /// Returns the position of the last event produced by the reader.
143    #[inline]
144    fn position(&self) -> TextPosition {
145        self.parser.position()
146    }
147}
148
149impl<R: Read> IntoIterator for EventReader<R> {
150    type IntoIter = Events<R>;
151    type Item = Result<XmlEvent>;
152
153    fn into_iter(self) -> Events<R> {
154        Events { reader: self, finished: false }
155    }
156}
157
158/// An iterator over XML events created from some type implementing `Read`.
159///
160/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then
161/// it will be returned by the iterator once, and then it will stop producing events.
162pub struct Events<R: Read> {
163    reader: EventReader<R>,
164    finished: bool,
165}
166
167impl<R: Read> Events<R> {
168    /// Unwraps the iterator, returning the internal `EventReader`.
169    #[inline]
170    pub fn into_inner(self) -> EventReader<R> {
171        self.reader
172    }
173
174    /// Access the underlying reader
175    ///
176    /// It's not recommended to use it while the events are still being parsed
177    pub fn source(&self) -> &R { &self.reader.source }
178
179    /// Access the underlying reader
180    ///
181    /// It's not recommended to use it while the events are still being parsed
182    pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source }
183}
184
185impl<R: Read> std::ops::Deref for Events<R> {
186    type Target = EventReader<R>;
187
188    fn deref(&self) -> &Self::Target {
189        &self.reader
190    }
191}
192
193impl<R: Read> std::ops::DerefMut for Events<R> {
194    fn deref_mut(&mut self) -> &mut Self::Target {
195        &mut self.reader
196    }
197}
198
199impl<R: Read> FusedIterator for Events<R> {
200}
201
202impl<R: Read> Iterator for Events<R> {
203    type Item = Result<XmlEvent>;
204
205    #[inline]
206    fn next(&mut self) -> Option<Result<XmlEvent>> {
207        if self.finished && !self.reader.parser.is_ignoring_end_of_stream() {
208            None
209        } else {
210            let ev = self.reader.next();
211            if let Ok(XmlEvent::EndDocument) | Err(_) = ev {
212                self.finished = true;
213            }
214            Some(ev)
215        }
216    }
217}
218
219impl<'r> EventReader<&'r [u8]> {
220    /// A convenience method to create an `XmlReader` from a string slice.
221    #[inline]
222    #[must_use]
223    #[allow(clippy::should_implement_trait)]
224    pub fn from_str(source: &'r str) -> Self {
225        EventReader::new(source.as_bytes())
226    }
227}