extern crate encoding_rs;
use std::fmt;
use std::io::{self, Read};
use encoding_rs::{Decoder, Encoding, UTF_8};
use util::{BomPeeker, TinyTranscoder};
mod util;
#[derive(Clone, Debug)]
pub struct DecodeReaderBytesBuilder {
encoding: Option<&'static Encoding>,
utf8_passthru: bool,
bom_override: bool,
strip_bom: bool,
bom_sniffing: bool,
}
impl Default for DecodeReaderBytesBuilder {
fn default() -> DecodeReaderBytesBuilder {
DecodeReaderBytesBuilder::new()
}
}
impl DecodeReaderBytesBuilder {
pub fn new() -> DecodeReaderBytesBuilder {
DecodeReaderBytesBuilder {
encoding: None,
utf8_passthru: false,
bom_override: false,
strip_bom: false,
bom_sniffing: true,
}
}
pub fn build<R: io::Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
}
pub fn build_with_buffer<R: io::Read, B: AsMut<[u8]>>(
&self,
rdr: R,
mut buffer: B,
) -> io::Result<DecodeReaderBytes<R, B>> {
if buffer.as_mut().len() < 4 {
let msg = format!(
"DecodeReaderBytesBuilder: buffer of size {} is too small",
buffer.as_mut().len(),
);
return Err(io::Error::new(io::ErrorKind::Other, msg));
}
let encoding =
self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
let has_detected =
!self.bom_sniffing || (!self.bom_override && encoding.is_some());
let peeker = if self.strip_bom {
BomPeeker::without_bom(rdr)
} else {
BomPeeker::with_bom(rdr)
};
Ok(DecodeReaderBytes {
rdr: peeker,
decoder: encoding,
tiny: TinyTranscoder::new(),
utf8_passthru: self.utf8_passthru,
buf: buffer,
buflen: 0,
pos: 0,
has_detected: has_detected,
exhausted: false,
})
}
pub fn encoding(
&mut self,
encoding: Option<&'static Encoding>,
) -> &mut DecodeReaderBytesBuilder {
self.encoding = encoding;
self
}
pub fn utf8_passthru(
&mut self,
yes: bool,
) -> &mut DecodeReaderBytesBuilder {
self.utf8_passthru = yes;
self
}
pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
self.strip_bom = yes;
self
}
pub fn bom_override(
&mut self,
yes: bool,
) -> &mut DecodeReaderBytesBuilder {
self.bom_override = yes;
self
}
pub fn bom_sniffing(
&mut self,
yes: bool,
) -> &mut DecodeReaderBytesBuilder {
self.bom_sniffing = yes;
self
}
}
pub struct DecodeReaderBytes<R, B> {
rdr: BomPeeker<R>,
decoder: Option<Decoder>,
tiny: TinyTranscoder,
utf8_passthru: bool,
buf: B,
pos: usize,
buflen: usize,
has_detected: bool,
exhausted: bool,
}
impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReaderBytes<R, B> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.detect()?;
if self.decoder.is_none() {
self.rdr.read(buf)
} else {
self.transcode(buf)
}
}
}
impl<R: io::Read> DecodeReaderBytes<R, Vec<u8>> {
pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
DecodeReaderBytesBuilder::new().build(rdr)
}
}
impl<R: io::Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if self.exhausted || buf.is_empty() {
return Ok(0);
}
let nwrite = self.tiny.read(buf)?;
if nwrite > 0 {
return Ok(nwrite);
}
if self.pos >= self.buflen {
self.fill()?;
}
if buf.len() < 4 {
return self.tiny_transcode(buf);
}
loop {
let (_, nin, nout, _) =
self.decoder.as_mut().unwrap().decode_to_utf8(
&self.buf.as_mut()[self.pos..self.buflen],
buf,
false,
);
self.pos += nin;
if nout > 0 {
return Ok(nout);
}
self.fill()?;
if self.buflen == 0 {
let (_, _, nout, _) = self
.decoder
.as_mut()
.unwrap()
.decode_to_utf8(&[], buf, true);
return Ok(nout);
}
}
}
fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
assert!(buf.len() < 4, "have a small caller buffer");
loop {
let (nin, nout) = self.tiny.transcode(
self.decoder.as_mut().unwrap(),
&self.buf.as_mut()[self.pos..self.buflen],
false,
);
self.pos += nin;
if nout > 0 {
return self.tiny.read(buf);
}
self.fill()?;
if self.buflen == 0 {
self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
return self.tiny.read(buf);
}
}
}
fn detect(&mut self) -> io::Result<()> {
if self.has_detected {
return Ok(());
}
self.has_detected = true;
let bom = self.rdr.peek_bom()?;
if let Some(encoding) = bom.encoding() {
if encoding == UTF_8 && self.utf8_passthru {
return Ok(());
}
self.decoder = Some(encoding.new_decoder_with_bom_removal());
}
Ok(())
}
fn fill(&mut self) -> io::Result<()> {
if self.pos < self.buflen {
assert!(
self.buflen < self.buf.as_mut().len(),
"internal buffer should never be exhausted"
);
let buf = self.buf.as_mut();
for (dst, src) in (self.pos..self.buflen).enumerate() {
buf[dst] = buf[src];
}
self.buflen -= self.pos;
} else {
self.buflen = 0;
}
self.pos = 0;
self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
if self.buflen == 0 {
self.exhausted = true;
}
Ok(())
}
}
impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut fmter = f.debug_struct("DecodeReaderBytes");
fmter
.field("rdr", &self.rdr)
.field("tiny", &self.tiny)
.field("utf8_passthru", &self.utf8_passthru)
.field("buf", &self.buf)
.field("pos", &self.pos)
.field("buflen", &self.buflen)
.field("has_detected", &self.has_detected)
.field("exhausted", &self.exhausted);
if let Some(ref d) = self.decoder {
let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
fmter.field("decoder", &msg);
} else {
fmter.field("decoder", &"None");
}
fmter.finish()
}
}
#[cfg(test)]
mod tests {
use std::io::Read;
use encoding_rs::{self, Encoding};
use super::{DecodeReaderBytes, DecodeReaderBytesBuilder};
fn read_to_string<R: Read>(mut rdr: R) -> String {
let mut s = String::new();
rdr.read_to_string(&mut s).unwrap();
s
}
#[test]
fn trans_utf16_bom() {
let srcbuf = vec![0xFF, 0xFE];
let mut dstbuf = vec![0; 8 * (1 << 10)];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
let n = rdr.read(&mut dstbuf).unwrap();
assert_eq!(&*srcbuf, &dstbuf[..n]);
let srcbuf = vec![0xFE, 0xFF];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
let n = rdr.read(&mut dstbuf).unwrap();
assert_eq!(&*srcbuf, &dstbuf[..n]);
let srcbuf = vec![0xEF, 0xBB, 0xBF];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
let n = rdr.read(&mut dstbuf).unwrap();
assert_eq!(n, 0);
let srcbuf = vec![0xEF, 0xBB, 0xBF];
let mut rdr = DecodeReaderBytesBuilder::new()
.utf8_passthru(true)
.build(&*srcbuf);
let n = rdr.read(&mut dstbuf).unwrap();
assert_eq!(&*srcbuf, &dstbuf[..n]);
}
#[test]
fn trans_utf16_basic() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
assert_eq!("a", read_to_string(&mut rdr));
let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
assert_eq!("a", read_to_string(&mut rdr));
}
#[test]
fn trans_utf16_basic_without_bom() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
let mut rdr =
DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
assert_eq!("a", read_to_string(&mut rdr));
let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
let mut rdr =
DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
assert_eq!("a", read_to_string(&mut rdr));
}
#[test]
fn trans_utf16_bom_override() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
let mut rdr = DecodeReaderBytesBuilder::new()
.bom_override(true)
.encoding(Some(encoding_rs::UTF_8))
.build(&*srcbuf);
assert_eq!("a", read_to_string(&mut rdr));
}
#[test]
fn trans_utf16_smallbuf() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
let mut tmp = [0u8; 1];
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'a'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'b'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'c'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 0);
}
#[test]
fn trans_utf16_incomplete() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
let mut rdr = DecodeReaderBytes::new(&*srcbuf);
assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
}
#[test]
fn trans_utf16_minimal_buffer_normal_caller_buffer() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
0x62, 0x00,
0x63, 0x00,
0x64, 0x00,
0x65, 0x00,
0x66, 0x00,
0x67, 0x00,
0x68, 0x00,
];
let mut rdr = DecodeReaderBytesBuilder::new()
.build_with_buffer(&*srcbuf, vec![0; 4])
.unwrap();
let got = read_to_string(&mut rdr);
assert_eq!(got, "abcdefgh");
}
#[test]
fn trans_utf16_minimal_buffers() {
let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
let mut rdr = DecodeReaderBytesBuilder::new()
.build_with_buffer(&*srcbuf, vec![0; 4])
.unwrap();
let mut tmp = [0u8; 1];
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'a'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'b'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 1);
assert_eq!(tmp, [b'c'; 1]);
let nread = rdr.read(&mut tmp).unwrap();
assert_eq!(nread, 0);
}
#[test]
fn trans_utf16_byte_api() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
0x62, 0x00,
0x63, 0x00,
0x64, 0x00,
0x65, 0x00,
0x66, 0x00,
0x67, 0x00,
0x68, 0x00,
];
let rdr = DecodeReaderBytes::new(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, b"abcdefgh");
}
#[test]
fn trans_utf16_no_sniffing() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, srcbuf);
}
#[test]
fn trans_utf16_no_sniffing_strip_bom() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.strip_bom(true)
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, &[0x61, 0x00]);
}
#[test]
fn trans_utf16_no_sniffing_encoding_override() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.encoding(Some(encoding_rs::UTF_16LE))
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, b"a");
}
#[test]
fn trans_utf16_no_sniffing_encoding_override_strip_bom() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.bom_sniffing(false)
.strip_bom(true)
.encoding(Some(encoding_rs::UTF_16LE))
.build(&*srcbuf);
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, b"a");
}
#[test]
fn trans_utf16_minimal_buffer_byte_api() {
#[rustfmt::skip]
let srcbuf = vec![
0xFF, 0xFE,
0x61, 0x00,
0x62, 0x00,
0x63, 0x00,
0x64, 0x00,
0x65, 0x00,
0x66, 0x00,
0x67, 0x00,
0x68, 0x00,
];
let rdr = DecodeReaderBytesBuilder::new()
.build_with_buffer(&*srcbuf, vec![0; 4])
.unwrap();
let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
assert_eq!(got, b"abcdefgh");
}
#[test]
fn buffer_too_small() {
let res = DecodeReaderBytesBuilder::new()
.build_with_buffer(&[][..], vec![0; 3]);
assert!(res.is_err());
}
macro_rules! test_trans_simple {
($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
#[test]
fn $name() {
let srcbuf = &$srcbytes[..];
let enc = Encoding::for_label($enc.as_bytes());
let mut rdr = DecodeReaderBytesBuilder::new()
.encoding(enc)
.build(&*srcbuf);
assert_eq!($dst, read_to_string(&mut rdr));
}
};
}
test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
test_trans_simple!(
trans_simple_big5_hkscs,
"big5-hkscs",
b"\xC7\xFA",
"Ж"
);
test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
}