#![feature(macro_rules)]
#![warn(missing_docs)]
use std::borrow::Cow;
use std::error::Error;
use std::result::Result;
use std::slice;
use std::str::CowString;
use std::str::{from_utf8, is_utf8, utf8_char_width, from_utf8_unchecked};
use std::vec::CowVec;
const CONT_MASK: u8 = 0b0011_1111u8;
const TAG_CONT_U8: u8 = 0b1000_0000u8;
#[deriving(Copy, Show)]
pub struct Cesu8DecodingError;
impl Error for Cesu8DecodingError {
fn description(&self) -> &str { "decoding error" }
fn detail(&self) -> Option<String> {
Some("could not convert CESU-8 data to UTF-8".to_string())
}
fn cause(&self) -> Option<&Error> { None }
}
pub fn from_cesu8(bytes: &[u8]) -> Result<CowString, Cesu8DecodingError> {
match from_utf8(bytes) {
Some(str) => Ok(Cow::Borrowed(str)),
None => {
let mut decoded = Vec::with_capacity(bytes.len());
if decode_from_iter(&mut decoded, &mut bytes.iter()) {
assert!(is_utf8(decoded.as_slice()));
Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
} else {
Err(Cesu8DecodingError)
}
}
}
}
#[test]
fn test_from_cesu8() {
let data = &[0x4D, 0xE6, 0x97, 0xA5, 0xED, 0xA0, 0x81, 0xED, 0xB0, 0x81];
assert_eq!(Cow::Borrowed("M日\u{10401}"),
from_cesu8(data).unwrap());
}
fn decode_from_iter(decoded: &mut Vec<u8>, iter: &mut slice::Items<u8>) -> bool {
macro_rules! err {
() => { return false }
}
macro_rules! next {
() => {
match iter.next() {
Some(a) => *a,
None => err!()
}
}
}
macro_rules! next_cont {
() => {
{
let byte = next!();
if (byte) & !CONT_MASK == TAG_CONT_U8 { byte } else { err!() }
}
}
}
loop {
let first = match iter.next() {
Some(&b) => b,
None => return true
};
if first < 127 {
decoded.push(first);
} else {
let w = utf8_char_width(first);
let second = next_cont!();
match w {
2 => { decoded.push_all(&[first, second]); }
3 => {
let third = next_cont!();
match (first, second) {
(0xE0 , 0xA0 ... 0xBF) |
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
(0xED , 0x80 ... 0x9F) |
(0xEE ... 0xEF, 0x80 ... 0xBF) => {
decoded.push_all(&[first, second, third])
}
(0xED , 0xA0 ... 0xAF) => {
if next!() != 0xED { err!() }
let fifth = next_cont!();
if fifth < 0xB0 || 0xBF < fifth { err!() }
let sixth = next_cont!();
let s = dec_surrogates(second, third, fifth, sixth);
decoded.push_all(&s);
}
_ => err!()
}
}
_ => err!()
}
}
}
}
fn dec_surrogate(second: u8, third: u8) -> u32 {
0xD000u32 | (second & CONT_MASK) as u32 << 6 | (third & CONT_MASK) as u32
}
fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8, ..4] {
let s1 = dec_surrogate(second, third);
let s2 = dec_surrogate(fifth, sixth);
let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
assert!(0x010000 <= c && c <= 0x10FFFF);
[0b1111_0000u8 | ((c & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
TAG_CONT_U8 | ((c & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
TAG_CONT_U8 | ((c & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
TAG_CONT_U8 | ((c & 0b0_0000_0000_0000_0011_1111) ) as u8]
}
pub fn to_cesu8(text: &str) -> CowVec<u8> {
if is_valid_cesu8(text) {
Cow::Borrowed(text.as_bytes())
} else {
let bytes = text.as_bytes();
let mut encoded = Vec::with_capacity(bytes.len() + bytes.len() >> 2);
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b < 128 {
encoded.push(b);
i += 1;
} else {
let w = utf8_char_width(b);
assert!(w <= 4);
assert!(i + w <= bytes.len());
if w != 4 {
encoded.push_all(bytes.slice(i, i+w));
} else {
let s = unsafe { from_utf8_unchecked(bytes.slice(i, i+w)) };
for u in s.utf16_units() {
encoded.push_all(&enc_surrogate(u))
}
}
i += w;
}
}
Cow::Owned(encoded)
}
}
pub fn is_valid_cesu8(text: &str) -> bool {
for b in text.bytes() {
if (b & !CONT_MASK) == TAG_CONT_U8 { continue; }
if utf8_char_width(b) > 3 { return false; }
}
true
}
fn enc_surrogate(surrogate: u16) -> [u8, ..3] {
assert!(0xD800 <= surrogate && surrogate <= 0xDFFF);
[0b11100000 | ((surrogate & 0b11110000_00000000) >> 12) as u8,
TAG_CONT_U8 | ((surrogate & 0b00001111_11000000) >> 6) as u8,
TAG_CONT_U8 | ((surrogate & 0b00000000_00111111) ) as u8]
}