use ascii::validate_ascii;
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
pub struct Utf8Error {
valid_up_to: usize,
}
impl Utf8Error {
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
}
#[cfg_attr(feature = "cargo-clippy", allow(eval_order_dependence))]
#[inline(always)]
pub fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
'outer: loop {
let mut first = {
let remaining = &v[index..];
match validate_ascii(remaining) {
None => {
break 'outer;
}
Some((non_ascii, consumed)) => {
index += consumed;
non_ascii
}
}
};
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
})
}
}
macro_rules! next { () => {{
index += 1;
if index >= len {
err!(None)
}
v[index]
}}}
'inner: loop {
let w = UTF8_CHAR_WIDTH[first as usize];
match w {
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0...0xBF) |
(0xE1...0xEC, 0x80...0xBF) |
(0xED, 0x80...0x9F) |
(0xEE...0xEF, 0x80...0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90...0xBF) |
(0xF1...0xF3, 0x80...0xBF) |
(0xF4, 0x80...0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
if index == len {
break 'outer;
}
first = v[index];
if first < 0x80 {
index += 1;
continue 'outer;
}
continue 'inner;
}
}
Ok(())
}
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, ];
const CONT_MASK: u8 = 0b0011_1111;
const TAG_CONT_U8: u8 = 0b1000_0000;