#![deny(missing_docs)]
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
#[macro_use]
extern crate doc_comment;
#[cfg(test)]
doctest!("../README.md");
use std::char;
use std::fmt;
use std::slice;
use char_utf8::encode_utf8;
const MAX_UTF8_BYTES: usize = 4;
mod char_utf8;
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum Utf8Sequence {
One(Utf8Range),
Two([Utf8Range; 2]),
Three([Utf8Range; 3]),
Four([Utf8Range; 4]),
}
impl Utf8Sequence {
fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
assert_eq!(start.len(), end.len());
match start.len() {
2 => Utf8Sequence::Two([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
]),
3 => Utf8Sequence::Three([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
]),
4 => Utf8Sequence::Four([
Utf8Range::new(start[0], end[0]),
Utf8Range::new(start[1], end[1]),
Utf8Range::new(start[2], end[2]),
Utf8Range::new(start[3], end[3]),
]),
n => unreachable!("invalid encoded length: {}", n),
}
}
pub fn as_slice(&self) -> &[Utf8Range] {
use self::Utf8Sequence::*;
match *self {
One(ref r) => unsafe { slice::from_raw_parts(r, 1) },
Two(ref r) => &r[..],
Three(ref r) => &r[..],
Four(ref r) => &r[..],
}
}
pub fn len(&self) -> usize {
self.as_slice().len()
}
pub fn matches(&self, bytes: &[u8]) -> bool {
if bytes.len() < self.len() {
return false;
}
for (&b, r) in bytes.iter().zip(self) {
if !r.matches(b) {
return false;
}
}
true
}
}
impl<'a> IntoIterator for &'a Utf8Sequence {
type IntoIter = slice::Iter<'a, Utf8Range>;
type Item = &'a Utf8Range;
fn into_iter(self) -> Self::IntoIter {
self.as_slice().into_iter()
}
}
impl fmt::Debug for Utf8Sequence {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Utf8Sequence::*;
match *self {
One(ref r) => write!(f, "{:?}", r),
Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}",
r[0], r[1], r[2], r[3]),
}
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct Utf8Range {
pub start: u8,
pub end: u8,
}
impl Utf8Range {
fn new(start: u8, end: u8) -> Self {
Utf8Range { start: start, end: end }
}
pub fn matches(&self, b: u8) -> bool {
self.start <= b && b <= self.end
}
}
impl fmt::Debug for Utf8Range {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.start == self.end {
write!(f, "[{:X}]", self.start)
} else {
write!(f, "[{:X}-{:X}]", self.start, self.end)
}
}
}
pub struct Utf8Sequences {
range_stack: Vec<ScalarRange>,
}
impl Utf8Sequences {
pub fn new(start: char, end: char) -> Self {
let mut it = Utf8Sequences { range_stack: vec![] };
it.push(start as u32, end as u32);
it
}
#[doc(hidden)]
pub fn reset(&mut self, start: char, end: char) {
self.range_stack.clear();
self.push(start as u32, end as u32);
}
fn push(&mut self, start: u32, end: u32) {
self.range_stack.push(ScalarRange { start: start, end: end });
}
}
struct ScalarRange {
start: u32,
end: u32,
}
impl fmt::Debug for ScalarRange {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
}
}
impl Iterator for Utf8Sequences {
type Item = Utf8Sequence;
fn next(&mut self) -> Option<Self::Item> {
'TOP:
while let Some(mut r) = self.range_stack.pop() {
'INNER:
loop {
if let Some((r1, r2)) = r.split() {
self.push(r2.start, r2.end);
r.start = r1.start;
r.end = r1.end;
continue 'INNER;
}
if !r.is_valid() {
continue 'TOP;
}
for i in 1..MAX_UTF8_BYTES {
let max = max_scalar_value(i);
if r.start <= max && max < r.end {
self.push(max + 1, r.end);
r.end = max;
continue 'INNER;
}
}
if let Some(ascii_range) = r.as_ascii() {
return Some(Utf8Sequence::One(ascii_range));
}
for i in 1..MAX_UTF8_BYTES {
let m = (1 << (6 * i)) - 1;
if (r.start & !m) != (r.end & !m) {
if (r.start & m) != 0 {
self.push((r.start | m) + 1, r.end);
r.end = r.start | m;
continue 'INNER;
}
if (r.end & m) != m {
self.push(r.end & !m, r.end);
r.end = (r.end & !m) - 1;
continue 'INNER;
}
}
}
let mut start = [0; MAX_UTF8_BYTES];
let mut end = [0; MAX_UTF8_BYTES];
let n = r.encode(&mut start, &mut end);
return Some(Utf8Sequence::from_encoded_range(
&start[0..n], &end[0..n]));
}
}
None
}
}
impl ScalarRange {
fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
if self.start < 0xE000 && self.end > 0xD7FF {
Some((ScalarRange {
start: self.start,
end: 0xD7FF,
}, ScalarRange {
start: 0xE000,
end: self.end,
}))
} else {
None
}
}
fn is_valid(&self) -> bool {
self.start <= self.end
}
fn as_ascii(&self) -> Option<Utf8Range> {
if self.is_ascii() {
Some(Utf8Range::new(self.start as u8, self.end as u8))
} else {
None
}
}
fn is_ascii(&self) -> bool {
self.is_valid() && self.end <= 0x7f
}
fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
let cs = char::from_u32(self.start).unwrap();
let ce = char::from_u32(self.end).unwrap();
let n = encode_utf8(cs, start).unwrap();
let m = encode_utf8(ce, end).unwrap();
assert_eq!(n, m);
n
}
}
fn max_scalar_value(nbytes: usize) -> u32 {
match nbytes {
1 => 0x007F,
2 => 0x07FF,
3 => 0xFFFF,
4 => 0x10FFFF,
_ => unreachable!("invalid UTF-8 byte sequence size"),
}
}
#[cfg(test)]
mod tests {
use std::char;
use quickcheck::{TestResult, quickcheck};
use char_utf8::encode_utf8;
use {MAX_UTF8_BYTES, Utf8Range, Utf8Sequences};
fn rutf8(s: u8, e: u8) -> Utf8Range {
Utf8Range::new(s, e)
}
fn never_accepts_surrogate_codepoints(start: char, end: char) {
let mut buf = [0; MAX_UTF8_BYTES];
for cp in 0xD800..0xE000 {
let c = unsafe { ::std::mem::transmute(cp) };
let n = encode_utf8(c, &mut buf).unwrap();
for r in Utf8Sequences::new(start, end) {
if r.matches(&buf[0..n]) {
panic!("Sequence ({:X}, {:X}) contains range {:?}, \
which matches surrogate code point {:X} \
with encoded bytes {:?}",
start as u32, end as u32, r, cp, &buf[0..n]);
}
}
}
}
#[test]
fn codepoints_no_surrogates() {
never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
}
#[test]
fn single_codepoint_one_sequence() {
for i in 0x0..(0x10FFFF + 1) {
let c = match char::from_u32(i) {
None => continue,
Some(c) => c,
};
let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
assert_eq!(seqs.len(), 1);
}
}
#[test]
fn qc_codepoints_no_surrogate() {
fn p(s: char, e: char) -> TestResult {
if s > e {
return TestResult::discard();
}
never_accepts_surrogate_codepoints(s, e);
TestResult::passed()
}
quickcheck(p as fn(char, char) -> TestResult);
}
#[test]
fn bmp() {
use Utf8Sequence::*;
let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}')
.collect::<Vec<_>>();
assert_eq!(seqs, vec![
One(rutf8(0x0, 0x7F)),
Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]),
Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]),
Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
]);
}
#[test]
fn scratch() {
for range in Utf8Sequences::new('\u{0}', '\u{FFFF}') {
println!("{:?}", range);
}
}
}