#![cfg_attr(feature = "alloc", doc = "```rust")]
#![cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
#![doc = "```"]
#![no_std]
#[cfg(any(test, feature = "alloc"))]
extern crate alloc;
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
#[cfg(feature = "alloc")]
use alloc::string::String;
use core::iter::FusedIterator;
use core::str::Chars;
const MAPPING: &str = include_str!("mapping.txt");
#[repr(C)]
#[derive(Copy, Clone)]
struct Ptr {
chr: [u8; 2],
len: u8,
}
const POINTERS: &[u8] = include_bytes!("pointers.bin");
#[inline(always)]
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode(s: &str) -> String {
deunicode_with_tofu(s, "[?]")
}
#[inline]
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode_with_tofu(s: &str, custom_placeholder: &str) -> String {
deunicode_with_tofu_cow(s, custom_placeholder).into_owned()
}
#[cfg(feature = "alloc")]
#[must_use]
pub fn deunicode_with_tofu_cow<'input>(s: &'input str, custom_placeholder: &str) -> Cow<'input, str> {
let ascii_len = s.as_bytes().iter().take_while(|&&c| c < 0x7F).count();
if ascii_len >= s.len() { return Cow::Borrowed(s);
}
let (ascii, rest) = s.as_bytes().split_at(ascii_len);
debug_assert!(core::str::from_utf8(ascii).is_ok());
let ascii = unsafe { core::str::from_utf8_unchecked(ascii) };
let mut out = String::new();
out.try_reserve_exact(s.len() | 15).unwrap_or_else(|_| panic!());
let needs_to_grow = ascii.as_bytes().len() > out.capacity().wrapping_sub(out.len());
if !needs_to_grow {
out.push_str(ascii);
}
debug_assert!(core::str::from_utf8(rest).is_ok());
let s = unsafe { core::str::from_utf8_unchecked(rest) };
out.extend(s.ascii_chars().map(move |ch| ch.unwrap_or(custom_placeholder)));
Cow::Owned(out)
}
#[inline]
#[must_use]
pub fn deunicode_char(ch: char) -> Option<&'static str> {
let pointers: &'static [Ptr] = unsafe {
core::slice::from_raw_parts(POINTERS.as_ptr().cast::<Ptr>(), POINTERS.len()/core::mem::size_of::<Ptr>())
};
if let Some(p) = pointers.get(ch as usize) {
if p.len <= 2 {
let chars = p.chr.get(..p.len as usize)?;
debug_assert!(core::str::from_utf8(chars).is_ok());
unsafe {
Some(core::str::from_utf8_unchecked(chars))
}
} else {
let map_pos = (u16::from(p.chr[0]) | u16::from(p.chr[1]) << 8) as usize;
MAPPING.get(map_pos..map_pos + p.len as usize)
}
} else {
None
}
}
pub trait AsciiChars {
#[cfg_attr(feature = "alloc", doc = "```rust")]
#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
#[doc = "```"]
fn ascii_chars(&self) -> AsciiCharsIter<'_>;
#[cfg(feature = "alloc")]
fn to_ascii_lossy(&self) -> String;
}
#[cfg(feature = "alloc")]
impl AsciiChars for String {
#[inline(always)]
fn ascii_chars(&self) -> AsciiCharsIter<'_> {
AsciiCharsIter::new(self)
}
#[inline(always)]
fn to_ascii_lossy(&self) -> String {
deunicode(self)
}
}
impl AsciiChars for str {
#[inline(always)]
fn ascii_chars(&self) -> AsciiCharsIter<'_> {
AsciiCharsIter::new(self)
}
#[inline(always)]
#[cfg(feature = "alloc")]
fn to_ascii_lossy(&self) -> String {
deunicode(self)
}
}
#[cfg_attr(feature = "alloc", doc = "```rust")]
#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
#[doc = "```"]
#[derive(Clone)]
pub struct AsciiCharsIter<'a> {
next_char: Option<Option<&'static str>>,
chars: Chars<'a>,
}
impl<'a> AsciiCharsIter<'a> {
#[inline]
pub fn new(unicode_string: &'a str) -> Self {
let mut chars = unicode_string.chars();
Self {
next_char: chars.next().map(deunicode_char),
chars,
}
}
}
impl<'a> FusedIterator for AsciiCharsIter<'a> {}
impl<'a> Iterator for AsciiCharsIter<'a> {
type Item = Option<&'static str>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let dch = self.next_char?;
self.next_char = self.chars.next().map(deunicode_char);
let dch = match dch {
None => return Some(None),
Some(dch) => dch,
};
let trim_last_char = dch.as_bytes().len() > 1 && dch.as_bytes().last().copied() == Some(b' ') &&
self.next_char.map_or(true, |ch| { ch.map_or(false, |ch| ch.as_bytes().first().copied() == Some(b' ')) });
Some(if !trim_last_char {
Some(dch)
} else {
dch.get(..dch.len()-1)
})
}
#[inline]
fn count(self) -> usize {
self.chars.count() + if self.next_char.is_some() {1} else {0}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.chars.size_hint().0 + if self.next_char.is_some() {1} else {0}, None)
}
}
#[cfg_attr(feature = "alloc", doc = "```rust")]
#[cfg_attr(not(feature = "alloc"), doc = "```rust,ignore")]
#[doc = "```"]
impl core::fmt::Display for AsciiCharsIter<'_> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.clone().try_for_each(|ch| f.write_str(ch.unwrap_or("\u{FFFD}")))
}
}
#[test]
fn iter_test() {
use alloc::vec::Vec;
let chars: Vec<_> = AsciiCharsIter::new("中国").flatten().collect();
assert_eq!(&chars, &["Zhong ", "Guo"]);
let chars: Vec<_> = "中国x".ascii_chars().flatten().collect();
assert_eq!(&chars, &["Zhong ", "Guo ", "x"]);
let chars: Vec<_> = "中 国".ascii_chars().flatten().collect();
assert_eq!(&chars, &["Zhong", " ", "Guo"]);
}