use core::cmp;
use core::iter::Filter;
use crate::tables::word::WordCat;
#[derive(Debug)]
pub struct UnicodeWords<'a> {
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for UnicodeWords<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
self.inner.next_back()
}
}
#[derive(Debug)]
pub struct UnicodeWordIndices<'a> {
#[allow(clippy::type_complexity)]
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}
impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.inner.next()
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.inner.next_back()
}
}
#[derive(Debug, Clone)]
pub struct UWordBounds<'a> {
string: &'a str,
cat: Option<WordCat>,
catb: Option<WordCat>,
}
#[derive(Debug, Clone)]
pub struct UWordBoundIndices<'a> {
start_offset: usize,
iter: UWordBounds<'a>,
}
impl<'a> UWordBoundIndices<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for UWordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum UWordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
WSegSpace,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
fn is_emoji(ch: char) -> bool {
use crate::tables::emoji;
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
}
impl<'a> Iterator for UWordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.is_empty() {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
let prev_zwj = cat == wd::WC_ZWJ;
cat = match self.cat {
None => wd::word_category(ch).2,
_ => self.cat.take().unwrap(),
};
take_cat = true;
if state != Start {
match cat {
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
skipped_format_extend = true;
continue;
}
_ => {}
}
}
if prev_zwj && is_emoji(ch) {
state = Emoji;
continue;
}
state = match state {
Start if cat == wd::WC_CR => {
idx += match self.get_next_cat(idx) {
Some(wd::WC_LF) => 1, _ => 0,
};
break; }
Start => match cat {
wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_Katakana => Katakana, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_Regional_Indicator => Regional(RegionalState::Half), wd::WC_LF | wd::WC_Newline => break, wd::WC_ZWJ => Zwj, wd::WC_WSegSpace => WSegSpace, _ => {
if let Some(ncat) = self.get_next_cat(idx) {
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
{
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break; }
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Zwj => {
take_curr = false;
break;
}
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_Double_Quote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter) }
wd::WC_Single_Quote if state == HLetter => {
FormatExtend(AcceptQLetter) }
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter) }
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric) }
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, wd::WC_ExtendNumLet => ExtendNumLet, _ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_Katakana => Katakana, _ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
wd::WC_Regional_Indicator => Regional(RegionalState::Full), _ => {
take_curr = false;
break;
}
},
Regional(_) => {
unreachable!("RegionalState::Unknown should not occur on forward iteration")
}
Emoji => {
take_curr = false;
break;
}
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric, RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, AcceptNone | AcceptQLetter => {
take_curr = false; take_cat = false;
break;
}
_ => break, },
}
}
if let FormatExtend(t) = state {
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.is_empty() {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
cat = match self.catb {
None => wd::word_category(ch).2,
_ => self.catb.take().unwrap(),
};
take_cat = true;
if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
if !matches!(state, FormatExtend(_) | Start) {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
state = savestate;
previdx = saveidx;
take_cat = false;
skipped_format_extend = true;
}
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
_ if is_emoji(ch) => Zwj,
wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_Katakana => Katakana, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
wd::WC_Single_Quote => {
saveidx = idx;
FormatExtend(AcceptQLetter) }
wd::WC_WSegSpace => WSegSpace,
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
if state == Start {
if cat == wd::WC_LF {
idx -= match self.get_prev_cat(idx) {
Some(wd::WC_CR) => 1, _ => 0,
};
}
} else {
take_curr = false;
}
break; }
_ => break, },
Zwj => match cat {
wd::WC_ZWJ => FormatExtend(AcceptAny),
_ => {
take_curr = false;
break;
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_Double_Quote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter) }
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireLetter) }
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireNumeric) }
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, wd::WC_ExtendNumLet => ExtendNumLet, _ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, wd::WC_ALetter => Letter, wd::WC_Hebrew_Letter => HLetter, wd::WC_Numeric => Numeric, wd::WC_Katakana => Katakana, _ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
wd::WC_Regional_Indicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars()
.rev()
.map(|c| wd::word_category(c).2)
.filter(|&c| {
!(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
})
.take_while(|&c| c == wd::WC_Regional_Indicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => {
if is_emoji(ch) {
Zwj
} else {
take_curr = false;
break;
}
}
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric, RequireLetter if cat == wd::WC_ALetter => Letter, RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, _ => break, },
}
}
if let FormatExtend(t) = state {
if t == RequireLetter
|| t == RequireHLetter
|| t == RequireNumeric
|| t == AcceptNone
|| t == AcceptQLetter
{
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> UWordBounds<'a> {
#[inline]
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
}
#[inline]
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
UWordBounds {
string: s,
cat: None,
catb: None,
}
}
#[inline]
pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
UWordBoundIndices {
start_offset: s.as_ptr() as usize,
iter: new_word_bounds(s),
}
}
#[inline]
fn has_alphanumeric(s: &&str) -> bool {
use crate::tables::util::is_alphanumeric;
s.chars().any(is_alphanumeric)
}
#[inline]
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
use super::UnicodeSegmentation;
UnicodeWords {
inner: s.split_word_bounds().filter(has_alphanumeric),
}
}
#[inline]
pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
use super::UnicodeSegmentation;
UnicodeWordIndices {
inner: s
.split_word_bound_indices()
.filter(|(_, c)| has_alphanumeric(c)),
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_syriac_abbr_mark() {
use crate::tables::word as wd;
let (_, _, cat) = wd::word_category('\u{70f}');
assert_eq!(cat, wd::WC_ALetter);
}
#[test]
fn test_end_of_ayah_cat() {
use crate::tables::word as wd;
let (_, _, cat) = wd::word_category('\u{6dd}');
assert_eq!(cat, wd::WC_Numeric);
}
}