#[cfg(feature = "unicode-linebreak")]
use crate::core::skip_ansi_escape_sequence;
use crate::core::Word;
#[derive(Debug, Clone, Copy)]
pub enum WordSeparator {
AsciiSpace,
#[cfg(feature = "unicode-linebreak")]
UnicodeBreakProperties,
Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
}
impl PartialEq for WordSeparator {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
#[cfg(feature = "unicode-linebreak")]
(WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
(_, _) => false,
}
}
}
impl WordSeparator {
pub const fn new() -> Self {
#[cfg(feature = "unicode-linebreak")]
{
WordSeparator::UnicodeBreakProperties
}
#[cfg(not(feature = "unicode-linebreak"))]
{
WordSeparator::AsciiSpace
}
}
pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
match self {
WordSeparator::AsciiSpace => find_words_ascii_space(line),
#[cfg(feature = "unicode-linebreak")]
WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
WordSeparator::Custom(func) => func(line),
}
}
}
fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
let mut start = 0;
let mut in_whitespace = false;
let mut char_indices = line.char_indices();
Box::new(std::iter::from_fn(move || {
for (idx, ch) in char_indices.by_ref() {
if in_whitespace && ch != ' ' {
let word = Word::from(&line[start..idx]);
start = idx;
in_whitespace = ch == ' ';
return Some(word);
}
in_whitespace = ch == ' ';
}
if start < line.len() {
let word = Word::from(&line[start..]);
start = line.len();
return Some(word);
}
None
}))
}
#[cfg(feature = "unicode-linebreak")]
fn strip_ansi_escape_sequences(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars();
while let Some(ch) = chars.next() {
if skip_ansi_escape_sequence(ch, &mut chars) {
continue;
}
result.push(ch);
}
result
}
#[cfg(feature = "unicode-linebreak")]
const SHY: char = '\u{00ad}';
#[cfg(feature = "unicode-linebreak")]
fn find_words_unicode_break_properties<'a>(
line: &'a str,
) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
let mut last_stripped_idx = 0;
let mut char_indices = line.char_indices();
let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
Some((orig_idx, ch)) => {
let stripped_idx = last_stripped_idx;
if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
last_stripped_idx += ch.len_utf8();
}
Some((orig_idx, stripped_idx))
}
None => None,
});
let stripped = strip_ansi_escape_sequences(line);
let mut opportunities = unicode_linebreak::linebreaks(&stripped)
.filter(|(idx, _)| {
#[allow(clippy::match_like_matches_macro)]
match &stripped[..*idx].chars().next_back() {
Some('-') => false,
Some(SHY) => false,
_ => true,
}
})
.collect::<Vec<_>>()
.into_iter();
opportunities.next_back();
let mut start = 0;
Box::new(std::iter::from_fn(move || {
for (idx, _) in opportunities.by_ref() {
if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
let word = Word::from(&line[start..orig_idx]);
start = orig_idx;
return Some(word);
}
}
if start < line.len() {
let word = Word::from(&line[start..]);
start = line.len();
return Some(word);
}
None
}))
}
#[cfg(test)]
mod tests {
use super::WordSeparator::*;
use super::*;
macro_rules! assert_iter_eq {
($left:expr, $right:expr) => {
assert_eq!($left.collect::<Vec<_>>(), $right);
};
}
fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
words.into_iter().map(Word::from).collect()
}
macro_rules! test_find_words {
($ascii_name:ident,
$unicode_name:ident,
$([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
#[test]
fn $ascii_name() {
$(
let expected_words = to_words($ascii_words.to_vec());
let actual_words = WordSeparator::AsciiSpace
.find_words($line)
.collect::<Vec<_>>();
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
)+
}
#[test]
#[cfg(feature = "unicode-linebreak")]
fn $unicode_name() {
$(
let expected_words = to_words($unicode_words.to_vec());
let actual_words = WordSeparator::UnicodeBreakProperties
.find_words($line)
.collect::<Vec<_>>();
assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
)+
}
};
}
test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
test_find_words!(
ascii_single_word,
unicode_single_word,
["foo", ["foo"], ["foo"]]
);
test_find_words!(
ascii_two_words,
unicode_two_words,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
);
test_find_words!(
ascii_multiple_words,
unicode_multiple_words,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
);
test_find_words!(
ascii_only_whitespace,
unicode_only_whitespace,
[" ", [" "], [" "]],
[" ", [" "], [" "]]
);
test_find_words!(
ascii_inter_word_whitespace,
unicode_inter_word_whitespace,
["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
);
test_find_words!(
ascii_trailing_whitespace,
unicode_trailing_whitespace,
["foo ", ["foo "], ["foo "]]
);
test_find_words!(
ascii_leading_whitespace,
unicode_leading_whitespace,
[" foo", [" ", "foo"], [" ", "foo"]]
);
test_find_words!(
ascii_multi_column_char,
unicode_multi_column_char,
["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] );
test_find_words!(
ascii_hyphens,
unicode_hyphens,
["foo-bar", ["foo-bar"], ["foo-bar"]],
["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
);
test_find_words!(
ascii_newline,
unicode_newline,
["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
);
test_find_words!(
ascii_tab,
unicode_tab,
["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
);
test_find_words!(
ascii_non_breaking_space,
unicode_non_breaking_space,
["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
);
#[test]
#[cfg(unix)]
fn find_words_colored_text() {
use termion::color::{Blue, Fg, Green, Reset};
let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
assert_iter_eq!(
AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);
#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(
UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);
}
#[test]
fn find_words_color_inside_word() {
let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(
UnicodeBreakProperties.find_words(text),
vec![Word::from(text)]
);
}
#[test]
fn word_separator_new() {
#[cfg(feature = "unicode-linebreak")]
assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
#[cfg(not(feature = "unicode-linebreak"))]
assert!(matches!(WordSeparator::new(), AsciiSpace));
}
}