#![doc(html_root_url = "https://docs.rs/lipsum/0.8.1")]
#![forbid(unsafe_code)]
#![deny(missing_docs)]
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha20Rng;
use std::collections::HashMap;
pub type Bigram<'a> = (&'a str, &'a str);
#[derive(Debug, Clone, Default)]
pub struct MarkovChain<'a> {
map: HashMap<Bigram<'a>, Vec<&'a str>>,
keys: Vec<Bigram<'a>>,
}
impl<'a> MarkovChain<'a> {
pub fn new() -> MarkovChain<'a> {
Default::default()
}
pub fn learn(&mut self, sentence: &'a str) {
let words = sentence.split_whitespace().collect::<Vec<&str>>();
for window in words.windows(3) {
let (a, b, c) = (window[0], window[1], window[2]);
self.map.entry((a, b)).or_insert_with(Vec::new).push(c);
}
self.keys = self.map.keys().cloned().collect();
self.keys.sort();
}
#[inline]
pub fn len(&self) -> usize {
self.map.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn words(&self, state: Bigram<'a>) -> Option<&Vec<&str>> {
self.map.get(&state)
}
pub fn generate_with_rng<R: Rng>(&self, rng: R, n: usize) -> String {
join_words(self.iter_with_rng(rng).take(n))
}
pub fn generate(&self, n: usize) -> String {
self.generate_with_rng(thread_rng(), n)
}
pub fn generate_with_rng_from<R: Rng>(&self, rng: R, n: usize, from: Bigram<'a>) -> String {
join_words(self.iter_with_rng_from(rng, from).take(n))
}
pub fn generate_from(&self, n: usize, from: Bigram<'a>) -> String {
self.generate_with_rng_from(thread_rng(), n, from)
}
pub fn iter_with_rng<R: Rng>(&self, mut rng: R) -> Words<'_, R> {
let initial_bigram = if self.is_empty() {
("", "")
} else {
*self.keys.choose(&mut rng).unwrap()
};
self.iter_with_rng_from(rng, initial_bigram)
}
pub fn iter(&self) -> Words<'_, ThreadRng> {
self.iter_with_rng(thread_rng())
}
pub fn iter_with_rng_from<R: Rng>(&self, rng: R, from: Bigram<'a>) -> Words<'_, R> {
Words {
map: &self.map,
rng,
keys: &self.keys,
state: from,
}
}
pub fn iter_from(&self, from: Bigram<'a>) -> Words<'_, ThreadRng> {
self.iter_with_rng_from(thread_rng(), from)
}
}
pub struct Words<'a, R: Rng> {
map: &'a HashMap<Bigram<'a>, Vec<&'a str>>,
rng: R,
keys: &'a Vec<Bigram<'a>>,
state: Bigram<'a>,
}
impl<'a, R: Rng> Iterator for Words<'a, R> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.map.is_empty() {
return None;
}
let result = Some(self.state.0);
while !self.map.contains_key(&self.state) {
self.state = *self.keys.choose(&mut self.rng).unwrap();
}
let next_words = &self.map[&self.state];
let next = next_words.choose(&mut self.rng).unwrap();
self.state = (self.state.1, next);
result
}
}
fn is_ascii_punctuation(c: char) -> bool {
c.is_ascii_punctuation()
}
fn capitalize<'a>(word: &'a str) -> String {
let idx = match word.chars().next() {
Some(c) => c.len_utf8(),
None => 0,
};
let mut result = String::with_capacity(word.len());
result.push_str(&word[..idx].to_uppercase());
result.push_str(&word[idx..]);
result
}
fn join_words<'a, I: Iterator<Item = &'a str>>(mut words: I) -> String {
match words.next() {
None => String::new(),
Some(word) => {
let punctuation = &['.', '!', '?'];
let mut sentence = capitalize(word);
let mut needs_cap = sentence.ends_with(punctuation);
for word in words {
sentence.push(' ');
if needs_cap {
sentence.push_str(&capitalize(word));
} else {
sentence.push_str(word);
}
needs_cap = word.ends_with(punctuation);
}
if !sentence.ends_with(punctuation) {
let idx = sentence.trim_end_matches(is_ascii_punctuation).len();
sentence.truncate(idx);
sentence.push('.');
}
sentence
}
}
}
pub const LOREM_IPSUM: &'static str = include_str!("lorem-ipsum.txt");
pub const LIBER_PRIMUS: &'static str = include_str!("liber-primus.txt");
thread_local! {
static LOREM_IPSUM_CHAIN: MarkovChain<'static> = {
let mut chain = MarkovChain::new();
chain.learn(LOREM_IPSUM);
chain.learn(LIBER_PRIMUS);
chain
}
}
pub fn lipsum(n: usize) -> String {
LOREM_IPSUM_CHAIN.with(|chain| chain.generate_from(n, ("Lorem", "ipsum")))
}
pub fn lipsum_from_seed(n: usize, seed: u64) -> String {
let rng = ChaCha20Rng::seed_from_u64(seed);
LOREM_IPSUM_CHAIN.with(|chain| chain.generate_with_rng_from(rng, n, ("Lorem", "ipsum")))
}
pub fn lipsum_words(n: usize) -> String {
LOREM_IPSUM_CHAIN.with(|chain| chain.generate(n))
}
pub fn lipsum_words_from_seed(n: usize, seed: u64) -> String {
let rng = ChaCha20Rng::seed_from_u64(seed);
LOREM_IPSUM_CHAIN.with(|chain| chain.generate_with_rng(rng, n))
}
const TITLE_MIN_WORDS: usize = 3;
const TITLE_MAX_WORDS: usize = 8;
const TITLE_SMALL_WORD: usize = 3;
pub fn lipsum_title() -> String {
LOREM_IPSUM_CHAIN.with(|chain| {
let n = thread_rng().gen_range(TITLE_MIN_WORDS..TITLE_MAX_WORDS);
let mut title = String::with_capacity(8 * n);
let words = chain
.iter()
.map(|word| word.trim_matches(is_ascii_punctuation))
.filter(|word| !word.is_empty())
.take(n);
for (i, word) in words.enumerate() {
if i > 0 {
title.push(' ');
}
if i == 0 || word.len() > TITLE_SMALL_WORD {
title.push_str(&capitalize(word));
} else {
title.push_str(word);
}
}
title
})
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
use rand_chacha::ChaCha20Rng;
#[test]
fn starts_with_lorem_ipsum() {
assert_eq!(&lipsum(10)[..11], "Lorem ipsum");
}
#[test]
fn generate_zero_words() {
assert_eq!(lipsum(0).split_whitespace().count(), 0);
}
#[test]
fn generate_one_word() {
assert_eq!(lipsum(1).split_whitespace().count(), 1);
}
#[test]
fn generate_two_words() {
assert_eq!(lipsum(2).split_whitespace().count(), 2);
}
#[test]
fn starts_differently() {
let idx = "Lorem ipsum".len();
assert_ne!(&lipsum_words(5)[..idx], &lipsum_words(5)[..idx]);
}
#[test]
fn generate_title() {
for word in lipsum_title().split_whitespace() {
assert!(
!word.starts_with(is_ascii_punctuation) && !word.ends_with(is_ascii_punctuation),
"Unexpected punctuation: {:?}",
word
);
if word.len() > TITLE_SMALL_WORD {
assert!(
word.starts_with(char::is_uppercase),
"Expected small word to be capitalized: {:?}",
word
);
}
}
}
#[test]
fn capitalize_after_punctiation() {
assert_eq!(
lipsum_words_from_seed(9, 5),
"Nullam habuit. Voluptatem cum summum bonum in voluptate est."
);
}
#[test]
fn empty_chain() {
let chain = MarkovChain::new();
assert_eq!(chain.generate(10), "");
}
#[test]
fn generate_from() {
let mut chain = MarkovChain::new();
chain.learn("red orange yellow green blue indigo violet");
assert_eq!(
chain.generate_from(5, ("orange", "yellow")),
"Orange yellow green blue indigo."
);
}
#[test]
fn generate_last_bigram() {
let mut chain = MarkovChain::new();
chain.learn("xxx yyy zzz");
assert_ne!(chain.generate_from(3, ("xxx", "yyy")), "xxx yyy zzz");
}
#[test]
fn generate_from_no_panic() {
let mut chain = MarkovChain::new();
chain.learn("foo bar baz");
chain.generate_from(3, ("xxx", "yyy"));
}
#[test]
fn chain_map() {
let mut chain = MarkovChain::new();
chain.learn("foo bar baz quuz");
let map = &chain.map;
assert_eq!(map.len(), 2);
assert_eq!(map[&("foo", "bar")], vec!["baz"]);
assert_eq!(map[&("bar", "baz")], vec!["quuz"]);
}
#[test]
fn new_with_rng() {
let rng = ChaCha20Rng::seed_from_u64(1234);
let mut chain = MarkovChain::new();
chain.learn("foo bar x y z");
chain.learn("foo bar a b c");
assert_eq!(
chain.generate_with_rng(rng, 15),
"A b bar a b a b bar a b x y b y x."
);
}
#[test]
fn seed_works() {
assert_eq!(
lipsum_words_from_seed(10, 100_000),
lipsum_words_from_seed(10, 100_000)
);
assert_eq!(lipsum_from_seed(30, 100_000), lipsum_from_seed(30, 100_000));
assert_ne!(
lipsum_words_from_seed(10, 100_000),
lipsum_words_from_seed(10, 100_001)
);
assert_ne!(lipsum_from_seed(30, 100_000), lipsum_from_seed(30, 100_001));
}
}