#![deny(missing_docs)]
extern crate memchr;
#[cfg(test)] extern crate quickcheck;
#[cfg(test)] extern crate rand;
use std::collections::VecDeque;
use std::fmt;
use std::iter::FromIterator;
use memchr::memchr;
pub use self::autiter::{
Automaton, Match,
Matches, MatchesOverlapping, StreamMatches, StreamMatchesOverlapping,
};
pub use self::full::FullAcAutomaton;
#[path = "autiter.rs"]
mod autiter;
#[path = "full.rs"]
mod full;
pub type StateIdx = u32;
type PatIdx = usize;
const FAIL_STATE: u32 = 0;
const ROOT_STATE: u32 = 1;
const DENSE_DEPTH_THRESHOLD: u32 = 3;
#[derive(Clone)]
pub struct AcAutomaton<P, T=Dense> {
pats: Vec<P>,
states: Vec<State<T>>,
start_bytes: Vec<u8>,
}
#[derive(Clone)]
struct State<T> {
out: Vec<PatIdx>,
fail: StateIdx,
goto: T,
depth: u32,
}
impl<P: AsRef<[u8]>> AcAutomaton<P> {
pub fn new<I>(pats: I) -> AcAutomaton<P, Dense>
where I: IntoIterator<Item=P> {
AcAutomaton::with_transitions(pats)
}
}
impl<P: AsRef<[u8]>, T: Transitions> AcAutomaton<P, T> {
pub fn with_transitions<I>(pats: I) -> AcAutomaton<P, T>
where I: IntoIterator<Item=P> {
AcAutomaton {
pats: vec![], states: vec![State::new(0), State::new(0)], start_bytes: vec![], }.build(pats.into_iter().collect())
}
pub fn into_full(self) -> FullAcAutomaton<P> {
FullAcAutomaton::new(self)
}
}
impl<P: AsRef<[u8]>, T: Transitions> Automaton<P> for AcAutomaton<P, T> {
#[inline]
fn next_state(&self, mut si: StateIdx, b: u8) -> StateIdx {
loop {
let maybe_si = self.states[si as usize].goto(b);
if maybe_si != FAIL_STATE {
si = maybe_si;
break;
} else {
si = self.states[si as usize].fail;
}
}
si
}
#[inline]
fn get_match(&self, si: StateIdx, outi: usize, texti: usize) -> Match {
let pati = self.states[si as usize].out[outi];
let patlen = self.pats[pati].as_ref().len();
let start = texti + 1 - patlen;
Match {
pati: pati,
start: start,
end: start + patlen,
}
}
#[inline]
fn has_match(&self, si: StateIdx, outi: usize) -> bool {
outi < self.states[si as usize].out.len()
}
#[inline]
fn skip_to(&self, si: StateIdx, text: &[u8], at: usize) -> usize {
if si != ROOT_STATE || !self.is_skippable() {
return at;
}
let b = self.start_bytes[0];
match memchr(b, &text[at..]) {
None => text.len(),
Some(i) => at + i,
}
}
#[inline]
fn is_skippable(&self) -> bool {
self.start_bytes.len() == 1
}
#[inline]
fn patterns(&self) -> &[P] {
&self.pats
}
#[inline]
fn pattern(&self, i: usize) -> &P {
&self.pats[i]
}
}
impl<P: AsRef<[u8]>, T: Transitions> AcAutomaton<P, T> {
fn build(mut self, pats: Vec<P>) -> AcAutomaton<P, T> {
for (pati, pat) in pats.iter().enumerate() {
if pat.as_ref().is_empty() {
continue;
}
let mut previ = ROOT_STATE;
for &b in pat.as_ref() {
if self.states[previ as usize].goto(b) != FAIL_STATE {
previ = self.states[previ as usize].goto(b);
} else {
let depth = self.states[previ as usize].depth + 1;
let nexti = self.add_state(State::new(depth));
self.states[previ as usize].set_goto(b, nexti);
previ = nexti;
}
}
self.states[previ as usize].out.push(pati);
}
for c in (0..256).into_iter().map(|c| c as u8) {
if self.states[ROOT_STATE as usize].goto(c) == FAIL_STATE {
self.states[ROOT_STATE as usize].set_goto(c, ROOT_STATE);
} else {
self.start_bytes.push(c);
}
}
self.pats = pats;
self.fill()
}
fn fill(mut self) -> AcAutomaton<P, T> {
let mut q = VecDeque::new();
for c in (0..256).into_iter().map(|c| c as u8) {
let si = self.states[ROOT_STATE as usize].goto(c);
if si != ROOT_STATE {
q.push_front(si);
}
}
while let Some(si) = q.pop_back() {
for c in (0..256).into_iter().map(|c| c as u8) {
let u = self.states[si as usize].goto(c);
if u != FAIL_STATE {
q.push_front(u);
let mut v = self.states[si as usize].fail;
while self.states[v as usize].goto(c) == FAIL_STATE {
v = self.states[v as usize].fail;
}
let ufail = self.states[v as usize].goto(c);
self.states[u as usize].fail = ufail;
let ufail_out = self.states[ufail as usize].out.clone();
self.states[u as usize].out.extend(ufail_out);
}
}
}
self
}
fn add_state(&mut self, state: State<T>) -> StateIdx {
let i = self.states.len();
self.states.push(state);
i as StateIdx
}
}
impl<T: Transitions> State<T> {
fn new(depth: u32) -> State<T> {
State {
out: vec![],
fail: 1,
goto: Transitions::new(depth),
depth: depth,
}
}
fn goto(&self, b: u8) -> StateIdx { self.goto.goto(b) }
fn set_goto(&mut self, b: u8, si: StateIdx) { self.goto.set_goto(b, si); }
}
pub trait Transitions {
fn new(depth: u32) -> Self;
fn goto(&self, alpha: u8) -> StateIdx;
fn set_goto(&mut self, alpha: u8, si: StateIdx);
}
#[derive(Clone, Debug)]
pub struct Dense(DenseChoice);
#[derive(Clone, Debug)]
enum DenseChoice {
Sparse(Vec<StateIdx>), Dense(Vec<(u8, StateIdx)>),
}
impl Transitions for Dense {
fn new(depth: u32) -> Dense {
if depth <= DENSE_DEPTH_THRESHOLD {
Dense(DenseChoice::Sparse(vec![0; 256]))
} else {
Dense(DenseChoice::Dense(vec![]))
}
}
fn goto(&self, b1: u8) -> StateIdx {
match self.0 {
DenseChoice::Sparse(ref m) => m[b1 as usize],
DenseChoice::Dense(ref m) => {
for &(b2, si) in m {
if b1 == b2 {
return si;
}
}
FAIL_STATE
}
}
}
fn set_goto(&mut self, b: u8, si: StateIdx) {
match self.0 {
DenseChoice::Sparse(ref mut m) => m[b as usize] = si,
DenseChoice::Dense(ref mut m) => m.push((b, si)),
}
}
}
#[derive(Clone, Debug)]
pub struct Sparse(Vec<StateIdx>);
impl Transitions for Sparse {
fn new(_: u32) -> Sparse {
Sparse(vec![0; 256])
}
#[inline]
fn goto(&self, b: u8) -> StateIdx {
self.0[b as usize]
}
fn set_goto(&mut self, b: u8, si: StateIdx) {
self.0[b as usize] = si;
}
}
impl<S: AsRef<[u8]>> FromIterator<S> for AcAutomaton<S> {
fn from_iter<T>(it: T) -> AcAutomaton<S> where T: IntoIterator<Item=S> {
AcAutomaton::new(it)
}
}
impl<P: AsRef<[u8]> + fmt::Debug, T: Transitions> fmt::Debug for AcAutomaton<P, T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use std::iter::repeat;
try!(writeln!(f, "{}", repeat('-').take(79).collect::<String>()));
try!(writeln!(f, "Patterns: {:?}", self.pats));
for (i, state) in self.states.iter().enumerate().skip(1) {
try!(writeln!(f, "{:3}: {}", i, state.debug(i == 1)));
}
write!(f, "{}", repeat('-').take(79).collect::<String>())
}
}
impl<T: Transitions> State<T> {
fn debug(&self, root: bool) -> String {
format!("State {{ depth: {:?}, out: {:?}, fail: {:?}, goto: {{{}}} }}",
self.depth, self.out, self.fail, self.goto_string(root))
}
fn goto_string(&self, root: bool) -> String {
use std::char::from_u32;
let mut goto = vec![];
for b in (0..256).map(|b| b as u8) {
let si = self.goto(b);
if (!root && si == FAIL_STATE) || (root && si == ROOT_STATE) {
continue;
}
goto.push(format!("{} => {}", from_u32(b as u32).unwrap(), si));
}
goto.connect(", ")
}
}
impl<T: Transitions> fmt::Debug for State<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.debug(false))
}
}
impl<T: Transitions> AcAutomaton<String, T> {
#[doc(hidden)]
pub fn dot(&self) -> String {
use std::fmt::Write;
let mut out = String::new();
macro_rules! w {
($w:expr, $($tt:tt)*) => { {write!($w, $($tt)*)}.unwrap() }
}
w!(out, r#"
digraph automaton {{
label=<<FONT POINT-SIZE="20">{}</FONT>>;
labelloc="l";
labeljust="l";
rankdir="LR";
"#, self.pats.connect(", "));
for (i, s) in self.states.iter().enumerate().skip(1) {
let i = i as u32;
if s.out.len() == 0 {
w!(out, " {};\n", i);
} else {
w!(out, " {} [peripheries=2];\n", i);
}
w!(out, " {} -> {} [style=dashed];\n", i, s.fail);
for b in (0..256).map(|b| b as u8) {
let si = s.goto(b);
if si == FAIL_STATE || (i == ROOT_STATE && si == ROOT_STATE) {
continue;
}
w!(out, " {} -> {} [label={}];\n", i, si, b as char);
}
}
w!(out, "}}");
out
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use std::io;
use quickcheck::{Arbitrary, Gen, quickcheck};
use rand::Rng;
use super::{Automaton, AcAutomaton, Match};
fn aut_find<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
AcAutomaton::new(xs.to_vec()).find(&haystack).collect()
}
fn aut_finds<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
let cur = io::Cursor::new(haystack.as_bytes());
AcAutomaton::new(xs.to_vec())
.stream_find(cur).map(|r| r.unwrap()).collect()
}
fn aut_findf<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
AcAutomaton::new(xs.to_vec()).into_full().find(haystack).collect()
}
fn aut_findfs<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
let cur = io::Cursor::new(haystack.as_bytes());
AcAutomaton::new(xs.to_vec())
.into_full()
.stream_find(cur).map(|r| r.unwrap()).collect()
}
fn aut_findo<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
AcAutomaton::new(xs.to_vec()).find_overlapping(haystack).collect()
}
fn aut_findos<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
let cur = io::Cursor::new(haystack.as_bytes());
AcAutomaton::new(xs.to_vec())
.stream_find_overlapping(cur).map(|r| r.unwrap()).collect()
}
fn aut_findfo<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
AcAutomaton::new(xs.to_vec())
.into_full().find_overlapping(haystack).collect()
}
fn aut_findfos<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + AsRef<[u8]> {
let cur = io::Cursor::new(haystack.as_bytes());
AcAutomaton::new(xs.to_vec())
.into_full()
.stream_find_overlapping(cur).map(|r| r.unwrap()).collect()
}
#[test]
fn one_pattern_one_match() {
let ns = vec!["a"];
let hay = "za";
let matches = vec![
Match { pati: 0, start: 1, end: 2 },
];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn one_pattern_many_match() {
let ns = vec!["a"];
let hay = "zazazzzza";
let matches = vec![
Match { pati: 0, start: 1, end: 2 },
Match { pati: 0, start: 3, end: 4 },
Match { pati: 0, start: 8, end: 9 },
];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn one_longer_pattern_one_match() {
let ns = vec!["abc"];
let hay = "zazabcz";
let matches = vec![ Match { pati: 0, start: 3, end: 6 } ];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn one_longer_pattern_many_match() {
let ns = vec!["abc"];
let hay = "zazabczzzzazzzabc";
let matches = vec![
Match { pati: 0, start: 3, end: 6 },
Match { pati: 0, start: 14, end: 17 },
];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn many_pattern_one_match() {
let ns = vec!["a", "b"];
let hay = "zb";
let matches = vec![ Match { pati: 1, start: 1, end: 2 } ];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn many_pattern_many_match() {
let ns = vec!["a", "b"];
let hay = "zbzazzzzb";
let matches = vec![
Match { pati: 1, start: 1, end: 2 },
Match { pati: 0, start: 3, end: 4 },
Match { pati: 1, start: 8, end: 9 },
];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_one_match() {
let ns = vec!["abc", "xyz"];
let hay = "zazxyzz";
let matches = vec![ Match { pati: 1, start: 3, end: 6 } ];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_many_match() {
let ns = vec!["abc", "xyz"];
let hay = "zazxyzzzzzazzzabcxyz";
let matches = vec![
Match { pati: 1, start: 3, end: 6 },
Match { pati: 0, start: 14, end: 17 },
Match { pati: 1, start: 17, end: 20 },
];
assert_eq!(&aut_find(&ns, hay), &matches);
assert_eq!(&aut_finds(&ns, hay), &matches);
assert_eq!(&aut_findf(&ns, hay), &matches);
assert_eq!(&aut_findfs(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_overlap_one_match() {
let ns = vec!["abc", "bc"];
let hay = "zazabcz";
let matches = vec![
Match { pati: 0, start: 3, end: 6 },
Match { pati: 1, start: 4, end: 6 },
];
assert_eq!(&aut_findo(&ns, hay), &matches);
assert_eq!(&aut_findos(&ns, hay), &matches);
assert_eq!(&aut_findfo(&ns, hay), &matches);
assert_eq!(&aut_findfos(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_overlap_one_match_reverse() {
let ns = vec!["abc", "bc"];
let hay = "xbc";
let matches = vec![ Match { pati: 1, start: 1, end: 3 } ];
assert_eq!(&aut_findo(&ns, hay), &matches);
assert_eq!(&aut_findos(&ns, hay), &matches);
assert_eq!(&aut_findfo(&ns, hay), &matches);
assert_eq!(&aut_findfos(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_overlap_many_match() {
let ns = vec!["abc", "bc", "c"];
let hay = "zzzabczzzbczzzc";
let matches = vec![
Match { pati: 0, start: 3, end: 6 },
Match { pati: 1, start: 4, end: 6 },
Match { pati: 2, start: 5, end: 6 },
Match { pati: 1, start: 9, end: 11 },
Match { pati: 2, start: 10, end: 11 },
Match { pati: 2, start: 14, end: 15 },
];
assert_eq!(&aut_findo(&ns, hay), &matches);
assert_eq!(&aut_findos(&ns, hay), &matches);
assert_eq!(&aut_findfo(&ns, hay), &matches);
assert_eq!(&aut_findfos(&ns, hay), &matches);
}
#[test]
fn many_longer_pattern_overlap_many_match_reverse() {
let ns = vec!["abc", "bc", "c"];
let hay = "zzzczzzbczzzabc";
let matches = vec![
Match { pati: 2, start: 3, end: 4 },
Match { pati: 1, start: 7, end: 9 },
Match { pati: 2, start: 8, end: 9 },
Match { pati: 0, start: 12, end: 15 },
Match { pati: 1, start: 13, end: 15 },
Match { pati: 2, start: 14, end: 15 },
];
assert_eq!(&aut_findo(&ns, hay), &matches);
assert_eq!(&aut_findos(&ns, hay), &matches);
assert_eq!(&aut_findfo(&ns, hay), &matches);
assert_eq!(&aut_findfos(&ns, hay), &matches);
}
#[test]
fn pattern_returns_original_type() {
let aut = AcAutomaton::new(vec!["apple", "maple"]);
let pat: &str = aut.pattern(0);
assert_eq!(pat, "apple");
let pats: &[&str] = aut.patterns();
assert_eq!(pats, &["apple", "maple"]);
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct SmallAscii(String);
impl Arbitrary for SmallAscii {
fn arbitrary<G: Gen>(g: &mut G) -> SmallAscii {
use std::char::from_u32;
SmallAscii((0..2)
.map(|_| from_u32(g.gen_range(97, 123)).unwrap())
.collect())
}
fn shrink(&self) -> Box<Iterator<Item=SmallAscii>> {
Box::new(self.0.shrink().map(SmallAscii))
}
}
impl From<SmallAscii> for String {
fn from(s: SmallAscii) -> String { s.0 }
}
impl AsRef<[u8]> for SmallAscii {
fn as_ref(&self) -> &[u8] { self.0.as_ref() }
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct BiasAscii(String);
impl Arbitrary for BiasAscii {
fn arbitrary<G: Gen>(g: &mut G) -> BiasAscii {
use std::char::from_u32;
let size = { let s = g.size(); g.gen_range(0, s) };
let mut s = String::with_capacity(size);
for _ in 0..size {
if g.gen_weighted_bool(3) {
s.push(char::arbitrary(g));
} else {
for _ in 0..5 {
s.push(from_u32(g.gen_range(97, 123)).unwrap());
}
}
}
BiasAscii(s)
}
fn shrink(&self) -> Box<Iterator<Item=BiasAscii>> {
Box::new(self.0.shrink().map(BiasAscii))
}
}
fn naive_find<S>(xs: &[S], haystack: &str) -> Vec<Match>
where S: Clone + Into<String> {
let needles: Vec<String> =
xs.to_vec().into_iter().map(Into::into).collect();
let mut matches = vec![];
for hi in 0..haystack.len() {
for (pati, needle) in needles.iter().enumerate() {
let needle = needle.as_bytes();
if needle.len() == 0 || needle.len() > haystack.len() - hi {
continue;
}
if needle == &haystack.as_bytes()[hi..hi+needle.len()] {
matches.push(Match {
pati: pati,
start: hi,
end: hi + needle.len(),
});
}
}
}
matches
}
#[test]
fn qc_ac_equals_naive() {
fn prop(needles: Vec<SmallAscii>, haystack: BiasAscii) -> bool {
let aut_matches = aut_findo(&needles, &haystack.0);
let naive_matches = naive_find(&needles, &haystack.0);
let aset: HashSet<Match> = aut_matches.iter().cloned().collect();
let nset: HashSet<Match> = naive_matches.iter().cloned().collect();
aset == nset
}
quickcheck(prop as fn(Vec<SmallAscii>, BiasAscii) -> bool);
}
}