[go: up one dir, main page]

twoway 0.1.1

Fast substring search for strings and byte strings. Optional SSE4.2 acceleration (requires nightly and cargo feature flag pcmp) using pcmpestri. Memchr is the only mandatory dependency. The two way algorithm is also used by rust's libstd itself, but here it is exposed both for byte strings, using memchr, and optionally using a SSE4.2 accelerated version.
Documentation
//! 
//!
//! Two-way string matching
//!
//! http://monge.univ-mlv.fr/~mac/Articles-PDF/CP-1991-jacm.pdf
#![allow(dead_code)]

use std::str;
use std::cmp::max;
use std::ops::Index;

#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
pub struct Str<'a>(pub &'a [u8]);

impl<'a> Str<'a> {
    fn len(&self) -> usize { self.0.len() }
    fn as_str(&self) -> &str { str::from_utf8(self.0).unwrap() }
}

impl<'a> Index<usize> for Str<'a> {
    type Output = u8;
    fn index(&self, ix: usize) -> &u8 {
        &self.0[ix - 1]
    }
}

// A note on a simple computation of the maximal suffix of a string
//  simpler, slower version of duval's algorithm
fn compute_max_suf_pos(w: &[u8]) -> usize {
    // using 0-based indexing throughout
    let n = w.len();
    let mut i = 0;
    let mut j = 1;
    while j < n {
        let mut k = 0;
        while j + k < n - 1 && w[i + k] == w[j + k] {
            k += 1;
        }
        if w[i + k] < w[j + k] {
            i += k + 1;
        } else {
            j += k + 1;
        }
        if i == j {
            j += 1;
        }
    }
    i
}

/// Jewels of Stringology: Text Algorithms
/// Maxsuf-and-Period(x)
fn maxsuf_and_period(x: &[u8]) -> (usize, usize) {
    let n = x.len();

    let mut s = 1;
    let mut i = 2;
    let mut p = 1;

    while i <= n {
        let r = (i - s) % p;
        if x[i - 1] == x[s + r - 1] {
            i += 1;
        } else if x[i - 1] < x[s + r - 1] {
            i += 1;
            p = i - s;
        } else {
            s = i - r;
            i = s;
            p = 1;
        }
    }
    (s - 1, p)
}

/// From paper on Two-way string matching
/// Return index, period
/// The returned index is zero-based!
fn maximal_suffix(x: Str, rev: bool) -> (usize, usize) {
    let n = x.len();

    let mut i = 0;
    let mut j = 1;
    let mut k = 1;
    let mut p = 1;

    while j + k <= n {
        let ap = x[i + k];
        let a = x[j + k];
        /*
        println!("trace: x={}, i={}, j={}, k={}, ap={}/{}, a={}/{}",
                 x.as_str(), i, j, k, ap as char, ap, a as char, a);
        */
        if (a < ap && !rev) || (a > ap && rev) {
            j += k;
            k = 1;
            p = j - i;
        } else if a == ap {
            if k == p {
                j += p;
                k = 1;
            } else {
                k += 1;
            }
        } else {
            i = j;
            j = i + 1;
            k = 1;
            p = 1;
        }
    }
    //println!("trace: x={}, i={}, p={}", x.as_str(), i, p);

    (i, p)
}

/// Return critical position, period.
/// critical position is zero-based
pub fn crit_period(x: Str) -> (usize, usize) {
    let (i, p) = maximal_suffix(x, false);
    let (j, q) = maximal_suffix(x, true);
    if i >= j {
        (i, p)
    } else {
        (j, q)
    }
}

/// From paper on Two-way string matching
///
/// **x**: pattern,
/// **t**: text,
pub fn find(x: Str, t: Str) {
    let n = x.len();
    // `l` critical position, zero-based
    // `p` period
    let (l, p) = crit_period(x);
    println!("crit, period = {:?}", (l, p));
    // x[1..l] is a suffix of x[l + 1..l + p]
    if l < n / 2 && x.0[..l] == x.0[p..p + l] {
        println!("short period case");
        let mut pos = 0;
        let mut s = 0; // `s` is the memory
        while pos + n <= t.len() {
            let mut i = max(l, s) + 1;
            while i <= n && x[i] == t[pos + i] {
                i += 1;
            }
            if i <= n {
                println!("vars i={}, l={}, s={}, p={}", i, l, s, p);
                pos += max(i - l, 1 + max(s, p) - p);
                s = 0;
            } else {
                let mut j = l;
                while j > s && x[j] == t[pos + j] {
                    j -= 1;
                }
                if j <= s {
                    println!("pos={} is a match!", pos);
                }
                pos += p;
                s = n - p;
            }
        }
    } else {
        println!("long period case");
        let q = max(l, n - l) + 1;
        let mut pos = 0;
        while pos + n <= t.len() {
            let mut i = l + 1;
            println!("vars i={}, l={}, p={}, pos={}", i, l, p, pos);
            while i <= n && x[i] == t[pos + i] {
                i += 1;
            }
            if i <= n {
                pos += i - l;
            } else {
                let mut j = l;
                while j > 0 && x[j] == t[pos + j] {
                    j -= 1;
                }
                if j == 0 {
                    // pos is a zero-based index..
                    println!("pos={} is a match", pos);
                }
                pos += q;
            }
        }
    }
}

pub fn find_(x: &str, y: &str) {
    find(Str(x.as_bytes()),
         Str(y.as_bytes()))
}

/// **x**: pattern,
/// **t**: text,
///
/// return the index of the first match (0-based index)
pub fn find_first(x: Str, t: Str) -> Option<usize> {
    let n = x.len();
    let (l, p) = crit_period(x);
    // x[1..l] is a suffix of x[l + 1..l + p]
    if l < n / 2 && x.0[..l] == x.0[p..p + l] {
        // short period case
        let mut pos = 0;
        let mut s = 0; // `s` is the memory
        while pos + n <= t.len() {
            let mut i = max(l, s) + 1;
            while i <= n && x[i] == t[pos + i] {
                i += 1;
            }
            if i <= n {
                pos += max(i - l, 1 + max(s, p) - p);
                s = 0;
            } else {
                let mut j = l;
                while j > s && x[j] == t[pos + j] {
                    j -= 1;
                }
                if j <= s {
                    return Some(pos);
                }
                pos += p;
                s = n - p;
            }
        }
    } else {
        // long period case
        let q = max(l, n - l) + 1;
        let mut pos = 0;
        while pos + n <= t.len() {
            let mut i = l + 1;
            while i <= n && x[i] == t[pos + i] {
                i += 1;
            }
            if i <= n {
                pos += i - l;
            } else {
                let mut j = l;
                while j > 0 && x[j] == t[pos + j] {
                    j -= 1;
                }
                if j == 0 {
                    return Some(pos);
                }
                pos += q;
            }
        }
    }
    None
}

#[test]
fn test_max() {
    assert_eq!((2, 1), maximal_suffix(Str(b"aab"), false));
    assert_eq!((0, 3), maximal_suffix(Str(b"aab"), true));

    assert_eq!((0, 3), maximal_suffix(Str(b"aabaa"), true));
    assert_eq!((2, 3), maximal_suffix(Str(b"aabaa"), false));

    assert_eq!((0, 7), maximal_suffix(Str(b"gcagagag"), false));
    assert_eq!((2, 2), maximal_suffix(Str(b"gcagagag"), true));
    assert_eq!((2, 2), crit_period(Str(b"gcagagag")));

    assert_eq!((2, 1), crit_period(Str(b"aab")));
    assert_eq!((2, 3), crit_period(Str(b"aabaa")));

    assert_eq!((2, 4), crit_period(Str(b"abaaaba")));

    // both of these factorizations are critial factorizations
    assert_eq!((2, 2), crit_period(Str(b"banana")));
    assert_eq!((1, 2), crit_period(Str(b"zanana")));

    assert_eq!((10, 1), crit_period(Str(b"caaaaaaaaacc")));

    assert_eq!((2, 3), crit_period(Str(b"aabaab")));
    assert_eq!((1, 3), crit_period(Str(b"baabaa")));

    assert_eq!((2, 3), crit_period(Str(b"babbab")));

    // NOTE: returns "long period" case per = 2, which is an approximation
    assert_eq!((2, 2), crit_period(Str(b"abca")));
    assert_eq!((1, 3), crit_period(Str(b"acba")));
}
#[test]
fn test_find() {
    find_("aab", "aaable");
    find_("aab", "aaabaabe");
    find_("aaaa", "boyaaarateaaaade");
    assert_eq!(Some(10), find_first(Str(b"aaaa"), Str(b"boyaaarateaaaade")));

    find_("aab", "bbbbbbbbb");
    find_("aabaa", "aabaaabaaabaa");

    assert_eq!(find_first(Str(b"ababab"), Str(b"cbcbabababab")), Some(4));
    // this was a bug (typo) in the short period case
    assert_eq!(find_first(Str(b"aabaab"), Str(b"abbaab")), None);
}

#[test]
fn test_max_suf_pos() {
    assert_eq!(2, compute_max_suf_pos((b"aab")));

    assert_eq!(2, compute_max_suf_pos((b"aabaa")));

    assert_eq!(0, compute_max_suf_pos((b"gcagagag")));
    assert_eq!(2, compute_max_suf_pos((b"banana")));
}

#[test]
fn test_maxsuf_and_period() {
    assert_eq!((2, 1), maxsuf_and_period((b"aab")));
    assert_eq!((2, 3), maxsuf_and_period((b"aabaa")));
    assert_eq!((0, 7), maxsuf_and_period((b"gcagagag")));
}

/*
#[test]
fn slow() {
    let needle   = (0..100).map(|_| "b").collect::<String>();
    let heystack = (0..100_000).map(|_| "a").collect::<String>();

    println!("Data ready.");

    for _ in 0..100 {
        if find_first(Str(heystack.as_bytes()), Str(needle.as_bytes())).is_none() {
            // Stuff...
        }
    }
}
*/