[go: up one dir, main page]

twoway 0.1.1

Fast substring search for strings and byte strings. Optional SSE4.2 acceleration (requires nightly and cargo feature flag pcmp) using pcmpestri. Memchr is the only mandatory dependency. The two way algorithm is also used by rust's libstd itself, but here it is exposed both for byte strings, using memchr, and optionally using a SSE4.2 accelerated version.
Documentation
//! Boyer-Moore-Horspool
//!

extern crate memchr;
use std::cmp;

fn bmh_skip(pat: &[u8], skip: &mut [u16; 256]) {
    let pat_skip = cmp::min(pat.len(), u16::max_value() as usize) as u16;
    for entry in skip.iter_mut() {
        *entry = pat_skip;
    }

    for (index, &byte) in pat[..pat.len() - 1].iter().enumerate() {
        skip[byte as usize] = cmp::min(pat.len() - index - 1, u16::max_value() as usize) as u16;
    }
}

/// Boyer-Moore-Horspool substring search
pub fn find(text: &[u8], pat: &[u8]) -> Option<usize> {
    let mut skip = [0; 256];
    bmh_skip(pat, &mut skip);

    let pat_len = pat.len();

    if pat_len == 0 {
        return Some(0);
    }

    let pat_len_m1 = pat_len - 1;
    let pat_last = pat[pat_len - 1];

    // initial search by memchr
    let mut j = match memchr::memchr(pat[0], text) {
        Some(x) => x,
        None => return None,
    };

    while let Some(&c) = text.get(j + pat_len_m1) {
        // check the back character of the pattern
        if c == pat_last && &text[j..j + pat_len] == pat {
            return Some(j);
        }
        j += skip[c as usize] as usize;
    }
    None
}

#[test]
fn bmh_preprocess() {
    let mut skip = [0; 256];
    let needle = b"gcagagag";
    bmh_skip(needle, &mut skip);
    assert_eq!(skip[b'g' as usize], 2);
    assert_eq!(skip[b'c' as usize], 6);
    assert_eq!(skip[b'a' as usize], 1);
    assert_eq!(skip[b't' as usize], 8);
}

#[test]
fn bmh_find() {
    let text = b"abc";
    assert_eq!(find(text, b"d"), None);
    assert_eq!(find(text, b"c"), Some(2));

    let longer = "longer text and so on";

    // test all windows
    for wsz in 1..17 {
        for window in longer.as_bytes().windows(wsz) {
            let str_find = longer.find(::std::str::from_utf8(window).unwrap());
            assert!(str_find.is_some());
            assert_eq!(find(longer.as_bytes(), window), str_find);
        }
    }

    let pat = b"ger text and so on";
    assert!(pat.len() > 16);
    assert_eq!(Some(3), find(longer.as_bytes(), pat));
}