[go: up one dir, main page]

twoway 0.1.1

Fast substring search for strings and byte strings. Optional SSE4.2 acceleration (requires nightly and cargo feature flag pcmp) using pcmpestri. Memchr is the only mandatory dependency. The two way algorithm is also used by rust's libstd itself, but here it is exposed both for byte strings, using memchr, and optionally using a SSE4.2 accelerated version.
Documentation
use std::cmp;

/*
/// Find one out of a set of bytes, max 4.
pub fn set(text: &[u8], set: &[u8]) -> Option<usize> {
    None
}

/// Find two consecutive bytes
pub fn find2(text: &[u8], a: u8, b: u8) -> Option<usize> {
    None
}
*/

const LO_U64: u64 = 0x0101010101010101;
const HI_U64: u64 = 0x8080808080808080;

// use truncation
const LO_USIZE: usize = LO_U64 as usize;
const HI_USIZE: usize = HI_U64 as usize;

#[cfg(target_pointer_width = "32")]
const USIZE_BYTES: usize = 4;
#[cfg(target_pointer_width = "64")]
const USIZE_BYTES: usize = 8;

/// Return `true` if `x` contains any zero byte.
///
/// From *Matters Computational*, J. Arndt
///
/// "The idea is to subtract one from each of the bytes and then look for
/// bytes where the borrow propagated all the way to the most significant
/// bit."
#[inline]
fn contains_zero_byte(x: usize) -> bool {
    x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0
}

#[cfg(target_pointer_width = "32")]
#[inline]
fn repeat_byte(b: u8) -> usize {
    let mut rep = (b as usize) << 8 | b as usize;
    rep = rep << 16 | rep;
    rep
}

#[cfg(target_pointer_width = "64")]
#[inline]
fn repeat_byte(b: u8) -> usize {
    let mut rep = (b as usize) << 8 | b as usize;
    rep = rep << 16 | rep;
    rep = rep << 32 | rep;
    rep
}

pub fn find_byte(x: u8, text: &[u8]) -> Option<usize> {
    let len = text.len();
    let ptr = text.as_ptr();

    // search up to an aligned boundary
    let align = (ptr as usize) & (USIZE_BYTES - 1);
    let mut offset;
    if align > 0 {
        offset = cmp::min(USIZE_BYTES - align, len);
        if let Some(index) = text[..offset].iter().position(|elt| *elt == x) {
            return Some(index);
        }
    } else {
        offset = 0;
    }

    let repeated_x = repeat_byte(x);

    if len >= 2 * USIZE_BYTES {
        while offset <= len - 2 * USIZE_BYTES {
            unsafe {
                let u = *(ptr.offset(offset as isize) as *const usize);
                let v = *(ptr.offset((offset + USIZE_BYTES) as isize) as *const usize);

                // break if there is a matching byte
                let zu = contains_zero_byte(u ^ repeated_x);
                let zv = contains_zero_byte(v ^ repeated_x);
                if zu || zv {
                    break;
                }
            }
            offset += USIZE_BYTES * 2;
        }
    }

    // find the byte after the point the body loop stopped
    text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i)
}

#[test]
fn test_find_byte() {
    let text = b"abcb";
    assert_eq!(find_byte(b'd', text), None);
    assert_eq!(find_byte(b'a', text), Some(0));
    assert_eq!(find_byte(b'c', text), Some(2));
    assert_eq!(find_byte(b'b', text), Some(1));

    let longer = "longer text and so on, a bit zebras and zw";
    assert_eq!(find_byte(b'z', longer.as_bytes()), longer.find('z'));
    assert_eq!(find_byte(b'w', longer.as_bytes()), longer.find('w'));
}

pub fn rfind_byte(x: u8, text: &[u8]) -> Option<usize> {
    // Scan for a single byte value by reading two `usize` words at a time.
    //
    // Split `text` in three parts
    // - unaligned tail, after the last word aligned address in text
    // - body, scan by 2 words at a time
    // - the first remaining bytes, < 2 word size
    let len = text.len();
    let ptr = text.as_ptr();

    // search up to a 16 byte aligned boundary
    let end_align = (ptr as usize + len) & (USIZE_BYTES - 1);
    let mut offset;
    if end_align > 0 {
        offset = len - cmp::min(USIZE_BYTES - end_align, len);
        if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) {
            return Some(offset + index);
        }
    } else {
        offset = len;
    }

    // search the body of the text
    let repeated_x = repeat_byte(x);

    while offset >= 2 * USIZE_BYTES {
        unsafe {
            let u = *(ptr.offset(offset as isize - 2 * USIZE_BYTES as isize) as *const usize);
            let v = *(ptr.offset(offset as isize - USIZE_BYTES as isize) as *const usize);

            // break if there is a matching byte
            let zu = contains_zero_byte(u ^ repeated_x);
            let zv = contains_zero_byte(v ^ repeated_x);
            if zu || zv {
                break;
            }
        }
        offset -= 2 * USIZE_BYTES;
    }

    // find the byte after the point the body loop stopped
    text[..offset].iter().rposition(|elt| *elt == x)
}

#[test]
fn test_rfind_byte() {
    assert_eq!(rfind_byte(b'x', b""), None);

    let text = b"abcb";
    assert_eq!(rfind_byte(b'd', text), None);
    assert_eq!(rfind_byte(b'a', text), Some(0));
    assert_eq!(rfind_byte(b'c', text), Some(2));
    assert_eq!(rfind_byte(b'b', text), Some(3));

    let longer = "loAAer text and yo on, a bit zebras and zw";
    assert_eq!(rfind_byte(b'z', longer.as_bytes()), longer.rfind('z'));
    assert_eq!(rfind_byte(b'w', longer.as_bytes()), longer.rfind('w'));
    assert_eq!(rfind_byte(b'y', longer.as_bytes()), longer.rfind('y'));
    assert_eq!(rfind_byte(b'A', longer.as_bytes()), longer.rfind('A'));
}