use std::cmp;
use std::fmt;
use std::panic::{RefUnwindSafe, UnwindSafe};
use std::u8;
use memchr::{memchr, memchr2, memchr3};
use ahocorasick::MatchKind;
use packed;
use Match;
#[derive(Clone, Debug)]
pub enum Candidate {
None,
Match(Match),
PossibleStartOfMatch(usize),
}
impl Candidate {
pub fn into_option(self) -> Option<usize> {
match self {
Candidate::None => None,
Candidate::Match(ref m) => Some(m.start()),
Candidate::PossibleStartOfMatch(start) => Some(start),
}
}
}
pub trait Prefilter:
Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
{
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate;
fn clone_prefilter(&self) -> Box<dyn Prefilter>;
fn heap_bytes(&self) -> usize;
fn reports_false_positives(&self) -> bool {
true
}
}
impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
#[inline]
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
(**self).next_candidate(state, haystack, at)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
(**self).clone_prefilter()
}
fn heap_bytes(&self) -> usize {
(**self).heap_bytes()
}
fn reports_false_positives(&self) -> bool {
(**self).reports_false_positives()
}
}
#[derive(Debug)]
pub struct PrefilterObj(Box<dyn Prefilter>);
impl Clone for PrefilterObj {
fn clone(&self) -> Self {
PrefilterObj(self.0.clone_prefilter())
}
}
impl PrefilterObj {
pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
PrefilterObj(Box::new(t))
}
pub fn as_ref(&self) -> &dyn Prefilter {
&*self.0
}
}
#[derive(Clone, Debug)]
pub struct PrefilterState {
skips: usize,
skipped: usize,
max_match_len: usize,
inert: bool,
last_scan_at: usize,
}
impl PrefilterState {
const MIN_SKIPS: usize = 40;
const MIN_AVG_FACTOR: usize = 2;
pub fn new(max_match_len: usize) -> PrefilterState {
PrefilterState {
skips: 0,
skipped: 0,
max_match_len,
inert: false,
last_scan_at: 0,
}
}
#[inline]
fn update_skipped_bytes(&mut self, skipped: usize) {
self.skips += 1;
self.skipped += skipped;
}
#[inline]
fn update_at(&mut self, at: usize) {
if at > self.last_scan_at {
self.last_scan_at = at;
}
}
#[inline]
pub fn is_effective(&mut self, at: usize) -> bool {
if self.inert {
return false;
}
if at < self.last_scan_at {
return false;
}
if self.skips < PrefilterState::MIN_SKIPS {
return true;
}
let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
if self.skipped >= min_avg * self.skips {
return true;
}
self.inert = true;
false
}
}
#[derive(Debug)]
pub struct Builder {
count: usize,
ascii_case_insensitive: bool,
start_bytes: StartBytesBuilder,
rare_bytes: RareBytesBuilder,
packed: Option<packed::Builder>,
}
impl Builder {
pub fn new(kind: MatchKind) -> Builder {
let pbuilder = kind
.as_packed()
.map(|kind| packed::Config::new().match_kind(kind).builder());
Builder {
count: 0,
ascii_case_insensitive: false,
start_bytes: StartBytesBuilder::new(),
rare_bytes: RareBytesBuilder::new(),
packed: pbuilder,
}
}
pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
self.ascii_case_insensitive = yes;
self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
self
}
pub fn build(&self) -> Option<PrefilterObj> {
match (self.start_bytes.build(), self.rare_bytes.build()) {
(prestart @ Some(_), prerare @ Some(_)) => {
let has_fewer_bytes =
self.start_bytes.count < self.rare_bytes.count;
let has_rarer_bytes =
self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
if has_fewer_bytes || has_rarer_bytes {
prestart
} else {
prerare
}
}
(prestart @ Some(_), None) => prestart,
(None, prerare @ Some(_)) => prerare,
(None, None) if self.ascii_case_insensitive => None,
(None, None) => self
.packed
.as_ref()
.and_then(|b| b.build())
.map(|s| PrefilterObj::new(Packed(s))),
}
}
pub fn add(&mut self, bytes: &[u8]) {
self.count += 1;
self.start_bytes.add(bytes);
self.rare_bytes.add(bytes);
if let Some(ref mut pbuilder) = self.packed {
pbuilder.add(bytes);
}
}
}
#[derive(Clone, Debug)]
struct Packed(packed::Searcher);
impl Prefilter for Packed {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
self.0.heap_bytes()
}
fn reports_false_positives(&self) -> bool {
false
}
}
#[derive(Clone, Debug)]
struct RareBytesBuilder {
ascii_case_insensitive: bool,
byte_offsets: RareByteOffsets,
available: bool,
count: usize,
rank_sum: u16,
}
#[derive(Clone, Copy)]
struct RareByteOffsets {
set: [RareByteOffset; 256],
}
impl RareByteOffsets {
pub fn empty() -> RareByteOffsets {
RareByteOffsets { set: [RareByteOffset::default(); 256] }
}
pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool {
assert!(off.is_active());
let existing = &mut self.set[byte as usize];
if !existing.is_active() {
*existing = off;
true
} else {
if existing.max < off.max {
*existing = off;
}
false
}
}
}
impl fmt::Debug for RareByteOffsets {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut offsets = vec![];
for off in self.set.iter() {
if off.is_active() {
offsets.push(off);
}
}
f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
}
}
#[derive(Clone, Copy, Debug)]
struct RareByteOffset {
max: u8,
}
impl Default for RareByteOffset {
fn default() -> RareByteOffset {
RareByteOffset { max: u8::MAX }
}
}
impl RareByteOffset {
fn new(max: usize) -> RareByteOffset {
if max > (u8::MAX - 1) as usize {
RareByteOffset::default()
} else {
RareByteOffset { max: max as u8 }
}
}
fn is_active(&self) -> bool {
self.max < u8::MAX
}
}
impl RareBytesBuilder {
fn new() -> RareBytesBuilder {
RareBytesBuilder {
ascii_case_insensitive: false,
byte_offsets: RareByteOffsets::empty(),
available: true,
count: 0,
rank_sum: 0,
}
}
fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
fn build(&self) -> Option<PrefilterObj> {
if !self.available || self.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if self.byte_offsets.set[b].is_active() {
bytes[len] = b as u8;
len += 1;
}
}
match len {
0 => None,
1 => Some(PrefilterObj::new(RareBytesOne {
byte1: bytes[0],
offset: self.byte_offsets.set[bytes[0] as usize],
})),
2 => Some(PrefilterObj::new(RareBytesTwo {
offsets: self.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
})),
3 => Some(PrefilterObj::new(RareBytesThree {
offsets: self.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
})),
_ => unreachable!(),
}
}
fn add(&mut self, bytes: &[u8]) {
if self.count > 3 {
self.available = false;
return;
}
let mut rarest = match bytes.get(0) {
None => return,
Some(&b) => (b, 0, freq_rank(b)),
};
for (pos, &b) in bytes.iter().enumerate() {
if self.byte_offsets.set[b as usize].is_active() {
self.add_rare_byte(b, pos);
return;
}
let rank = freq_rank(b);
if rank < rarest.2 {
rarest = (b, pos, rank);
}
}
self.add_rare_byte(rarest.0, rarest.1);
}
fn add_rare_byte(&mut self, byte: u8, pos: usize) {
self.add_one_byte(byte, pos);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte), pos);
}
}
fn add_one_byte(&mut self, byte: u8, pos: usize) {
let off = RareByteOffset::new(pos);
if !off.is_active() {
self.available = false;
return;
}
if self.byte_offsets.apply(byte, off) {
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
#[derive(Clone, Debug)]
struct RareBytesOne {
byte1: u8,
offset: RareByteOffset,
}
impl Prefilter for RareBytesOne {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr(self.byte1, &haystack[at..])
.map(|i| {
let pos = at + i;
state.last_scan_at = pos;
cmp::max(at, pos.saturating_sub(self.offset.max as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[derive(Clone, Debug)]
struct RareBytesTwo {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
}
impl Prefilter for RareBytesTwo {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr2(self.byte1, self.byte2, &haystack[at..])
.map(|i| {
let pos = at + i;
state.update_at(pos);
let offset = self.offsets.set[haystack[pos] as usize].max;
cmp::max(at, pos.saturating_sub(offset as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[derive(Clone, Debug)]
struct RareBytesThree {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
byte3: u8,
}
impl Prefilter for RareBytesThree {
fn next_candidate(
&self,
state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
.map(|i| {
let pos = at + i;
state.update_at(pos);
let offset = self.offsets.set[haystack[pos] as usize].max;
cmp::max(at, pos.saturating_sub(offset as usize))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[derive(Clone, Debug)]
struct StartBytesBuilder {
ascii_case_insensitive: bool,
byteset: Vec<bool>,
count: usize,
rank_sum: u16,
}
impl StartBytesBuilder {
fn new() -> StartBytesBuilder {
StartBytesBuilder {
ascii_case_insensitive: false,
byteset: vec![false; 256],
count: 0,
rank_sum: 0,
}
}
fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
fn build(&self) -> Option<PrefilterObj> {
if self.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if !self.byteset[b] {
continue;
}
if b > 0x7F {
return None;
}
bytes[len] = b as u8;
len += 1;
}
match len {
0 => None,
1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
2 => Some(PrefilterObj::new(StartBytesTwo {
byte1: bytes[0],
byte2: bytes[1],
})),
3 => Some(PrefilterObj::new(StartBytesThree {
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
})),
_ => unreachable!(),
}
}
fn add(&mut self, bytes: &[u8]) {
if self.count > 3 {
return;
}
if let Some(&byte) = bytes.get(0) {
self.add_one_byte(byte);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte));
}
}
}
fn add_one_byte(&mut self, byte: u8) {
if !self.byteset[byte as usize] {
self.byteset[byte as usize] = true;
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
#[derive(Clone, Debug)]
struct StartBytesOne {
byte1: u8,
}
impl Prefilter for StartBytesOne {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr(self.byte1, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[derive(Clone, Debug)]
struct StartBytesTwo {
byte1: u8,
byte2: u8,
}
impl Prefilter for StartBytesTwo {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr2(self.byte1, self.byte2, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[derive(Clone, Debug)]
struct StartBytesThree {
byte1: u8,
byte2: u8,
byte3: u8,
}
impl Prefilter for StartBytesThree {
fn next_candidate(
&self,
_state: &mut PrefilterState,
haystack: &[u8],
at: usize,
) -> Candidate {
memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
.map(|i| at + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
fn clone_prefilter(&self) -> Box<dyn Prefilter> {
Box::new(self.clone())
}
fn heap_bytes(&self) -> usize {
0
}
}
#[inline]
pub fn next<P: Prefilter>(
prestate: &mut PrefilterState,
prefilter: P,
haystack: &[u8],
at: usize,
) -> Candidate {
let cand = prefilter.next_candidate(prestate, haystack, at);
match cand {
Candidate::None => {
prestate.update_skipped_bytes(haystack.len() - at);
}
Candidate::Match(ref m) => {
prestate.update_skipped_bytes(m.start() - at);
}
Candidate::PossibleStartOfMatch(i) => {
prestate.update_skipped_bytes(i - at);
}
}
cand
}
pub fn opposite_ascii_case(b: u8) -> u8 {
if b'A' <= b && b <= b'Z' {
b.to_ascii_lowercase()
} else if b'a' <= b && b <= b'z' {
b.to_ascii_uppercase()
} else {
b
}
}
fn freq_rank(b: u8) -> u8 {
use byte_frequencies::BYTE_FREQUENCIES;
BYTE_FREQUENCIES[b as usize]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn scratch() {
let mut b = Builder::new(MatchKind::LeftmostFirst);
b.add(b"Sherlock");
b.add(b"locjaw");
let s = b.build().unwrap();
println!("{:?}", s);
}
}