use std::collections::HashMap;
use std::fmt;
use std::mem;
use exec::Search;
use prog::{Inst, Program};
use sparse::SparseSet;
const CACHE_LIMIT: usize = 2 * (1<<20);
pub fn can_exec(insts: &Program) -> bool {
use prog::Inst::*;
use prog::EmptyLook::*;
if insts.len() > ::std::u32::MAX as usize {
return false;
}
for inst in insts {
match *inst {
Char(_) | Ranges(_) => return false,
EmptyLook(ref inst) => {
match inst.look {
WordBoundary | NotWordBoundary => return false,
StartLine | EndLine | StartText | EndText => {}
}
}
Match(_) | Save(_) | Split(_) | Bytes(_) => {}
}
}
true
}
#[derive(Debug)]
pub struct DfaCache {
compiled: HashMap<StateKey, StatePtr>,
states: Vec<State>,
start_states: Vec<StatePtr>,
stack: Vec<InstPtr>,
qcur: SparseSet,
qnext: SparseSet,
}
#[derive(Debug)]
pub struct Dfa<'a, 'b, 'c: 'b, 'm: 'b> {
prog: &'a Program,
start: StatePtr,
search: &'b mut Search<'c, 'm>,
compiled: &'a mut HashMap<StateKey, StatePtr>,
states: &'a mut Vec<State>,
start_states: &'a mut Vec<StatePtr>,
stack: &'a mut Vec<InstPtr>,
}
#[derive(Clone)]
struct State {
next: Vec<StatePtr>,
insts: Vec<InstPtr>,
is_match: bool,
inst_flags: Flags,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
struct StateKey {
insts: Vec<InstPtr>,
is_match: bool,
}
type InstPtr = u32;
type StatePtr = u32;
const STATE_UNKNOWN: StatePtr = 0;
const STATE_DEAD: StatePtr = 1;
#[derive(Copy, Clone, Debug)]
struct Byte(u16);
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
struct Flags(u8);
impl DfaCache {
pub fn new() -> Self {
DfaCache {
compiled: HashMap::new(),
states: vec![State::invalid(), State::invalid()],
start_states: vec![STATE_UNKNOWN; 256],
stack: vec![],
qcur: SparseSet::new(0),
qnext: SparseSet::new(0),
}
}
fn resize(&mut self, num_insts: usize) {
if num_insts == self.qcur.capacity() {
return;
}
self.qcur = SparseSet::new(num_insts);
self.qnext = SparseSet::new(num_insts);
}
}
impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> {
pub fn exec(
prog: &'a Program,
search: &'b mut Search<'c, 'm>,
text: &[u8],
at: usize,
) -> bool {
let mut _cache = prog.cache_dfa();
let mut cache = &mut **_cache;
cache.resize(prog.len());
let mut dfa = Dfa {
prog: prog,
start: 0, search: search,
compiled: &mut cache.compiled,
states: &mut cache.states,
start_states: &mut cache.start_states,
stack: &mut cache.stack,
};
dfa.start = match dfa.start_state(&mut cache.qcur, text, at) {
STATE_DEAD => return false,
si => si,
};
debug_assert!(dfa.start != STATE_UNKNOWN);
let matched = if prog.is_reverse {
dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text, at)
} else {
dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text, at)
};
if matched && dfa.search.matches.len() <= 1 {
dfa.search.set_match(0);
}
matched
}
fn exec_at(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
at: usize,
) -> bool {
debug_assert!(!self.prog.is_reverse);
let (mut si, mut i, mut matched) = (self.start, at, false);
while i < text.len() {
if !self.prog.prefixes.is_empty() && si == self.start {
i = match self.prefix_at(text, i) {
None => return false,
Some(i) => i,
};
}
let cls = self.prog.byte_classes[text[i] as usize];
let mut next_si = self.states[si as usize].next[cls as usize];
if next_si <= STATE_DEAD {
if next_si == STATE_DEAD {
return matched;
}
next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i]));
debug_assert!(next_si != STATE_UNKNOWN);
if next_si == STATE_DEAD {
return matched;
}
}
si = next_si;
if self.states[si as usize].is_match {
if self.search.quit_after_first_match() {
return true;
}
matched = true;
self.search.set_end(Some(i));
}
i += 1;
}
si = self.next_state(qcur, qnext, si, Byte::eof());
debug_assert!(si != STATE_UNKNOWN);
if si == STATE_DEAD {
return matched;
}
if self.states[si as usize].is_match {
if self.search.quit_after_first_match() {
return true;
}
matched = true;
self.search.set_end(Some(text.len()));
}
if matched && self.search.matches.len() != 1 {
for &ip in &self.states[si as usize].insts {
if let Inst::Match(slot) = self.prog[ip as usize] {
self.search.set_match(slot);
}
}
}
matched
}
fn exec_at_reverse(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
text: &[u8],
at: usize,
) -> bool {
debug_assert!(self.prog.is_reverse);
let (mut si, mut i, mut matched) = (self.start, at, false);
while i > 0 {
i -= 1;
let cls = self.prog.byte_classes[text[i] as usize];
let mut next_si = self.states[si as usize].next[cls as usize];
if next_si <= STATE_DEAD {
if next_si == STATE_DEAD {
return matched;
}
next_si = self.exec_byte(qcur, qnext, si, Byte::byte(text[i]));
debug_assert!(next_si != STATE_UNKNOWN);
if next_si == STATE_DEAD {
return matched;
}
}
si = next_si;
if self.states[si as usize].is_match {
if self.search.quit_after_first_match() {
return true;
}
matched = true;
self.search.set_start(Some(i+1));
}
}
si = self.next_state(qcur, qnext, si, Byte::eof());
debug_assert!(si != STATE_UNKNOWN);
if si == STATE_DEAD {
return matched;
}
if self.states[si as usize].is_match {
if self.search.quit_after_first_match() {
return true;
}
matched = true;
self.search.set_start(Some(0));
}
matched
}
fn exec_byte(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
mut si: StatePtr,
b: Byte,
) -> StatePtr {
use prog::Inst::*;
qcur.clear();
for &ip in &self.states[si as usize].insts {
qcur.add(ip as usize);
}
if self.states[si as usize].inst_flags.has_non_match_flags() {
let mut flags = Flags::new();
if b.is_eof() {
flags.set_end(true).set_end_line(true);
} else if b.as_byte().map_or(false, |b| b == b'\n') {
flags.set_end_line(true);
}
qnext.clear();
for &ip in &*qcur {
self.follow_epsilons(usize_to_u32(ip), qnext, flags);
}
mem::swap(qcur, qnext);
}
let mut flags = Flags::new();
if b.as_byte().map_or(false, |b| b == b'\n') {
flags.set_start_line(true);
}
qnext.clear();
for &ip in &*qcur {
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) | Split(_) | EmptyLook(_) => {}
Match(_) => {
flags.set_match(true);
if !self.continue_past_first_match() {
break;
} else if self.search.matches.len() != 1 {
if !qnext.contains_ip(ip as usize) {
qnext.add(ip);
}
}
}
Bytes(ref inst) => {
if b.as_byte().map_or(false, |b| inst.matches(b)) {
self.follow_epsilons(
inst.goto as InstPtr, qnext, flags);
}
}
}
}
let mut cache = true;
if b.is_eof() && self.search.matches.len() != 1 {
mem::swap(qcur, qnext);
cache = false;
}
let next = self.cached_state(qnext, flags.is_match(), Some(&mut si));
debug_assert!(next != STATE_UNKNOWN);
let cls = self.byte_class(b);
if cache {
self.states[si as usize].next[cls] = next;
}
next
}
fn follow_epsilons(
&mut self,
ip: InstPtr,
q: &mut SparseSet,
flags: Flags,
) {
use prog::Inst::*;
use prog::EmptyLook::*;
self.stack.push(ip);
while let Some(ip) = self.stack.pop() {
if q.contains_ip(ip as usize) {
continue;
}
q.add(ip as usize);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Match(_) | Bytes(_) => {}
EmptyLook(ref inst) => {
match inst.look {
StartLine if flags.is_start_line() => {
self.stack.push(inst.goto as InstPtr);
}
EndLine if flags.is_end_line() => {
self.stack.push(inst.goto as InstPtr);
}
StartText if flags.is_start() => {
self.stack.push(inst.goto as InstPtr);
}
EndText if flags.is_end() => {
self.stack.push(inst.goto as InstPtr);
}
StartLine | EndLine | StartText | EndText => {}
WordBoundary | NotWordBoundary => unreachable!(),
}
}
Save(ref inst) => self.stack.push(inst.goto as InstPtr),
Split(ref inst) => {
self.stack.push(inst.goto2 as InstPtr);
self.stack.push(inst.goto1 as InstPtr);
}
}
}
}
fn cached_state(
&mut self,
q: &SparseSet,
is_match: bool,
current_state: Option<&mut StatePtr>,
) -> StatePtr {
let (key, inst_flags) = match self.cached_state_key(q, is_match) {
None => return STATE_DEAD,
Some(v) => v,
};
if let Some(&si) = self.compiled.get(&key) {
return si;
}
if self.approximate_size() > CACHE_LIMIT {
self.clear_cache_and_save(current_state);
}
let next = vec![STATE_UNKNOWN; self.num_byte_classes()];
self.states.push(State {
next: next,
insts: key.insts.clone(),
is_match: is_match,
inst_flags: inst_flags,
});
let si = usize_to_u32(self.states.len().checked_sub(1).unwrap());
self.compiled.insert(key, si);
si
}
fn cached_state_key(
&mut self,
q: &SparseSet,
is_match: bool,
) -> Option<(StateKey, Flags)> {
use prog::Inst::*;
use prog::EmptyLook::*;
let mut inst_flags = Flags::new();
let mut insts = vec![];
for &ip in q {
let ip = usize_to_u32(ip);
match self.prog[ip as usize] {
Char(_) | Ranges(_) => unreachable!(),
Save(_) => {}
Split(_) => {}
Bytes(_) => insts.push(ip),
EmptyLook(ref inst) => {
match inst.look {
StartLine => {
inst_flags.set_start_line(true);
insts.push(ip);
}
EndLine => {
inst_flags.set_end_line(true);
insts.push(ip);
}
StartText => {
inst_flags.set_start(true);
insts.push(ip);
}
EndText => {
inst_flags.set_end(true);
insts.push(ip);
}
WordBoundary | NotWordBoundary => unreachable!(),
}
}
Match(_) => {
insts.push(ip);
if !self.continue_past_first_match() {
break;
}
}
}
}
if insts.len() == 0 && !is_match {
None
} else {
let key = StateKey { insts: insts, is_match: is_match };
Some((key, inst_flags))
}
}
fn clear_cache_and_save(&mut self, current_state: Option<&mut StatePtr>) {
if self.states.len() <= 2 {
return;
}
match current_state {
None => self.clear_cache(),
Some(si) => {
let cur = self.copy_state(*si);
self.clear_cache();
*si = self.restore_state(cur);
}
}
}
fn clear_cache(&mut self) {
let start = self.copy_state(self.start);
self.states.clear();
self.compiled.clear();
for start in self.start_states.iter_mut() {
*start = STATE_UNKNOWN;
}
self.states.push(State::invalid());
self.states.push(State::invalid());
self.start = self.restore_state(start);
}
fn copy_state(&self, si: StatePtr) -> State {
let mut state = self.states[si as usize].clone();
state.next = vec![STATE_UNKNOWN; self.num_byte_classes()];
state
}
fn restore_state(&mut self, state: State) -> StatePtr {
let key = StateKey {
insts: state.insts.clone(),
is_match: state.is_match,
};
if let Some(&si) = self.compiled.get(&key) {
return si;
}
let si = usize_to_u32(self.states.len());
self.states.push(state);
self.compiled.insert(key, si);
si
}
fn next_state(
&mut self,
qcur: &mut SparseSet,
qnext: &mut SparseSet,
si: StatePtr,
b: Byte,
) -> StatePtr {
let cls = self.byte_class(b);
match self.states[si as usize].next[cls] {
STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
STATE_DEAD => return STATE_DEAD,
nsi => return nsi,
}
}
fn start_state(
&mut self,
q: &mut SparseSet,
text: &[u8],
at: usize,
) -> StatePtr {
let start_flags = if self.prog.is_reverse {
self.start_flags_reverse(text, at)
} else {
self.start_flags(text, at)
};
let flagi = start_flags.0 as usize;
match self.start_states[flagi] {
STATE_UNKNOWN => {}
STATE_DEAD => return STATE_DEAD,
si => return si,
}
q.clear();
self.follow_epsilons(0, q, start_flags);
let sp = self.cached_state(q, false, None);
self.start_states[flagi] = sp;
sp
}
fn start_flags(&self, text: &[u8], at: usize) -> Flags {
let mut flags = Flags::new();
flags.set_start(at == 0).set_end(text.len() == 0);
flags.set_start_line(at == 0 || text[at - 1] == b'\n');
flags.set_end_line(text.len() == 0);
flags
}
fn start_flags_reverse(&self, text: &[u8], at: usize) -> Flags {
let mut flags = Flags::new();
flags.set_start(at == text.len()).set_end(text.len() == 0);
flags.set_start_line(at == text.len() || text[at] == b'\n');
flags.set_end_line(text.len() == 0);
flags
}
fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
}
fn num_byte_classes(&self) -> usize {
(self.prog.byte_classes[255] as usize + 1) + 1
}
fn byte_class(&self, b: Byte) -> usize {
if b.is_eof() {
self.num_byte_classes() - 1
} else {
self.prog.byte_classes[b.0 as usize] as usize
}
}
fn continue_past_first_match(&self) -> bool {
self.prog.is_reverse || self.search.matches.len() != 1
}
fn approximate_size(&self) -> usize {
use std::mem::size_of as size;
let compiled =
(self.compiled.len() * (size::<StateKey>() + 128))
+ (self.compiled.len() * size::<StatePtr>());
let states =
self.states.len()
* (size::<State>()
+ 128
+ (self.num_byte_classes() * size::<StatePtr>()));
let start_states = self.start_states.len() * size::<StatePtr>();
self.prog.approximate_size() + compiled + states + start_states
}
}
impl State {
fn invalid() -> State {
State {
next: vec![],
insts: vec![],
is_match: false,
inst_flags: Flags::new(),
}
}
}
impl Flags {
#[inline]
fn new() -> Self {
Flags(0)
}
#[inline]
fn has_non_match_flags(&self) -> bool {
self.0 & 0b0_1111111 > 0
}
#[inline]
fn set(&mut self, yes: bool, bit: u8) {
if yes {
self.0 = self.0 | bit;
} else {
self.0 = self.0 & !bit;
}
}
#[inline]
fn is_match(&self) -> bool { self.0 & 0b1_0000000 > 0 }
#[inline]
fn set_match(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b1_0000000);
self
}
#[inline]
fn is_start(&self) -> bool { self.0 & 0b0_1_000000 > 0 }
#[inline]
fn set_start(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b0_1_000000);
self
}
#[inline]
fn is_end(&self) -> bool { self.0 & 0b00_1_00000 > 0 }
#[inline]
fn set_end(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b00_1_00000);
self
}
#[inline]
fn is_start_line(&self) -> bool { self.0 & 0b000_1_0000 > 0 }
#[inline]
fn set_start_line(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b000_1_0000);
self
}
#[inline]
fn is_end_line(&self) -> bool { self.0 & 0b0000_1_000 > 0 }
#[inline]
fn set_end_line(&mut self, yes: bool) -> &mut Self {
self.set(yes, 0b0000_1_000);
self
}
}
impl Byte {
#[inline] fn byte(b: u8) -> Self { Byte(b as u16) }
#[inline] fn eof() -> Self { Byte(256) }
#[inline] fn is_eof(&self) -> bool { self.0 == 256 }
#[inline]
fn as_byte(&self) -> Option<u8> {
if self.is_eof() {
None
} else {
Some(self.0 as u8)
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut next = vec![];
for (b, next_sp) in self.next.iter().enumerate() {
match *next_sp {
STATE_UNKNOWN => {}
STATE_DEAD => next.push((vb(b as usize), "DEAD".to_string())),
si => next.push((vb(b as usize), si.to_string())),
}
}
f.debug_struct("State")
.field("is_match", &self.is_match)
.field("insts", &self.insts)
.field("next", &next)
.finish()
}
}
impl fmt::Debug for Flags {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Flags")
.field("match", &if self.is_match() { 1 } else { 0 })
.field("start", &if self.is_start() { 1 } else { 0 })
.field("end", &if self.is_end() { 1 } else { 0 })
.field("start_line", &if self.is_start_line() { 1 } else { 0 })
.field("end_line", &if self.is_end_line() { 1 } else { 0 })
.finish()
}
}
fn vb(b: usize) -> String {
use std::ascii::escape_default;
if b > ::std::u8::MAX as usize {
"EOF".to_owned()
} else {
let escaped = escape_default(b as u8).collect::<Vec<u8>>();
String::from_utf8_lossy(&escaped).into_owned()
}
}
fn usize_to_u32(n: usize) -> u32 {
if (n as u64) > (::std::u32::MAX as u64) {
panic!("BUG: {} is too big to fit into u32", n)
}
n as u32
}