Bug 1716518 - Upgrade aho-corasick to v0.7.18. r=emilio

Differential Revision: https://phabricator.services.mozilla.com/D117814
This commit is contained in:
Mike Hommey 2021-06-15 22:04:51 +00:00
parent 38e4a9776f
commit c23265234d
24 changed files with 843 additions and 462 deletions

4
Cargo.lock generated
View File

@ -26,9 +26,9 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
[[package]]
name = "aho-corasick"
version = "0.7.6"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]

View File

@ -1 +1 @@
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"a2f9a1263aa35a92af4ffc1935b264f062738bc25761aa62b3d582031d6bf5f0","DESIGN.md":"44d4516ef38d60e9638f756baf40bcd9eff1b8e8ce7538a1d8549e02d6605d48","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"626d74e4bdac78d2446c75c722a7e46d0eaa4e506a1068ff693b5abc338a384f","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"46c57a83a75a8f25fdf19a15deae10748d12b8af9445ae74700a546a92024608","src/automaton.rs":"85e79ceb964f824fcceca026abd255980840116704834d70a1b9c44833df299f","src/buffer.rs":"c40992e7d1ba0bac6d1c268d41069aad81f2226686c64192ed888a60f66db8cd","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"590f2e257bf7c630bea3a28d4a1f75c78db7a0802f5921aced017a056146b4e6","src/dfa.rs":"2fb1077edfefd2b7f7e9c0d9df55df1441d4571500a2c45aa5b41960a36441e4","src/error.rs":"36dbf2cefbfaa8a69186551320dbff023d3e82780a6c925e87c3e3997b967e66","src/lib.rs":"028ab998e8f0d1a98650b139bcca83681cbb52545060b9253b76d7e19117b53d","src/nfa.rs":"6bc3479ad37c576bba4bbdc9e3d0c6e69a4b7f0d9a88fcbbf727bf4a9b288494","src/packed/api.rs":"aa89627c7114c057c98ad1c7ab9ce18c6ed55267a6bcf7bc8efb917b6cfe5532","src/packed/mod.rs":"29c76ad3cbb1f831140cefac7a27fb504ac4af4f454975a571965b48aad417eb","src/packed/pattern.rs":"b88c57af057997da0a5a06f4c5604a7e598c20acfc11c15cd8977727f6e1cf9c","src/packed/rabinkarp.rs":"b3242a8631ea5607163dcbb641e4ac9c6da26774378da1e51651b0ab5656b390","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"21b18cbee9bc33918b85b1dc51a0faed57acb426f61e6b72aeaf69faa7595701","src/packed/teddy/mod.rs":"f63db3419b1d378929bf0bc1f0e3b909ff3c38b9f2b6e86ba4546b8f39907cd3","src/packed/teddy/runtime.rs":"0a1250ea73159b3be6e0fa9a3f55ecedbb2cb90cb798d1709e9f5ee48f8855d5","src/packed/tests.rs":"0b52ab9eef73a1a4f141f475a9fa98e54d447104aa69acba3a7f8248ce7164b2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"3dbe93d85c6fb985a9aea0b5eab003fe81a228e02adba00c8f63a35c3fd246b8","src/state_id.rs":"ebecd7046760e6bd72303f288be93342b446e7fe95f20b5ce23653d802c48b09","src/tests.rs":"9201cc0662bc9a1e8fa15c59e33a18a55ec6b3bd6bbea294d9cace0053bb8d24"},"package":"58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"}
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"f61283fd900435313b9ba8c1b87a4b5b31d442f9b554222136ec8d1d3d1e39d8","DESIGN.md":"9065f33d818d1562244d36dc4781e2a351108030cee17f11c2ba512ca7b4c27e","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"741e7249c8d1d6a7ba9341d68253dbf4952477c5620ff37c5325f2e894b148b6","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"6fcbe812eec7af44b104c6b8a27b0a2ea8d67c3d9aec73cb69d802b30be5f005","src/automaton.rs":"610b3e2c104c51bf4f51a6d07626c3972e9d1274ca276e987385a231b284cc8b","src/buffer.rs":"dae7ee7c1f846ca9cf115ba4949484000e1837b4fb7311f8d8c9a35011c9c26f","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"99a53a2ed8eea8c13699def90e31dfdff9d0b90572b1db3cb534e3396e7a0ed0","src/dfa.rs":"25e4455b3e179a7e192108d05f3683993456b36e3ebed99f827558c52525b7e6","src/error.rs":"d34c2c9c815df5d9dedc46b4b3ce109cd2cee07825de643f0c574ec960367beb","src/lib.rs":"f0c48b0ee093dd8b3034d025d052c3667860c5d4a196cb178588012b719acea4","src/nfa.rs":"2f443951c78196126bfd237ed5770a69077e6190daeecd47131339c25e51a3d0","src/packed/api.rs":"ec58ff1b4375dd4ff88fb5859c7ede994fe08d31b7d3677720a086592aa0fe53","src/packed/mod.rs":"d7ee11d487a7f129f16dc8f1473442a7127905933f378504bae83df0f23c5e2a","src/packed/pattern.rs":"3abf3835d4c4f8a43753c52936a894d819f713f233fc046e19de5ef95200dcce","src/packed/rabinkarp.rs":"caf9563b7442c9b75c9cb520fa236c7a6da8173705889b8d79b69ede14a20767","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"aad40b3f93d2c388b409b31fb2795d414a365237789d5b1a7510d97ceb8ce260","src/packed/teddy/mod.rs":"83b52bd80272970ad17234d0db293d17c1710ec582302bf516b203c8edec037e","src/packed/teddy/runtime.rs":"836146e90b320b14fa2c65fe4af7915a41f6fb04408aac5fac731c22ff46adae","src/packed/tests.rs":"b8dc4d3281ecd6d0fa2bf7ef16cf292a467dfdce64e470c7921e983bfa60fee2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"82a3eb6d5c0c3f10bc8d5f57d55d6d14cf4cf21c475bb5253e1921084063b8d7","src/state_id.rs":"519ec8c7bf3fa72103d4c561c193759759f535dca924c9853efe630f406d2029","src/tests.rs":"6522ed1b244513c01de5bbcf0fe35571454fdea2c2a9d8dfe13a04bf57b70eca"},"package":"1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"}

View File

@ -11,8 +11,9 @@
# will likely look very different (and much more reasonable)
[package]
edition = "2018"
name = "aho-corasick"
version = "0.7.6"
version = "0.7.18"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
autotests = false
@ -32,16 +33,11 @@ debug = true
[lib]
name = "aho_corasick"
[dependencies.memchr]
version = "2.2.0"
version = "2.4.0"
default-features = false
[dev-dependencies.doc-comment]
version = "0.3.1"
[dev-dependencies]
[features]
default = ["std"]
std = ["memchr/use_std"]
[badges.appveyor]
repository = "BurntSushi/aho-corasick"
[badges.travis-ci]
repository = "BurntSushi/aho-corasick"
std = ["memchr/std"]

View File

@ -2,7 +2,7 @@ This document describes the internal design of this crate, which is an object
lesson in what happens when you take a fairly simple old algorithm like
Aho-Corasick and make it fast and production ready.
The target audience of this crate is Rust programmers that have some
The target audience of this document is Rust programmers that have some
familiarity with string searching, however, one does not need to know the
Aho-Corasick algorithm in order to read this (it is explained below). One
should, however, know what a trie is. (If you don't, go read its Wikipedia
@ -13,7 +13,7 @@ own, Aho-Corasick isn't that complicated. The complex pieces come from the
different variants of Aho-Corasick implemented in this crate. Specifically,
they are:
* Aho-Corasick as an NFA, using dense transitions near root with sparse
* Aho-Corasick as an NFA, using dense transitions near the root with sparse
transitions elsewhere.
* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct
and uses less memory.)
@ -74,7 +74,7 @@ one is Aho-Corasick. It's a common solution because it's not too hard to
implement, scales quite well even when searching for thousands of patterns and
is generally pretty fast. Aho-Corasick does well here because, regardless of
the number of patterns you're searching for, it always visits each byte in the
haystack exactly ocne. This means, generally speaking, adding more patterns to
haystack exactly once. This means, generally speaking, adding more patterns to
an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
this is not true, since a larger automaton will make less effective use of the
CPU's cache.)
@ -277,12 +277,12 @@ there are a small number of patterns.
# More DFA tricks
As described in the previous section, one of the downsides of using a DFA is
that is uses more memory and can take longer to builder. One small way of
mitigating these concerns is to map the alphabet used by the automaton into a
smaller space. Typically, the alphabet of a DFA has 256 elements in it: one
element for each possible value that fits into a byte. However, in many cases,
one does not need the full alphabet. For example, if all patterns in an
As described in the previous section, one of the downsides of using a DFA
is that is uses more memory and can take longer to build. One small way of
mitigating these concerns is to map the alphabet used by the automaton into
a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
one element for each possible value that fits into a byte. However, in many
cases, one does not need the full alphabet. For example, if all patterns in an
Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
bytes. As far as the automaton is concerned, the rest of the 204 bytes are
indistinguishable from one another: they will never disrciminate between a

View File

@ -5,11 +5,10 @@ acceleration in some cases. This library provides multiple pattern
search principally through an implementation of the
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
which builds a finite state machine for executing searches in linear time.
Features include case insensitive matching, overlapping matches and search &
replace in streams.
Features include case insensitive matching, overlapping matches, fast searching
via SIMD and optional full DFA construction and search & replace in streams.
[![Linux build status](https://api.travis-ci.org/BurntSushi/aho-corasick.svg)](https://travis-ci.org/BurntSushi/aho-corasick)
[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/aho-corasick?svg=true)](https://ci.appveyor.com/project/BurntSushi/aho-corasick)
[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
@ -29,12 +28,6 @@ Add this to your `Cargo.toml`:
aho-corasick = "0.7"
```
and this to your crate root (if you're using Rust 2015):
```rust
extern crate aho_corasick;
```
### Example: basic searching
@ -95,7 +88,6 @@ loading the entire stream into memory first.
```rust
use aho_corasick::AhoCorasick;
# fn example() -> Result<(), ::std::io::Error> {
let patterns = &["fox", "brown", "quick"];
let replace_with = &["sloth", "grey", "slow"];
@ -105,9 +97,9 @@ let rdr = "The quick brown fox.";
let mut wtr = vec![];
let ac = AhoCorasick::new(patterns);
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
.expect("stream_replace_all failed");
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
# Ok(()) }; example().unwrap()
```
@ -164,11 +156,16 @@ expression alternation. See `MatchKind` in the docs for more details.
### Minimum Rust version policy
This crate's minimum supported `rustc` version is `1.28.0`.
This crate's minimum supported `rustc` version is `1.41.1`.
The current policy is that the minimum Rust version required to use this crate
can be increased in minor version updates. For example, if `crate 1.0` requires
Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
version of Rust.
In general, this crate will be conservative with respect to the minimum
supported version of Rust. In general, it will follow the `regex` crate's
policy, since `regex` is an important dependent.
supported version of Rust.
### Future work

View File

@ -1,14 +1,14 @@
use std::io;
use automaton::Automaton;
use buffer::Buffer;
use dfa::{self, DFA};
use error::Result;
use nfa::{self, NFA};
use packed;
use prefilter::PrefilterState;
use state_id::StateID;
use Match;
use crate::automaton::Automaton;
use crate::buffer::Buffer;
use crate::dfa::{self, DFA};
use crate::error::Result;
use crate::nfa::{self, NFA};
use crate::packed;
use crate::prefilter::{Prefilter, PrefilterState};
use crate::state_id::StateID;
use crate::Match;
/// An automaton for searching multiple strings in linear time.
///
@ -502,7 +502,7 @@ impl<S: StateID> AhoCorasick<S> {
/// The closure accepts three parameters: the match found, the text of
/// the match and a string buffer with which to write the replaced text
/// (if any). If the closure returns `true`, then it continues to the next
/// match. If the closure returns false, then searching is stopped.
/// match. If the closure returns `false`, then searching is stopped.
///
/// # Examples
///
@ -524,6 +524,24 @@ impl<S: StateID> AhoCorasick<S> {
/// });
/// assert_eq!("0 the 2 to the 0age", result);
/// ```
///
/// Stopping the replacement by returning `false` (continued from the
/// example above):
///
/// ```
/// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
/// # let patterns = &["append", "appendage", "app"];
/// # let haystack = "append the app to the appendage";
/// # let ac = AhoCorasickBuilder::new()
/// # .match_kind(MatchKind::LeftmostFirst)
/// # .build(patterns);
/// let mut result = String::new();
/// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
/// dst.push_str(&mat.pattern().to_string());
/// mat.pattern() != 2
/// });
/// assert_eq!("0 the 2 to the appendage", result);
/// ```
pub fn replace_all_with<F>(
&self,
haystack: &str,
@ -536,7 +554,9 @@ impl<S: StateID> AhoCorasick<S> {
for mat in self.find_iter(haystack) {
dst.push_str(&haystack[last_match..mat.start()]);
last_match = mat.end();
replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
break;
};
}
dst.push_str(&haystack[last_match..]);
}
@ -548,7 +568,7 @@ impl<S: StateID> AhoCorasick<S> {
/// The closure accepts three parameters: the match found, the text of
/// the match and a byte buffer with which to write the replaced text
/// (if any). If the closure returns `true`, then it continues to the next
/// match. If the closure returns false, then searching is stopped.
/// match. If the closure returns `false`, then searching is stopped.
///
/// # Examples
///
@ -570,6 +590,24 @@ impl<S: StateID> AhoCorasick<S> {
/// });
/// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
/// ```
///
/// Stopping the replacement by returning `false` (continued from the
/// example above):
///
/// ```
/// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
/// # let patterns = &["append", "appendage", "app"];
/// # let haystack = b"append the app to the appendage";
/// # let ac = AhoCorasickBuilder::new()
/// # .match_kind(MatchKind::LeftmostFirst)
/// # .build(patterns);
/// let mut result = vec![];
/// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
/// dst.extend(mat.pattern().to_string().bytes());
/// mat.pattern() != 2
/// });
/// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
/// ```
pub fn replace_all_with_bytes<F>(
&self,
haystack: &[u8],
@ -582,7 +620,9 @@ impl<S: StateID> AhoCorasick<S> {
for mat in self.find_iter(haystack) {
dst.extend(&haystack[last_match..mat.start()]);
last_match = mat.end();
replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
break;
};
}
dst.extend(&haystack[last_match..]);
}
@ -735,9 +775,7 @@ impl<S: StateID> AhoCorasick<S> {
/// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
///
/// The closure accepts three parameters: the match found, the text of
/// the match and the writer with which to write the replaced text
/// (if any). If the closure returns `true`, then it continues to the next
/// match. If the closure returns false, then searching is stopped.
/// the match and the writer with which to write the replaced text (if any).
///
/// After all matches are replaced, the writer is _not_ flushed.
///
@ -967,18 +1005,6 @@ impl<S: StateID> AhoCorasick<S> {
///
/// let ac = AhoCorasickBuilder::new()
/// .dfa(true)
/// .byte_classes(false)
/// .build(&["foo", "bar", "baz"]);
/// assert_eq!(20_768, ac.heap_bytes());
///
/// let ac = AhoCorasickBuilder::new()
/// .dfa(true)
/// .byte_classes(true) // default
/// .build(&["foo", "bar", "baz"]);
/// assert_eq!(1_248, ac.heap_bytes());
///
/// let ac = AhoCorasickBuilder::new()
/// .dfa(true)
/// .ascii_case_insensitive(true)
/// .build(&["foo", "bar", "baz"]);
/// assert_eq!(1_248, ac.heap_bytes());
@ -1037,6 +1063,24 @@ impl<S: StateID> Imp<S> {
}
}
/// Returns the prefilter object, if one exists, for the underlying
/// automaton.
fn prefilter(&self) -> Option<&dyn Prefilter> {
match *self {
Imp::NFA(ref nfa) => nfa.prefilter(),
Imp::DFA(ref dfa) => dfa.prefilter(),
}
}
/// Returns true if and only if we should attempt to use a prefilter.
fn use_prefilter(&self) -> bool {
let p = match self.prefilter() {
None => return false,
Some(p) => p,
};
!p.looks_for_non_start_of_match()
}
#[inline(always)]
fn overlapping_find_at(
&self,
@ -1113,7 +1157,7 @@ impl<S: StateID> Imp<S> {
///
/// The lifetime `'b` refers to the lifetime of the haystack being searched.
#[derive(Debug)]
pub struct FindIter<'a, 'b, S: 'a + StateID> {
pub struct FindIter<'a, 'b, S: StateID> {
fsm: &'a Imp<S>,
prestate: PrefilterState,
haystack: &'b [u8],
@ -1170,7 +1214,7 @@ impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
///
/// The lifetime `'b` refers to the lifetime of the haystack being searched.
#[derive(Debug)]
pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> {
pub struct FindOverlappingIter<'a, 'b, S: StateID> {
fsm: &'a Imp<S>,
prestate: PrefilterState,
haystack: &'b [u8],
@ -1241,7 +1285,7 @@ impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
///
/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
#[derive(Debug)]
pub struct StreamFindIter<'a, R, S: 'a + StateID> {
pub struct StreamFindIter<'a, R, S: StateID> {
it: StreamChunkIter<'a, R, S>,
}
@ -1276,7 +1320,7 @@ impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
/// N.B. This does not actually implement Iterator because we need to borrow
/// from the underlying reader. But conceptually, it's still an iterator.
#[derive(Debug)]
struct StreamChunkIter<'a, R, S: 'a + StateID> {
struct StreamChunkIter<'a, R, S: StateID> {
/// The AC automaton.
fsm: &'a Imp<S>,
/// State associated with this automaton's prefilter. It is a heuristic
@ -1325,7 +1369,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
"stream searching is only supported for Standard match semantics"
);
let prestate = PrefilterState::new(ac.max_pattern_len());
let prestate = if ac.imp.use_prefilter() {
PrefilterState::new(ac.max_pattern_len())
} else {
PrefilterState::disabled()
};
let buf = Buffer::new(ac.imp.max_pattern_len());
let state_id = ac.imp.start_state();
StreamChunkIter {
@ -1621,7 +1669,7 @@ impl AhoCorasickBuilder {
// N.B. Using byte classes can actually be faster by improving
// locality, but this only really applies for multi-megabyte
// automata (i.e., automata that don't fit in your CPU's cache).
self.dfa(true).byte_classes(false);
self.dfa(true);
} else if patterns.len() <= 5000 {
self.dfa(true);
}
@ -1809,7 +1857,7 @@ impl AhoCorasickBuilder {
/// finite automaton (NFA) is used instead.
///
/// The main benefit to a DFA is that it can execute searches more quickly
/// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
/// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
/// DFA uses more space and can take much longer to build.
///
/// Enabling this option does not change the time complexity for
@ -1868,6 +1916,10 @@ impl AhoCorasickBuilder {
/// overall performance.
///
/// This option is enabled by default.
#[deprecated(
since = "0.7.16",
note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
)]
pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
self.dfa_builder.byte_classes(yes);
self
@ -1896,6 +1948,10 @@ impl AhoCorasickBuilder {
/// non-premultiplied form only requires 8 bits.
///
/// This option is enabled by default.
#[deprecated(
since = "0.7.16",
note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
)]
pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
self.dfa_builder.premultiply(yes);
self

View File

@ -1,7 +1,7 @@
use ahocorasick::MatchKind;
use prefilter::{self, Candidate, Prefilter, PrefilterState};
use state_id::{dead_id, fail_id, StateID};
use Match;
use crate::ahocorasick::MatchKind;
use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
use crate::state_id::{dead_id, fail_id, StateID};
use crate::Match;
// NOTE: This trait essentially started as a copy of the same trait from from
// regex-automata, with some wording changed since we use this trait for
@ -28,6 +28,42 @@ use Match;
// for tracking the state ID and one that doesn't. We should ideally do the
// same for standard searching, but my sanity stopped me.
// SAFETY RATIONALE: Previously, the code below went to some length to remove
// all bounds checks. This generally produced tighter assembly and lead to
// 20-50% improvements in micro-benchmarks on corpora made up of random
// characters. This somewhat makes sense, since the branch predictor is going
// to be at its worse on random text.
//
// However, using the aho-corasick-debug tool and manually benchmarking
// different inputs, the code *with* bounds checks actually wound up being
// slightly faster:
//
// $ cat input
// Sherlock Holmes
// John Watson
// Professor Moriarty
// Irene Adler
// Mary Watson
//
// $ aho-corasick-debug-safe \
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
// pattern read time: 32.824µs
// automaton build time: 444.687µs
// automaton heap usage: 72392 bytes
// match count: 639
// count time: 1.809961702s
//
// $ aho-corasick-debug-master \
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
// pattern read time: 31.425µs
// automaton build time: 317.434µs
// automaton heap usage: 72392 bytes
// match count: 639
// count time: 2.059157705s
//
// I was able to reproduce this result on two different machines (an i5 and
// an i7). Therefore, we go the route of safe code for now.
/// A trait describing the interface of an Aho-Corasick finite state machine.
///
/// Every automaton has exactly one fail state, one dead state and exactly one
@ -39,8 +75,8 @@ use Match;
/// only when at least one match has been observed.
///
/// Every automaton also has one or more match states, such that
/// `Automaton::is_match_state_unchecked(id)` returns `true` if and only if
/// `id` corresponds to a match state.
/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
/// corresponds to a match state.
pub trait Automaton {
/// The representation used for state identifiers in this automaton.
///
@ -123,20 +159,12 @@ pub trait Automaton {
/// must ensure that the given identifier corresponds to a valid automaton
/// state. Implementors must, in turn, ensure that this routine is safe for
/// all valid state identifiers and for all possible `u8` values.
unsafe fn next_state_unchecked(
&self,
current: Self::ID,
input: u8,
) -> Self::ID;
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
/// Like next_state_unchecked, but debug_asserts that the underlying
/// Like next_state, but debug_asserts that the underlying
/// implementation never returns a `fail_id()` for the next state.
unsafe fn next_state_unchecked_no_fail(
&self,
current: Self::ID,
input: u8,
) -> Self::ID {
let next = self.next_state_unchecked(current, input);
fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
let next = self.next_state(current, input);
// We should never see a transition to the failure state.
debug_assert!(
next != fail_id(),
@ -174,7 +202,7 @@ pub trait Automaton {
}
}
// It's important for this to always be inlined. Namely, it's only caller
// It's important for this to always be inlined. Namely, its only caller
// is standard_find_at, and the inlining should remove the case analysis
// for prefilter scanning when there is no prefilter available.
#[inline(always)]
@ -183,66 +211,49 @@ pub trait Automaton {
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
at: usize,
mut at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
// This is necessary for guaranteeing a safe API, since we use the
// state ID below in a function that exhibits UB if called with an
// invalid state ID.
assert!(
self.is_valid(*state_id),
"{} is not a valid state ID",
state_id.to_usize()
);
unsafe {
let start = haystack.as_ptr();
let end = haystack[haystack.len()..].as_ptr();
let mut ptr = haystack[at..].as_ptr();
while ptr < end {
if let Some(pre) = prefilter {
let at = ptr as usize - start as usize;
if prestate.is_effective(at)
&& *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
ptr = start.offset(i as isize);
}
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
at = i;
}
}
}
// SAFETY: next_state is safe for all possible u8 values,
// so the only thing we're concerned about is the validity
// of `state_id`. `state_id` either comes from the caller
// (in which case, we assert above that it is valid), or it
// comes from the return value of next_state, which is also
// guaranteed to be valid.
*state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
ptr = ptr.offset(1);
// This routine always quits immediately after seeing a
// match, and since dead states can only come after seeing
// a match, seeing a dead state here is impossible. (Unless
// we have an anchored automaton, in which case, dead states
// are used to stop a search.)
debug_assert!(
*state_id != dead_id() || self.anchored(),
"standard find should never see a dead state"
);
if self.is_match_or_dead_state(*state_id) {
return if *state_id == dead_id() {
None
} else {
let end = ptr as usize - start as usize;
self.get_match(*state_id, 0, end)
};
}
}
None
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
at += 1;
// This routine always quits immediately after seeing a
// match, and since dead states can only come after seeing
// a match, seeing a dead state here is impossible. (Unless
// we have an anchored automaton, in which case, dead states
// are used to stop a search.)
debug_assert!(
*state_id != dead_id() || self.anchored(),
"standard find should never see a dead state"
);
if self.is_match_or_dead_state(*state_id) {
return if *state_id == dead_id() {
None
} else {
self.get_match(*state_id, 0, at)
};
}
}
None
}
/// Execute a search using leftmost (either first or longest) match
@ -276,7 +287,7 @@ pub trait Automaton {
}
}
// It's important for this to always be inlined. Namely, it's only caller
// It's important for this to always be inlined. Namely, its only caller
// is leftmost_find_at, and the inlining should remove the case analysis
// for prefilter scanning when there is no prefilter available.
#[inline(always)]
@ -285,76 +296,58 @@ pub trait Automaton {
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
at: usize,
mut at: usize,
state_id: &mut Self::ID,
) -> Option<Match> {
debug_assert!(self.match_kind().is_leftmost());
// This is necessary for guaranteeing a safe API, since we use the
// state ID below in a function that exhibits UB if called with an
// invalid state ID.
assert!(
self.is_valid(*state_id),
"{} is not a valid state ID",
state_id.to_usize()
);
if self.anchored() && at > 0 && *state_id == self.start_state() {
return None;
}
unsafe {
let start = haystack.as_ptr();
let end = haystack[haystack.len()..].as_ptr();
let mut ptr = haystack[at..].as_ptr();
let mut last_match = self.get_match(*state_id, 0, at);
while ptr < end {
if let Some(pre) = prefilter {
let at = ptr as usize - start as usize;
if prestate.is_effective(at)
&& *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
ptr = start.offset(i as isize);
}
let mut last_match = self.get_match(*state_id, 0, at);
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && *state_id == self.start_state()
{
let c = prefilter::next(prestate, pre, haystack, at)
.into_option();
match c {
None => return None,
Some(i) => {
at = i;
}
}
}
// SAFETY: next_state is safe for all possible u8 values,
// so the only thing we're concerned about is the validity
// of `state_id`. `state_id` either comes from the caller
// (in which case, we assert above that it is valid), or it
// comes from the return value of next_state, which is also
// guaranteed to be valid.
*state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
ptr = ptr.offset(1);
if self.is_match_or_dead_state(*state_id) {
if *state_id == dead_id() {
// The only way to enter into a dead state is if a
// match has been found, so we assert as much. This
// is different from normal automata, where you might
// enter a dead state if you know a subsequent match
// will never be found (regardless of whether a match
// has already been found). For Aho-Corasick, it is
// built so that we can match at any position, so the
// possibility of a match always exists.
//
// (Unless we have an anchored automaton, in which
// case, dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
let end = ptr as usize - start as usize;
last_match = self.get_match(*state_id, 0, end);
}
}
last_match
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
at += 1;
if self.is_match_or_dead_state(*state_id) {
if *state_id == dead_id() {
// The only way to enter into a dead state is if a match
// has been found, so we assert as much. This is different
// from normal automata, where you might enter a dead state
// if you know a subsequent match will never be found
// (regardless of whether a match has already been found).
// For Aho-Corasick, it is built so that we can match at
// any position, so the possibility of a match always
// exists.
//
// (Unless we have an anchored automaton, in which case,
// dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
last_match = self.get_match(*state_id, 0, at);
}
}
last_match
}
/// This is like leftmost_find_at, but does not need to track a caller
@ -393,7 +386,7 @@ pub trait Automaton {
}
}
// It's important for this to always be inlined. Namely, it's only caller
// It's important for this to always be inlined. Namely, its only caller
// is leftmost_find_at_no_state, and the inlining should remove the case
// analysis for prefilter scanning when there is no prefilter available.
#[inline(always)]
@ -402,7 +395,7 @@ pub trait Automaton {
prestate: &mut PrefilterState,
prefilter: Option<&dyn Prefilter>,
haystack: &[u8],
at: usize,
mut at: usize,
) -> Option<Match> {
debug_assert!(self.match_kind().is_leftmost());
if self.anchored() && at > 0 {
@ -422,63 +415,54 @@ pub trait Automaton {
};
}
}
let mut state_id = self.start_state();
unsafe {
let start = haystack.as_ptr();
let end = haystack[haystack.len()..].as_ptr();
let mut ptr = haystack[at..].as_ptr();
let mut last_match = self.get_match(state_id, 0, at);
while ptr < end {
if let Some(pre) = prefilter {
let at = ptr as usize - start as usize;
if prestate.is_effective(at)
&& state_id == self.start_state()
{
match prefilter::next(prestate, pre, haystack, at) {
Candidate::None => return None,
// Since we aren't tracking a state ID, we can
// quit early once we know we have a match.
Candidate::Match(m) => return Some(m),
Candidate::PossibleStartOfMatch(i) => {
ptr = start.offset(i as isize);
}
let mut state_id = self.start_state();
let mut last_match = self.get_match(state_id, 0, at);
while at < haystack.len() {
if let Some(pre) = prefilter {
if prestate.is_effective(at) && state_id == self.start_state()
{
match prefilter::next(prestate, pre, haystack, at) {
Candidate::None => return None,
// Since we aren't tracking a state ID, we can
// quit early once we know we have a match.
Candidate::Match(m) => return Some(m),
Candidate::PossibleStartOfMatch(i) => {
at = i;
}
}
}
// SAFETY: next_state is safe for all possible u8 values,
// so the only thing we're concerned about is the validity
// of `state_id`. `state_id` either comes from the caller
// (in which case, we assert above that it is valid), or it
// comes from the return value of next_state, which is also
// guaranteed to be valid.
state_id = self.next_state_unchecked_no_fail(state_id, *ptr);
ptr = ptr.offset(1);
if self.is_match_or_dead_state(state_id) {
if state_id == dead_id() {
// The only way to enter into a dead state is if a
// match has been found, so we assert as much. This
// is different from normal automata, where you might
// enter a dead state if you know a subsequent match
// will never be found (regardless of whether a match
// has already been found). For Aho-Corasick, it is
// built so that we can match at any position, so the
// possibility of a match always exists.
//
// (Unless we have an anchored automaton, in which
// case, dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
let end = ptr as usize - start as usize;
last_match = self.get_match(state_id, 0, end);
}
}
last_match
// CORRECTNESS: next_state is correct for all possible u8 values,
// so the only thing we're concerned about is the validity of
// `state_id`. `state_id` either comes from the caller (in which
// case, we assume it is correct), or it comes from the return
// value of next_state, which is guaranteed to be correct.
state_id = self.next_state_no_fail(state_id, haystack[at]);
at += 1;
if self.is_match_or_dead_state(state_id) {
if state_id == dead_id() {
// The only way to enter into a dead state is if a
// match has been found, so we assert as much. This
// is different from normal automata, where you might
// enter a dead state if you know a subsequent match
// will never be found (regardless of whether a match
// has already been found). For Aho-Corasick, it is
// built so that we can match at any position, so the
// possibility of a match always exists.
//
// (Unless we have an anchored automaton, in which
// case, dead states are used to stop a search.)
debug_assert!(
last_match.is_some() || self.anchored(),
"failure state should only be seen after match"
);
return last_match;
}
last_match = self.get_match(state_id, 0, at);
}
}
last_match
}
/// Execute an overlapping search.

View File

@ -50,7 +50,9 @@ impl Buffer {
// reasons, so we set a lower bound of `8 * min`.
//
// TODO: It would be good to find a way to test the streaming
// implementation with the minimal buffer size.
// implementation with the minimal buffer size. For now, we just
// uncomment out the next line and comment out the subsequent line.
// let capacity = 1 + min;
let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
Buffer { buf: vec![0; capacity], min, end: 0 }
}
@ -117,6 +119,8 @@ impl Buffer {
// SAFETY: A buffer contains Copy data, so there's no problem
// moving it around. Safety also depends on our indices being in
// bounds, which they always should be, given the assert above.
//
// TODO: Switch to [T]::copy_within once our MSRV is high enough.
ptr::copy(
self.buf[roll_start..].as_ptr(),
self.buf.as_mut_ptr(),

View File

@ -36,7 +36,7 @@ impl ByteClasses {
pub fn get(&self, byte: u8) -> u8 {
// SAFETY: This is safe because all dense transitions have
// exactly 256 elements, so all u8 values are valid indices.
unsafe { *self.0.get_unchecked(byte as usize) }
self.0[byte as usize]
}
/// Return the total number of elements in the alphabet represented by
@ -64,7 +64,7 @@ impl ByteClasses {
/// hasn't been converted to equivalence classes yet. Picking an arbitrary
/// byte from each equivalence class then permits a full exploration of
/// the NFA instead of using every possible byte value.
pub fn representatives(&self) -> ByteClassRepresentatives {
pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
}
@ -85,7 +85,7 @@ impl ByteClasses {
}
impl fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.is_singleton() {
write!(f, "ByteClasses({{singletons}})")
} else {

View File

@ -1,13 +1,13 @@
use std::mem::size_of;
use ahocorasick::MatchKind;
use automaton::Automaton;
use classes::ByteClasses;
use error::Result;
use nfa::{PatternID, PatternLength, NFA};
use prefilter::{Prefilter, PrefilterObj, PrefilterState};
use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
use Match;
use crate::ahocorasick::MatchKind;
use crate::automaton::Automaton;
use crate::classes::ByteClasses;
use crate::error::Result;
use crate::nfa::{PatternID, PatternLength, NFA};
use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState};
use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
use crate::Match;
#[derive(Clone, Debug)]
pub enum DFA<S> {
@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> {
self.repr().pattern_count
}
pub fn prefilter(&self) -> Option<&dyn Prefilter> {
self.repr().prefilter.as_ref().map(|p| p.as_ref())
}
pub fn start_state(&self) -> S {
self.repr().start_id
}
@ -189,9 +193,9 @@ impl<S: StateID> Automaton for Standard<S> {
self.repr().match_count(id)
}
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
fn next_state(&self, current: S, input: u8) -> S {
let o = current.to_usize() * 256 + input as usize;
*self.repr().trans.get_unchecked(o)
self.repr().trans[o]
}
}
@ -248,11 +252,11 @@ impl<S: StateID> Automaton for ByteClass<S> {
self.repr().match_count(id)
}
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
fn next_state(&self, current: S, input: u8) -> S {
let alphabet_len = self.repr().byte_classes.alphabet_len();
let input = self.repr().byte_classes.get(input);
let o = current.to_usize() * alphabet_len + input as usize;
*self.repr().trans.get_unchecked(o)
self.repr().trans[o]
}
}
@ -317,9 +321,9 @@ impl<S: StateID> Automaton for Premultiplied<S> {
self.repr().matches[o].len()
}
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
fn next_state(&self, current: S, input: u8) -> S {
let o = current.to_usize() + input as usize;
*self.repr().trans.get_unchecked(o)
self.repr().trans[o]
}
}
@ -384,10 +388,10 @@ impl<S: StateID> Automaton for PremultipliedByteClass<S> {
self.repr().matches[o].len()
}
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
fn next_state(&self, current: S, input: u8) -> S {
let input = self.repr().byte_classes.get(input);
let o = current.to_usize() + input as usize;
*self.repr().trans.get_unchecked(o)
self.repr().trans[o]
}
}
@ -637,8 +641,8 @@ impl Builder {
heap_bytes: 0,
prefilter: nfa.prefilter_obj().map(|p| p.clone()),
byte_classes: byte_classes.clone(),
trans: trans,
matches: matches,
trans,
matches,
};
for id in (0..nfa.state_len()).map(S::from_usize) {
repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id));

View File

@ -68,7 +68,7 @@ impl error::Error for Error {
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.kind {
ErrorKind::StateIDOverflow { max } => write!(
f,

View File

@ -168,13 +168,14 @@ naive solutions, it is generally slower than more specialized algorithms that
are accelerated using vector instructions such as SIMD.
For that reason, this library will internally use a "prefilter" to attempt
to accelerate searches when possible. Currently, this library has fairly
limited implementation that only applies when there are 3 or fewer unique
starting bytes among all patterns in an automaton.
to accelerate searches when possible. Currently, this library has several
different algorithms it might use depending on the patterns provided. Once the
number of patterns gets too big, prefilters are no longer used.
While a prefilter is generally good to have on by default since it works well
in the common case, it can lead to less predictable or even sub-optimal
performance in some cases. For that reason, prefilters can be disabled via
While a prefilter is generally good to have on by default since it works
well in the common case, it can lead to less predictable or even sub-optimal
performance in some cases. For that reason, prefilters can be explicitly
disabled via
[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter).
*/
@ -185,20 +186,19 @@ performance in some cases. For that reason, prefilters can be disabled via
#[cfg(not(feature = "std"))]
compile_error!("`std` feature is currently required to build this crate");
extern crate memchr;
#[cfg(test)]
#[macro_use]
extern crate doc_comment;
// #[cfg(doctest)]
// #[macro_use]
// extern crate doc_comment;
#[cfg(test)]
doctest!("../README.md");
// #[cfg(doctest)]
// doctest!("../README.md");
pub use ahocorasick::{
pub use crate::ahocorasick::{
AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind,
StreamFindIter,
};
pub use error::{Error, ErrorKind};
pub use state_id::StateID;
pub use crate::error::{Error, ErrorKind};
pub use crate::state_id::StateID;
mod ahocorasick;
mod automaton;
@ -292,6 +292,6 @@ impl Match {
#[inline]
fn from_span(id: usize, start: usize, end: usize) -> Match {
Match { pattern: id, len: end - start, end: end }
Match { pattern: id, len: end - start, end }
}
}

View File

@ -4,13 +4,13 @@ use std::fmt;
use std::mem::size_of;
use std::ops::{Index, IndexMut};
use ahocorasick::MatchKind;
use automaton::Automaton;
use classes::{ByteClassBuilder, ByteClasses};
use error::Result;
use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
use state_id::{dead_id, fail_id, usize_to_state_id, StateID};
use Match;
use crate::ahocorasick::MatchKind;
use crate::automaton::Automaton;
use crate::classes::{ByteClassBuilder, ByteClasses};
use crate::error::Result;
use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
use crate::Match;
/// The identifier for a pattern, which is simply the position of the pattern
/// in the sequence of patterns given by the caller.
@ -172,7 +172,7 @@ impl<S: StateID> NFA<S> {
self.state_mut(id)
}
fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> {
fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
IterTransitionsMut::new(self, id)
}
@ -194,7 +194,7 @@ impl<S: StateID> NFA<S> {
trans,
// Anchored automatons do not have any failure transitions.
fail: if self.anchored { dead_id() } else { self.start_id },
depth: depth,
depth,
matches: vec![],
});
Ok(id)
@ -207,7 +207,7 @@ impl<S: StateID> NFA<S> {
trans,
// Anchored automatons do not have any failure transitions.
fail: if self.anchored { dead_id() } else { self.start_id },
depth: depth,
depth,
matches: vec![],
});
Ok(id)
@ -262,14 +262,14 @@ impl<S: StateID> Automaton for NFA<S> {
self.states[id.to_usize()].matches.len()
}
unsafe fn next_state_unchecked(&self, mut current: S, input: u8) -> S {
fn next_state(&self, mut current: S, input: u8) -> S {
// This terminates since:
//
// 1. `State.fail` never points to fail_id().
// 2. All `State.fail` values point to a state closer to `start`.
// 3. The start state has no transitions to fail_id().
loop {
let state = self.states.get_unchecked(current.to_usize());
let state = &self.states[current.to_usize()];
let next = state.next_state(input);
if next != fail_id() {
return next;
@ -335,9 +335,9 @@ impl<S: StateID> State<S> {
/// Represents the transitions for a single dense state.
///
/// The primary purpose here is to encapsulate unchecked index access. Namely,
/// since a dense representation always contains 256 elements, all values of
/// `u8` are valid indices.
/// The primary purpose here is to encapsulate index access. Namely, since a
/// dense representation always contains 256 elements, all values of `u8` are
/// valid indices.
#[derive(Clone, Debug)]
struct Dense<S>(Vec<S>);
@ -362,7 +362,7 @@ impl<S> Index<u8> for Dense<S> {
fn index(&self, i: u8) -> &S {
// SAFETY: This is safe because all dense transitions have
// exactly 256 elements, so all u8 values are valid indices.
unsafe { self.0.get_unchecked(i as usize) }
&self.0[i as usize]
}
}
@ -371,7 +371,7 @@ impl<S> IndexMut<u8> for Dense<S> {
fn index_mut(&mut self, i: u8) -> &mut S {
// SAFETY: This is safe because all dense transitions have
// exactly 256 elements, so all u8 values are valid indices.
unsafe { self.0.get_unchecked_mut(i as usize) }
&mut self.0[i as usize]
}
}
@ -497,7 +497,7 @@ impl<S: StateID> Transitions<S> {
/// is iterating over transitions, the caller can still mutate the NFA. This
/// is useful when creating failure transitions.
#[derive(Debug)]
struct IterTransitionsMut<'a, S: StateID + 'a> {
struct IterTransitionsMut<'a, S: StateID> {
nfa: &'a mut NFA<S>,
state_id: S,
cur: usize,
@ -619,7 +619,7 @@ struct Compiler<'a, S: StateID> {
impl<'a, S: StateID> Compiler<'a, S> {
fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
Ok(Compiler {
builder: builder,
builder,
prefilter: prefilter::Builder::new(builder.match_kind)
.ascii_case_insensitive(builder.ascii_case_insensitive),
nfa: NFA {
@ -702,6 +702,10 @@ impl<'a, S: StateID> Compiler<'a, S> {
// building a DFA. They would technically be useful for the
// NFA, but it would require a second pass over the patterns.
self.byte_classes.set_range(b, b);
if self.builder.ascii_case_insensitive {
let b = opposite_ascii_case(b);
self.byte_classes.set_range(b, b);
}
// If the transition from prev using the current byte already
// exists, then just move through it. Otherwise, add a new
@ -854,10 +858,17 @@ impl<'a, S: StateID> Compiler<'a, S> {
while let Some(id) = queue.pop_front() {
let mut it = self.nfa.iter_transitions_mut(id);
while let Some((b, next)) = it.next() {
if !seen.contains(next) {
queue.push_back(next);
seen.insert(next);
if seen.contains(next) {
// The only way to visit a duplicate state in a transition
// list is when ASCII case insensitivity is enabled. In
// this case, we want to skip it since it's redundant work.
// But it would also end up duplicating matches, which
// results in reporting duplicate matches in some cases.
// See the 'acasei010' regression test.
continue;
}
queue.push_back(next);
seen.insert(next);
let mut fail = it.nfa().state(id).fail;
while it.nfa().state(fail).next_state(b) == fail_id() {
@ -1008,10 +1019,17 @@ impl<'a, S: StateID> Compiler<'a, S> {
// Queue up the next state.
let next = item.next_queued_state(it.nfa(), next_id);
if !seen.contains(next.id) {
queue.push_back(next);
seen.insert(next.id);
if seen.contains(next.id) {
// The only way to visit a duplicate state in a transition
// list is when ASCII case insensitivity is enabled. In
// this case, we want to skip it since it's redundant work.
// But it would also end up duplicating matches, which
// results in reporting duplicate matches in some cases.
// See the 'acasei010' regression test.
continue;
}
queue.push_back(next);
seen.insert(next.id);
// Find the failure state for next. Same as standard.
let mut fail = it.nfa().state(item.id).fail;
@ -1256,9 +1274,10 @@ impl Iterator for AllBytesIter {
}
impl<S: StateID> fmt::Debug for NFA<S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "NFA(")?;
writeln!(f, "match_kind: {:?}", self.match_kind)?;
writeln!(f, "prefilter: {:?}", self.prefilter)?;
writeln!(f, "{}", "-".repeat(79))?;
for (id, s) in self.states.iter().enumerate() {
let mut trans = vec![];

View File

@ -1,9 +1,9 @@
use std::u16;
use packed::pattern::Patterns;
use packed::rabinkarp::RabinKarp;
use packed::teddy::{self, Teddy};
use Match;
use crate::packed::pattern::Patterns;
use crate::packed::rabinkarp::RabinKarp;
use crate::packed::teddy::{self, Teddy};
use crate::Match;
/// This is a limit placed on the total number of patterns we're willing to try
/// and match at once. As more sophisticated algorithms are added, this number
@ -269,8 +269,8 @@ impl Builder {
};
Some(Searcher {
config: self.config.clone(),
patterns: patterns,
rabinkarp: rabinkarp,
patterns,
rabinkarp,
search_kind,
minimum_len,
})

View File

@ -105,7 +105,7 @@ common reasons:
no searcher is built.
*/
pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
mod api;
mod pattern;

View File

@ -4,7 +4,7 @@ use std::mem;
use std::u16;
use std::usize;
use packed::api::MatchKind;
use crate::packed::api::MatchKind;
/// The type used for representing a pattern identifier.
///
@ -155,7 +155,7 @@ impl Patterns {
/// Return the pattern with the given identifier. If such a pattern does
/// not exist, then this panics.
pub fn get(&self, id: PatternID) -> Pattern {
pub fn get(&self, id: PatternID) -> Pattern<'_> {
Pattern(&self.by_id[id as usize])
}
@ -167,7 +167,7 @@ impl Patterns {
/// Callers must ensure that a pattern with the given identifier exists
/// before using this method.
#[cfg(target_arch = "x86_64")]
pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern {
pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
Pattern(self.by_id.get_unchecked(id as usize))
}
@ -189,7 +189,7 @@ impl Patterns {
/// the order provided by this iterator, then the result is guaranteed
/// to satisfy the correct match semantics. (Either leftmost-first or
/// leftmost-longest.)
pub fn iter(&self) -> PatternIter {
pub fn iter(&self) -> PatternIter<'_> {
PatternIter { patterns: self, i: 0 }
}
}
@ -226,7 +226,7 @@ impl<'p> Iterator for PatternIter<'p> {
pub struct Pattern<'a>(&'a [u8]);
impl<'a> fmt::Debug for Pattern<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Pattern")
.field("lit", &String::from_utf8_lossy(&self.0))
.finish()

View File

@ -1,7 +1,7 @@
use std::mem;
use packed::pattern::{PatternID, Patterns};
use Match;
use crate::packed::pattern::{PatternID, Patterns};
use crate::Match;
/// The type of the rolling hash used in the Rabin-Karp algorithm.
type Hash = usize;

View File

@ -4,8 +4,8 @@ use std::cmp;
use std::collections::BTreeMap;
use std::fmt;
use packed::pattern::{PatternID, Patterns};
use packed::teddy::Teddy;
use crate::packed::pattern::{PatternID, Patterns};
use crate::packed::teddy::Teddy;
/// A builder for constructing a Teddy matcher.
///
@ -73,7 +73,7 @@ impl Builder {
}
fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> {
use packed::teddy::runtime;
use crate::packed::teddy::runtime;
// Most of the logic here is just about selecting the optimal settings,
// or perhaps even rejecting construction altogether. The choices
@ -119,7 +119,7 @@ impl Builder {
// safe to call functions marked with the `avx2` target feature.
match (masks.len(), avx, fat) {
(1, false, _) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim1Mask128(
runtime::TeddySlim1Mask128 {
@ -128,7 +128,7 @@ impl Builder {
),
}),
(1, true, false) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim1Mask256(
runtime::TeddySlim1Mask256 {
@ -137,7 +137,7 @@ impl Builder {
),
}),
(1, true, true) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddyFat1Mask256(
runtime::TeddyFat1Mask256 {
@ -146,7 +146,7 @@ impl Builder {
),
}),
(2, false, _) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim2Mask128(
runtime::TeddySlim2Mask128 {
@ -156,7 +156,7 @@ impl Builder {
),
}),
(2, true, false) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim2Mask256(
runtime::TeddySlim2Mask256 {
@ -166,7 +166,7 @@ impl Builder {
),
}),
(2, true, true) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddyFat2Mask256(
runtime::TeddyFat2Mask256 {
@ -176,7 +176,7 @@ impl Builder {
),
}),
(3, false, _) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim3Mask128(
runtime::TeddySlim3Mask128 {
@ -187,7 +187,7 @@ impl Builder {
),
}),
(3, true, false) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddySlim3Mask256(
runtime::TeddySlim3Mask256 {
@ -198,7 +198,7 @@ impl Builder {
),
}),
(3, true, true) => Some(Teddy {
buckets: buckets,
buckets,
max_pattern_id: patterns.max_pattern_id(),
exec: runtime::Exec::TeddyFat3Mask256(
runtime::TeddyFat3Mask256 {
@ -296,7 +296,7 @@ impl<'p> Compiler<'p> {
}
impl<'p> fmt::Debug for Compiler<'p> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut buckets = vec![vec![]; self.buckets.len()];
for (i, bucket) in self.buckets.iter().enumerate() {
for &patid in bucket {
@ -400,7 +400,7 @@ impl Mask {
}
impl fmt::Debug for Mask {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
for i in 0..32 {
parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));

View File

@ -1,11 +1,11 @@
#[cfg(target_arch = "x86_64")]
pub use packed::teddy::compile::Builder;
pub use crate::packed::teddy::compile::Builder;
#[cfg(not(target_arch = "x86_64"))]
pub use packed::teddy::fallback::Builder;
pub use crate::packed::teddy::fallback::Builder;
#[cfg(not(target_arch = "x86_64"))]
pub use packed::teddy::fallback::Teddy;
pub use crate::packed::teddy::fallback::Teddy;
#[cfg(target_arch = "x86_64")]
pub use packed::teddy::runtime::Teddy;
pub use crate::packed::teddy::runtime::Teddy;
#[cfg(target_arch = "x86_64")]
mod compile;
@ -14,8 +14,8 @@ mod runtime;
#[cfg(not(target_arch = "x86_64"))]
mod fallback {
use packed::pattern::Patterns;
use Match;
use crate::packed::pattern::Patterns;
use crate::Match;
#[derive(Clone, Debug, Default)]
pub struct Builder(());

View File

@ -51,10 +51,10 @@
use std::arch::x86_64::*;
use std::mem;
use packed::pattern::{PatternID, Patterns};
use packed::teddy::compile;
use packed::vector::*;
use Match;
use crate::packed::pattern::{PatternID, Patterns};
use crate::packed::teddy::compile;
use crate::packed::vector::*;
use crate::Match;
/// The Teddy runtime.
///

View File

@ -1,8 +1,8 @@
use std::collections::HashMap;
use std::usize;
use packed::{Config, MatchKind};
use Match;
use crate::packed::{Config, MatchKind};
use crate::Match;
/// A description of a single test against a multi-pattern searcher.
///

View File

@ -5,9 +5,9 @@ use std::u8;
use memchr::{memchr, memchr2, memchr3};
use ahocorasick::MatchKind;
use packed;
use Match;
use crate::ahocorasick::MatchKind;
use crate::packed;
use crate::Match;
/// A candidate is the result of running a prefilter on a haystack at a
/// particular position. The result is either no match, a confirmed match or
@ -80,6 +80,17 @@ pub trait Prefilter:
fn reports_false_positives(&self) -> bool {
true
}
/// Returns true if and only if this prefilter may look for a non-starting
/// position of a match.
///
/// This is useful in a streaming context where prefilters that don't look
/// for a starting position of a match can be quite difficult to deal with.
///
/// This returns false by default.
fn looks_for_non_start_of_match(&self) -> bool {
false
}
}
impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
@ -191,6 +202,17 @@ impl PrefilterState {
}
}
/// Create a prefilter state that always disables the prefilter.
pub fn disabled() -> PrefilterState {
PrefilterState {
skips: 0,
skipped: 0,
max_match_len: 0,
inert: true,
last_scan_at: 0,
}
}
/// Update this state with the number of bytes skipped on the last
/// invocation of the prefilter.
#[inline]
@ -285,6 +307,7 @@ impl Builder {
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
pub fn build(&self) -> Option<PrefilterObj> {
// match (self.start_bytes.build(), self.rare_bytes.build()) {
match (self.start_bytes.build(), self.rare_bytes.build()) {
// If we could build both start and rare prefilters, then there are
// a few cases in which we'd want to use the start-byte prefilter
@ -371,8 +394,14 @@ struct RareBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// A set of byte offsets associated with detected rare bytes. An entry is
/// only set if a rare byte is detected in a pattern.
/// A set of rare bytes, indexed by byte value.
rare_set: ByteSet,
/// A set of byte offsets associated with bytes in a pattern. An entry
/// corresponds to a particular bytes (its index) and is only non-zero if
/// the byte occurred at an offset greater than 0 in at least one pattern.
///
/// If a byte's offset is not representable in 8 bits, then the rare bytes
/// prefilter becomes inert.
byte_offsets: RareByteOffsets,
/// Whether this is available as a prefilter or not. This can be set to
/// false during construction if a condition is seen that invalidates the
@ -385,11 +414,43 @@ struct RareBytesBuilder {
rank_sum: u16,
}
/// A set of rare byte offsets, keyed by byte.
/// A set of bytes.
#[derive(Clone, Copy)]
struct ByteSet([bool; 256]);
impl ByteSet {
fn empty() -> ByteSet {
ByteSet([false; 256])
}
fn insert(&mut self, b: u8) -> bool {
let new = !self.contains(b);
self.0[b as usize] = true;
new
}
fn contains(&self, b: u8) -> bool {
self.0[b as usize]
}
}
impl fmt::Debug for ByteSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut bytes = vec![];
for b in 0..=255 {
if self.contains(b) {
bytes.push(b);
}
}
f.debug_struct("ByteSet").field("set", &bytes).finish()
}
}
/// A set of byte offsets, keyed by byte.
#[derive(Clone, Copy)]
struct RareByteOffsets {
/// When an item in this set has an offset of u8::MAX (255), then it is
/// considered unset.
/// Each entry corresponds to the maximum offset of the corresponding
/// byte across all patterns seen.
set: [RareByteOffset; 256],
}
@ -403,29 +464,17 @@ impl RareByteOffsets {
/// greater than the existing offset, then it overwrites the previous
/// value and returns false. If there is no previous value set, then this
/// sets it and returns true.
///
/// The given offset must be active, otherwise this panics.
pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool {
assert!(off.is_active());
let existing = &mut self.set[byte as usize];
if !existing.is_active() {
*existing = off;
true
} else {
if existing.max < off.max {
*existing = off;
}
false
}
pub fn set(&mut self, byte: u8, off: RareByteOffset) {
self.set[byte as usize].max =
cmp::max(self.set[byte as usize].max, off.max);
}
}
impl fmt::Debug for RareByteOffsets {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut offsets = vec![];
for off in self.set.iter() {
if off.is_active() {
if off.max > 0 {
offsets.push(off);
}
}
@ -448,34 +497,28 @@ struct RareByteOffset {
/// ineffective when it is asked to start scanning from a position that it
/// has already scanned past.
///
/// N.B. The maximum value for this is 254. A value of 255 indicates that
/// this is unused. If a rare byte is found at an offset of 255 or greater,
/// then the rare-byte prefilter is disabled for simplicity.
/// Using a `u8` here means that if we ever see a pattern that's longer
/// than 255 bytes, then the entire rare byte prefilter is disabled.
max: u8,
}
impl Default for RareByteOffset {
fn default() -> RareByteOffset {
RareByteOffset { max: u8::MAX }
RareByteOffset { max: 0 }
}
}
impl RareByteOffset {
/// Create a new rare byte offset. If the given offset is too big, then
/// an inactive `RareByteOffset` is returned.
fn new(max: usize) -> RareByteOffset {
if max > (u8::MAX - 1) as usize {
RareByteOffset::default()
/// None is returned. In that case, callers should render the rare bytes
/// prefilter inert.
fn new(max: usize) -> Option<RareByteOffset> {
if max > u8::MAX as usize {
None
} else {
RareByteOffset { max: max as u8 }
Some(RareByteOffset { max: max as u8 })
}
}
/// Returns true if and only if this offset is active. If it's inactive,
/// then it should not be used.
fn is_active(&self) -> bool {
self.max < u8::MAX
}
}
impl RareBytesBuilder {
@ -483,6 +526,7 @@ impl RareBytesBuilder {
fn new() -> RareBytesBuilder {
RareBytesBuilder {
ascii_case_insensitive: false,
rare_set: ByteSet::empty(),
byte_offsets: RareByteOffsets::empty(),
available: true,
count: 0,
@ -507,8 +551,8 @@ impl RareBytesBuilder {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if self.byte_offsets.set[b].is_active() {
for b in 0..=255 {
if self.rare_set.contains(b) {
bytes[len] = b as u8;
len += 1;
}
@ -539,15 +583,25 @@ impl RareBytesBuilder {
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
// If we've already given up, then do nothing.
if !self.available {
return;
}
// If we've already blown our budget, then don't waste time looking
// for more rare bytes.
if self.count > 3 {
self.available = false;
return;
}
// If the pattern is too long, then our offset table is bunk, so
// give up.
if bytes.len() >= 256 {
self.available = false;
return;
}
let mut rarest = match bytes.get(0) {
None => return,
Some(&b) => (b, 0, freq_rank(b)),
Some(&b) => (b, freq_rank(b)),
};
// The idea here is to look for the rarest byte in each pattern, and
// add that to our set. As a special exception, if we see a byte that
@ -558,33 +612,44 @@ impl RareBytesBuilder {
// were searching for `Sherlock` and `lockjaw`, then this would pick
// `k` for both patterns, resulting in the use of `memchr` instead of
// `memchr2` for `k` and `j`.
let mut found = false;
for (pos, &b) in bytes.iter().enumerate() {
if self.byte_offsets.set[b as usize].is_active() {
self.add_rare_byte(b, pos);
return;
self.set_offset(pos, b);
if found {
continue;
}
if self.rare_set.contains(b) {
found = true;
continue;
}
let rank = freq_rank(b);
if rank < rarest.2 {
rarest = (b, pos, rank);
if rank < rarest.1 {
rarest = (b, rank);
}
}
self.add_rare_byte(rarest.0, rarest.1);
if !found {
self.add_rare_byte(rarest.0);
}
}
fn add_rare_byte(&mut self, byte: u8, pos: usize) {
self.add_one_byte(byte, pos);
fn set_offset(&mut self, pos: usize, byte: u8) {
// This unwrap is OK because pos is never bigger than our max.
let offset = RareByteOffset::new(pos).unwrap();
self.byte_offsets.set(byte, offset);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte), pos);
self.byte_offsets.set(opposite_ascii_case(byte), offset);
}
}
fn add_one_byte(&mut self, byte: u8, pos: usize) {
let off = RareByteOffset::new(pos);
if !off.is_active() {
self.available = false;
return;
fn add_rare_byte(&mut self, byte: u8) {
self.add_one_rare_byte(byte);
if self.ascii_case_insensitive {
self.add_one_rare_byte(opposite_ascii_case(byte));
}
if self.byte_offsets.apply(byte, off) {
}
fn add_one_rare_byte(&mut self, byte: u8) {
if self.rare_set.insert(byte) {
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
@ -621,6 +686,33 @@ impl Prefilter for RareBytesOne {
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: It should be possible to use a rare byte prefilter in a
// streaming context. The main problem is that we usually assume that
// if a prefilter has scanned some text and not found anything, then no
// match *starts* in that text. This doesn't matter in non-streaming
// contexts, but in a streaming context, if we're looking for a byte
// that doesn't start at the beginning of a match and don't find it,
// then it's still possible for a match to start at the end of the
// current buffer content. In order to fix this, the streaming searcher
// would need to become aware of prefilters that do this and use the
// appropriate offset in various places. It is quite a delicate change
// and probably shouldn't be attempted until streaming search has a
// better testing strategy. In particular, we'd really like to be able
// to vary the buffer size to force strange cases that occur at the
// edge of the buffer. If we make the buffer size minimal, then these
// cases occur more frequently and easier.
//
// This is also a bummer because this means that if the prefilter
// builder chose a rare byte prefilter, then a streaming search won't
// use any prefilter at all because the builder doesn't know how it's
// going to be used. Assuming we don't make streaming search aware of
// these special types of prefilters as described above, we could fix
// this by building a "backup" prefilter that could be used when the
// rare byte prefilter could not. But that's a bandaide. Sigh.
true
}
}
/// A prefilter for scanning for two "rare" bytes.
@ -655,6 +747,11 @@ impl Prefilter for RareBytesTwo {
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: See Prefilter impl for RareBytesOne.
true
}
}
/// A prefilter for scanning for three "rare" bytes.
@ -690,6 +787,11 @@ impl Prefilter for RareBytesThree {
fn heap_bytes(&self) -> usize {
0
}
fn looks_for_non_start_of_match(&self) -> bool {
// TODO: See Prefilter impl for RareBytesOne.
true
}
}
/// A builder for constructing a starting byte prefilter.
@ -698,7 +800,7 @@ impl Prefilter for RareBytesThree {
/// matches by reporting all positions corresponding to a particular byte. This
/// generally only takes affect when there are at most 3 distinct possible
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
/// distinct starting bytes (`f` and `b`), and this prefiler returns all
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
/// occurrences of either `f` or `b`.
///
/// In some cases, a heuristic frequency analysis may determine that it would
@ -930,7 +1032,7 @@ pub fn opposite_ascii_case(b: u8) -> u8 {
/// Return the frequency rank of the given byte. The higher the rank, the more
/// common the byte (heuristically speaking).
fn freq_rank(b: u8) -> u8 {
use byte_frequencies::BYTE_FREQUENCIES;
use crate::byte_frequencies::BYTE_FREQUENCIES;
BYTE_FREQUENCIES[b as usize]
}

View File

@ -1,7 +1,7 @@
use std::fmt::Debug;
use std::hash::Hash;
use error::{Error, Result};
use crate::error::{Error, Result};
// NOTE: Most of this code was copied from regex-automata, but without the
// (de)serialization specific stuff.
@ -69,18 +69,7 @@ mod private {
/// other type. In particular, this crate provides implementations for `u8`,
/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for
/// targets that can represent all corresponding values in a `usize`.)
///
/// # Safety
///
/// This trait is unsafe because the correctness of its implementations may be
/// relied upon by other unsafe code. For example, one possible way to
/// implement this trait incorrectly would be to return a maximum identifier
/// in `max_id` that is greater than the real maximum identifier. This will
/// likely result in wrap-on-overflow semantics in release mode, which can in
/// turn produce incorrect state identifiers. Those state identifiers may then
/// in turn access out-of-bounds memory in an automaton's search routine, where
/// bounds checks are explicitly elided for performance reasons.
pub unsafe trait StateID:
pub trait StateID:
private::Sealed
+ Clone
+ Copy
@ -111,11 +100,11 @@ pub unsafe trait StateID:
/// Return the maximum state identifier supported by this representation.
///
/// Implementors must return a correct bound. Doing otherwise may result
/// in memory unsafety.
/// in unspecified behavior (but will not violate memory safety).
fn max_id() -> usize;
}
unsafe impl StateID for usize {
impl StateID for usize {
#[inline]
fn from_usize(n: usize) -> usize {
n
@ -132,7 +121,7 @@ unsafe impl StateID for usize {
}
}
unsafe impl StateID for u8 {
impl StateID for u8 {
#[inline]
fn from_usize(n: usize) -> u8 {
n as u8
@ -149,7 +138,7 @@ unsafe impl StateID for u8 {
}
}
unsafe impl StateID for u16 {
impl StateID for u16 {
#[inline]
fn from_usize(n: usize) -> u16 {
n as u16
@ -167,7 +156,7 @@ unsafe impl StateID for u16 {
}
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
unsafe impl StateID for u32 {
impl StateID for u32 {
#[inline]
fn from_usize(n: usize) -> u32 {
n as u32
@ -185,7 +174,7 @@ unsafe impl StateID for u32 {
}
#[cfg(target_pointer_width = "64")]
unsafe impl StateID for u64 {
impl StateID for u64 {
#[inline]
fn from_usize(n: usize) -> u64 {
n as u64

View File

@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::io;
use std::usize;
use {AhoCorasickBuilder, Match, MatchKind};
use crate::{AhoCorasickBuilder, Match, MatchKind};
/// A description of a single test against an Aho-Corasick automaton.
///
@ -549,6 +549,39 @@ const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[
t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]),
];
/// Tests for ASCII case insensitivity.
///
/// These tests should all have the same behavior regardless of match semantics
/// or whether the search is overlapping.
const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[
t!(acasei000, &["a"], "A", &[(0, 0, 1)]),
t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]),
t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]),
t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]),
];
/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests.
const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[
t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]),
t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]),
t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]),
];
/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests.
const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[
t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
// This is a regression test from:
// https://github.com/BurntSushi/aho-corasick/issues/68
// Previously, it was reporting a duplicate (1, 3, 6) match.
t!(
acasei010,
&["abc", "def", "abcdef"],
"abcdef",
&[(0, 0, 3), (2, 0, 6), (1, 3, 6)]
),
];
/// Regression tests that are applied to all Aho-Corasick combinations.
///
/// If regression tests are needed for specific match semantics, then add them
@ -706,6 +739,8 @@ macro_rules! testcombo {
$collection,
$kind,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when option is removed.
#[allow(deprecated)]
b.dfa(true).byte_classes(false);
}
);
@ -714,6 +749,8 @@ macro_rules! testcombo {
$collection,
$kind,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when option is removed.
#[allow(deprecated)]
b.dfa(true).premultiply(false);
}
);
@ -722,6 +759,8 @@ macro_rules! testcombo {
$collection,
$kind,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when options are removed.
#[allow(deprecated)]
b.dfa(true).byte_classes(false).premultiply(false);
}
);
@ -797,6 +836,8 @@ testconfig!(
AC_STANDARD_OVERLAPPING,
Standard,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when option is removed.
#[allow(deprecated)]
b.dfa(true).byte_classes(false);
}
);
@ -806,6 +847,8 @@ testconfig!(
AC_STANDARD_OVERLAPPING,
Standard,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when option is removed.
#[allow(deprecated)]
b.dfa(true).premultiply(false);
}
);
@ -815,6 +858,8 @@ testconfig!(
AC_STANDARD_OVERLAPPING,
Standard,
|b: &mut AhoCorasickBuilder| {
// TODO: remove tests when options are removed.
#[allow(deprecated)]
b.dfa(true).byte_classes(false).premultiply(false);
}
);
@ -907,6 +952,99 @@ testconfig!(
}
);
// And also write out the test combinations for ASCII case insensitivity.
testconfig!(
acasei_standard_nfa_default,
&[ASCII_CASE_INSENSITIVE],
Standard,
|b: &mut AhoCorasickBuilder| {
b.prefilter(false).ascii_case_insensitive(true);
}
);
testconfig!(
acasei_standard_dfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
Standard,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true).dfa(true);
}
);
testconfig!(
overlapping,
acasei_standard_overlapping_nfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
Standard,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true);
}
);
testconfig!(
overlapping,
acasei_standard_overlapping_dfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
Standard,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true).dfa(true);
}
);
testconfig!(
acasei_leftmost_first_nfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
LeftmostFirst,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true);
}
);
testconfig!(
acasei_leftmost_first_dfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
LeftmostFirst,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true).dfa(true);
}
);
testconfig!(
acasei_leftmost_longest_nfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
LeftmostLongest,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true);
}
);
testconfig!(
acasei_leftmost_longest_dfa_default,
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
LeftmostLongest,
|b: &mut AhoCorasickBuilder| {
b.ascii_case_insensitive(true).dfa(true);
}
);
fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
which: TestCollection,
mut f: F,
) {
let get_match_triples =
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
matches
.into_iter()
.map(|m| (m.pattern(), m.start(), m.end()))
.collect()
};
for &tests in which {
for test in tests {
assert_eq!(
test.matches,
get_match_triples(f(&test)).as_slice(),
"test: {}, patterns: {:?}, haystack: {:?}",
test.name,
test.patterns,
test.haystack
);
}
}
}
#[test]
fn search_tests_have_unique_names() {
let assert = |constname, tests: &[SearchTest]| {
@ -996,27 +1134,119 @@ fn regression_ascii_case_insensitive_no_exponential() {
assert!(ac.find("").is_none());
}
fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
which: TestCollection,
mut f: F,
) {
let get_match_triples =
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
matches
.into_iter()
.map(|m| (m.pattern(), m.start(), m.end()))
.collect()
};
for &tests in which {
for test in tests {
// See: https://github.com/BurntSushi/aho-corasick/issues/53
//
// This test ensures that the rare byte prefilter works in a particular corner
// case. In particular, the shift offset detected for '/' in the patterns below
// was incorrect, leading to a false negative.
#[test]
fn regression_rare_byte_prefilter() {
use crate::AhoCorasick;
let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]);
assert!(ac.is_match("ab/j/"));
}
#[test]
fn regression_case_insensitive_prefilter() {
use crate::AhoCorasickBuilder;
for c in b'a'..b'z' {
for c2 in b'a'..b'z' {
let c = c as char;
let c2 = c2 as char;
let needle = format!("{}{}", c, c2).to_lowercase();
let haystack = needle.to_uppercase();
let ac = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.prefilter(true)
.build(&[&needle]);
assert_eq!(
test.matches,
get_match_triples(f(&test)).as_slice(),
"test: {}, patterns: {:?}, haystack: {:?}",
test.name,
test.patterns,
test.haystack
1,
ac.find_iter(&haystack).count(),
"failed to find {:?} in {:?}\n\nautomaton:\n{:?}",
needle,
haystack,
ac,
);
}
}
}
// See: https://github.com/BurntSushi/aho-corasick/issues/64
//
// This occurs when the rare byte prefilter is active.
#[test]
fn regression_stream_rare_byte_prefilter() {
use std::io::Read;
// NOTE: The test only fails if this ends with j.
const MAGIC: [u8; 5] = *b"1234j";
// NOTE: The test fails for value in 8188..=8191 These value put the string
// to search accross two call to read because the buffer size is 8192 by
// default.
const BEGIN: usize = 8191;
/// This is just a structure that implements Reader. The reader
/// implementation will simulate a file filled with 0, except for the MAGIC
/// string at offset BEGIN.
#[derive(Default)]
struct R {
read: usize,
}
impl Read for R {
fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> {
//dbg!(buf.len());
if self.read > 100000 {
return Ok(0);
}
let mut from = 0;
if self.read < BEGIN {
from = buf.len().min(BEGIN - self.read);
for x in 0..from {
buf[x] = 0;
}
self.read += from;
}
if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() {
let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from);
if to > from {
buf[from..to].copy_from_slice(
&MAGIC
[self.read - BEGIN..self.read - BEGIN + to - from],
);
self.read += to - from;
from = to;
}
}
for x in from..buf.len() {
buf[x] = 0;
self.read += 1;
}
Ok(buf.len())
}
}
fn run() -> ::std::io::Result<()> {
let aut = AhoCorasickBuilder::new().build(&[&MAGIC]);
// While reading from a vector, it works:
let mut buf = vec![];
R::default().read_to_end(&mut buf)?;
let from_whole = aut.find_iter(&buf).next().unwrap().start();
//But using stream_find_iter fails!
let mut file = R::default();
let begin = aut
.stream_find_iter(&mut file)
.next()
.expect("NOT FOUND!!!!")? // Panic here
.start();
assert_eq!(from_whole, begin);
Ok(())
}
run().unwrap()
}