mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 21:01:08 +00:00
Bug 1716518 - Upgrade aho-corasick to v0.7.18. r=emilio
Differential Revision: https://phabricator.services.mozilla.com/D117814
This commit is contained in:
parent
38e4a9776f
commit
c23265234d
4
Cargo.lock
generated
4
Cargo.lock
generated
@ -26,9 +26,9 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.6"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
@ -1 +1 @@
|
||||
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"a2f9a1263aa35a92af4ffc1935b264f062738bc25761aa62b3d582031d6bf5f0","DESIGN.md":"44d4516ef38d60e9638f756baf40bcd9eff1b8e8ce7538a1d8549e02d6605d48","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"626d74e4bdac78d2446c75c722a7e46d0eaa4e506a1068ff693b5abc338a384f","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"46c57a83a75a8f25fdf19a15deae10748d12b8af9445ae74700a546a92024608","src/automaton.rs":"85e79ceb964f824fcceca026abd255980840116704834d70a1b9c44833df299f","src/buffer.rs":"c40992e7d1ba0bac6d1c268d41069aad81f2226686c64192ed888a60f66db8cd","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"590f2e257bf7c630bea3a28d4a1f75c78db7a0802f5921aced017a056146b4e6","src/dfa.rs":"2fb1077edfefd2b7f7e9c0d9df55df1441d4571500a2c45aa5b41960a36441e4","src/error.rs":"36dbf2cefbfaa8a69186551320dbff023d3e82780a6c925e87c3e3997b967e66","src/lib.rs":"028ab998e8f0d1a98650b139bcca83681cbb52545060b9253b76d7e19117b53d","src/nfa.rs":"6bc3479ad37c576bba4bbdc9e3d0c6e69a4b7f0d9a88fcbbf727bf4a9b288494","src/packed/api.rs":"aa89627c7114c057c98ad1c7ab9ce18c6ed55267a6bcf7bc8efb917b6cfe5532","src/packed/mod.rs":"29c76ad3cbb1f831140cefac7a27fb504ac4af4f454975a571965b48aad417eb","src/packed/pattern.rs":"b88c57af057997da0a5a06f4c5604a7e598c20acfc11c15cd8977727f6e1cf9c","src/packed/rabinkarp.rs":"b3242a8631ea5607163dcbb641e4ac9c6da26774378da1e51651b0ab5656b390","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"21b18cbee9bc33918b85b1dc51a0faed57acb426f61e6b72aeaf69faa7595701","src/packed/teddy/mod.rs":"f63db3419b1d378929bf0bc1f0e3b909ff3c38b9f2b6e86ba4546b8f39907cd3","src/packed/teddy/runtime.rs":"0a1250ea73159b3be6e0fa9a3f55ecedbb2cb90cb798d1709e9f5ee48f8855d5","src/packed/tests.rs":"0b52ab9eef73a1a4f141f475a9fa98e54d447104aa69acba3a7f8248ce7164b2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"3dbe93d85c6fb985a9aea0b5eab003fe81a228e02adba00c8f63a35c3fd246b8","src/state_id.rs":"ebecd7046760e6bd72303f288be93342b446e7fe95f20b5ce23653d802c48b09","src/tests.rs":"9201cc0662bc9a1e8fa15c59e33a18a55ec6b3bd6bbea294d9cace0053bb8d24"},"package":"58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"}
|
||||
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"f61283fd900435313b9ba8c1b87a4b5b31d442f9b554222136ec8d1d3d1e39d8","DESIGN.md":"9065f33d818d1562244d36dc4781e2a351108030cee17f11c2ba512ca7b4c27e","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"741e7249c8d1d6a7ba9341d68253dbf4952477c5620ff37c5325f2e894b148b6","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"6fcbe812eec7af44b104c6b8a27b0a2ea8d67c3d9aec73cb69d802b30be5f005","src/automaton.rs":"610b3e2c104c51bf4f51a6d07626c3972e9d1274ca276e987385a231b284cc8b","src/buffer.rs":"dae7ee7c1f846ca9cf115ba4949484000e1837b4fb7311f8d8c9a35011c9c26f","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"99a53a2ed8eea8c13699def90e31dfdff9d0b90572b1db3cb534e3396e7a0ed0","src/dfa.rs":"25e4455b3e179a7e192108d05f3683993456b36e3ebed99f827558c52525b7e6","src/error.rs":"d34c2c9c815df5d9dedc46b4b3ce109cd2cee07825de643f0c574ec960367beb","src/lib.rs":"f0c48b0ee093dd8b3034d025d052c3667860c5d4a196cb178588012b719acea4","src/nfa.rs":"2f443951c78196126bfd237ed5770a69077e6190daeecd47131339c25e51a3d0","src/packed/api.rs":"ec58ff1b4375dd4ff88fb5859c7ede994fe08d31b7d3677720a086592aa0fe53","src/packed/mod.rs":"d7ee11d487a7f129f16dc8f1473442a7127905933f378504bae83df0f23c5e2a","src/packed/pattern.rs":"3abf3835d4c4f8a43753c52936a894d819f713f233fc046e19de5ef95200dcce","src/packed/rabinkarp.rs":"caf9563b7442c9b75c9cb520fa236c7a6da8173705889b8d79b69ede14a20767","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"aad40b3f93d2c388b409b31fb2795d414a365237789d5b1a7510d97ceb8ce260","src/packed/teddy/mod.rs":"83b52bd80272970ad17234d0db293d17c1710ec582302bf516b203c8edec037e","src/packed/teddy/runtime.rs":"836146e90b320b14fa2c65fe4af7915a41f6fb04408aac5fac731c22ff46adae","src/packed/tests.rs":"b8dc4d3281ecd6d0fa2bf7ef16cf292a467dfdce64e470c7921e983bfa60fee2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"82a3eb6d5c0c3f10bc8d5f57d55d6d14cf4cf21c475bb5253e1921084063b8d7","src/state_id.rs":"519ec8c7bf3fa72103d4c561c193759759f535dca924c9853efe630f406d2029","src/tests.rs":"6522ed1b244513c01de5bbcf0fe35571454fdea2c2a9d8dfe13a04bf57b70eca"},"package":"1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"}
|
16
third_party/rust/aho-corasick/Cargo.toml
vendored
16
third_party/rust/aho-corasick/Cargo.toml
vendored
@ -11,8 +11,9 @@
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "aho-corasick"
|
||||
version = "0.7.6"
|
||||
version = "0.7.18"
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||
exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"]
|
||||
autotests = false
|
||||
@ -32,16 +33,11 @@ debug = true
|
||||
[lib]
|
||||
name = "aho_corasick"
|
||||
[dependencies.memchr]
|
||||
version = "2.2.0"
|
||||
version = "2.4.0"
|
||||
default-features = false
|
||||
[dev-dependencies.doc-comment]
|
||||
version = "0.3.1"
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
std = ["memchr/use_std"]
|
||||
[badges.appveyor]
|
||||
repository = "BurntSushi/aho-corasick"
|
||||
|
||||
[badges.travis-ci]
|
||||
repository = "BurntSushi/aho-corasick"
|
||||
std = ["memchr/std"]
|
||||
|
18
third_party/rust/aho-corasick/DESIGN.md
vendored
18
third_party/rust/aho-corasick/DESIGN.md
vendored
@ -2,7 +2,7 @@ This document describes the internal design of this crate, which is an object
|
||||
lesson in what happens when you take a fairly simple old algorithm like
|
||||
Aho-Corasick and make it fast and production ready.
|
||||
|
||||
The target audience of this crate is Rust programmers that have some
|
||||
The target audience of this document is Rust programmers that have some
|
||||
familiarity with string searching, however, one does not need to know the
|
||||
Aho-Corasick algorithm in order to read this (it is explained below). One
|
||||
should, however, know what a trie is. (If you don't, go read its Wikipedia
|
||||
@ -13,7 +13,7 @@ own, Aho-Corasick isn't that complicated. The complex pieces come from the
|
||||
different variants of Aho-Corasick implemented in this crate. Specifically,
|
||||
they are:
|
||||
|
||||
* Aho-Corasick as an NFA, using dense transitions near root with sparse
|
||||
* Aho-Corasick as an NFA, using dense transitions near the root with sparse
|
||||
transitions elsewhere.
|
||||
* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct
|
||||
and uses less memory.)
|
||||
@ -74,7 +74,7 @@ one is Aho-Corasick. It's a common solution because it's not too hard to
|
||||
implement, scales quite well even when searching for thousands of patterns and
|
||||
is generally pretty fast. Aho-Corasick does well here because, regardless of
|
||||
the number of patterns you're searching for, it always visits each byte in the
|
||||
haystack exactly ocne. This means, generally speaking, adding more patterns to
|
||||
haystack exactly once. This means, generally speaking, adding more patterns to
|
||||
an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
|
||||
this is not true, since a larger automaton will make less effective use of the
|
||||
CPU's cache.)
|
||||
@ -277,12 +277,12 @@ there are a small number of patterns.
|
||||
|
||||
# More DFA tricks
|
||||
|
||||
As described in the previous section, one of the downsides of using a DFA is
|
||||
that is uses more memory and can take longer to builder. One small way of
|
||||
mitigating these concerns is to map the alphabet used by the automaton into a
|
||||
smaller space. Typically, the alphabet of a DFA has 256 elements in it: one
|
||||
element for each possible value that fits into a byte. However, in many cases,
|
||||
one does not need the full alphabet. For example, if all patterns in an
|
||||
As described in the previous section, one of the downsides of using a DFA
|
||||
is that is uses more memory and can take longer to build. One small way of
|
||||
mitigating these concerns is to map the alphabet used by the automaton into
|
||||
a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
|
||||
one element for each possible value that fits into a byte. However, in many
|
||||
cases, one does not need the full alphabet. For example, if all patterns in an
|
||||
Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
|
||||
bytes. As far as the automaton is concerned, the rest of the 204 bytes are
|
||||
indistinguishable from one another: they will never disrciminate between a
|
||||
|
29
third_party/rust/aho-corasick/README.md
vendored
29
third_party/rust/aho-corasick/README.md
vendored
@ -5,11 +5,10 @@ acceleration in some cases. This library provides multiple pattern
|
||||
search principally through an implementation of the
|
||||
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
|
||||
which builds a finite state machine for executing searches in linear time.
|
||||
Features include case insensitive matching, overlapping matches and search &
|
||||
replace in streams.
|
||||
Features include case insensitive matching, overlapping matches, fast searching
|
||||
via SIMD and optional full DFA construction and search & replace in streams.
|
||||
|
||||
[![Linux build status](https://api.travis-ci.org/BurntSushi/aho-corasick.svg)](https://travis-ci.org/BurntSushi/aho-corasick)
|
||||
[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/aho-corasick?svg=true)](https://ci.appveyor.com/project/BurntSushi/aho-corasick)
|
||||
[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
|
||||
[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||
@ -29,12 +28,6 @@ Add this to your `Cargo.toml`:
|
||||
aho-corasick = "0.7"
|
||||
```
|
||||
|
||||
and this to your crate root (if you're using Rust 2015):
|
||||
|
||||
```rust
|
||||
extern crate aho_corasick;
|
||||
```
|
||||
|
||||
|
||||
### Example: basic searching
|
||||
|
||||
@ -95,7 +88,6 @@ loading the entire stream into memory first.
|
||||
```rust
|
||||
use aho_corasick::AhoCorasick;
|
||||
|
||||
# fn example() -> Result<(), ::std::io::Error> {
|
||||
let patterns = &["fox", "brown", "quick"];
|
||||
let replace_with = &["sloth", "grey", "slow"];
|
||||
|
||||
@ -105,9 +97,9 @@ let rdr = "The quick brown fox.";
|
||||
let mut wtr = vec![];
|
||||
|
||||
let ac = AhoCorasick::new(patterns);
|
||||
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
|
||||
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
|
||||
.expect("stream_replace_all failed");
|
||||
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
|
||||
# Ok(()) }; example().unwrap()
|
||||
```
|
||||
|
||||
|
||||
@ -164,11 +156,16 @@ expression alternation. See `MatchKind` in the docs for more details.
|
||||
|
||||
### Minimum Rust version policy
|
||||
|
||||
This crate's minimum supported `rustc` version is `1.28.0`.
|
||||
This crate's minimum supported `rustc` version is `1.41.1`.
|
||||
|
||||
The current policy is that the minimum Rust version required to use this crate
|
||||
can be increased in minor version updates. For example, if `crate 1.0` requires
|
||||
Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
|
||||
1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
|
||||
version of Rust.
|
||||
|
||||
In general, this crate will be conservative with respect to the minimum
|
||||
supported version of Rust. In general, it will follow the `regex` crate's
|
||||
policy, since `regex` is an important dependent.
|
||||
supported version of Rust.
|
||||
|
||||
|
||||
### Future work
|
||||
|
126
third_party/rust/aho-corasick/src/ahocorasick.rs
vendored
126
third_party/rust/aho-corasick/src/ahocorasick.rs
vendored
@ -1,14 +1,14 @@
|
||||
use std::io;
|
||||
|
||||
use automaton::Automaton;
|
||||
use buffer::Buffer;
|
||||
use dfa::{self, DFA};
|
||||
use error::Result;
|
||||
use nfa::{self, NFA};
|
||||
use packed;
|
||||
use prefilter::PrefilterState;
|
||||
use state_id::StateID;
|
||||
use Match;
|
||||
use crate::automaton::Automaton;
|
||||
use crate::buffer::Buffer;
|
||||
use crate::dfa::{self, DFA};
|
||||
use crate::error::Result;
|
||||
use crate::nfa::{self, NFA};
|
||||
use crate::packed;
|
||||
use crate::prefilter::{Prefilter, PrefilterState};
|
||||
use crate::state_id::StateID;
|
||||
use crate::Match;
|
||||
|
||||
/// An automaton for searching multiple strings in linear time.
|
||||
///
|
||||
@ -502,7 +502,7 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
/// The closure accepts three parameters: the match found, the text of
|
||||
/// the match and a string buffer with which to write the replaced text
|
||||
/// (if any). If the closure returns `true`, then it continues to the next
|
||||
/// match. If the closure returns false, then searching is stopped.
|
||||
/// match. If the closure returns `false`, then searching is stopped.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -524,6 +524,24 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
/// });
|
||||
/// assert_eq!("0 the 2 to the 0age", result);
|
||||
/// ```
|
||||
///
|
||||
/// Stopping the replacement by returning `false` (continued from the
|
||||
/// example above):
|
||||
///
|
||||
/// ```
|
||||
/// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
||||
/// # let patterns = &["append", "appendage", "app"];
|
||||
/// # let haystack = "append the app to the appendage";
|
||||
/// # let ac = AhoCorasickBuilder::new()
|
||||
/// # .match_kind(MatchKind::LeftmostFirst)
|
||||
/// # .build(patterns);
|
||||
/// let mut result = String::new();
|
||||
/// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
|
||||
/// dst.push_str(&mat.pattern().to_string());
|
||||
/// mat.pattern() != 2
|
||||
/// });
|
||||
/// assert_eq!("0 the 2 to the appendage", result);
|
||||
/// ```
|
||||
pub fn replace_all_with<F>(
|
||||
&self,
|
||||
haystack: &str,
|
||||
@ -536,7 +554,9 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
for mat in self.find_iter(haystack) {
|
||||
dst.push_str(&haystack[last_match..mat.start()]);
|
||||
last_match = mat.end();
|
||||
replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
|
||||
if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
|
||||
break;
|
||||
};
|
||||
}
|
||||
dst.push_str(&haystack[last_match..]);
|
||||
}
|
||||
@ -548,7 +568,7 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
/// The closure accepts three parameters: the match found, the text of
|
||||
/// the match and a byte buffer with which to write the replaced text
|
||||
/// (if any). If the closure returns `true`, then it continues to the next
|
||||
/// match. If the closure returns false, then searching is stopped.
|
||||
/// match. If the closure returns `false`, then searching is stopped.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
@ -570,6 +590,24 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
/// });
|
||||
/// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
|
||||
/// ```
|
||||
///
|
||||
/// Stopping the replacement by returning `false` (continued from the
|
||||
/// example above):
|
||||
///
|
||||
/// ```
|
||||
/// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
||||
/// # let patterns = &["append", "appendage", "app"];
|
||||
/// # let haystack = b"append the app to the appendage";
|
||||
/// # let ac = AhoCorasickBuilder::new()
|
||||
/// # .match_kind(MatchKind::LeftmostFirst)
|
||||
/// # .build(patterns);
|
||||
/// let mut result = vec![];
|
||||
/// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
|
||||
/// dst.extend(mat.pattern().to_string().bytes());
|
||||
/// mat.pattern() != 2
|
||||
/// });
|
||||
/// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
|
||||
/// ```
|
||||
pub fn replace_all_with_bytes<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
@ -582,7 +620,9 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
for mat in self.find_iter(haystack) {
|
||||
dst.extend(&haystack[last_match..mat.start()]);
|
||||
last_match = mat.end();
|
||||
replace_with(&mat, &haystack[mat.start()..mat.end()], dst);
|
||||
if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
|
||||
break;
|
||||
};
|
||||
}
|
||||
dst.extend(&haystack[last_match..]);
|
||||
}
|
||||
@ -735,9 +775,7 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
/// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
|
||||
///
|
||||
/// The closure accepts three parameters: the match found, the text of
|
||||
/// the match and the writer with which to write the replaced text
|
||||
/// (if any). If the closure returns `true`, then it continues to the next
|
||||
/// match. If the closure returns false, then searching is stopped.
|
||||
/// the match and the writer with which to write the replaced text (if any).
|
||||
///
|
||||
/// After all matches are replaced, the writer is _not_ flushed.
|
||||
///
|
||||
@ -967,18 +1005,6 @@ impl<S: StateID> AhoCorasick<S> {
|
||||
///
|
||||
/// let ac = AhoCorasickBuilder::new()
|
||||
/// .dfa(true)
|
||||
/// .byte_classes(false)
|
||||
/// .build(&["foo", "bar", "baz"]);
|
||||
/// assert_eq!(20_768, ac.heap_bytes());
|
||||
///
|
||||
/// let ac = AhoCorasickBuilder::new()
|
||||
/// .dfa(true)
|
||||
/// .byte_classes(true) // default
|
||||
/// .build(&["foo", "bar", "baz"]);
|
||||
/// assert_eq!(1_248, ac.heap_bytes());
|
||||
///
|
||||
/// let ac = AhoCorasickBuilder::new()
|
||||
/// .dfa(true)
|
||||
/// .ascii_case_insensitive(true)
|
||||
/// .build(&["foo", "bar", "baz"]);
|
||||
/// assert_eq!(1_248, ac.heap_bytes());
|
||||
@ -1037,6 +1063,24 @@ impl<S: StateID> Imp<S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the prefilter object, if one exists, for the underlying
|
||||
/// automaton.
|
||||
fn prefilter(&self) -> Option<&dyn Prefilter> {
|
||||
match *self {
|
||||
Imp::NFA(ref nfa) => nfa.prefilter(),
|
||||
Imp::DFA(ref dfa) => dfa.prefilter(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if we should attempt to use a prefilter.
|
||||
fn use_prefilter(&self) -> bool {
|
||||
let p = match self.prefilter() {
|
||||
None => return false,
|
||||
Some(p) => p,
|
||||
};
|
||||
!p.looks_for_non_start_of_match()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn overlapping_find_at(
|
||||
&self,
|
||||
@ -1113,7 +1157,7 @@ impl<S: StateID> Imp<S> {
|
||||
///
|
||||
/// The lifetime `'b` refers to the lifetime of the haystack being searched.
|
||||
#[derive(Debug)]
|
||||
pub struct FindIter<'a, 'b, S: 'a + StateID> {
|
||||
pub struct FindIter<'a, 'b, S: StateID> {
|
||||
fsm: &'a Imp<S>,
|
||||
prestate: PrefilterState,
|
||||
haystack: &'b [u8],
|
||||
@ -1170,7 +1214,7 @@ impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
|
||||
///
|
||||
/// The lifetime `'b` refers to the lifetime of the haystack being searched.
|
||||
#[derive(Debug)]
|
||||
pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> {
|
||||
pub struct FindOverlappingIter<'a, 'b, S: StateID> {
|
||||
fsm: &'a Imp<S>,
|
||||
prestate: PrefilterState,
|
||||
haystack: &'b [u8],
|
||||
@ -1241,7 +1285,7 @@ impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
|
||||
///
|
||||
/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
|
||||
#[derive(Debug)]
|
||||
pub struct StreamFindIter<'a, R, S: 'a + StateID> {
|
||||
pub struct StreamFindIter<'a, R, S: StateID> {
|
||||
it: StreamChunkIter<'a, R, S>,
|
||||
}
|
||||
|
||||
@ -1276,7 +1320,7 @@ impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
|
||||
/// N.B. This does not actually implement Iterator because we need to borrow
|
||||
/// from the underlying reader. But conceptually, it's still an iterator.
|
||||
#[derive(Debug)]
|
||||
struct StreamChunkIter<'a, R, S: 'a + StateID> {
|
||||
struct StreamChunkIter<'a, R, S: StateID> {
|
||||
/// The AC automaton.
|
||||
fsm: &'a Imp<S>,
|
||||
/// State associated with this automaton's prefilter. It is a heuristic
|
||||
@ -1325,7 +1369,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
|
||||
"stream searching is only supported for Standard match semantics"
|
||||
);
|
||||
|
||||
let prestate = PrefilterState::new(ac.max_pattern_len());
|
||||
let prestate = if ac.imp.use_prefilter() {
|
||||
PrefilterState::new(ac.max_pattern_len())
|
||||
} else {
|
||||
PrefilterState::disabled()
|
||||
};
|
||||
let buf = Buffer::new(ac.imp.max_pattern_len());
|
||||
let state_id = ac.imp.start_state();
|
||||
StreamChunkIter {
|
||||
@ -1621,7 +1669,7 @@ impl AhoCorasickBuilder {
|
||||
// N.B. Using byte classes can actually be faster by improving
|
||||
// locality, but this only really applies for multi-megabyte
|
||||
// automata (i.e., automata that don't fit in your CPU's cache).
|
||||
self.dfa(true).byte_classes(false);
|
||||
self.dfa(true);
|
||||
} else if patterns.len() <= 5000 {
|
||||
self.dfa(true);
|
||||
}
|
||||
@ -1809,7 +1857,7 @@ impl AhoCorasickBuilder {
|
||||
/// finite automaton (NFA) is used instead.
|
||||
///
|
||||
/// The main benefit to a DFA is that it can execute searches more quickly
|
||||
/// than a DFA (perhaps 2-4 times as fast). The main drawback is that the
|
||||
/// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
|
||||
/// DFA uses more space and can take much longer to build.
|
||||
///
|
||||
/// Enabling this option does not change the time complexity for
|
||||
@ -1868,6 +1916,10 @@ impl AhoCorasickBuilder {
|
||||
/// overall performance.
|
||||
///
|
||||
/// This option is enabled by default.
|
||||
#[deprecated(
|
||||
since = "0.7.16",
|
||||
note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
|
||||
)]
|
||||
pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
|
||||
self.dfa_builder.byte_classes(yes);
|
||||
self
|
||||
@ -1896,6 +1948,10 @@ impl AhoCorasickBuilder {
|
||||
/// non-premultiplied form only requires 8 bits.
|
||||
///
|
||||
/// This option is enabled by default.
|
||||
#[deprecated(
|
||||
since = "0.7.16",
|
||||
note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
|
||||
)]
|
||||
pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
|
||||
self.dfa_builder.premultiply(yes);
|
||||
self
|
||||
|
354
third_party/rust/aho-corasick/src/automaton.rs
vendored
354
third_party/rust/aho-corasick/src/automaton.rs
vendored
@ -1,7 +1,7 @@
|
||||
use ahocorasick::MatchKind;
|
||||
use prefilter::{self, Candidate, Prefilter, PrefilterState};
|
||||
use state_id::{dead_id, fail_id, StateID};
|
||||
use Match;
|
||||
use crate::ahocorasick::MatchKind;
|
||||
use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
|
||||
use crate::state_id::{dead_id, fail_id, StateID};
|
||||
use crate::Match;
|
||||
|
||||
// NOTE: This trait essentially started as a copy of the same trait from from
|
||||
// regex-automata, with some wording changed since we use this trait for
|
||||
@ -28,6 +28,42 @@ use Match;
|
||||
// for tracking the state ID and one that doesn't. We should ideally do the
|
||||
// same for standard searching, but my sanity stopped me.
|
||||
|
||||
// SAFETY RATIONALE: Previously, the code below went to some length to remove
|
||||
// all bounds checks. This generally produced tighter assembly and lead to
|
||||
// 20-50% improvements in micro-benchmarks on corpora made up of random
|
||||
// characters. This somewhat makes sense, since the branch predictor is going
|
||||
// to be at its worse on random text.
|
||||
//
|
||||
// However, using the aho-corasick-debug tool and manually benchmarking
|
||||
// different inputs, the code *with* bounds checks actually wound up being
|
||||
// slightly faster:
|
||||
//
|
||||
// $ cat input
|
||||
// Sherlock Holmes
|
||||
// John Watson
|
||||
// Professor Moriarty
|
||||
// Irene Adler
|
||||
// Mary Watson
|
||||
//
|
||||
// $ aho-corasick-debug-safe \
|
||||
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
|
||||
// pattern read time: 32.824µs
|
||||
// automaton build time: 444.687µs
|
||||
// automaton heap usage: 72392 bytes
|
||||
// match count: 639
|
||||
// count time: 1.809961702s
|
||||
//
|
||||
// $ aho-corasick-debug-master \
|
||||
// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
|
||||
// pattern read time: 31.425µs
|
||||
// automaton build time: 317.434µs
|
||||
// automaton heap usage: 72392 bytes
|
||||
// match count: 639
|
||||
// count time: 2.059157705s
|
||||
//
|
||||
// I was able to reproduce this result on two different machines (an i5 and
|
||||
// an i7). Therefore, we go the route of safe code for now.
|
||||
|
||||
/// A trait describing the interface of an Aho-Corasick finite state machine.
|
||||
///
|
||||
/// Every automaton has exactly one fail state, one dead state and exactly one
|
||||
@ -39,8 +75,8 @@ use Match;
|
||||
/// only when at least one match has been observed.
|
||||
///
|
||||
/// Every automaton also has one or more match states, such that
|
||||
/// `Automaton::is_match_state_unchecked(id)` returns `true` if and only if
|
||||
/// `id` corresponds to a match state.
|
||||
/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
|
||||
/// corresponds to a match state.
|
||||
pub trait Automaton {
|
||||
/// The representation used for state identifiers in this automaton.
|
||||
///
|
||||
@ -123,20 +159,12 @@ pub trait Automaton {
|
||||
/// must ensure that the given identifier corresponds to a valid automaton
|
||||
/// state. Implementors must, in turn, ensure that this routine is safe for
|
||||
/// all valid state identifiers and for all possible `u8` values.
|
||||
unsafe fn next_state_unchecked(
|
||||
&self,
|
||||
current: Self::ID,
|
||||
input: u8,
|
||||
) -> Self::ID;
|
||||
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
|
||||
|
||||
/// Like next_state_unchecked, but debug_asserts that the underlying
|
||||
/// Like next_state, but debug_asserts that the underlying
|
||||
/// implementation never returns a `fail_id()` for the next state.
|
||||
unsafe fn next_state_unchecked_no_fail(
|
||||
&self,
|
||||
current: Self::ID,
|
||||
input: u8,
|
||||
) -> Self::ID {
|
||||
let next = self.next_state_unchecked(current, input);
|
||||
fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
|
||||
let next = self.next_state(current, input);
|
||||
// We should never see a transition to the failure state.
|
||||
debug_assert!(
|
||||
next != fail_id(),
|
||||
@ -174,7 +202,7 @@ pub trait Automaton {
|
||||
}
|
||||
}
|
||||
|
||||
// It's important for this to always be inlined. Namely, it's only caller
|
||||
// It's important for this to always be inlined. Namely, its only caller
|
||||
// is standard_find_at, and the inlining should remove the case analysis
|
||||
// for prefilter scanning when there is no prefilter available.
|
||||
#[inline(always)]
|
||||
@ -183,66 +211,49 @@ pub trait Automaton {
|
||||
prestate: &mut PrefilterState,
|
||||
prefilter: Option<&dyn Prefilter>,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
mut at: usize,
|
||||
state_id: &mut Self::ID,
|
||||
) -> Option<Match> {
|
||||
// This is necessary for guaranteeing a safe API, since we use the
|
||||
// state ID below in a function that exhibits UB if called with an
|
||||
// invalid state ID.
|
||||
assert!(
|
||||
self.is_valid(*state_id),
|
||||
"{} is not a valid state ID",
|
||||
state_id.to_usize()
|
||||
);
|
||||
unsafe {
|
||||
let start = haystack.as_ptr();
|
||||
let end = haystack[haystack.len()..].as_ptr();
|
||||
let mut ptr = haystack[at..].as_ptr();
|
||||
while ptr < end {
|
||||
if let Some(pre) = prefilter {
|
||||
let at = ptr as usize - start as usize;
|
||||
if prestate.is_effective(at)
|
||||
&& *state_id == self.start_state()
|
||||
{
|
||||
let c = prefilter::next(prestate, pre, haystack, at)
|
||||
.into_option();
|
||||
match c {
|
||||
None => return None,
|
||||
Some(i) => {
|
||||
ptr = start.offset(i as isize);
|
||||
}
|
||||
while at < haystack.len() {
|
||||
if let Some(pre) = prefilter {
|
||||
if prestate.is_effective(at) && *state_id == self.start_state()
|
||||
{
|
||||
let c = prefilter::next(prestate, pre, haystack, at)
|
||||
.into_option();
|
||||
match c {
|
||||
None => return None,
|
||||
Some(i) => {
|
||||
at = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
// SAFETY: next_state is safe for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity
|
||||
// of `state_id`. `state_id` either comes from the caller
|
||||
// (in which case, we assert above that it is valid), or it
|
||||
// comes from the return value of next_state, which is also
|
||||
// guaranteed to be valid.
|
||||
*state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
|
||||
ptr = ptr.offset(1);
|
||||
// This routine always quits immediately after seeing a
|
||||
// match, and since dead states can only come after seeing
|
||||
// a match, seeing a dead state here is impossible. (Unless
|
||||
// we have an anchored automaton, in which case, dead states
|
||||
// are used to stop a search.)
|
||||
debug_assert!(
|
||||
*state_id != dead_id() || self.anchored(),
|
||||
"standard find should never see a dead state"
|
||||
);
|
||||
|
||||
if self.is_match_or_dead_state(*state_id) {
|
||||
return if *state_id == dead_id() {
|
||||
None
|
||||
} else {
|
||||
let end = ptr as usize - start as usize;
|
||||
self.get_match(*state_id, 0, end)
|
||||
};
|
||||
}
|
||||
}
|
||||
None
|
||||
// CORRECTNESS: next_state is correct for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity of
|
||||
// `state_id`. `state_id` either comes from the caller (in which
|
||||
// case, we assume it is correct), or it comes from the return
|
||||
// value of next_state, which is guaranteed to be correct.
|
||||
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
|
||||
at += 1;
|
||||
// This routine always quits immediately after seeing a
|
||||
// match, and since dead states can only come after seeing
|
||||
// a match, seeing a dead state here is impossible. (Unless
|
||||
// we have an anchored automaton, in which case, dead states
|
||||
// are used to stop a search.)
|
||||
debug_assert!(
|
||||
*state_id != dead_id() || self.anchored(),
|
||||
"standard find should never see a dead state"
|
||||
);
|
||||
|
||||
if self.is_match_or_dead_state(*state_id) {
|
||||
return if *state_id == dead_id() {
|
||||
None
|
||||
} else {
|
||||
self.get_match(*state_id, 0, at)
|
||||
};
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Execute a search using leftmost (either first or longest) match
|
||||
@ -276,7 +287,7 @@ pub trait Automaton {
|
||||
}
|
||||
}
|
||||
|
||||
// It's important for this to always be inlined. Namely, it's only caller
|
||||
// It's important for this to always be inlined. Namely, its only caller
|
||||
// is leftmost_find_at, and the inlining should remove the case analysis
|
||||
// for prefilter scanning when there is no prefilter available.
|
||||
#[inline(always)]
|
||||
@ -285,76 +296,58 @@ pub trait Automaton {
|
||||
prestate: &mut PrefilterState,
|
||||
prefilter: Option<&dyn Prefilter>,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
mut at: usize,
|
||||
state_id: &mut Self::ID,
|
||||
) -> Option<Match> {
|
||||
debug_assert!(self.match_kind().is_leftmost());
|
||||
// This is necessary for guaranteeing a safe API, since we use the
|
||||
// state ID below in a function that exhibits UB if called with an
|
||||
// invalid state ID.
|
||||
assert!(
|
||||
self.is_valid(*state_id),
|
||||
"{} is not a valid state ID",
|
||||
state_id.to_usize()
|
||||
);
|
||||
if self.anchored() && at > 0 && *state_id == self.start_state() {
|
||||
return None;
|
||||
}
|
||||
unsafe {
|
||||
let start = haystack.as_ptr();
|
||||
let end = haystack[haystack.len()..].as_ptr();
|
||||
let mut ptr = haystack[at..].as_ptr();
|
||||
|
||||
let mut last_match = self.get_match(*state_id, 0, at);
|
||||
while ptr < end {
|
||||
if let Some(pre) = prefilter {
|
||||
let at = ptr as usize - start as usize;
|
||||
if prestate.is_effective(at)
|
||||
&& *state_id == self.start_state()
|
||||
{
|
||||
let c = prefilter::next(prestate, pre, haystack, at)
|
||||
.into_option();
|
||||
match c {
|
||||
None => return None,
|
||||
Some(i) => {
|
||||
ptr = start.offset(i as isize);
|
||||
}
|
||||
let mut last_match = self.get_match(*state_id, 0, at);
|
||||
while at < haystack.len() {
|
||||
if let Some(pre) = prefilter {
|
||||
if prestate.is_effective(at) && *state_id == self.start_state()
|
||||
{
|
||||
let c = prefilter::next(prestate, pre, haystack, at)
|
||||
.into_option();
|
||||
match c {
|
||||
None => return None,
|
||||
Some(i) => {
|
||||
at = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
// SAFETY: next_state is safe for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity
|
||||
// of `state_id`. `state_id` either comes from the caller
|
||||
// (in which case, we assert above that it is valid), or it
|
||||
// comes from the return value of next_state, which is also
|
||||
// guaranteed to be valid.
|
||||
*state_id = self.next_state_unchecked_no_fail(*state_id, *ptr);
|
||||
ptr = ptr.offset(1);
|
||||
if self.is_match_or_dead_state(*state_id) {
|
||||
if *state_id == dead_id() {
|
||||
// The only way to enter into a dead state is if a
|
||||
// match has been found, so we assert as much. This
|
||||
// is different from normal automata, where you might
|
||||
// enter a dead state if you know a subsequent match
|
||||
// will never be found (regardless of whether a match
|
||||
// has already been found). For Aho-Corasick, it is
|
||||
// built so that we can match at any position, so the
|
||||
// possibility of a match always exists.
|
||||
//
|
||||
// (Unless we have an anchored automaton, in which
|
||||
// case, dead states are used to stop a search.)
|
||||
debug_assert!(
|
||||
last_match.is_some() || self.anchored(),
|
||||
"failure state should only be seen after match"
|
||||
);
|
||||
return last_match;
|
||||
}
|
||||
let end = ptr as usize - start as usize;
|
||||
last_match = self.get_match(*state_id, 0, end);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
// CORRECTNESS: next_state is correct for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity of
|
||||
// `state_id`. `state_id` either comes from the caller (in which
|
||||
// case, we assume it is correct), or it comes from the return
|
||||
// value of next_state, which is guaranteed to be correct.
|
||||
*state_id = self.next_state_no_fail(*state_id, haystack[at]);
|
||||
at += 1;
|
||||
if self.is_match_or_dead_state(*state_id) {
|
||||
if *state_id == dead_id() {
|
||||
// The only way to enter into a dead state is if a match
|
||||
// has been found, so we assert as much. This is different
|
||||
// from normal automata, where you might enter a dead state
|
||||
// if you know a subsequent match will never be found
|
||||
// (regardless of whether a match has already been found).
|
||||
// For Aho-Corasick, it is built so that we can match at
|
||||
// any position, so the possibility of a match always
|
||||
// exists.
|
||||
//
|
||||
// (Unless we have an anchored automaton, in which case,
|
||||
// dead states are used to stop a search.)
|
||||
debug_assert!(
|
||||
last_match.is_some() || self.anchored(),
|
||||
"failure state should only be seen after match"
|
||||
);
|
||||
return last_match;
|
||||
}
|
||||
last_match = self.get_match(*state_id, 0, at);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
}
|
||||
|
||||
/// This is like leftmost_find_at, but does not need to track a caller
|
||||
@ -393,7 +386,7 @@ pub trait Automaton {
|
||||
}
|
||||
}
|
||||
|
||||
// It's important for this to always be inlined. Namely, it's only caller
|
||||
// It's important for this to always be inlined. Namely, its only caller
|
||||
// is leftmost_find_at_no_state, and the inlining should remove the case
|
||||
// analysis for prefilter scanning when there is no prefilter available.
|
||||
#[inline(always)]
|
||||
@ -402,7 +395,7 @@ pub trait Automaton {
|
||||
prestate: &mut PrefilterState,
|
||||
prefilter: Option<&dyn Prefilter>,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
mut at: usize,
|
||||
) -> Option<Match> {
|
||||
debug_assert!(self.match_kind().is_leftmost());
|
||||
if self.anchored() && at > 0 {
|
||||
@ -422,63 +415,54 @@ pub trait Automaton {
|
||||
};
|
||||
}
|
||||
}
|
||||
let mut state_id = self.start_state();
|
||||
unsafe {
|
||||
let start = haystack.as_ptr();
|
||||
let end = haystack[haystack.len()..].as_ptr();
|
||||
let mut ptr = haystack[at..].as_ptr();
|
||||
|
||||
let mut last_match = self.get_match(state_id, 0, at);
|
||||
while ptr < end {
|
||||
if let Some(pre) = prefilter {
|
||||
let at = ptr as usize - start as usize;
|
||||
if prestate.is_effective(at)
|
||||
&& state_id == self.start_state()
|
||||
{
|
||||
match prefilter::next(prestate, pre, haystack, at) {
|
||||
Candidate::None => return None,
|
||||
// Since we aren't tracking a state ID, we can
|
||||
// quit early once we know we have a match.
|
||||
Candidate::Match(m) => return Some(m),
|
||||
Candidate::PossibleStartOfMatch(i) => {
|
||||
ptr = start.offset(i as isize);
|
||||
}
|
||||
let mut state_id = self.start_state();
|
||||
let mut last_match = self.get_match(state_id, 0, at);
|
||||
while at < haystack.len() {
|
||||
if let Some(pre) = prefilter {
|
||||
if prestate.is_effective(at) && state_id == self.start_state()
|
||||
{
|
||||
match prefilter::next(prestate, pre, haystack, at) {
|
||||
Candidate::None => return None,
|
||||
// Since we aren't tracking a state ID, we can
|
||||
// quit early once we know we have a match.
|
||||
Candidate::Match(m) => return Some(m),
|
||||
Candidate::PossibleStartOfMatch(i) => {
|
||||
at = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
// SAFETY: next_state is safe for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity
|
||||
// of `state_id`. `state_id` either comes from the caller
|
||||
// (in which case, we assert above that it is valid), or it
|
||||
// comes from the return value of next_state, which is also
|
||||
// guaranteed to be valid.
|
||||
state_id = self.next_state_unchecked_no_fail(state_id, *ptr);
|
||||
ptr = ptr.offset(1);
|
||||
if self.is_match_or_dead_state(state_id) {
|
||||
if state_id == dead_id() {
|
||||
// The only way to enter into a dead state is if a
|
||||
// match has been found, so we assert as much. This
|
||||
// is different from normal automata, where you might
|
||||
// enter a dead state if you know a subsequent match
|
||||
// will never be found (regardless of whether a match
|
||||
// has already been found). For Aho-Corasick, it is
|
||||
// built so that we can match at any position, so the
|
||||
// possibility of a match always exists.
|
||||
//
|
||||
// (Unless we have an anchored automaton, in which
|
||||
// case, dead states are used to stop a search.)
|
||||
debug_assert!(
|
||||
last_match.is_some() || self.anchored(),
|
||||
"failure state should only be seen after match"
|
||||
);
|
||||
return last_match;
|
||||
}
|
||||
let end = ptr as usize - start as usize;
|
||||
last_match = self.get_match(state_id, 0, end);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
// CORRECTNESS: next_state is correct for all possible u8 values,
|
||||
// so the only thing we're concerned about is the validity of
|
||||
// `state_id`. `state_id` either comes from the caller (in which
|
||||
// case, we assume it is correct), or it comes from the return
|
||||
// value of next_state, which is guaranteed to be correct.
|
||||
state_id = self.next_state_no_fail(state_id, haystack[at]);
|
||||
at += 1;
|
||||
if self.is_match_or_dead_state(state_id) {
|
||||
if state_id == dead_id() {
|
||||
// The only way to enter into a dead state is if a
|
||||
// match has been found, so we assert as much. This
|
||||
// is different from normal automata, where you might
|
||||
// enter a dead state if you know a subsequent match
|
||||
// will never be found (regardless of whether a match
|
||||
// has already been found). For Aho-Corasick, it is
|
||||
// built so that we can match at any position, so the
|
||||
// possibility of a match always exists.
|
||||
//
|
||||
// (Unless we have an anchored automaton, in which
|
||||
// case, dead states are used to stop a search.)
|
||||
debug_assert!(
|
||||
last_match.is_some() || self.anchored(),
|
||||
"failure state should only be seen after match"
|
||||
);
|
||||
return last_match;
|
||||
}
|
||||
last_match = self.get_match(state_id, 0, at);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
}
|
||||
|
||||
/// Execute an overlapping search.
|
||||
|
6
third_party/rust/aho-corasick/src/buffer.rs
vendored
6
third_party/rust/aho-corasick/src/buffer.rs
vendored
@ -50,7 +50,9 @@ impl Buffer {
|
||||
// reasons, so we set a lower bound of `8 * min`.
|
||||
//
|
||||
// TODO: It would be good to find a way to test the streaming
|
||||
// implementation with the minimal buffer size.
|
||||
// implementation with the minimal buffer size. For now, we just
|
||||
// uncomment out the next line and comment out the subsequent line.
|
||||
// let capacity = 1 + min;
|
||||
let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
|
||||
Buffer { buf: vec![0; capacity], min, end: 0 }
|
||||
}
|
||||
@ -117,6 +119,8 @@ impl Buffer {
|
||||
// SAFETY: A buffer contains Copy data, so there's no problem
|
||||
// moving it around. Safety also depends on our indices being in
|
||||
// bounds, which they always should be, given the assert above.
|
||||
//
|
||||
// TODO: Switch to [T]::copy_within once our MSRV is high enough.
|
||||
ptr::copy(
|
||||
self.buf[roll_start..].as_ptr(),
|
||||
self.buf.as_mut_ptr(),
|
||||
|
6
third_party/rust/aho-corasick/src/classes.rs
vendored
6
third_party/rust/aho-corasick/src/classes.rs
vendored
@ -36,7 +36,7 @@ impl ByteClasses {
|
||||
pub fn get(&self, byte: u8) -> u8 {
|
||||
// SAFETY: This is safe because all dense transitions have
|
||||
// exactly 256 elements, so all u8 values are valid indices.
|
||||
unsafe { *self.0.get_unchecked(byte as usize) }
|
||||
self.0[byte as usize]
|
||||
}
|
||||
|
||||
/// Return the total number of elements in the alphabet represented by
|
||||
@ -64,7 +64,7 @@ impl ByteClasses {
|
||||
/// hasn't been converted to equivalence classes yet. Picking an arbitrary
|
||||
/// byte from each equivalence class then permits a full exploration of
|
||||
/// the NFA instead of using every possible byte value.
|
||||
pub fn representatives(&self) -> ByteClassRepresentatives {
|
||||
pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
|
||||
ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
|
||||
}
|
||||
|
||||
@ -85,7 +85,7 @@ impl ByteClasses {
|
||||
}
|
||||
|
||||
impl fmt::Debug for ByteClasses {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.is_singleton() {
|
||||
write!(f, "ByteClasses({{singletons}})")
|
||||
} else {
|
||||
|
40
third_party/rust/aho-corasick/src/dfa.rs
vendored
40
third_party/rust/aho-corasick/src/dfa.rs
vendored
@ -1,13 +1,13 @@
|
||||
use std::mem::size_of;
|
||||
|
||||
use ahocorasick::MatchKind;
|
||||
use automaton::Automaton;
|
||||
use classes::ByteClasses;
|
||||
use error::Result;
|
||||
use nfa::{PatternID, PatternLength, NFA};
|
||||
use prefilter::{Prefilter, PrefilterObj, PrefilterState};
|
||||
use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
|
||||
use Match;
|
||||
use crate::ahocorasick::MatchKind;
|
||||
use crate::automaton::Automaton;
|
||||
use crate::classes::ByteClasses;
|
||||
use crate::error::Result;
|
||||
use crate::nfa::{PatternID, PatternLength, NFA};
|
||||
use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState};
|
||||
use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID};
|
||||
use crate::Match;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum DFA<S> {
|
||||
@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> {
|
||||
self.repr().pattern_count
|
||||
}
|
||||
|
||||
pub fn prefilter(&self) -> Option<&dyn Prefilter> {
|
||||
self.repr().prefilter.as_ref().map(|p| p.as_ref())
|
||||
}
|
||||
|
||||
pub fn start_state(&self) -> S {
|
||||
self.repr().start_id
|
||||
}
|
||||
@ -189,9 +193,9 @@ impl<S: StateID> Automaton for Standard<S> {
|
||||
self.repr().match_count(id)
|
||||
}
|
||||
|
||||
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
||||
fn next_state(&self, current: S, input: u8) -> S {
|
||||
let o = current.to_usize() * 256 + input as usize;
|
||||
*self.repr().trans.get_unchecked(o)
|
||||
self.repr().trans[o]
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,11 +252,11 @@ impl<S: StateID> Automaton for ByteClass<S> {
|
||||
self.repr().match_count(id)
|
||||
}
|
||||
|
||||
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
||||
fn next_state(&self, current: S, input: u8) -> S {
|
||||
let alphabet_len = self.repr().byte_classes.alphabet_len();
|
||||
let input = self.repr().byte_classes.get(input);
|
||||
let o = current.to_usize() * alphabet_len + input as usize;
|
||||
*self.repr().trans.get_unchecked(o)
|
||||
self.repr().trans[o]
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,9 +321,9 @@ impl<S: StateID> Automaton for Premultiplied<S> {
|
||||
self.repr().matches[o].len()
|
||||
}
|
||||
|
||||
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
||||
fn next_state(&self, current: S, input: u8) -> S {
|
||||
let o = current.to_usize() + input as usize;
|
||||
*self.repr().trans.get_unchecked(o)
|
||||
self.repr().trans[o]
|
||||
}
|
||||
}
|
||||
|
||||
@ -384,10 +388,10 @@ impl<S: StateID> Automaton for PremultipliedByteClass<S> {
|
||||
self.repr().matches[o].len()
|
||||
}
|
||||
|
||||
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
|
||||
fn next_state(&self, current: S, input: u8) -> S {
|
||||
let input = self.repr().byte_classes.get(input);
|
||||
let o = current.to_usize() + input as usize;
|
||||
*self.repr().trans.get_unchecked(o)
|
||||
self.repr().trans[o]
|
||||
}
|
||||
}
|
||||
|
||||
@ -637,8 +641,8 @@ impl Builder {
|
||||
heap_bytes: 0,
|
||||
prefilter: nfa.prefilter_obj().map(|p| p.clone()),
|
||||
byte_classes: byte_classes.clone(),
|
||||
trans: trans,
|
||||
matches: matches,
|
||||
trans,
|
||||
matches,
|
||||
};
|
||||
for id in (0..nfa.state_len()).map(S::from_usize) {
|
||||
repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id));
|
||||
|
2
third_party/rust/aho-corasick/src/error.rs
vendored
2
third_party/rust/aho-corasick/src/error.rs
vendored
@ -68,7 +68,7 @@ impl error::Error for Error {
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::StateIDOverflow { max } => write!(
|
||||
f,
|
||||
|
32
third_party/rust/aho-corasick/src/lib.rs
vendored
32
third_party/rust/aho-corasick/src/lib.rs
vendored
@ -168,13 +168,14 @@ naive solutions, it is generally slower than more specialized algorithms that
|
||||
are accelerated using vector instructions such as SIMD.
|
||||
|
||||
For that reason, this library will internally use a "prefilter" to attempt
|
||||
to accelerate searches when possible. Currently, this library has fairly
|
||||
limited implementation that only applies when there are 3 or fewer unique
|
||||
starting bytes among all patterns in an automaton.
|
||||
to accelerate searches when possible. Currently, this library has several
|
||||
different algorithms it might use depending on the patterns provided. Once the
|
||||
number of patterns gets too big, prefilters are no longer used.
|
||||
|
||||
While a prefilter is generally good to have on by default since it works well
|
||||
in the common case, it can lead to less predictable or even sub-optimal
|
||||
performance in some cases. For that reason, prefilters can be disabled via
|
||||
While a prefilter is generally good to have on by default since it works
|
||||
well in the common case, it can lead to less predictable or even sub-optimal
|
||||
performance in some cases. For that reason, prefilters can be explicitly
|
||||
disabled via
|
||||
[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter).
|
||||
*/
|
||||
|
||||
@ -185,20 +186,19 @@ performance in some cases. For that reason, prefilters can be disabled via
|
||||
#[cfg(not(feature = "std"))]
|
||||
compile_error!("`std` feature is currently required to build this crate");
|
||||
|
||||
extern crate memchr;
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate doc_comment;
|
||||
// #[cfg(doctest)]
|
||||
// #[macro_use]
|
||||
// extern crate doc_comment;
|
||||
|
||||
#[cfg(test)]
|
||||
doctest!("../README.md");
|
||||
// #[cfg(doctest)]
|
||||
// doctest!("../README.md");
|
||||
|
||||
pub use ahocorasick::{
|
||||
pub use crate::ahocorasick::{
|
||||
AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind,
|
||||
StreamFindIter,
|
||||
};
|
||||
pub use error::{Error, ErrorKind};
|
||||
pub use state_id::StateID;
|
||||
pub use crate::error::{Error, ErrorKind};
|
||||
pub use crate::state_id::StateID;
|
||||
|
||||
mod ahocorasick;
|
||||
mod automaton;
|
||||
@ -292,6 +292,6 @@ impl Match {
|
||||
|
||||
#[inline]
|
||||
fn from_span(id: usize, start: usize, end: usize) -> Match {
|
||||
Match { pattern: id, len: end - start, end: end }
|
||||
Match { pattern: id, len: end - start, end }
|
||||
}
|
||||
}
|
||||
|
71
third_party/rust/aho-corasick/src/nfa.rs
vendored
71
third_party/rust/aho-corasick/src/nfa.rs
vendored
@ -4,13 +4,13 @@ use std::fmt;
|
||||
use std::mem::size_of;
|
||||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use ahocorasick::MatchKind;
|
||||
use automaton::Automaton;
|
||||
use classes::{ByteClassBuilder, ByteClasses};
|
||||
use error::Result;
|
||||
use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
|
||||
use state_id::{dead_id, fail_id, usize_to_state_id, StateID};
|
||||
use Match;
|
||||
use crate::ahocorasick::MatchKind;
|
||||
use crate::automaton::Automaton;
|
||||
use crate::classes::{ByteClassBuilder, ByteClasses};
|
||||
use crate::error::Result;
|
||||
use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
|
||||
use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
|
||||
use crate::Match;
|
||||
|
||||
/// The identifier for a pattern, which is simply the position of the pattern
|
||||
/// in the sequence of patterns given by the caller.
|
||||
@ -172,7 +172,7 @@ impl<S: StateID> NFA<S> {
|
||||
self.state_mut(id)
|
||||
}
|
||||
|
||||
fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> {
|
||||
fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
|
||||
IterTransitionsMut::new(self, id)
|
||||
}
|
||||
|
||||
@ -194,7 +194,7 @@ impl<S: StateID> NFA<S> {
|
||||
trans,
|
||||
// Anchored automatons do not have any failure transitions.
|
||||
fail: if self.anchored { dead_id() } else { self.start_id },
|
||||
depth: depth,
|
||||
depth,
|
||||
matches: vec![],
|
||||
});
|
||||
Ok(id)
|
||||
@ -207,7 +207,7 @@ impl<S: StateID> NFA<S> {
|
||||
trans,
|
||||
// Anchored automatons do not have any failure transitions.
|
||||
fail: if self.anchored { dead_id() } else { self.start_id },
|
||||
depth: depth,
|
||||
depth,
|
||||
matches: vec![],
|
||||
});
|
||||
Ok(id)
|
||||
@ -262,14 +262,14 @@ impl<S: StateID> Automaton for NFA<S> {
|
||||
self.states[id.to_usize()].matches.len()
|
||||
}
|
||||
|
||||
unsafe fn next_state_unchecked(&self, mut current: S, input: u8) -> S {
|
||||
fn next_state(&self, mut current: S, input: u8) -> S {
|
||||
// This terminates since:
|
||||
//
|
||||
// 1. `State.fail` never points to fail_id().
|
||||
// 2. All `State.fail` values point to a state closer to `start`.
|
||||
// 3. The start state has no transitions to fail_id().
|
||||
loop {
|
||||
let state = self.states.get_unchecked(current.to_usize());
|
||||
let state = &self.states[current.to_usize()];
|
||||
let next = state.next_state(input);
|
||||
if next != fail_id() {
|
||||
return next;
|
||||
@ -335,9 +335,9 @@ impl<S: StateID> State<S> {
|
||||
|
||||
/// Represents the transitions for a single dense state.
|
||||
///
|
||||
/// The primary purpose here is to encapsulate unchecked index access. Namely,
|
||||
/// since a dense representation always contains 256 elements, all values of
|
||||
/// `u8` are valid indices.
|
||||
/// The primary purpose here is to encapsulate index access. Namely, since a
|
||||
/// dense representation always contains 256 elements, all values of `u8` are
|
||||
/// valid indices.
|
||||
#[derive(Clone, Debug)]
|
||||
struct Dense<S>(Vec<S>);
|
||||
|
||||
@ -362,7 +362,7 @@ impl<S> Index<u8> for Dense<S> {
|
||||
fn index(&self, i: u8) -> &S {
|
||||
// SAFETY: This is safe because all dense transitions have
|
||||
// exactly 256 elements, so all u8 values are valid indices.
|
||||
unsafe { self.0.get_unchecked(i as usize) }
|
||||
&self.0[i as usize]
|
||||
}
|
||||
}
|
||||
|
||||
@ -371,7 +371,7 @@ impl<S> IndexMut<u8> for Dense<S> {
|
||||
fn index_mut(&mut self, i: u8) -> &mut S {
|
||||
// SAFETY: This is safe because all dense transitions have
|
||||
// exactly 256 elements, so all u8 values are valid indices.
|
||||
unsafe { self.0.get_unchecked_mut(i as usize) }
|
||||
&mut self.0[i as usize]
|
||||
}
|
||||
}
|
||||
|
||||
@ -497,7 +497,7 @@ impl<S: StateID> Transitions<S> {
|
||||
/// is iterating over transitions, the caller can still mutate the NFA. This
|
||||
/// is useful when creating failure transitions.
|
||||
#[derive(Debug)]
|
||||
struct IterTransitionsMut<'a, S: StateID + 'a> {
|
||||
struct IterTransitionsMut<'a, S: StateID> {
|
||||
nfa: &'a mut NFA<S>,
|
||||
state_id: S,
|
||||
cur: usize,
|
||||
@ -619,7 +619,7 @@ struct Compiler<'a, S: StateID> {
|
||||
impl<'a, S: StateID> Compiler<'a, S> {
|
||||
fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
|
||||
Ok(Compiler {
|
||||
builder: builder,
|
||||
builder,
|
||||
prefilter: prefilter::Builder::new(builder.match_kind)
|
||||
.ascii_case_insensitive(builder.ascii_case_insensitive),
|
||||
nfa: NFA {
|
||||
@ -702,6 +702,10 @@ impl<'a, S: StateID> Compiler<'a, S> {
|
||||
// building a DFA. They would technically be useful for the
|
||||
// NFA, but it would require a second pass over the patterns.
|
||||
self.byte_classes.set_range(b, b);
|
||||
if self.builder.ascii_case_insensitive {
|
||||
let b = opposite_ascii_case(b);
|
||||
self.byte_classes.set_range(b, b);
|
||||
}
|
||||
|
||||
// If the transition from prev using the current byte already
|
||||
// exists, then just move through it. Otherwise, add a new
|
||||
@ -854,10 +858,17 @@ impl<'a, S: StateID> Compiler<'a, S> {
|
||||
while let Some(id) = queue.pop_front() {
|
||||
let mut it = self.nfa.iter_transitions_mut(id);
|
||||
while let Some((b, next)) = it.next() {
|
||||
if !seen.contains(next) {
|
||||
queue.push_back(next);
|
||||
seen.insert(next);
|
||||
if seen.contains(next) {
|
||||
// The only way to visit a duplicate state in a transition
|
||||
// list is when ASCII case insensitivity is enabled. In
|
||||
// this case, we want to skip it since it's redundant work.
|
||||
// But it would also end up duplicating matches, which
|
||||
// results in reporting duplicate matches in some cases.
|
||||
// See the 'acasei010' regression test.
|
||||
continue;
|
||||
}
|
||||
queue.push_back(next);
|
||||
seen.insert(next);
|
||||
|
||||
let mut fail = it.nfa().state(id).fail;
|
||||
while it.nfa().state(fail).next_state(b) == fail_id() {
|
||||
@ -1008,10 +1019,17 @@ impl<'a, S: StateID> Compiler<'a, S> {
|
||||
|
||||
// Queue up the next state.
|
||||
let next = item.next_queued_state(it.nfa(), next_id);
|
||||
if !seen.contains(next.id) {
|
||||
queue.push_back(next);
|
||||
seen.insert(next.id);
|
||||
if seen.contains(next.id) {
|
||||
// The only way to visit a duplicate state in a transition
|
||||
// list is when ASCII case insensitivity is enabled. In
|
||||
// this case, we want to skip it since it's redundant work.
|
||||
// But it would also end up duplicating matches, which
|
||||
// results in reporting duplicate matches in some cases.
|
||||
// See the 'acasei010' regression test.
|
||||
continue;
|
||||
}
|
||||
queue.push_back(next);
|
||||
seen.insert(next.id);
|
||||
|
||||
// Find the failure state for next. Same as standard.
|
||||
let mut fail = it.nfa().state(item.id).fail;
|
||||
@ -1256,9 +1274,10 @@ impl Iterator for AllBytesIter {
|
||||
}
|
||||
|
||||
impl<S: StateID> fmt::Debug for NFA<S> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "NFA(")?;
|
||||
writeln!(f, "match_kind: {:?}", self.match_kind)?;
|
||||
writeln!(f, "prefilter: {:?}", self.prefilter)?;
|
||||
writeln!(f, "{}", "-".repeat(79))?;
|
||||
for (id, s) in self.states.iter().enumerate() {
|
||||
let mut trans = vec![];
|
||||
|
12
third_party/rust/aho-corasick/src/packed/api.rs
vendored
12
third_party/rust/aho-corasick/src/packed/api.rs
vendored
@ -1,9 +1,9 @@
|
||||
use std::u16;
|
||||
|
||||
use packed::pattern::Patterns;
|
||||
use packed::rabinkarp::RabinKarp;
|
||||
use packed::teddy::{self, Teddy};
|
||||
use Match;
|
||||
use crate::packed::pattern::Patterns;
|
||||
use crate::packed::rabinkarp::RabinKarp;
|
||||
use crate::packed::teddy::{self, Teddy};
|
||||
use crate::Match;
|
||||
|
||||
/// This is a limit placed on the total number of patterns we're willing to try
|
||||
/// and match at once. As more sophisticated algorithms are added, this number
|
||||
@ -269,8 +269,8 @@ impl Builder {
|
||||
};
|
||||
Some(Searcher {
|
||||
config: self.config.clone(),
|
||||
patterns: patterns,
|
||||
rabinkarp: rabinkarp,
|
||||
patterns,
|
||||
rabinkarp,
|
||||
search_kind,
|
||||
minimum_len,
|
||||
})
|
||||
|
@ -105,7 +105,7 @@ common reasons:
|
||||
no searcher is built.
|
||||
*/
|
||||
|
||||
pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
|
||||
pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
|
||||
|
||||
mod api;
|
||||
mod pattern;
|
||||
|
@ -4,7 +4,7 @@ use std::mem;
|
||||
use std::u16;
|
||||
use std::usize;
|
||||
|
||||
use packed::api::MatchKind;
|
||||
use crate::packed::api::MatchKind;
|
||||
|
||||
/// The type used for representing a pattern identifier.
|
||||
///
|
||||
@ -155,7 +155,7 @@ impl Patterns {
|
||||
|
||||
/// Return the pattern with the given identifier. If such a pattern does
|
||||
/// not exist, then this panics.
|
||||
pub fn get(&self, id: PatternID) -> Pattern {
|
||||
pub fn get(&self, id: PatternID) -> Pattern<'_> {
|
||||
Pattern(&self.by_id[id as usize])
|
||||
}
|
||||
|
||||
@ -167,7 +167,7 @@ impl Patterns {
|
||||
/// Callers must ensure that a pattern with the given identifier exists
|
||||
/// before using this method.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern {
|
||||
pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
|
||||
Pattern(self.by_id.get_unchecked(id as usize))
|
||||
}
|
||||
|
||||
@ -189,7 +189,7 @@ impl Patterns {
|
||||
/// the order provided by this iterator, then the result is guaranteed
|
||||
/// to satisfy the correct match semantics. (Either leftmost-first or
|
||||
/// leftmost-longest.)
|
||||
pub fn iter(&self) -> PatternIter {
|
||||
pub fn iter(&self) -> PatternIter<'_> {
|
||||
PatternIter { patterns: self, i: 0 }
|
||||
}
|
||||
}
|
||||
@ -226,7 +226,7 @@ impl<'p> Iterator for PatternIter<'p> {
|
||||
pub struct Pattern<'a>(&'a [u8]);
|
||||
|
||||
impl<'a> fmt::Debug for Pattern<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Pattern")
|
||||
.field("lit", &String::from_utf8_lossy(&self.0))
|
||||
.finish()
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::mem;
|
||||
|
||||
use packed::pattern::{PatternID, Patterns};
|
||||
use Match;
|
||||
use crate::packed::pattern::{PatternID, Patterns};
|
||||
use crate::Match;
|
||||
|
||||
/// The type of the rolling hash used in the Rabin-Karp algorithm.
|
||||
type Hash = usize;
|
||||
|
@ -4,8 +4,8 @@ use std::cmp;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
use packed::pattern::{PatternID, Patterns};
|
||||
use packed::teddy::Teddy;
|
||||
use crate::packed::pattern::{PatternID, Patterns};
|
||||
use crate::packed::teddy::Teddy;
|
||||
|
||||
/// A builder for constructing a Teddy matcher.
|
||||
///
|
||||
@ -73,7 +73,7 @@ impl Builder {
|
||||
}
|
||||
|
||||
fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> {
|
||||
use packed::teddy::runtime;
|
||||
use crate::packed::teddy::runtime;
|
||||
|
||||
// Most of the logic here is just about selecting the optimal settings,
|
||||
// or perhaps even rejecting construction altogether. The choices
|
||||
@ -119,7 +119,7 @@ impl Builder {
|
||||
// safe to call functions marked with the `avx2` target feature.
|
||||
match (masks.len(), avx, fat) {
|
||||
(1, false, _) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim1Mask128(
|
||||
runtime::TeddySlim1Mask128 {
|
||||
@ -128,7 +128,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(1, true, false) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim1Mask256(
|
||||
runtime::TeddySlim1Mask256 {
|
||||
@ -137,7 +137,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(1, true, true) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddyFat1Mask256(
|
||||
runtime::TeddyFat1Mask256 {
|
||||
@ -146,7 +146,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(2, false, _) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim2Mask128(
|
||||
runtime::TeddySlim2Mask128 {
|
||||
@ -156,7 +156,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(2, true, false) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim2Mask256(
|
||||
runtime::TeddySlim2Mask256 {
|
||||
@ -166,7 +166,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(2, true, true) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddyFat2Mask256(
|
||||
runtime::TeddyFat2Mask256 {
|
||||
@ -176,7 +176,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(3, false, _) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim3Mask128(
|
||||
runtime::TeddySlim3Mask128 {
|
||||
@ -187,7 +187,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(3, true, false) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddySlim3Mask256(
|
||||
runtime::TeddySlim3Mask256 {
|
||||
@ -198,7 +198,7 @@ impl Builder {
|
||||
),
|
||||
}),
|
||||
(3, true, true) => Some(Teddy {
|
||||
buckets: buckets,
|
||||
buckets,
|
||||
max_pattern_id: patterns.max_pattern_id(),
|
||||
exec: runtime::Exec::TeddyFat3Mask256(
|
||||
runtime::TeddyFat3Mask256 {
|
||||
@ -296,7 +296,7 @@ impl<'p> Compiler<'p> {
|
||||
}
|
||||
|
||||
impl<'p> fmt::Debug for Compiler<'p> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut buckets = vec![vec![]; self.buckets.len()];
|
||||
for (i, bucket) in self.buckets.iter().enumerate() {
|
||||
for &patid in bucket {
|
||||
@ -400,7 +400,7 @@ impl Mask {
|
||||
}
|
||||
|
||||
impl fmt::Debug for Mask {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let (mut parts_lo, mut parts_hi) = (vec![], vec![]);
|
||||
for i in 0..32 {
|
||||
parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i]));
|
||||
|
@ -1,11 +1,11 @@
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub use packed::teddy::compile::Builder;
|
||||
pub use crate::packed::teddy::compile::Builder;
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
pub use packed::teddy::fallback::Builder;
|
||||
pub use crate::packed::teddy::fallback::Builder;
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
pub use packed::teddy::fallback::Teddy;
|
||||
pub use crate::packed::teddy::fallback::Teddy;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub use packed::teddy::runtime::Teddy;
|
||||
pub use crate::packed::teddy::runtime::Teddy;
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod compile;
|
||||
@ -14,8 +14,8 @@ mod runtime;
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
mod fallback {
|
||||
use packed::pattern::Patterns;
|
||||
use Match;
|
||||
use crate::packed::pattern::Patterns;
|
||||
use crate::Match;
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct Builder(());
|
||||
|
@ -51,10 +51,10 @@
|
||||
use std::arch::x86_64::*;
|
||||
use std::mem;
|
||||
|
||||
use packed::pattern::{PatternID, Patterns};
|
||||
use packed::teddy::compile;
|
||||
use packed::vector::*;
|
||||
use Match;
|
||||
use crate::packed::pattern::{PatternID, Patterns};
|
||||
use crate::packed::teddy::compile;
|
||||
use crate::packed::vector::*;
|
||||
use crate::Match;
|
||||
|
||||
/// The Teddy runtime.
|
||||
///
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::usize;
|
||||
|
||||
use packed::{Config, MatchKind};
|
||||
use Match;
|
||||
use crate::packed::{Config, MatchKind};
|
||||
use crate::Match;
|
||||
|
||||
/// A description of a single test against a multi-pattern searcher.
|
||||
///
|
||||
|
222
third_party/rust/aho-corasick/src/prefilter.rs
vendored
222
third_party/rust/aho-corasick/src/prefilter.rs
vendored
@ -5,9 +5,9 @@ use std::u8;
|
||||
|
||||
use memchr::{memchr, memchr2, memchr3};
|
||||
|
||||
use ahocorasick::MatchKind;
|
||||
use packed;
|
||||
use Match;
|
||||
use crate::ahocorasick::MatchKind;
|
||||
use crate::packed;
|
||||
use crate::Match;
|
||||
|
||||
/// A candidate is the result of running a prefilter on a haystack at a
|
||||
/// particular position. The result is either no match, a confirmed match or
|
||||
@ -80,6 +80,17 @@ pub trait Prefilter:
|
||||
fn reports_false_positives(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns true if and only if this prefilter may look for a non-starting
|
||||
/// position of a match.
|
||||
///
|
||||
/// This is useful in a streaming context where prefilters that don't look
|
||||
/// for a starting position of a match can be quite difficult to deal with.
|
||||
///
|
||||
/// This returns false by default.
|
||||
fn looks_for_non_start_of_match(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
|
||||
@ -191,6 +202,17 @@ impl PrefilterState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a prefilter state that always disables the prefilter.
|
||||
pub fn disabled() -> PrefilterState {
|
||||
PrefilterState {
|
||||
skips: 0,
|
||||
skipped: 0,
|
||||
max_match_len: 0,
|
||||
inert: true,
|
||||
last_scan_at: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update this state with the number of bytes skipped on the last
|
||||
/// invocation of the prefilter.
|
||||
#[inline]
|
||||
@ -285,6 +307,7 @@ impl Builder {
|
||||
/// All patterns added to an Aho-Corasick automaton should be added to this
|
||||
/// builder before attempting to construct the prefilter.
|
||||
pub fn build(&self) -> Option<PrefilterObj> {
|
||||
// match (self.start_bytes.build(), self.rare_bytes.build()) {
|
||||
match (self.start_bytes.build(), self.rare_bytes.build()) {
|
||||
// If we could build both start and rare prefilters, then there are
|
||||
// a few cases in which we'd want to use the start-byte prefilter
|
||||
@ -371,8 +394,14 @@ struct RareBytesBuilder {
|
||||
/// Whether this prefilter should account for ASCII case insensitivity or
|
||||
/// not.
|
||||
ascii_case_insensitive: bool,
|
||||
/// A set of byte offsets associated with detected rare bytes. An entry is
|
||||
/// only set if a rare byte is detected in a pattern.
|
||||
/// A set of rare bytes, indexed by byte value.
|
||||
rare_set: ByteSet,
|
||||
/// A set of byte offsets associated with bytes in a pattern. An entry
|
||||
/// corresponds to a particular bytes (its index) and is only non-zero if
|
||||
/// the byte occurred at an offset greater than 0 in at least one pattern.
|
||||
///
|
||||
/// If a byte's offset is not representable in 8 bits, then the rare bytes
|
||||
/// prefilter becomes inert.
|
||||
byte_offsets: RareByteOffsets,
|
||||
/// Whether this is available as a prefilter or not. This can be set to
|
||||
/// false during construction if a condition is seen that invalidates the
|
||||
@ -385,11 +414,43 @@ struct RareBytesBuilder {
|
||||
rank_sum: u16,
|
||||
}
|
||||
|
||||
/// A set of rare byte offsets, keyed by byte.
|
||||
/// A set of bytes.
|
||||
#[derive(Clone, Copy)]
|
||||
struct ByteSet([bool; 256]);
|
||||
|
||||
impl ByteSet {
|
||||
fn empty() -> ByteSet {
|
||||
ByteSet([false; 256])
|
||||
}
|
||||
|
||||
fn insert(&mut self, b: u8) -> bool {
|
||||
let new = !self.contains(b);
|
||||
self.0[b as usize] = true;
|
||||
new
|
||||
}
|
||||
|
||||
fn contains(&self, b: u8) -> bool {
|
||||
self.0[b as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ByteSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut bytes = vec![];
|
||||
for b in 0..=255 {
|
||||
if self.contains(b) {
|
||||
bytes.push(b);
|
||||
}
|
||||
}
|
||||
f.debug_struct("ByteSet").field("set", &bytes).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of byte offsets, keyed by byte.
|
||||
#[derive(Clone, Copy)]
|
||||
struct RareByteOffsets {
|
||||
/// When an item in this set has an offset of u8::MAX (255), then it is
|
||||
/// considered unset.
|
||||
/// Each entry corresponds to the maximum offset of the corresponding
|
||||
/// byte across all patterns seen.
|
||||
set: [RareByteOffset; 256],
|
||||
}
|
||||
|
||||
@ -403,29 +464,17 @@ impl RareByteOffsets {
|
||||
/// greater than the existing offset, then it overwrites the previous
|
||||
/// value and returns false. If there is no previous value set, then this
|
||||
/// sets it and returns true.
|
||||
///
|
||||
/// The given offset must be active, otherwise this panics.
|
||||
pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool {
|
||||
assert!(off.is_active());
|
||||
|
||||
let existing = &mut self.set[byte as usize];
|
||||
if !existing.is_active() {
|
||||
*existing = off;
|
||||
true
|
||||
} else {
|
||||
if existing.max < off.max {
|
||||
*existing = off;
|
||||
}
|
||||
false
|
||||
}
|
||||
pub fn set(&mut self, byte: u8, off: RareByteOffset) {
|
||||
self.set[byte as usize].max =
|
||||
cmp::max(self.set[byte as usize].max, off.max);
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RareByteOffsets {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut offsets = vec![];
|
||||
for off in self.set.iter() {
|
||||
if off.is_active() {
|
||||
if off.max > 0 {
|
||||
offsets.push(off);
|
||||
}
|
||||
}
|
||||
@ -448,34 +497,28 @@ struct RareByteOffset {
|
||||
/// ineffective when it is asked to start scanning from a position that it
|
||||
/// has already scanned past.
|
||||
///
|
||||
/// N.B. The maximum value for this is 254. A value of 255 indicates that
|
||||
/// this is unused. If a rare byte is found at an offset of 255 or greater,
|
||||
/// then the rare-byte prefilter is disabled for simplicity.
|
||||
/// Using a `u8` here means that if we ever see a pattern that's longer
|
||||
/// than 255 bytes, then the entire rare byte prefilter is disabled.
|
||||
max: u8,
|
||||
}
|
||||
|
||||
impl Default for RareByteOffset {
|
||||
fn default() -> RareByteOffset {
|
||||
RareByteOffset { max: u8::MAX }
|
||||
RareByteOffset { max: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl RareByteOffset {
|
||||
/// Create a new rare byte offset. If the given offset is too big, then
|
||||
/// an inactive `RareByteOffset` is returned.
|
||||
fn new(max: usize) -> RareByteOffset {
|
||||
if max > (u8::MAX - 1) as usize {
|
||||
RareByteOffset::default()
|
||||
/// None is returned. In that case, callers should render the rare bytes
|
||||
/// prefilter inert.
|
||||
fn new(max: usize) -> Option<RareByteOffset> {
|
||||
if max > u8::MAX as usize {
|
||||
None
|
||||
} else {
|
||||
RareByteOffset { max: max as u8 }
|
||||
Some(RareByteOffset { max: max as u8 })
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if this offset is active. If it's inactive,
|
||||
/// then it should not be used.
|
||||
fn is_active(&self) -> bool {
|
||||
self.max < u8::MAX
|
||||
}
|
||||
}
|
||||
|
||||
impl RareBytesBuilder {
|
||||
@ -483,6 +526,7 @@ impl RareBytesBuilder {
|
||||
fn new() -> RareBytesBuilder {
|
||||
RareBytesBuilder {
|
||||
ascii_case_insensitive: false,
|
||||
rare_set: ByteSet::empty(),
|
||||
byte_offsets: RareByteOffsets::empty(),
|
||||
available: true,
|
||||
count: 0,
|
||||
@ -507,8 +551,8 @@ impl RareBytesBuilder {
|
||||
return None;
|
||||
}
|
||||
let (mut bytes, mut len) = ([0; 3], 0);
|
||||
for b in 0..256 {
|
||||
if self.byte_offsets.set[b].is_active() {
|
||||
for b in 0..=255 {
|
||||
if self.rare_set.contains(b) {
|
||||
bytes[len] = b as u8;
|
||||
len += 1;
|
||||
}
|
||||
@ -539,15 +583,25 @@ impl RareBytesBuilder {
|
||||
/// All patterns added to an Aho-Corasick automaton should be added to this
|
||||
/// builder before attempting to construct the prefilter.
|
||||
fn add(&mut self, bytes: &[u8]) {
|
||||
// If we've already given up, then do nothing.
|
||||
if !self.available {
|
||||
return;
|
||||
}
|
||||
// If we've already blown our budget, then don't waste time looking
|
||||
// for more rare bytes.
|
||||
if self.count > 3 {
|
||||
self.available = false;
|
||||
return;
|
||||
}
|
||||
// If the pattern is too long, then our offset table is bunk, so
|
||||
// give up.
|
||||
if bytes.len() >= 256 {
|
||||
self.available = false;
|
||||
return;
|
||||
}
|
||||
let mut rarest = match bytes.get(0) {
|
||||
None => return,
|
||||
Some(&b) => (b, 0, freq_rank(b)),
|
||||
Some(&b) => (b, freq_rank(b)),
|
||||
};
|
||||
// The idea here is to look for the rarest byte in each pattern, and
|
||||
// add that to our set. As a special exception, if we see a byte that
|
||||
@ -558,33 +612,44 @@ impl RareBytesBuilder {
|
||||
// were searching for `Sherlock` and `lockjaw`, then this would pick
|
||||
// `k` for both patterns, resulting in the use of `memchr` instead of
|
||||
// `memchr2` for `k` and `j`.
|
||||
let mut found = false;
|
||||
for (pos, &b) in bytes.iter().enumerate() {
|
||||
if self.byte_offsets.set[b as usize].is_active() {
|
||||
self.add_rare_byte(b, pos);
|
||||
return;
|
||||
self.set_offset(pos, b);
|
||||
if found {
|
||||
continue;
|
||||
}
|
||||
if self.rare_set.contains(b) {
|
||||
found = true;
|
||||
continue;
|
||||
}
|
||||
let rank = freq_rank(b);
|
||||
if rank < rarest.2 {
|
||||
rarest = (b, pos, rank);
|
||||
if rank < rarest.1 {
|
||||
rarest = (b, rank);
|
||||
}
|
||||
}
|
||||
self.add_rare_byte(rarest.0, rarest.1);
|
||||
if !found {
|
||||
self.add_rare_byte(rarest.0);
|
||||
}
|
||||
}
|
||||
|
||||
fn add_rare_byte(&mut self, byte: u8, pos: usize) {
|
||||
self.add_one_byte(byte, pos);
|
||||
fn set_offset(&mut self, pos: usize, byte: u8) {
|
||||
// This unwrap is OK because pos is never bigger than our max.
|
||||
let offset = RareByteOffset::new(pos).unwrap();
|
||||
self.byte_offsets.set(byte, offset);
|
||||
if self.ascii_case_insensitive {
|
||||
self.add_one_byte(opposite_ascii_case(byte), pos);
|
||||
self.byte_offsets.set(opposite_ascii_case(byte), offset);
|
||||
}
|
||||
}
|
||||
|
||||
fn add_one_byte(&mut self, byte: u8, pos: usize) {
|
||||
let off = RareByteOffset::new(pos);
|
||||
if !off.is_active() {
|
||||
self.available = false;
|
||||
return;
|
||||
fn add_rare_byte(&mut self, byte: u8) {
|
||||
self.add_one_rare_byte(byte);
|
||||
if self.ascii_case_insensitive {
|
||||
self.add_one_rare_byte(opposite_ascii_case(byte));
|
||||
}
|
||||
if self.byte_offsets.apply(byte, off) {
|
||||
}
|
||||
|
||||
fn add_one_rare_byte(&mut self, byte: u8) {
|
||||
if self.rare_set.insert(byte) {
|
||||
self.count += 1;
|
||||
self.rank_sum += freq_rank(byte) as u16;
|
||||
}
|
||||
@ -621,6 +686,33 @@ impl Prefilter for RareBytesOne {
|
||||
fn heap_bytes(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn looks_for_non_start_of_match(&self) -> bool {
|
||||
// TODO: It should be possible to use a rare byte prefilter in a
|
||||
// streaming context. The main problem is that we usually assume that
|
||||
// if a prefilter has scanned some text and not found anything, then no
|
||||
// match *starts* in that text. This doesn't matter in non-streaming
|
||||
// contexts, but in a streaming context, if we're looking for a byte
|
||||
// that doesn't start at the beginning of a match and don't find it,
|
||||
// then it's still possible for a match to start at the end of the
|
||||
// current buffer content. In order to fix this, the streaming searcher
|
||||
// would need to become aware of prefilters that do this and use the
|
||||
// appropriate offset in various places. It is quite a delicate change
|
||||
// and probably shouldn't be attempted until streaming search has a
|
||||
// better testing strategy. In particular, we'd really like to be able
|
||||
// to vary the buffer size to force strange cases that occur at the
|
||||
// edge of the buffer. If we make the buffer size minimal, then these
|
||||
// cases occur more frequently and easier.
|
||||
//
|
||||
// This is also a bummer because this means that if the prefilter
|
||||
// builder chose a rare byte prefilter, then a streaming search won't
|
||||
// use any prefilter at all because the builder doesn't know how it's
|
||||
// going to be used. Assuming we don't make streaming search aware of
|
||||
// these special types of prefilters as described above, we could fix
|
||||
// this by building a "backup" prefilter that could be used when the
|
||||
// rare byte prefilter could not. But that's a bandaide. Sigh.
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for two "rare" bytes.
|
||||
@ -655,6 +747,11 @@ impl Prefilter for RareBytesTwo {
|
||||
fn heap_bytes(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn looks_for_non_start_of_match(&self) -> bool {
|
||||
// TODO: See Prefilter impl for RareBytesOne.
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for three "rare" bytes.
|
||||
@ -690,6 +787,11 @@ impl Prefilter for RareBytesThree {
|
||||
fn heap_bytes(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn looks_for_non_start_of_match(&self) -> bool {
|
||||
// TODO: See Prefilter impl for RareBytesOne.
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing a starting byte prefilter.
|
||||
@ -698,7 +800,7 @@ impl Prefilter for RareBytesThree {
|
||||
/// matches by reporting all positions corresponding to a particular byte. This
|
||||
/// generally only takes affect when there are at most 3 distinct possible
|
||||
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
|
||||
/// distinct starting bytes (`f` and `b`), and this prefiler returns all
|
||||
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
|
||||
/// occurrences of either `f` or `b`.
|
||||
///
|
||||
/// In some cases, a heuristic frequency analysis may determine that it would
|
||||
@ -930,7 +1032,7 @@ pub fn opposite_ascii_case(b: u8) -> u8 {
|
||||
/// Return the frequency rank of the given byte. The higher the rank, the more
|
||||
/// common the byte (heuristically speaking).
|
||||
fn freq_rank(b: u8) -> u8 {
|
||||
use byte_frequencies::BYTE_FREQUENCIES;
|
||||
use crate::byte_frequencies::BYTE_FREQUENCIES;
|
||||
BYTE_FREQUENCIES[b as usize]
|
||||
}
|
||||
|
||||
|
27
third_party/rust/aho-corasick/src/state_id.rs
vendored
27
third_party/rust/aho-corasick/src/state_id.rs
vendored
@ -1,7 +1,7 @@
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
|
||||
use error::{Error, Result};
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
// NOTE: Most of this code was copied from regex-automata, but without the
|
||||
// (de)serialization specific stuff.
|
||||
@ -69,18 +69,7 @@ mod private {
|
||||
/// other type. In particular, this crate provides implementations for `u8`,
|
||||
/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for
|
||||
/// targets that can represent all corresponding values in a `usize`.)
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This trait is unsafe because the correctness of its implementations may be
|
||||
/// relied upon by other unsafe code. For example, one possible way to
|
||||
/// implement this trait incorrectly would be to return a maximum identifier
|
||||
/// in `max_id` that is greater than the real maximum identifier. This will
|
||||
/// likely result in wrap-on-overflow semantics in release mode, which can in
|
||||
/// turn produce incorrect state identifiers. Those state identifiers may then
|
||||
/// in turn access out-of-bounds memory in an automaton's search routine, where
|
||||
/// bounds checks are explicitly elided for performance reasons.
|
||||
pub unsafe trait StateID:
|
||||
pub trait StateID:
|
||||
private::Sealed
|
||||
+ Clone
|
||||
+ Copy
|
||||
@ -111,11 +100,11 @@ pub unsafe trait StateID:
|
||||
/// Return the maximum state identifier supported by this representation.
|
||||
///
|
||||
/// Implementors must return a correct bound. Doing otherwise may result
|
||||
/// in memory unsafety.
|
||||
/// in unspecified behavior (but will not violate memory safety).
|
||||
fn max_id() -> usize;
|
||||
}
|
||||
|
||||
unsafe impl StateID for usize {
|
||||
impl StateID for usize {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> usize {
|
||||
n
|
||||
@ -132,7 +121,7 @@ unsafe impl StateID for usize {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl StateID for u8 {
|
||||
impl StateID for u8 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u8 {
|
||||
n as u8
|
||||
@ -149,7 +138,7 @@ unsafe impl StateID for u8 {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl StateID for u16 {
|
||||
impl StateID for u16 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u16 {
|
||||
n as u16
|
||||
@ -167,7 +156,7 @@ unsafe impl StateID for u16 {
|
||||
}
|
||||
|
||||
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
||||
unsafe impl StateID for u32 {
|
||||
impl StateID for u32 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u32 {
|
||||
n as u32
|
||||
@ -185,7 +174,7 @@ unsafe impl StateID for u32 {
|
||||
}
|
||||
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
unsafe impl StateID for u64 {
|
||||
impl StateID for u64 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u64 {
|
||||
n as u64
|
||||
|
270
third_party/rust/aho-corasick/src/tests.rs
vendored
270
third_party/rust/aho-corasick/src/tests.rs
vendored
@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::usize;
|
||||
|
||||
use {AhoCorasickBuilder, Match, MatchKind};
|
||||
use crate::{AhoCorasickBuilder, Match, MatchKind};
|
||||
|
||||
/// A description of a single test against an Aho-Corasick automaton.
|
||||
///
|
||||
@ -549,6 +549,39 @@ const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[
|
||||
t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]),
|
||||
];
|
||||
|
||||
/// Tests for ASCII case insensitivity.
|
||||
///
|
||||
/// These tests should all have the same behavior regardless of match semantics
|
||||
/// or whether the search is overlapping.
|
||||
const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[
|
||||
t!(acasei000, &["a"], "A", &[(0, 0, 1)]),
|
||||
t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]),
|
||||
t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]),
|
||||
t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]),
|
||||
];
|
||||
|
||||
/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests.
|
||||
const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[
|
||||
t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]),
|
||||
t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]),
|
||||
t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]),
|
||||
];
|
||||
|
||||
/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests.
|
||||
const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[
|
||||
t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
|
||||
t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]),
|
||||
// This is a regression test from:
|
||||
// https://github.com/BurntSushi/aho-corasick/issues/68
|
||||
// Previously, it was reporting a duplicate (1, 3, 6) match.
|
||||
t!(
|
||||
acasei010,
|
||||
&["abc", "def", "abcdef"],
|
||||
"abcdef",
|
||||
&[(0, 0, 3), (2, 0, 6), (1, 3, 6)]
|
||||
),
|
||||
];
|
||||
|
||||
/// Regression tests that are applied to all Aho-Corasick combinations.
|
||||
///
|
||||
/// If regression tests are needed for specific match semantics, then add them
|
||||
@ -706,6 +739,8 @@ macro_rules! testcombo {
|
||||
$collection,
|
||||
$kind,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when option is removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).byte_classes(false);
|
||||
}
|
||||
);
|
||||
@ -714,6 +749,8 @@ macro_rules! testcombo {
|
||||
$collection,
|
||||
$kind,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when option is removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).premultiply(false);
|
||||
}
|
||||
);
|
||||
@ -722,6 +759,8 @@ macro_rules! testcombo {
|
||||
$collection,
|
||||
$kind,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when options are removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).byte_classes(false).premultiply(false);
|
||||
}
|
||||
);
|
||||
@ -797,6 +836,8 @@ testconfig!(
|
||||
AC_STANDARD_OVERLAPPING,
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when option is removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).byte_classes(false);
|
||||
}
|
||||
);
|
||||
@ -806,6 +847,8 @@ testconfig!(
|
||||
AC_STANDARD_OVERLAPPING,
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when option is removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).premultiply(false);
|
||||
}
|
||||
);
|
||||
@ -815,6 +858,8 @@ testconfig!(
|
||||
AC_STANDARD_OVERLAPPING,
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
// TODO: remove tests when options are removed.
|
||||
#[allow(deprecated)]
|
||||
b.dfa(true).byte_classes(false).premultiply(false);
|
||||
}
|
||||
);
|
||||
@ -907,6 +952,99 @@ testconfig!(
|
||||
}
|
||||
);
|
||||
|
||||
// And also write out the test combinations for ASCII case insensitivity.
|
||||
testconfig!(
|
||||
acasei_standard_nfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE],
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.prefilter(false).ascii_case_insensitive(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
acasei_standard_dfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true).dfa(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
overlapping,
|
||||
acasei_standard_overlapping_nfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
overlapping,
|
||||
acasei_standard_overlapping_dfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING],
|
||||
Standard,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true).dfa(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
acasei_leftmost_first_nfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
|
||||
LeftmostFirst,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
acasei_leftmost_first_dfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
|
||||
LeftmostFirst,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true).dfa(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
acasei_leftmost_longest_nfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
|
||||
LeftmostLongest,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true);
|
||||
}
|
||||
);
|
||||
testconfig!(
|
||||
acasei_leftmost_longest_dfa_default,
|
||||
&[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING],
|
||||
LeftmostLongest,
|
||||
|b: &mut AhoCorasickBuilder| {
|
||||
b.ascii_case_insensitive(true).dfa(true);
|
||||
}
|
||||
);
|
||||
|
||||
fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
|
||||
which: TestCollection,
|
||||
mut f: F,
|
||||
) {
|
||||
let get_match_triples =
|
||||
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
|
||||
matches
|
||||
.into_iter()
|
||||
.map(|m| (m.pattern(), m.start(), m.end()))
|
||||
.collect()
|
||||
};
|
||||
for &tests in which {
|
||||
for test in tests {
|
||||
assert_eq!(
|
||||
test.matches,
|
||||
get_match_triples(f(&test)).as_slice(),
|
||||
"test: {}, patterns: {:?}, haystack: {:?}",
|
||||
test.name,
|
||||
test.patterns,
|
||||
test.haystack
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_tests_have_unique_names() {
|
||||
let assert = |constname, tests: &[SearchTest]| {
|
||||
@ -996,27 +1134,119 @@ fn regression_ascii_case_insensitive_no_exponential() {
|
||||
assert!(ac.find("").is_none());
|
||||
}
|
||||
|
||||
fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>(
|
||||
which: TestCollection,
|
||||
mut f: F,
|
||||
) {
|
||||
let get_match_triples =
|
||||
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
|
||||
matches
|
||||
.into_iter()
|
||||
.map(|m| (m.pattern(), m.start(), m.end()))
|
||||
.collect()
|
||||
};
|
||||
for &tests in which {
|
||||
for test in tests {
|
||||
// See: https://github.com/BurntSushi/aho-corasick/issues/53
|
||||
//
|
||||
// This test ensures that the rare byte prefilter works in a particular corner
|
||||
// case. In particular, the shift offset detected for '/' in the patterns below
|
||||
// was incorrect, leading to a false negative.
|
||||
#[test]
|
||||
fn regression_rare_byte_prefilter() {
|
||||
use crate::AhoCorasick;
|
||||
|
||||
let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]);
|
||||
assert!(ac.is_match("ab/j/"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn regression_case_insensitive_prefilter() {
|
||||
use crate::AhoCorasickBuilder;
|
||||
|
||||
for c in b'a'..b'z' {
|
||||
for c2 in b'a'..b'z' {
|
||||
let c = c as char;
|
||||
let c2 = c2 as char;
|
||||
let needle = format!("{}{}", c, c2).to_lowercase();
|
||||
let haystack = needle.to_uppercase();
|
||||
let ac = AhoCorasickBuilder::new()
|
||||
.ascii_case_insensitive(true)
|
||||
.prefilter(true)
|
||||
.build(&[&needle]);
|
||||
assert_eq!(
|
||||
test.matches,
|
||||
get_match_triples(f(&test)).as_slice(),
|
||||
"test: {}, patterns: {:?}, haystack: {:?}",
|
||||
test.name,
|
||||
test.patterns,
|
||||
test.haystack
|
||||
1,
|
||||
ac.find_iter(&haystack).count(),
|
||||
"failed to find {:?} in {:?}\n\nautomaton:\n{:?}",
|
||||
needle,
|
||||
haystack,
|
||||
ac,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// See: https://github.com/BurntSushi/aho-corasick/issues/64
|
||||
//
|
||||
// This occurs when the rare byte prefilter is active.
|
||||
#[test]
|
||||
fn regression_stream_rare_byte_prefilter() {
|
||||
use std::io::Read;
|
||||
|
||||
// NOTE: The test only fails if this ends with j.
|
||||
const MAGIC: [u8; 5] = *b"1234j";
|
||||
|
||||
// NOTE: The test fails for value in 8188..=8191 These value put the string
|
||||
// to search accross two call to read because the buffer size is 8192 by
|
||||
// default.
|
||||
const BEGIN: usize = 8191;
|
||||
|
||||
/// This is just a structure that implements Reader. The reader
|
||||
/// implementation will simulate a file filled with 0, except for the MAGIC
|
||||
/// string at offset BEGIN.
|
||||
#[derive(Default)]
|
||||
struct R {
|
||||
read: usize,
|
||||
}
|
||||
|
||||
impl Read for R {
|
||||
fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> {
|
||||
//dbg!(buf.len());
|
||||
if self.read > 100000 {
|
||||
return Ok(0);
|
||||
}
|
||||
let mut from = 0;
|
||||
if self.read < BEGIN {
|
||||
from = buf.len().min(BEGIN - self.read);
|
||||
for x in 0..from {
|
||||
buf[x] = 0;
|
||||
}
|
||||
self.read += from;
|
||||
}
|
||||
if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() {
|
||||
let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from);
|
||||
if to > from {
|
||||
buf[from..to].copy_from_slice(
|
||||
&MAGIC
|
||||
[self.read - BEGIN..self.read - BEGIN + to - from],
|
||||
);
|
||||
self.read += to - from;
|
||||
from = to;
|
||||
}
|
||||
}
|
||||
for x in from..buf.len() {
|
||||
buf[x] = 0;
|
||||
self.read += 1;
|
||||
}
|
||||
Ok(buf.len())
|
||||
}
|
||||
}
|
||||
|
||||
fn run() -> ::std::io::Result<()> {
|
||||
let aut = AhoCorasickBuilder::new().build(&[&MAGIC]);
|
||||
|
||||
// While reading from a vector, it works:
|
||||
let mut buf = vec![];
|
||||
R::default().read_to_end(&mut buf)?;
|
||||
let from_whole = aut.find_iter(&buf).next().unwrap().start();
|
||||
|
||||
//But using stream_find_iter fails!
|
||||
let mut file = R::default();
|
||||
let begin = aut
|
||||
.stream_find_iter(&mut file)
|
||||
.next()
|
||||
.expect("NOT FOUND!!!!")? // Panic here
|
||||
.start();
|
||||
assert_eq!(from_whole, begin);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
run().unwrap()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user