mirror of
https://gitee.com/openharmony/third_party_rust_aho-corasick
synced 2024-11-26 17:12:09 +00:00
Initial commit. Not finished.
This commit is contained in:
commit
4aa21954e4
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
.*.swp
|
||||
doc
|
||||
tags
|
||||
examples/ss10pusa.csv
|
||||
build
|
||||
target
|
||||
Cargo.lock
|
||||
scratch*
|
||||
bench_large/huge
|
12
.travis.yml
Normal file
12
.travis.yml
Normal file
@ -0,0 +1,12 @@
|
||||
language: rust
|
||||
rust:
|
||||
- 1.0.0
|
||||
- beta
|
||||
- nightly
|
||||
script:
|
||||
- cargo build --verbose
|
||||
- cargo test --verbose
|
||||
- cargo doc
|
||||
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
|
||||
cargo bench --verbose;
|
||||
fi
|
3
COPYING
Normal file
3
COPYING
Normal file
@ -0,0 +1,3 @@
|
||||
This project is dual-licensed under the Unlicense and MIT licenses.
|
||||
|
||||
You may use this code under the terms of either license.
|
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "aho-corasick"
|
||||
version = "0.1.0" #:version
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||
description = "Fast multiple substring searching with finite state machines."
|
||||
documentation = "http://burntsushi.net/rustdoc/aho-corasick/"
|
||||
homepage = "https://github.com/BurntSushi/aho-corasick"
|
||||
repository = "https://github.com/BurntSushi/aho-corasick"
|
||||
readme = "README.md"
|
||||
keywords = ["string", "search", "text", "aho", "corasick"]
|
||||
license = "Unlicense/MIT"
|
||||
|
||||
[lib]
|
||||
name = "aho_corasick"
|
||||
|
||||
[dependencies]
|
||||
memchr = "0.1.*"
|
21
LICENSE-MIT
Normal file
21
LICENSE-MIT
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
14
Makefile
Normal file
14
Makefile
Normal file
@ -0,0 +1,14 @@
|
||||
all:
|
||||
echo Nothing to do...
|
||||
|
||||
ctags:
|
||||
ctags --recurse --options=ctags.rust --languages=Rust
|
||||
|
||||
docs:
|
||||
cargo doc
|
||||
in-dir ./target/doc fix-perms
|
||||
rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
|
||||
|
||||
push:
|
||||
git push origin master
|
||||
git push github master
|
31
README.md
Normal file
31
README.md
Normal file
@ -0,0 +1,31 @@
|
||||
**UNDER DEVELOPMENT**
|
||||
|
||||
This crate provides a fast implementation of the
|
||||
[Aho-Corasick](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
|
||||
algorithm. Its intended use case is for fast substring matching, particularly
|
||||
when matching multiple substrings in a search text. This is achieved by
|
||||
compiling the substrings into a finite state machine.
|
||||
|
||||
This implementation provides optimal algorithmic time complexity. Construction
|
||||
of the finite state machine is `O(p)` where `p` is the length of the substrings
|
||||
concatenated. Matching against search text is `O(n + p + m)`, where `n` is
|
||||
the length of the search text and `m` is the number of matches.
|
||||
|
||||
[![Build status](https://api.travis-ci.org/BurntSushi/aho-corasick.png)](https://travis-ci.org/BurntSushi/aho-corasick)
|
||||
[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||
|
||||
|
||||
### Documentation
|
||||
|
||||
[http://burntsushi.net/rustdoc/aho-corasick/](http://burntsushi.net/rustdoc/aho-corasick/).
|
||||
|
||||
|
||||
### Alternatives
|
||||
|
||||
Aho-Corasick is useful for matching multiple substrings against many long
|
||||
strings. If your long string is fixed, then you might consider building a
|
||||
[suffix array](https://github.com/BurntSushi/suffix)
|
||||
of the search text (which takes `O(n)` time). Matches can then be found in
|
||||
`O(plogn)` time.
|
24
UNLICENSE
Normal file
24
UNLICENSE
Normal file
@ -0,0 +1,24 @@
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
11
ctags.rust
Normal file
11
ctags.rust
Normal file
@ -0,0 +1,11 @@
|
||||
--langdef=Rust
|
||||
--langmap=Rust:.rs
|
||||
--regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
|
||||
--regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
|
||||
--regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/
|
1
session.vim
Normal file
1
session.vim
Normal file
@ -0,0 +1 @@
|
||||
au BufWritePost *.rs silent!make ctags > /dev/null 2>&1
|
144
src/lib.rs
Normal file
144
src/lib.rs
Normal file
@ -0,0 +1,144 @@
|
||||
/*!
|
||||
A fast implementation of the Aho-Corasick string search algorithm.
|
||||
*/
|
||||
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
pats: Vec<String>,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
pub fn new() -> Builder {
|
||||
Builder { pats: vec![] }
|
||||
}
|
||||
|
||||
pub fn add<S: Into<String>>(mut self, s: S) -> Builder {
|
||||
self.pats.push(s.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> Automaton {
|
||||
Automaton::new(self.pats)
|
||||
}
|
||||
}
|
||||
|
||||
type PatIdx = usize;
|
||||
type StateIdx = usize;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Automaton {
|
||||
pats: Vec<String>,
|
||||
states: Vec<State>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct State {
|
||||
out: Vec<PatIdx>,
|
||||
fail: StateIdx,
|
||||
goto: Vec<StateIdx>, // indexed by alphabet
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
fn new(pats: Vec<String>) -> Automaton {
|
||||
Automaton {
|
||||
pats: vec![], // filled in later, avoid wrath of borrow checker
|
||||
states: vec![State::new()],
|
||||
}.build(pats)
|
||||
}
|
||||
|
||||
fn build(mut self, pats: Vec<String>) -> Automaton {
|
||||
let rooti = self.add_state(State::new());
|
||||
for (pati, pat) in pats.iter().enumerate() {
|
||||
let mut previ = rooti;
|
||||
for &b in pat.as_bytes() {
|
||||
if let Some(si) = self.states[previ].goto(b) {
|
||||
previ = si;
|
||||
} else {
|
||||
let nexti = self.add_state(State::new());
|
||||
self.states[previ].goto[b as usize] = nexti;
|
||||
previ = nexti;
|
||||
}
|
||||
}
|
||||
self.states[previ].out.push(pati);
|
||||
}
|
||||
for v in &mut self.states[rooti].goto {
|
||||
if *v == 0 {
|
||||
*v = 1;
|
||||
}
|
||||
}
|
||||
self.pats = pats;
|
||||
self
|
||||
}
|
||||
|
||||
fn add_state(&mut self, state: State) -> StateIdx {
|
||||
let i = self.states.len();
|
||||
self.states.push(state);
|
||||
i
|
||||
}
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new() -> State {
|
||||
State {
|
||||
out: vec![],
|
||||
fail: 1,
|
||||
goto: vec![0; 256],
|
||||
}
|
||||
}
|
||||
|
||||
fn goto(&self, b: u8) -> Option<StateIdx> {
|
||||
let i = self.goto[b as usize];
|
||||
if i == 0 { None } else { Some(i) }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl fmt::Debug for Automaton {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use std::iter::repeat;
|
||||
|
||||
try!(writeln!(f, "{}", repeat('-').take(79).collect::<String>()));
|
||||
try!(writeln!(f, "Patterns: {:?}", self.pats));
|
||||
for (i, state) in self.states.iter().enumerate().skip(1) {
|
||||
try!(writeln!(f, "{:3}: {}", i, state.debug(i == 1)));
|
||||
}
|
||||
write!(f, "{}", repeat('-').take(79).collect::<String>())
|
||||
}
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn debug(&self, root: bool) -> String {
|
||||
format!("State {{ out: {:?}, fail: {:?}, goto: {{{}}} }}",
|
||||
self.out, self.fail, self.dense_goto_string(root))
|
||||
}
|
||||
|
||||
fn dense_goto_string(&self, root: bool) -> String {
|
||||
use std::char::from_u32;
|
||||
|
||||
let mut goto = vec![];
|
||||
for (i, &state) in self.goto.iter().enumerate() {
|
||||
if (!root && state == 0) || (root && state == 1) { continue; }
|
||||
goto.push(format!("{} => {}", from_u32(i as u32).unwrap(), state));
|
||||
}
|
||||
goto.connect(", ")
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", self.debug(false))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Builder;
|
||||
|
||||
#[test]
|
||||
fn scratch() {
|
||||
let aut = Builder::new().add("he").add("she").build();
|
||||
println!("{:?}", aut);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user