Initial commit. Not finished.

This commit is contained in:
Andrew Gallant 2015-06-11 21:15:43 -04:00
commit 4aa21954e4
11 changed files with 287 additions and 0 deletions

9
.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
.*.swp
doc
tags
examples/ss10pusa.csv
build
target
Cargo.lock
scratch*
bench_large/huge

12
.travis.yml Normal file
View File

@ -0,0 +1,12 @@
language: rust
rust:
- 1.0.0
- beta
- nightly
script:
- cargo build --verbose
- cargo test --verbose
- cargo doc
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
cargo bench --verbose;
fi

3
COPYING Normal file
View File

@ -0,0 +1,3 @@
This project is dual-licensed under the Unlicense and MIT licenses.
You may use this code under the terms of either license.

17
Cargo.toml Normal file
View File

@ -0,0 +1,17 @@
[package]
name = "aho-corasick"
version = "0.1.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = "Fast multiple substring searching with finite state machines."
documentation = "http://burntsushi.net/rustdoc/aho-corasick/"
homepage = "https://github.com/BurntSushi/aho-corasick"
repository = "https://github.com/BurntSushi/aho-corasick"
readme = "README.md"
keywords = ["string", "search", "text", "aho", "corasick"]
license = "Unlicense/MIT"
[lib]
name = "aho_corasick"
[dependencies]
memchr = "0.1.*"

21
LICENSE-MIT Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

14
Makefile Normal file
View File

@ -0,0 +1,14 @@
all:
echo Nothing to do...
ctags:
ctags --recurse --options=ctags.rust --languages=Rust
docs:
cargo doc
in-dir ./target/doc fix-perms
rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
push:
git push origin master
git push github master

31
README.md Normal file
View File

@ -0,0 +1,31 @@
**UNDER DEVELOPMENT**
This crate provides a fast implementation of the
[Aho-Corasick](http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
algorithm. Its intended use case is for fast substring matching, particularly
when matching multiple substrings in a search text. This is achieved by
compiling the substrings into a finite state machine.
This implementation provides optimal algorithmic time complexity. Construction
of the finite state machine is `O(p)` where `p` is the length of the substrings
concatenated. Matching against search text is `O(n + p + m)`, where `n` is
the length of the search text and `m` is the number of matches.
[![Build status](https://api.travis-ci.org/BurntSushi/aho-corasick.png)](https://travis-ci.org/BurntSushi/aho-corasick)
[![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick)
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
### Documentation
[http://burntsushi.net/rustdoc/aho-corasick/](http://burntsushi.net/rustdoc/aho-corasick/).
### Alternatives
Aho-Corasick is useful for matching multiple substrings against many long
strings. If your long string is fixed, then you might consider building a
[suffix array](https://github.com/BurntSushi/suffix)
of the search text (which takes `O(n)` time). Matches can then be found in
`O(plogn)` time.

24
UNLICENSE Normal file
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

11
ctags.rust Normal file
View File

@ -0,0 +1,11 @@
--langdef=Rust
--langmap=Rust:.rs
--regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
--regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
--regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
--regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/

1
session.vim Normal file
View File

@ -0,0 +1 @@
au BufWritePost *.rs silent!make ctags > /dev/null 2>&1

144
src/lib.rs Normal file
View File

@ -0,0 +1,144 @@
/*!
A fast implementation of the Aho-Corasick string search algorithm.
*/
use std::fmt;
#[derive(Clone, Debug)]
pub struct Builder {
pats: Vec<String>,
}
impl Builder {
pub fn new() -> Builder {
Builder { pats: vec![] }
}
pub fn add<S: Into<String>>(mut self, s: S) -> Builder {
self.pats.push(s.into());
self
}
pub fn build(self) -> Automaton {
Automaton::new(self.pats)
}
}
type PatIdx = usize;
type StateIdx = usize;
#[derive(Clone)]
pub struct Automaton {
pats: Vec<String>,
states: Vec<State>,
}
#[derive(Clone)]
struct State {
out: Vec<PatIdx>,
fail: StateIdx,
goto: Vec<StateIdx>, // indexed by alphabet
}
impl Automaton {
fn new(pats: Vec<String>) -> Automaton {
Automaton {
pats: vec![], // filled in later, avoid wrath of borrow checker
states: vec![State::new()],
}.build(pats)
}
fn build(mut self, pats: Vec<String>) -> Automaton {
let rooti = self.add_state(State::new());
for (pati, pat) in pats.iter().enumerate() {
let mut previ = rooti;
for &b in pat.as_bytes() {
if let Some(si) = self.states[previ].goto(b) {
previ = si;
} else {
let nexti = self.add_state(State::new());
self.states[previ].goto[b as usize] = nexti;
previ = nexti;
}
}
self.states[previ].out.push(pati);
}
for v in &mut self.states[rooti].goto {
if *v == 0 {
*v = 1;
}
}
self.pats = pats;
self
}
fn add_state(&mut self, state: State) -> StateIdx {
let i = self.states.len();
self.states.push(state);
i
}
}
impl State {
fn new() -> State {
State {
out: vec![],
fail: 1,
goto: vec![0; 256],
}
}
fn goto(&self, b: u8) -> Option<StateIdx> {
let i = self.goto[b as usize];
if i == 0 { None } else { Some(i) }
}
}
impl fmt::Debug for Automaton {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use std::iter::repeat;
try!(writeln!(f, "{}", repeat('-').take(79).collect::<String>()));
try!(writeln!(f, "Patterns: {:?}", self.pats));
for (i, state) in self.states.iter().enumerate().skip(1) {
try!(writeln!(f, "{:3}: {}", i, state.debug(i == 1)));
}
write!(f, "{}", repeat('-').take(79).collect::<String>())
}
}
impl State {
fn debug(&self, root: bool) -> String {
format!("State {{ out: {:?}, fail: {:?}, goto: {{{}}} }}",
self.out, self.fail, self.dense_goto_string(root))
}
fn dense_goto_string(&self, root: bool) -> String {
use std::char::from_u32;
let mut goto = vec![];
for (i, &state) in self.goto.iter().enumerate() {
if (!root && state == 0) || (root && state == 1) { continue; }
goto.push(format!("{} => {}", from_u32(i as u32).unwrap(), state));
}
goto.connect(", ")
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.debug(false))
}
}
#[cfg(test)]
mod tests {
use super::Builder;
#[test]
fn scratch() {
let aut = Builder::new().add("he").add("she").build();
println!("{:?}", aut);
}
}