impl: add wasm simd support

This commit adds simd acceleration support to the `memmem` module. This
is added with the freshly-stabilized support from rust-lang/rust#86204.
This mostly just cribs off the generic simd support for 128-bit types
built for sse, copying bits and pieces of code here and there. Some
refactoring happened internally to help reduce duplication where
possible.

I ran some initial benchmarks with the `memmem/krate/*` regex and a
hacked up single-threaded version of criterion. Some [initial
comparisons][compare] using Wasmtime as a runtime do indeed show a lot
of improvements, but there are indeed some slowdowns as well.

[compare]: https://gist.github.com/alexcrichton/6a72e682e7b6d505ade605359fbe3f2d

PR #84
This commit is contained in:
Alex Crichton
2021-12-22 12:59:57 -06:00
committed by GitHub
parent e57582346c
commit c28170ec18
9 changed files with 287 additions and 113 deletions
+15
View File
@@ -27,6 +27,7 @@ jobs:
- stable
- stable-32
- stable-mips
- wasm
- beta
- nightly
- macos
@@ -62,6 +63,10 @@ jobs:
- build: win-gnu
os: windows-2019
rust: stable-x86_64-gnu
- build: wasm
os: ubuntu-18.04
rust: stable-x86_64-gnu
wasm: true
steps:
- name: Checkout repository
uses: actions/checkout@v1
@@ -81,6 +86,16 @@ jobs:
cargo install --git https://github.com/rust-embedded/cross
echo "CARGO=cross" >> $GITHUB_ENV
echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV
- name: Download Wasmtime
if: matrix.wasm
run: |
rustup target add wasm32-wasi
echo "CARGO_BUILD_TARGET=wasm32-wasi" >> $GITHUB_ENV
echo "RUSTFLAGS=-Ctarget-feature=+simd128" >> $GITHUB_ENV
curl -LO https://github.com/bytecodealliance/wasmtime/releases/download/v0.32.0/wasmtime-v0.32.0-x86_64-linux.tar.xz
tar xvf wasmtime-v0.32.0-x86_64-linux.tar.xz
echo `pwd`/wasmtime-v0.32.0-x86_64-linux >> $GITHUB_PATH
echo "CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime run --wasm-features simd --" >> $GITHUB_ENV
- name: Show command used for Cargo
run: |
echo "cargo command is: ${{ env.CARGO }}"
+25 -10
View File
@@ -640,44 +640,47 @@ pub(crate) mod sliceslice {
}
pub(crate) fn prebuilt(_: &str) -> impl Fn(&str) -> bool + 'static {
unimplemented!("sliceslice only runs on x86")
if true {
unimplemented!("sliceslice only runs on x86")
}
|_| false
}
pub(crate) fn oneshotiter<'a>(
haystack: &'a str,
needle: &'a str,
_haystack: &'a str,
_needle: &'a str,
) -> impl Iterator<Item = usize> + 'static {
std::iter::from_fn(move || {
unimplemented!("sliceslice only runs on x86")
})
}
pub(crate) fn prebuiltiter(needle: &str) -> super::super::NoIter {
pub(crate) fn prebuiltiter(_needle: &str) -> super::super::NoIter {
unimplemented!("sliceslice only runs on x86")
}
}
pub(crate) mod rev {
pub(crate) fn oneshot(haystack: &str, needle: &str) -> bool {
pub(crate) fn oneshot(_haystack: &str, _needle: &str) -> bool {
unimplemented!("sliceslice does not support reverse searches")
}
pub(crate) fn prebuilt(
needle: &str,
_needle: &str,
) -> impl Fn(&str) -> bool + 'static {
|_| unimplemented!("sliceslice does not support reverse searches")
}
pub(crate) fn oneshotiter(
haystack: &str,
needle: &str,
_haystack: &str,
_needle: &str,
) -> impl Iterator<Item = usize> + 'static {
std::iter::from_fn(move || {
unimplemented!("sliceslice does not support reverse searches")
})
}
pub(crate) fn prebuiltiter(needle: &str) -> super::super::NoIter {
pub(crate) fn prebuiltiter(_needle: &str) -> super::super::NoIter {
unimplemented!("sliceslice does not support reverse searches")
}
}
@@ -693,9 +696,21 @@ pub(crate) mod libc {
}
pub(crate) mod fwd {
#[cfg(target_arch = "wasm32")]
extern "C" {
fn memmem(
haystack: *const libc::c_void,
haystack_len: usize,
needle: *const libc::c_void,
needle_len: usize,
) -> *const libc::c_void;
}
#[cfg(not(target_arch = "wasm32"))]
use libc::memmem;
fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
let p = unsafe {
libc::memmem(
memmem(
haystack.as_ptr() as *const libc::c_void,
haystack.len(),
needle.as_ptr() as *const libc::c_void,
+21 -7
View File
@@ -15,15 +15,29 @@ fn main() {
// is not a problem. In that case, the fastest option will be chosen at
// runtime.
fn enable_simd_optimizations() {
if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD")
|| !target_has_feature("sse2")
{
if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") {
return;
}
println!("cargo:rustc-cfg=memchr_runtime_simd");
println!("cargo:rustc-cfg=memchr_runtime_sse2");
println!("cargo:rustc-cfg=memchr_runtime_sse42");
println!("cargo:rustc-cfg=memchr_runtime_avx");
let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
match &arch[..] {
"x86_64" => {
if !target_has_feature("sse2") {
return;
}
println!("cargo:rustc-cfg=memchr_runtime_simd");
println!("cargo:rustc-cfg=memchr_runtime_sse2");
println!("cargo:rustc-cfg=memchr_runtime_sse42");
println!("cargo:rustc-cfg=memchr_runtime_avx");
}
"wasm32" | "wasm64" => {
if !target_has_feature("simd128") {
return;
}
println!("cargo:rustc-cfg=memchr_runtime_simd");
println!("cargo:rustc-cfg=memchr_runtime_wasm128");
}
_ => {}
}
}
// This adds a `memchr_libc` cfg if and only if libc can be used, if no other
+37 -48
View File
@@ -146,16 +146,17 @@ macro_rules! define_memmem_simple_tests {
}
mod byte_frequencies;
#[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
#[cfg(memchr_runtime_simd)]
mod genericsimd;
mod prefilter;
mod rabinkarp;
mod rarebytes;
mod twoway;
mod util;
// SIMD is only supported on x86_64 currently.
#[cfg(target_arch = "x86_64")]
#[cfg(memchr_runtime_simd)]
mod vector;
#[cfg(all(memchr_runtime_wasm128))]
mod wasm;
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
mod x86;
@@ -773,47 +774,47 @@ enum SearcherKind {
TwoWay(twoway::Forward),
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
GenericSIMD128(x86::sse::Forward),
#[cfg(memchr_runtime_wasm128)]
GenericSIMD128(wasm::Forward),
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
GenericSIMD256(x86::avx::Forward),
}
impl<'n> Searcher<'n> {
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
use self::SearcherKind::*;
let ninfo = NeedleInfo::new(needle);
let prefn =
prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle);
let kind = if needle.len() == 0 {
Empty
} else if needle.len() == 1 {
OneByte(needle[0])
} else if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) {
GenericSIMD256(fwd)
} else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) {
GenericSIMD128(fwd)
} else {
TwoWay(twoway::Forward::new(needle))
let mk = |kind: SearcherKind| {
let prefn = prefilter::forward(
&config.prefilter,
&ninfo.rarebytes,
needle,
);
Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
};
Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
}
if needle.len() == 0 {
return mk(Empty);
}
if needle.len() == 1 {
return mk(OneByte(needle[0]));
}
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
{
if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) {
return mk(GenericSIMD256(fwd));
} else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) {
return mk(GenericSIMD128(fwd));
}
}
#[cfg(all(target_arch = "wasm32", memchr_runtime_simd))]
{
if let Some(fwd) = wasm::Forward::new(&ninfo, needle) {
return mk(GenericSIMD128(fwd));
}
}
#[cfg(not(all(not(miri), target_arch = "x86_64", memchr_runtime_simd)))]
fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
use self::SearcherKind::*;
let ninfo = NeedleInfo::new(needle);
let prefn =
prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle);
let kind = if needle.len() == 0 {
Empty
} else if needle.len() == 1 {
OneByte(needle[0])
} else {
TwoWay(twoway::Forward::new(needle))
};
Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
mk(TwoWay(twoway::Forward::new(needle)))
}
/// Return a fresh prefilter state that can be used with this searcher.
@@ -844,11 +845,7 @@ impl<'n> Searcher<'n> {
Empty => Empty,
OneByte(b) => OneByte(b),
TwoWay(tw) => TwoWay(tw),
#[cfg(all(
not(miri),
target_arch = "x86_64",
memchr_runtime_simd
))]
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(gs) => GenericSIMD128(gs),
#[cfg(all(
not(miri),
@@ -873,11 +870,7 @@ impl<'n> Searcher<'n> {
Empty => Empty,
OneByte(b) => OneByte(b),
TwoWay(tw) => TwoWay(tw),
#[cfg(all(
not(miri),
target_arch = "x86_64",
memchr_runtime_simd
))]
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(gs) => GenericSIMD128(gs),
#[cfg(all(
not(miri),
@@ -921,11 +914,7 @@ impl<'n> Searcher<'n> {
self.find_tw(tw, state, haystack, needle)
}
}
#[cfg(all(
not(miri),
target_arch = "x86_64",
memchr_runtime_simd
))]
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(ref gs) => {
// The SIMD matcher can't handle particularly short haystacks,
// so we fall back to RK in these cases.
+41 -34
View File
@@ -1,8 +1,10 @@
use crate::memmem::{rarebytes::RareNeedleBytes, NeedleInfo};
mod fallback;
#[cfg(all(target_arch = "x86_64", memchr_runtime_simd))]
#[cfg(memchr_runtime_simd)]
mod genericsimd;
#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))]
mod wasm;
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
mod x86;
@@ -90,6 +92,21 @@ pub(crate) type PrefilterFnTy = unsafe fn(
needle: &[u8],
) -> Option<usize>;
// If the haystack is too small for SSE2, then just run memchr on the
// rarest byte and be done with it. (It is likely that this code path is
// rarely exercised, since a higher level routine will probably dispatch to
// Rabin-Karp for such a small haystack.)
#[cfg(memchr_runtime_simd)]
fn simple_memchr_fallback(
_prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize();
crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare))
}
impl PrefilterFn {
/// Create a new prefilter function from the function pointer given.
///
@@ -269,7 +286,6 @@ impl PrefilterState {
/// This only applies to x86_64 when runtime SIMD detection is enabled (which
/// is the default). In general, we try to use an AVX prefilter, followed by
/// SSE and then followed by a generic one based on memchr.
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
#[inline(always)]
pub(crate) fn forward(
config: &Prefilter,
@@ -280,20 +296,33 @@ pub(crate) fn forward(
return None;
}
#[cfg(feature = "std")]
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
{
if cfg!(memchr_runtime_avx) {
if is_x86_feature_detected!("avx2") {
// SAFETY: x86::avx::find only requires the avx2 feature,
// which we've just checked above.
return unsafe { Some(PrefilterFn::new(x86::avx::find)) };
#[cfg(feature = "std")]
{
if cfg!(memchr_runtime_avx) {
if is_x86_feature_detected!("avx2") {
// SAFETY: x86::avx::find only requires the avx2 feature,
// which we've just checked above.
return unsafe { Some(PrefilterFn::new(x86::avx::find)) };
}
}
}
if cfg!(memchr_runtime_sse2) {
// SAFETY: x86::sse::find only requires the sse2 feature, which is
// guaranteed to be available on x86_64.
return unsafe { Some(PrefilterFn::new(x86::sse::find)) };
}
}
if cfg!(memchr_runtime_sse2) {
// SAFETY: x86::sse::find only requires the sse2 feature, which is
// guaranteed to be available on x86_64.
return unsafe { Some(PrefilterFn::new(x86::sse::find)) };
#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))]
{
// SAFETY: `wasm::find` is actually a safe function
//
// Also note that the `if true` is here to prevent, on wasm with simd,
// rustc warning about the code below being dead code.
if true {
return unsafe { Some(PrefilterFn::new(wasm::find)) };
}
}
// Check that our rarest byte has a reasonably low rank. The main issue
// here is that the fallback prefilter can perform pretty poorly if it's
@@ -306,28 +335,6 @@ pub(crate) fn forward(
None
}
/// Determine which prefilter function, if any, to use.
///
/// Since SIMD is currently only supported on x86_64, this will just select
/// the fallback prefilter if the rare bytes provided have a low enough rank.
#[cfg(not(all(not(miri), target_arch = "x86_64", memchr_runtime_simd)))]
#[inline(always)]
pub(crate) fn forward(
config: &Prefilter,
rare: &RareNeedleBytes,
needle: &[u8],
) -> Option<PrefilterFn> {
if config.is_none() || needle.len() <= 1 {
return None;
}
let (rare1_rank, _) = rare.as_ranks(needle);
if rare1_rank <= MAX_FALLBACK_RANK {
// SAFETY: fallback::find is safe to call in all environments.
return unsafe { Some(PrefilterFn::new(fallback::find)) };
}
None
}
/// Return the minimum length of the haystack in which a prefilter should be
/// used. If the haystack is below this length, then it's probably not worth
/// the overhead of running the prefilter.
+39
View File
@@ -0,0 +1,39 @@
use core::arch::wasm32::v128;
use crate::memmem::{
prefilter::{PrefilterFnTy, PrefilterState},
NeedleInfo,
};
// Check that the functions below satisfy the Prefilter function type.
const _: PrefilterFnTy = find;
/// A `v128`-accelerated candidate finder for single-substring search.
#[target_feature(enable = "simd128")]
pub(crate) fn find(
prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
unsafe {
super::genericsimd::find::<v128>(
prestate,
ninfo,
haystack,
needle,
super::simple_memchr_fallback,
)
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
#[test]
#[cfg(not(miri))]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
// SAFETY: super::find is safe to call for all inputs on x86.
unsafe { PrefilterTest::run_all_tests(super::find) };
}
}
+1 -14
View File
@@ -21,25 +21,12 @@ pub(crate) unsafe fn find(
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
// If the haystack is too small for SSE2, then just run memchr on the
// rarest byte and be done with it. (It is likely that this code path is
// rarely exercised, since a higher level routine will probably dispatch to
// Rabin-Karp for such a small haystack.)
fn simple_memchr_fallback(
_prestate: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize();
crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare))
}
super::super::genericsimd::find::<__m128i>(
prestate,
ninfo,
haystack,
needle,
simple_memchr_fallback,
super::super::simple_memchr_fallback,
)
}
+33
View File
@@ -96,3 +96,36 @@ mod x86avx {
}
}
}
#[cfg(target_arch = "wasm32")]
mod wasm_simd128 {
use super::Vector;
use core::arch::wasm32::*;
impl Vector for v128 {
#[inline(always)]
unsafe fn splat(byte: u8) -> v128 {
u8x16_splat(byte)
}
#[inline(always)]
unsafe fn load_unaligned(data: *const u8) -> v128 {
v128_load(data.cast())
}
#[inline(always)]
unsafe fn movemask(self) -> u32 {
u8x16_bitmask(self).into()
}
#[inline(always)]
unsafe fn cmpeq(self, vector2: Self) -> v128 {
u8x16_eq(self, vector2)
}
#[inline(always)]
unsafe fn and(self, vector2: Self) -> v128 {
v128_and(self, vector2)
}
}
}
+75
View File
@@ -0,0 +1,75 @@
use core::arch::wasm32::v128;
use crate::memmem::{genericsimd, NeedleInfo};
/// A `v128` accelerated vectorized substring search routine that only works on
/// small needles.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Forward(genericsimd::Forward);
impl Forward {
/// Create a new "generic simd" forward searcher. If one could not be
/// created from the given inputs, then None is returned.
pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
if !cfg!(memchr_runtime_simd) {
return None;
}
genericsimd::Forward::new(ninfo, needle).map(Forward)
}
/// Returns the minimum length of haystack that is needed for this searcher
/// to work. Passing a haystack with a length smaller than this will cause
/// `find` to panic.
#[inline(always)]
pub(crate) fn min_haystack_len(&self) -> usize {
self.0.min_haystack_len::<v128>()
}
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
self.find_impl(haystack, needle)
}
/// The implementation of find marked with the appropriate target feature.
#[target_feature(enable = "simd128")]
fn find_impl(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
unsafe { genericsimd::fwd_find::<v128>(&self.0, haystack, needle) }
}
}
#[cfg(all(test, feature = "std", not(miri)))]
mod tests {
use crate::memmem::{prefilter::PrefilterState, NeedleInfo};
fn find(
_: &mut PrefilterState,
ninfo: &NeedleInfo,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
super::Forward::new(ninfo, needle).unwrap().find(haystack, needle)
}
#[test]
fn prefilter_permutations() {
use crate::memmem::prefilter::tests::PrefilterTest;
unsafe {
PrefilterTest::run_all_tests_filter(find, |t| {
// This substring searcher only works on certain configs, so
// filter our tests such that Forward::new will be guaranteed
// to succeed. (And also remove tests with a haystack that is
// too small.)
let fwd = match super::Forward::new(&t.ninfo, &t.needle) {
None => return false,
Some(fwd) => fwd,
};
t.haystack.len() >= fwd.min_haystack_len()
})
}
}
}