Initial commit

2025-02-17 03:48:15 +00:00 · 2018-10-29 13:49:20 +00:00 · 2018-10-29 13:49:20 +00:00 · 2e2e827538
commit 2e2e827538
16 changed files with 5188 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/target
+**/*.rs.bk
+Cargo.lock
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,24 @@
+language: rust
+sudo: false
+
+matrix:
+  include:
+    - rust: 1.29.0
+    - rust: stable
+    - rust: beta
+    - rust: nightly
+    - rust: nightly
+      env:
+       - FEATURES='nightly'
+
+branches:
+  only:
+    - staging
+    - trying
+    - master
+
+script:
+  - cargo build --verbose --features "$FEATURES"
+  - cargo test --verbose --features "$FEATURES"
+  - if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then cargo bench --verbose --features "$FEATURES"; fi
+  - cargo doc --verbose --features "$FEATURES"
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,14 @@
+# Change Log
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/)
+and this project adheres to [Semantic Versioning](http://semver.org/).
+
+## [Unreleased]
+
+## v0.1.0 - 2018-10-29
+
+- Initial release
+
+[Unreleased]: https://github.com/japaric/heapless/compare/v0.1.0...HEAD
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,20 @@
+[package]
+name = "hashbrown"
+version = "0.1.0"
+authors = ["Amanieu d'Antras <amanieu@gmail.com>"]
+description = "A faster replacement for HashMap"
+license = "Apache-2.0/MIT"
+repository = "https://github.com/Amanieu/hashbrown"
+readme = "README.md"
+keywords = ["hash", "no_std", "hashmap"]
+categories = ["data-structures", "no-std"]
+
+[dependencies]
+scopeguard = { version = "0.3", default-features = false }
+byteorder = { version = "1.0", default-features = false }
+
+[dev-dependencies]
+rustc-hash = "1.0"
+
+[features]
+nightly = []
--- a/201
+++ b/201
@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/25
+++ b/25
@ -0,0 +1,25 @@
+Copyright (c) 2016 Amanieu d'Antras
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,81 @@
+hashbrown
+=========
+
+[![Build Status](https://travis-ci.com/Amanieu/hashbrown.svg?branch=master)](https://travis-ci.com/Amanieu/hashbrown) [![Crates.io](https://img.shields.io/crates/v/hashbrown.svg)](https://crates.io/crates/hashbrown)
+
+> A high-performance hash table for Rust.
+
+## [Documentation](https://docs.rs/hashbrown)
+
+## [Change log](CHANGELOG.md)
+
+## Features
+
+- Drop-in replacement for the standard library `HashMap` and `HashSet` types.
+- Around 2x faster than `FxHashMap` and 8x faster than the standard `HashMap`.
+- Compatible with `#[no_std]` (currently requires nightly for the `alloc` crate).
+- Empty hash maps do not allocate any memory.
+- Uses SIMD to speed up lookups. The algorithm is based on Google's ["Swiss Table"](https://abseil.io/blog/20180927-swisstables) hash map.
+  - Explained in detail in [this video](https://www.youtube.com/watch?v=ncHmEUmJZf4).
+
+## Performance
+
+Compared to `std::collections::HashMap`:
+
+```
+ name               stdhash ns/iter  hashbrown ns/iter  diff ns/iter    diff %  speedup
+ find_existing      23,831           2,935                   -20,896   -87.68%   x 8.12
+ find_nonexisting   25,326           2,283                   -23,043   -90.99%  x 11.09
+ get_remove_insert  124              25                          -99   -79.84%   x 4.96
+ grow_by_insertion  197              177                         -20   -10.15%   x 1.11
+ hashmap_as_queue   72               18                          -54   -75.00%   x 4.00
+ new_drop           14               0                           -14  -100.00%    x inf
+ new_insert_drop    78               55                          -23   -29.49%   x 1.42
+```
+
+Compared to `rustc_hash::FxHashMap`:
+
+```
+ name               fxhash ns/iter  hashbrown ns/iter  diff ns/iter    diff %  speedup
+ find_existing      5,951           2,935                    -3,016   -50.68%   x 2.03
+ find_nonexisting   4,637           2,283                    -2,354   -50.77%   x 2.03
+ get_remove_insert  29              25                           -4   -13.79%   x 1.16
+ grow_by_insertion  160             177                          17    10.62%   x 0.90
+ hashmap_as_queue   22              18                           -4   -18.18%   x 1.22
+ new_drop           9               0                            -9  -100.00%    x inf
+ new_insert_drop    64              55                           -9   -14.06%   x 1.16
+```
+
+## Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+hashbrown = "0.1"
+```
+
+and this to your crate root:
+
+```rust
+extern crate hashbrown;
+```
+
+This crate has the following Cargo features:
+
+- `nightly`: Enables nightly-only features: `no_std` support and ~10% speedup from branch hint intrinsics.
+
+## License
+
+Licensed under either of
+
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+
+at your option.
+
+### Contribution
+
+Unless you explicitly state otherwise, any contribution intentionally submitted
+for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any
+additional terms or conditions.
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -0,0 +1,125 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#![feature(test)]
+
+extern crate hashbrown;
+extern crate test;
+extern crate rustc_hash;
+
+use test::Bencher;
+use std::hash::Hash;
+
+use hashbrown::HashMap;
+//use rustc_hash::FxHashMap as HashMap;
+//use std::collections::HashMap;
+
+fn new_map<K: Eq + Hash, V>() -> HashMap<K, V> {
+    HashMap::default()
+}
+
+#[bench]
+fn new_drop(b: &mut Bencher) {
+    b.iter(|| {
+        let m: HashMap<i32, i32> = new_map();
+        assert_eq!(m.len(), 0);
+    })
+}
+
+#[bench]
+fn new_insert_drop(b: &mut Bencher) {
+    b.iter(|| {
+        let mut m = new_map();
+        m.insert(0, 0);
+        assert_eq!(m.len(), 1);
+    })
+}
+
+#[bench]
+fn grow_by_insertion(b: &mut Bencher) {
+    let mut m = new_map();
+
+    for i in 1..1001 {
+        m.insert(i, i);
+    }
+
+    let mut k = 1001;
+
+    b.iter(|| {
+        m.insert(k, k);
+        k += 1;
+    });
+}
+
+#[bench]
+fn find_existing(b: &mut Bencher) {
+    let mut m = new_map();
+
+    for i in 1..1001 {
+        m.insert(i, i);
+    }
+
+    b.iter(|| {
+        for i in 1..1001 {
+            m.contains_key(&i);
+        }
+    });
+}
+
+#[bench]
+fn find_nonexisting(b: &mut Bencher) {
+    let mut m = new_map();
+
+    for i in 1..1001 {
+        m.insert(i, i);
+    }
+
+    b.iter(|| {
+        for i in 1001..2001 {
+            m.contains_key(&i);
+        }
+    });
+}
+
+#[bench]
+fn hashmap_as_queue(b: &mut Bencher) {
+    let mut m = new_map();
+
+    for i in 1..1001 {
+        m.insert(i, i);
+    }
+
+    let mut k = 1;
+
+    b.iter(|| {
+        m.remove(&k);
+        m.insert(k + 1000, k + 1000);
+        k += 1;
+    });
+}
+
+#[bench]
+fn get_remove_insert(b: &mut Bencher) {
+    let mut m = new_map();
+
+    for i in 1..1001 {
+        m.insert(i, i);
+    }
+
+    let mut k = 1;
+
+    b.iter(|| {
+        m.get(&(k + 400));
+        m.get(&(k + 2000));
+        m.remove(&k);
+        m.insert(k + 1000, k + 1000);
+        k += 1;
+    })
+}
--- a/src/fx.rs
+++ b/src/fx.rs
@ -0,0 +1,112 @@
+//! Fast, non-cryptographic hash used by rustc and Firefox.
+
+use core::default::Default;
+use core::hash::{BuildHasherDefault, Hasher};
+use core::mem::size_of;
+use core::ops::BitXor;
+
+use byteorder::{ByteOrder, NativeEndian};
+
+/// Type alias for a `HashBuilder` using the `fx` hash algorithm.
+pub type FxHashBuilder = BuildHasherDefault<FxHasher>;
+
+/// A speedy hash algorithm for use within rustc. The hashmap in liballoc
+/// by default uses SipHash which isn't quite as speedy as we want. In the
+/// compiler we're not really worried about DOS attempts, so we use a fast
+/// non-cryptographic hash.
+///
+/// This is the same as the algorithm used by Firefox -- which is a homespun
+/// one not based on any widely-known algorithm -- though modified to produce
+/// 64-bit hash values instead of 32-bit hash values. It consistently
+/// out-performs an FNV-based hash within rustc itself -- the collision rate is
+/// similar or slightly worse than FNV, but the speed of the hash function
+/// itself is much higher because it works on up to 8 bytes at a time.
+pub struct FxHasher {
+    hash: usize,
+}
+
+#[cfg(target_pointer_width = "32")]
+const K: usize = 0x9e3779b9;
+#[cfg(target_pointer_width = "64")]
+const K: usize = 0x517cc1b727220a95;
+
+impl Default for FxHasher {
+    #[inline]
+    fn default() -> FxHasher {
+        FxHasher { hash: 0 }
+    }
+}
+
+impl FxHasher {
+    #[inline]
+    fn add_to_hash(&mut self, i: usize) {
+        self.hash = self.hash.rotate_left(5).bitxor(i).wrapping_mul(K);
+    }
+}
+
+impl Hasher for FxHasher {
+    #[inline]
+    fn write(&mut self, mut bytes: &[u8]) {
+        #[cfg(target_pointer_width = "32")]
+        let read_usize = |bytes| NativeEndian::read_u32(bytes);
+        #[cfg(target_pointer_width = "64")]
+        let read_usize = |bytes| NativeEndian::read_u64(bytes);
+
+        let mut hash = FxHasher { hash: self.hash };
+        assert!(size_of::<usize>() <= 8);
+        while bytes.len() >= size_of::<usize>() {
+            hash.add_to_hash(read_usize(bytes) as usize);
+            bytes = &bytes[size_of::<usize>()..];
+        }
+        if (size_of::<usize>() > 4) && (bytes.len() >= 4) {
+            hash.add_to_hash(NativeEndian::read_u32(bytes) as usize);
+            bytes = &bytes[4..];
+        }
+        if (size_of::<usize>() > 2) && bytes.len() >= 2 {
+            hash.add_to_hash(NativeEndian::read_u16(bytes) as usize);
+            bytes = &bytes[2..];
+        }
+        if (size_of::<usize>() > 1) && bytes.len() >= 1 {
+            hash.add_to_hash(bytes[0] as usize);
+        }
+        self.hash = hash.hash;
+    }
+
+    #[inline]
+    fn write_u8(&mut self, i: u8) {
+        self.add_to_hash(i as usize);
+    }
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.add_to_hash(i as usize);
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.add_to_hash(i as usize);
+    }
+
+    #[cfg(target_pointer_width = "32")]
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.add_to_hash(i as usize);
+        self.add_to_hash((i >> 32) as usize);
+    }
+
+    #[cfg(target_pointer_width = "64")]
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.add_to_hash(i as usize);
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.add_to_hash(i);
+    }
+
+    #[inline]
+    fn finish(&self) -> u64 {
+        self.hash as u64
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,34 @@
+//! A high-performance replacement for the standard library `HashMap`.
+//!
+//! The API of this crate mirrors that of the hash table implementation in
+//! `std::collections`.
+
+#![no_std]
+#![cfg_attr(
+    feature = "nightly",
+    feature(alloc, allocator_api, ptr_offset_from, test, core_intrinsics)
+)]
+#![warn(missing_docs)]
+
+#[cfg(feature = "nightly")]
+extern crate alloc;
+extern crate byteorder;
+extern crate scopeguard;
+#[cfg(not(feature = "nightly"))]
+extern crate std as alloc;
+
+mod fx;
+mod map;
+mod raw;
+mod set;
+
+pub mod hash_map {
+    //! A hash map implemented with linear probing and Robin Hood bucket stealing.
+    pub use map::*;
+}
+pub mod hash_set {
+    //! A hash set implemented as a `HashMap` where the value is `()`.
+    pub use set::*;
+}
+pub use map::HashMap;
+pub use set::HashSet;
--- a/src/map.rs
+++ b/src/map.rs
--- a/src/raw/bitmask.rs
+++ b/src/raw/bitmask.rs
@ -0,0 +1,85 @@
+use raw::imp::{BitMaskWord, BITMASK_MASK, BITMASK_SHIFT};
+
+/// A bit mask which contains the result of a `Match` operation on a `Group` and
+/// allows iterating through them.
+///
+/// The bit mask is arranged so that low-order bits represent lower memory
+/// addresses for group match results.
+#[derive(Copy, Clone)]
+pub struct BitMask(pub BitMaskWord);
+
+impl BitMask {
+    /// Returns a new `BitMask` with all bits inverted.
+    #[inline]
+    #[must_use]
+    pub fn invert(self) -> BitMask {
+        BitMask(self.0 ^ BITMASK_MASK)
+    }
+
+    /// Returns a new `BitMask` with the lowest bit removed.
+    #[inline]
+    #[must_use]
+    pub fn remove_lowest_bit(self) -> BitMask {
+        BitMask(self.0 & (self.0 - 1))
+    }
+    /// Returns whether the `BitMask` has at least one set bits.
+    #[inline]
+    pub fn any_bit_set(self) -> bool {
+        self.0 != 0
+    }
+
+    /// Returns the first set bit in the `BitMask`, if there is one.
+    #[inline]
+    pub fn lowest_set_bit(self) -> Option<usize> {
+        if self.0 == 0 {
+            None
+        } else {
+            Some(self.trailing_zeros())
+        }
+    }
+
+    /// Returns the number of trailing zeroes in the `BitMask`.
+    #[inline]
+    pub fn trailing_zeros(self) -> usize {
+        // ARM doesn't have a CTZ instruction, and instead uses RBIT + CLZ.
+        // However older ARM versions (pre-ARMv7) don't have RBIT and need to
+        // emulate it instead. Since we only have 1 bit set in each byte we can
+        // use REV + CLZ instead.
+        if cfg!(target_arch = "arm") && BITMASK_SHIFT >= 3 {
+            self.0.swap_bytes().leading_zeros() as usize >> BITMASK_SHIFT
+        } else {
+            self.0.trailing_zeros() as usize >> BITMASK_SHIFT
+        }
+    }
+
+    /// Returns the number of leading zeroes in the `BitMask`.
+    #[inline]
+    pub fn leading_zeros(self) -> usize {
+        self.0.leading_zeros() as usize >> BITMASK_SHIFT
+    }
+}
+
+impl IntoIterator for BitMask {
+    type Item = usize;
+    type IntoIter = BitMaskIter;
+
+    #[inline]
+    fn into_iter(self) -> BitMaskIter {
+        BitMaskIter(self)
+    }
+}
+
+/// Iterator over the contents of a `BitMask`, returning the indicies of set
+/// bits.
+pub struct BitMaskIter(BitMask);
+
+impl Iterator for BitMaskIter {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        let bit = self.0.lowest_set_bit()?;
+        self.0 = self.0.remove_lowest_bit();
+        Some(bit)
+    }
+}
--- a/src/raw/generic.rs
+++ b/src/raw/generic.rs
@ -0,0 +1,127 @@
+use core::{mem, ptr};
+use raw::bitmask::BitMask;
+use raw::EMPTY;
+
+// Use the native word size as the group size. Using a 64-bit group size on
+// a 32-bit architecture will just end up being more expensive because
+// shifts and multiplies will need to be emulated.
+#[cfg(any(
+    target_pointer_width = "64",
+    target_arch = "aarch64",
+    target_arch = "x86_64",
+))]
+type GroupWord = u64;
+#[cfg(all(
+    target_pointer_width = "32",
+    not(target_arch = "aarch64"),
+    not(target_arch = "x86_64"),
+))]
+type GroupWord = u32;
+
+pub type BitMaskWord = GroupWord;
+pub const BITMASK_SHIFT: u32 = 3;
+pub const BITMASK_MASK: GroupWord = 0x8080808080808080u64 as GroupWord;
+
+/// Helper function to replicate a byte across a `GroupWord`.
+#[inline]
+fn repeat(byte: u8) -> GroupWord {
+    let repeat = byte as GroupWord;
+    let repeat = repeat | repeat.wrapping_shl(8);
+    let repeat = repeat | repeat.wrapping_shl(16);
+    // This last line is a no-op with a 32-bit GroupWord
+    repeat | repeat.wrapping_shl(32)
+}
+
+/// Abstraction over a group of control bytes which can be scanned in
+/// parallel.
+///
+/// This implementation uses a word-sized integer.
+pub struct Group(GroupWord);
+
+// We perform all operations in the native endianess, and convert to
+// little-endian just before creating a BitMask. The can potentially
+// enable the compiler to eliminate unnecessary byte swaps if we are
+// only checking whether a BitMask is empty.
+impl Group {
+    /// Number of bytes in the group.
+    pub const WIDTH: usize = mem::size_of::<Self>();
+
+    /// Returns a full group of empty bytes, suitable for use as the initial
+    /// value for an empty hash table.
+    ///
+    /// This is guaranteed to be aligned to the group size.
+    #[inline]
+    pub fn static_empty() -> &'static [u8] {
+        #[repr(C)]
+        struct Dummy {
+            _align: [GroupWord; 0],
+            bytes: [u8; Group::WIDTH],
+        };
+        const DUMMY: Dummy = Dummy {
+            _align: [],
+            bytes: [EMPTY; Group::WIDTH],
+        };
+        &DUMMY.bytes
+    }
+
+    /// Loads a group of bytes starting at the given address.
+    #[inline]
+    pub unsafe fn load(ptr: *const u8) -> Group {
+        Group(ptr::read_unaligned(ptr as *const _))
+    }
+
+    /// Loads a group of bytes starting at the given address, which must be
+    /// aligned to `WIDTH`.
+    #[inline]
+    pub unsafe fn load_aligned(ptr: *const u8) -> Group {
+        Group(ptr::read(ptr as *const _))
+    }
+
+    /// Stores the group of bytes to the given address, which must be
+    /// aligned to `WIDTH`.
+    #[inline]
+    pub unsafe fn store_aligned(&self, ptr: *mut u8) {
+        ptr::write(ptr as *mut _, self.0);
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which *may*
+    /// have the given value.
+    ///
+    /// This function may return a false positive in certain cases where
+    /// the byte in the group differs from the searched value only in its
+    /// lowest bit. This is fine because:
+    /// - This never happens for `EMPTY` and `DELETED`, only full entries.
+    /// - The check for key equality will catch these.
+    /// - This only happens if there is at least 1 true match.
+    /// - The chance of this happening is very low (< 1% chance per byte).
+    #[inline]
+    pub fn match_byte(&self, byte: u8) -> BitMask {
+        // This algorithm is derived from
+        // http://graphics.stanford.edu/~seander/bithacks.html##ValueInWord
+        let cmp = self.0 ^ repeat(byte);
+        BitMask(((cmp - repeat(0x01)) & !cmp & repeat(0x80)).to_le())
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which are
+    /// `EMPTY`.
+    #[inline]
+    pub fn match_empty(&self) -> BitMask {
+        BitMask((self.0 & (self.0 << 1) & repeat(0x80)).to_le())
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which are
+    /// `EMPTY` pr `DELETED`.
+    #[inline]
+    pub fn match_empty_or_deleted(&self) -> BitMask {
+        BitMask((self.0 & repeat(0x80)).to_le())
+    }
+
+    /// Performs the following transformation on all bytes in the group:
+    /// - `EMPTY => EMPTY`
+    /// - `DELETED => EMPTY`
+    /// - `FULL => DELETED`
+    #[inline]
+    pub fn convert_special_to_empty_and_full_to_deleted(&self) -> Group {
+        Group(((self.0 & repeat(0x80)) >> 7) * 0xff)
+    }
+}
--- a/src/raw/mod.rs
+++ b/src/raw/mod.rs
@ -0,0 +1,946 @@
+use alloc::alloc::{alloc, dealloc, handle_alloc_error};
+use core::alloc::Layout;
+use core::hint;
+use core::iter::FusedIterator;
+use core::marker::PhantomData;
+use core::mem;
+use core::mem::ManuallyDrop;
+use core::ptr::NonNull;
+use scopeguard::guard;
+
+// Branch prediction hint. This is currently only available on nightly but it
+// consistently improves performance by 10-15%.
+#[cfg(feature = "nightly")]
+use core::intrinsics::likely;
+#[cfg(not(feature = "nightly"))]
+#[inline]
+fn likely(b: bool) -> bool {
+    b
+}
+
+// Use the SSE2 implementation if possible: it allows us to scan 16 buckets at
+// once instead of 8.
+#[cfg(all(
+    target_feature = "sse2",
+    any(target_arch = "x86", target_arch = "x86_64")
+))]
+#[path = "sse2.rs"]
+mod imp;
+#[cfg(not(all(
+    target_feature = "sse2",
+    any(target_arch = "x86", target_arch = "x86_64")
+)))]
+#[path = "generic.rs"]
+mod imp;
+
+mod bitmask;
+
+use raw::bitmask::BitMask;
+use raw::imp::Group;
+
+/// Control byte value for an empty bucket.
+const EMPTY: u8 = 0b11111111;
+
+/// Control byte value for a deleted bucket.
+const DELETED: u8 = 0b10000000;
+
+/// Checks whether a control byte represents a full bucket (top bit is clear).
+#[inline]
+fn is_full(ctrl: u8) -> bool {
+    ctrl & 0x80 == 0
+}
+
+/// Checks whether a control byte represents a special value (top bit is set).
+#[inline]
+fn is_special(ctrl: u8) -> bool {
+    ctrl & 0x80 != 0
+}
+
+/// Checks whether a special control value is EMPTY (just check 1 bit).
+#[inline]
+fn special_is_empty(ctrl: u8) -> bool {
+    debug_assert!(is_special(ctrl));
+    ctrl & 0x01 != 0
+}
+
+/// Primary hash function, used to select the initial bucket to probe from.
+#[inline]
+fn h1(hash: u64) -> usize {
+    hash as usize
+}
+
+/// Secondary hash function, saved in the low 7 bits of the control byte.
+#[inline]
+fn h2(hash: u64) -> u8 {
+    // Grab the top 7 bits of the hash. While the hash is normally a full 64-bit
+    // value, some hash functions (such as FxHash) produce a usize result
+    // instead, which means that the top 32 bits are 0 on 32-bit platforms.
+    let hash_len = usize::min(mem::size_of::<usize>(), mem::size_of::<u64>());
+    let top7 = hash >> (hash_len * 8 - 7);
+    (top7 & 0x7f) as u8
+}
+
+/// Probe sequence based on triangular numbers, which is guaranteed (since our
+/// table size is a power of two) to visit every group of elements exactly once.
+struct ProbeSeq {
+    mask: usize,
+    offset: usize,
+    index: usize,
+}
+
+impl Iterator for ProbeSeq {
+    type Item = usize;
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        // We should have found an empty bucket by now and ended the probe.
+        debug_assert!(self.index <= self.mask, "Went past end of probe sequence");
+
+        let result = self.offset;
+        self.index += Group::WIDTH;
+        self.offset += self.index;
+        self.offset &= self.mask;
+        Some(result)
+    }
+}
+
+/// Returns the number of buckets needed to hold the given number of items,
+/// taking the maximum load factor into account.
+#[inline]
+fn capacity_to_buckets(cap: usize) -> usize {
+    let adjusted_cap = if cap < 8 {
+        // Need at least 1 free bucket on small tables
+        cap + 1
+    } else {
+        // Otherwise require 1/8 buckets to be empty (87.5% load)
+        cap.checked_mul(8).expect("Hash table capacity overflow") / 7
+    };
+
+    // Any overflows will have been caught by the checked_mul.
+    adjusted_cap.next_power_of_two()
+}
+
+/// Returns the maximum effective capacity for the given bucket mask, taking
+/// the maximum load factor into account.
+#[inline]
+fn bucket_mask_to_capacity(bucket_mask: usize) -> usize {
+    if bucket_mask < 8 {
+        bucket_mask
+    } else {
+        ((bucket_mask + 1) / 8) * 7
+    }
+}
+
+// Returns a Layout which describes the allocation required for a hash table,
+// and the offset of the buckets in the allocation.
+#[inline]
+#[cfg(feature = "nightly")]
+fn calculate_layout<T>(buckets: usize) -> Option<(Layout, usize)> {
+    debug_assert!(buckets.is_power_of_two());
+
+    // Array of buckets
+    let data = Layout::array::<T>(buckets).ok()?;
+
+    // Array of control bytes. This must be aligned to the group size.
+    //
+    // We add `Group::WIDTH` control bytes at the end of the array which
+    // replicate the bytes at the start of the array and thus avoids the need to
+    // perform bounds-checking while probing.
+    let ctrl = Layout::array::<u8>(buckets + Group::WIDTH)
+        .ok()?
+        .align_to(Group::WIDTH);
+
+    ctrl.extend(data).ok()
+}
+
+// Returns a Layout which describes the allocation required for a hash table,
+// and the offset of the buckets in the allocation.
+#[inline]
+#[cfg(not(feature = "nightly"))]
+fn calculate_layout<T>(buckets: usize) -> Option<(Layout, usize)> {
+    debug_assert!(buckets.is_power_of_two());
+
+    // Manual layout calculation since Layout methods are not yet stable.
+    let data_align = usize::max(mem::align_of::<T>(), Group::WIDTH);
+    let data_offset = (buckets + Group::WIDTH).checked_add(data_align - 1)? & !(data_align - 1);
+    let len = data_offset.checked_add(mem::size_of::<T>().checked_mul(buckets)?)?;
+    unsafe {
+        Some((
+            Layout::from_size_align_unchecked(len, data_align),
+            data_offset,
+        ))
+    }
+}
+
+/// A reference to a hash table bucket containing a `T`.
+pub struct Bucket<T> {
+    ptr: NonNull<T>,
+}
+
+impl<T> Clone for Bucket<T> {
+    #[inline]
+    fn clone(&self) -> Self {
+        Bucket { ptr: self.ptr }
+    }
+}
+
+impl<T> Bucket<T> {
+    #[inline]
+    unsafe fn from_ptr(ptr: *const T) -> Self {
+        Bucket {
+            ptr: NonNull::new_unchecked(ptr as *mut T),
+        }
+    }
+    #[inline]
+    pub unsafe fn drop(&self) {
+        self.ptr.as_ptr().drop_in_place();
+    }
+    #[inline]
+    pub unsafe fn read(&self) -> T {
+        self.ptr.as_ptr().read()
+    }
+    #[inline]
+    pub unsafe fn write(&self, val: T) {
+        self.ptr.as_ptr().write(val);
+    }
+    #[inline]
+    pub unsafe fn as_ref<'a>(&self) -> &'a T {
+        &*self.ptr.as_ptr()
+    }
+    #[inline]
+    pub unsafe fn as_mut<'a>(&self) -> &'a mut T {
+        &mut *self.ptr.as_ptr()
+    }
+}
+
+/// A raw hash table with an unsafe API.
+pub struct RawTable<T> {
+    ctrl: NonNull<u8>,
+    bucket_mask: usize,
+    data: NonNull<T>,
+    items: usize,
+    growth_left: usize,
+}
+
+impl<T> RawTable<T> {
+    /// Creates a new empty hash table without allocating any memory.
+    ///
+    /// In effect this returns a table with exactly 1 bucket. However we can
+    /// leave the data pointer dangling since that bucket is never written to
+    /// due to our load factor forcing us to always have at least 1 free bucket.
+    #[inline]
+    pub fn new() -> RawTable<T> {
+        RawTable {
+            data: NonNull::dangling(),
+            ctrl: NonNull::from(&Group::static_empty()[0]),
+            bucket_mask: 0,
+            items: 0,
+            growth_left: 0,
+        }
+    }
+
+    /// Allocates a new hash table with the given number of buckets.
+    ///
+    /// The control bytes are left uninitialized.
+    #[inline]
+    unsafe fn new_uninitialized(buckets: usize) -> RawTable<T> {
+        let (layout, data_offset) =
+            calculate_layout::<T>(buckets).expect("Hash table capacity overflow");
+        let ctrl = NonNull::new(alloc(layout)).unwrap_or_else(|| handle_alloc_error(layout));
+        let data = NonNull::new_unchecked(ctrl.as_ptr().add(data_offset) as *mut T);
+        RawTable {
+            data,
+            ctrl,
+            bucket_mask: buckets - 1,
+            items: 0,
+            growth_left: bucket_mask_to_capacity(buckets - 1),
+        }
+    }
+
+    /// Allocates a new hash table with at least enough capacity for inserting
+    /// the given number of elements without reallocating.
+    pub fn with_capacity(capacity: usize) -> RawTable<T> {
+        if capacity == 0 {
+            RawTable::new()
+        } else {
+            unsafe {
+                let result = RawTable::new_uninitialized(capacity_to_buckets(capacity));
+                result
+                    .ctrl(0)
+                    .write_bytes(EMPTY, result.buckets() + Group::WIDTH);
+
+                // If we have fewer buckets than the group width then we need to
+                // fill in unused spaces in the trailing control bytes with
+                // DELETED entries. See the comments in set_ctrl.
+                if result.buckets() < Group::WIDTH {
+                    result
+                        .ctrl(result.buckets())
+                        .write_bytes(DELETED, Group::WIDTH - result.buckets());
+                }
+
+                result
+            }
+        }
+    }
+
+    /// Deallocates the table without dropping any entries.
+    #[inline]
+    unsafe fn free_buckets(&mut self) {
+        let (layout, _) =
+            calculate_layout::<T>(self.buckets()).unwrap_or_else(|| hint::unreachable_unchecked());
+        dealloc(self.ctrl.as_ptr(), layout);
+    }
+
+    /// Returns the index of a bucket from a `Bucket`.
+    #[inline]
+    #[cfg(feature = "nightly")]
+    unsafe fn bucket_index(&self, bucket: &Bucket<T>) -> usize {
+        bucket.ptr.as_ptr().offset_from(self.data.as_ptr()) as usize
+    }
+
+    /// Returns the index of a bucket from a `Bucket`.
+    #[inline]
+    #[cfg(not(feature = "nightly"))]
+    unsafe fn bucket_index(&self, bucket: &Bucket<T>) -> usize {
+        (bucket.ptr.as_ptr() as usize - self.data.as_ptr() as usize) / mem::size_of::<T>()
+    }
+
+    /// Returns a pointer to a control byte.
+    #[inline]
+    unsafe fn ctrl(&self, index: usize) -> *mut u8 {
+        debug_assert!(index < self.buckets() + Group::WIDTH);
+        self.ctrl.as_ptr().add(index)
+    }
+
+    /// Returns a pointer to an element in the table.
+    #[inline]
+    pub unsafe fn bucket(&self, index: usize) -> Bucket<T> {
+        debug_assert_ne!(self.bucket_mask, 0);
+        debug_assert!(index < self.buckets());
+        Bucket::from_ptr(self.data.as_ptr().add(index))
+    }
+
+    /// Erases an element from the table without dropping it.
+    #[inline]
+    pub unsafe fn erase_no_drop(&mut self, item: &Bucket<T>) {
+        let index = self.bucket_index(item);
+        let index_before = index.wrapping_sub(Group::WIDTH) & self.bucket_mask;
+        let empty_before = Group::load(self.ctrl(index_before)).match_empty();
+        let empty_after = Group::load(self.ctrl(index)).match_empty();
+
+        // If we are inside a continuous block of Group::WIDTH full or deleted
+        // cells then a probe window may have seen a full block when trying to
+        // insert. We therefore need to keep that block non-empty so that
+        // lookups will continue searching to the next probe window.
+        let ctrl = if empty_before.trailing_zeros() + empty_after.leading_zeros() >= Group::WIDTH {
+            DELETED
+        } else {
+            self.growth_left += 1;
+            EMPTY
+        };
+        self.set_ctrl(index, ctrl);
+        self.items -= 1;
+    }
+
+    /// Returns an iterator for a probe sequence on the table.
+    ///
+    /// This iterator never terminates, but is guaranteed to visit each bucket
+    /// group exactly once.
+    #[inline]
+    fn probe_seq(&self, hash: u64) -> ProbeSeq {
+        ProbeSeq {
+            mask: self.bucket_mask,
+            offset: h1(hash) & self.bucket_mask,
+            index: 0,
+        }
+    }
+
+    /// Sets a control byte, and possibly also the replicated control byte at
+    /// the end of the array.
+    #[inline]
+    unsafe fn set_ctrl(&self, index: usize, ctrl: u8) {
+        // Replicate the first Group::WIDTH control bytes at the end of
+        // the array without using a branch:
+        // - If index >= Group::WIDTH then index == index2.
+        // - Otherwise index2 == self.bucket_mask + 1 + index.
+        //
+        // The very last replicated control byte is never actually read because
+        // we mask the initial index for unaligned loads, but we write it
+        // anyways because it makes the set_ctrl implementation simpler.
+        //
+        // If there are fewer buckets than Group::WIDTH then this code will
+        // replicate the buckets at the end of the trailing group. For example
+        // with 2 buckets and a group size of 4, the control bytes will look
+        // like this:
+        //
+        //     Real    |             Replicated
+        // -------------------------------------------------
+        // | [A] | [B] | [DELETED] | [DELETED] | [A] | [B] |
+        // -------------------------------------------------
+        let index2 = ((index.wrapping_sub(Group::WIDTH)) & self.bucket_mask) + Group::WIDTH;
+
+        *self.ctrl(index) = ctrl;
+        *self.ctrl(index2) = ctrl;
+    }
+
+    /// Searches for an empty or deleted bucket which is suitable for inserting
+    /// a new element.
+    ///
+    /// There must be at least 1 empty bucket in the table.
+    #[inline]
+    fn find_insert_slot(&self, hash: u64) -> usize {
+        for pos in self.probe_seq(hash) {
+            let group = unsafe { Group::load(self.ctrl(pos)) };
+            if let Some(bit) = group.match_empty_or_deleted().lowest_set_bit() {
+                return (pos + bit) & self.bucket_mask;
+            }
+        }
+
+        // probe_seq never returns.
+        unreachable!();
+    }
+
+    /// Marks all table buckets as empty without dropping their contents.
+    #[inline]
+    fn clear_no_drop(&mut self) {
+        unsafe {
+            self.ctrl(0)
+                .write_bytes(EMPTY, self.buckets() + Group::WIDTH);
+        }
+        self.items = 0;
+        self.growth_left = bucket_mask_to_capacity(self.bucket_mask);
+    }
+
+    /// Removes all elements from the table without freeing the backing memory.
+    #[inline]
+    pub fn clear(&mut self) {
+        // Ensure that the table is reset even if one of the drops panic
+        let self_ = guard(self, |self_| self_.clear_no_drop());
+
+        for item in self_.iter() {
+            unsafe {
+                item.drop();
+            }
+        }
+    }
+
+    /// Shrinks the table to fit `max(self.len(), min_size)` elements.
+    #[inline]
+    pub fn shrink_to(&mut self, min_size: usize, hasher: impl Fn(&T) -> u64) {
+        let min_size = usize::max(self.items, min_size);
+        if bucket_mask_to_capacity(self.bucket_mask) / 2 > min_size {
+            self.resize(min_size, hasher);
+        }
+    }
+
+    /// Ensures that at least `additional` items can be inserted into the table
+    /// without reallocation.
+    #[inline]
+    pub fn reserve(&mut self, additional: usize, hasher: impl Fn(&T) -> u64) {
+        if additional > self.growth_left {
+            self.reserve_rehash(additional, hasher);
+        }
+    }
+
+    /// Out-of-line slow path for `reserve`.
+    #[cold]
+    #[inline(never)]
+    fn reserve_rehash(&mut self, additional: usize, hasher: impl Fn(&T) -> u64) {
+        let new_items = self
+            .items
+            .checked_add(additional)
+            .expect("Hash table capacity overflow");
+
+        // Rehash in-place without re-allocating if we have plenty of spare
+        // capacity that is locked up due to DELETED entries.
+        if new_items < bucket_mask_to_capacity(self.bucket_mask) / 2 {
+            self.rehash_in_place(hasher);
+        } else {
+            self.resize(new_items, hasher);
+        }
+    }
+
+    /// Rehashes the contents of the table in place (i.e. without changing the
+    /// allocation).
+    ///
+    /// If `hasher` panics then some the table's contents may be lost.
+    fn rehash_in_place(&mut self, hasher: impl Fn(&T) -> u64) {
+        unsafe {
+            // Bulk convert all full control bytes to DELETED, and all DELETED
+            // control bytes to EMPTY. This effectively frees up all buckets
+            // containing a DELETED entry.
+            for i in (0..self.buckets()).step_by(Group::WIDTH) {
+                let group = Group::load_aligned(self.ctrl(i));
+                let group = group.convert_special_to_empty_and_full_to_deleted();
+                group.store_aligned(self.ctrl(i));
+            }
+
+            // Fix up the trailing control bytes. See the comments in set_ctrl.
+            if self.buckets() < Group::WIDTH {
+                self.ctrl(0)
+                    .copy_to(self.ctrl(Group::WIDTH), self.buckets());
+                self.ctrl(self.buckets())
+                    .write_bytes(DELETED, Group::WIDTH - self.buckets());
+            } else {
+                self.ctrl(0)
+                    .copy_to(self.ctrl(self.buckets()), Group::WIDTH);
+            }
+
+            // If the hash function panics then properly clean up any elements
+            // that we haven't rehashed yet. We unfortunately can't preserve the
+            // element since we lost their hash and have no way of recovering it
+            // without risking another panic.
+            let mut guard = guard(self, |self_| {
+                for i in 0..self_.buckets() {
+                    if *self_.ctrl(i) == DELETED {
+                        self_.set_ctrl(i, EMPTY);
+                        self_.bucket(i).drop();
+                        self_.items -= 1;
+                    }
+                }
+                self_.growth_left = bucket_mask_to_capacity(self_.bucket_mask) - self_.items;
+            });
+
+            // At this point, DELETED elements are elements that we haven't
+            // rehashed yet. Find them and re-insert them at their ideal
+            // position.
+            'outer: for i in 0..guard.buckets() {
+                if *guard.ctrl(i) != DELETED {
+                    continue;
+                }
+                'inner: loop {
+                    // Hash the current item
+                    let item = guard.bucket(i);
+                    let hash = hasher(item.as_ref());
+
+                    // Search for a suitable place to put it
+                    let new_i = guard.find_insert_slot(hash);
+
+                    // Probing works by scanning through all of the control
+                    // bytes in groups, which may not be aligned to the group
+                    // size. If both the new and old position fall within the
+                    // same unaligned group, then there is no benefit in moving
+                    // it and we can just continue to the next item.
+                    let probe_index = |pos| {
+                        ((pos - guard.probe_seq(hash).offset) & guard.bucket_mask) / Group::WIDTH
+                    };
+                    if likely(probe_index(i) == probe_index(new_i)) {
+                        guard.set_ctrl(i, h2(hash));
+                        continue 'outer;
+                    }
+
+                    // We are moving the current item to a new position. Write
+                    // our H2 to the control byte of the new position.
+                    let prev_ctrl = *guard.ctrl(new_i);
+                    guard.set_ctrl(new_i, h2(hash));
+
+                    if prev_ctrl == EMPTY {
+                        // If the target slot is empty, simply move the current
+                        // element into the new slot and clear the old control
+                        // byte.
+                        guard.set_ctrl(i, EMPTY);
+                        guard.bucket(new_i).write(item.read());
+                        continue 'outer;
+                    } else {
+                        // If the target slot is occupied, swap the two elements
+                        // and then continue processing the element that we just
+                        // swapped into the old slot.
+                        debug_assert_eq!(prev_ctrl, DELETED);
+                        mem::swap(guard.bucket(new_i).as_mut(), item.as_mut());
+                        continue 'inner;
+                    }
+                }
+            }
+
+            guard.growth_left = bucket_mask_to_capacity(guard.bucket_mask) - guard.items;
+            mem::forget(guard);
+        }
+    }
+
+    /// Allocates a new table of a different size and moves the contents of the
+    /// current table into it.
+    fn resize(&mut self, capacity: usize, hasher: impl Fn(&T) -> u64) {
+        unsafe {
+            debug_assert!(self.items <= capacity);
+
+            // Allocate and initialize the new table.
+            let mut new_table = RawTable::with_capacity(capacity);
+            new_table.growth_left -= self.items;
+            new_table.items = self.items;
+
+            // The hash function may panic, in which case we simply free the new
+            // table without dropping any elements that may have been copied into
+            // it.
+            let mut new_table = guard(ManuallyDrop::new(new_table), |new_table| {
+                if new_table.bucket_mask != 0 {
+                    new_table.free_buckets();
+                }
+            });
+
+            // Copy all elements to the new table.
+            for item in self.iter() {
+                // This may panic.
+                let hash = hasher(item.as_ref());
+
+                // WE can use a simpler version of insert() here since there are no
+                // DELETED entries.
+                let index = new_table.find_insert_slot(hash);
+                new_table.set_ctrl(index, h2(hash));
+                new_table.bucket(index).write(item.read());
+            }
+
+            // We successfully copied all elements without panicking. Now replace
+            // self with the new table. The old table will have its memory freed but
+            // the items will not be dropped (since they have been moved into the
+            // new table).
+            mem::swap(self, &mut new_table);
+        }
+    }
+
+    /// Inserts a new element into the table.
+    ///
+    /// This does not check if the given element already exists in the table.
+    #[inline]
+    pub fn insert(&mut self, hash: u64, value: T, hasher: impl Fn(&T) -> u64) -> Bucket<T> {
+        self.reserve(1, hasher);
+
+        unsafe {
+            let index = self.find_insert_slot(hash);
+            let bucket = self.bucket(index);
+
+            // If we are replacing a DELETED entry then we don't need to update
+            // the load counter.
+            let old_ctrl = *self.ctrl(index);
+            self.growth_left -= special_is_empty(old_ctrl) as usize;
+
+            self.set_ctrl(index, h2(hash));
+            bucket.write(value);
+            self.items += 1;
+            bucket
+        }
+    }
+
+    /// Searches for an element in the table.
+    #[inline]
+    pub fn find(&self, hash: u64, eq: impl Fn(&T) -> bool) -> Option<Bucket<T>> {
+        unsafe {
+            for pos in self.probe_seq(hash) {
+                let group = Group::load(self.ctrl(pos));
+                for bit in group.match_byte(h2(hash)) {
+                    let index = (pos + bit) & self.bucket_mask;
+                    let bucket = self.bucket(index);
+                    if likely(eq(bucket.as_ref())) {
+                        return Some(bucket);
+                    }
+                }
+                if likely(group.match_empty().any_bit_set()) {
+                    return None;
+                }
+            }
+        }
+
+        // probe_seq never returns.
+        unreachable!();
+    }
+
+    /// Returns the number of elements the map can hold without reallocating.
+    ///
+    /// This number is a lower bound; the table might be able to hold
+    /// more, but is guaranteed to be able to hold at least this many.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.items + self.growth_left
+    }
+
+    /// Returns the number of elements in the table.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.items
+    }
+
+    /// Returns the number of buckets in the table.
+    #[inline]
+    fn buckets(&self) -> usize {
+        self.bucket_mask + 1
+    }
+
+    /// Returns an iterator over every element in the table.
+    #[inline]
+    pub fn iter(&self) -> RawIter<T> {
+        unsafe {
+            let current_group = Group::load_aligned(self.ctrl.as_ptr())
+                .match_empty_or_deleted()
+                .invert();
+            RawIter {
+                data: self.data.as_ptr(),
+                ctrl: self.ctrl.as_ptr(),
+                current_group,
+                end: self.ctrl(self.bucket_mask),
+                items: self.items,
+            }
+        }
+    }
+
+    /// Returns an iterator which removes all elements from the table without
+    /// freeing the memory.
+    #[inline]
+    pub fn drain(&mut self) -> RawDrain<T> {
+        RawDrain {
+            iter: self.iter(),
+            table: NonNull::from(self),
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<T: Clone> Clone for RawTable<T> {
+    fn clone(&self) -> Self {
+        if self.bucket_mask == 0 {
+            Self::new()
+        } else {
+            unsafe {
+                let mut new_table = ManuallyDrop::new(Self::new_uninitialized(self.buckets()));
+
+                // Copy the control bytes unchanged. We do this in a single pass
+                self.ctrl(0)
+                    .copy_to_nonoverlapping(new_table.ctrl(0), self.buckets() + Group::WIDTH);
+
+                {
+                    // The cloning of elements may panic, in which case we need
+                    // to make sure we drop only the elements that have been
+                    // cloned so far.
+                    let mut guard = guard((0, &mut new_table), |(index, new_table)| {
+                        for i in 0..=*index {
+                            if is_full(*new_table.ctrl(i)) {
+                                new_table.bucket(i).drop();
+                            }
+                        }
+                        new_table.free_buckets();
+                    });
+
+                    for from in self.iter() {
+                        let index = self.bucket_index(&from);
+                        let to = guard.1.bucket(index);
+                        to.write(from.as_ref().clone());
+
+                        // Update the index in case we need to unwind.
+                        guard.0 = index;
+                    }
+
+                    // Successfully cloned all items, no need to clean up.
+                    mem::forget(guard);
+                }
+
+                // Return the newly created table.
+                new_table.items = self.items;
+                new_table.growth_left = self.growth_left;
+                ManuallyDrop::into_inner(new_table)
+            }
+        }
+    }
+}
+
+impl<T> Drop for RawTable<T> {
+    #[inline]
+    fn drop(&mut self) {
+        if self.bucket_mask != 0 {
+            unsafe {
+                for item in self.iter() {
+                    item.drop();
+                }
+                self.free_buckets();
+            }
+        }
+    }
+}
+
+impl<T> IntoIterator for RawTable<T> {
+    type Item = T;
+    type IntoIter = RawIntoIter<T>;
+
+    #[inline]
+    fn into_iter(self) -> RawIntoIter<T> {
+        unsafe {
+            let alloc = if self.bucket_mask != 0 {
+                let (layout, _) = calculate_layout::<T>(self.buckets())
+                    .unwrap_or_else(|| hint::unreachable_unchecked());
+                Some((self.ctrl.cast(), layout))
+            } else {
+                None
+            };
+            let iter = self.iter();
+            mem::forget(self);
+            RawIntoIter { iter, alloc }
+        }
+    }
+}
+
+/// Iterator which returns a raw pointer to every full bucket in the table.
+pub struct RawIter<T> {
+    // Using *const here for covariance
+    data: *const T,
+    ctrl: *const u8,
+    current_group: BitMask,
+    end: *const u8,
+    items: usize,
+}
+
+impl<T> Clone for RawIter<T> {
+    #[inline]
+    fn clone(&self) -> Self {
+        RawIter {
+            data: self.data,
+            ctrl: self.ctrl,
+            current_group: self.current_group,
+            end: self.end,
+            items: self.items,
+        }
+    }
+}
+
+impl<T> Iterator for RawIter<T> {
+    type Item = Bucket<T>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Bucket<T>> {
+        unsafe {
+            loop {
+                if let Some(index) = self.current_group.lowest_set_bit() {
+                    self.current_group = self.current_group.remove_lowest_bit();
+                    self.items -= 1;
+                    return Some(Bucket::from_ptr(self.data.add(index)));
+                }
+
+                self.ctrl = self.ctrl.add(Group::WIDTH);
+                if self.ctrl >= self.end {
+                    // We don't check against items == 0 here to allow the
+                    // compiler to optimize away the item count entirely if the
+                    // iterator length is never queried.
+                    debug_assert_eq!(self.items, 0);
+                    return None;
+                }
+
+                self.data = self.data.add(Group::WIDTH);
+                self.current_group = Group::load_aligned(self.ctrl)
+                    .match_empty_or_deleted()
+                    .invert();
+            }
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.items, Some(self.items))
+    }
+}
+
+impl<T> ExactSizeIterator for RawIter<T> {}
+impl<T> FusedIterator for RawIter<T> {}
+
+/// Iterator which consumes a table and returns elements.
+pub struct RawIntoIter<T> {
+    iter: RawIter<T>,
+    alloc: Option<(NonNull<u8>, Layout)>,
+}
+
+impl<'a, T> RawIntoIter<T> {
+    #[inline]
+    pub fn iter(&self) -> RawIter<T> {
+        self.iter.clone()
+    }
+}
+
+impl<T> Drop for RawIntoIter<T> {
+    #[inline]
+    fn drop(&mut self) {
+        unsafe {
+            // Drop all remaining elements
+            while let Some(item) = self.iter.next() {
+                item.drop();
+            }
+
+            // Free the table
+            if let Some((ptr, layout)) = self.alloc {
+                dealloc(ptr.as_ptr(), layout);
+            }
+        }
+    }
+}
+
+impl<T> Iterator for RawIntoIter<T> {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        unsafe { Some(self.iter.next()?.read()) }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<T> ExactSizeIterator for RawIntoIter<T> {}
+impl<T> FusedIterator for RawIntoIter<T> {}
+
+/// Iterator which consumes elements without freeing the table storage.
+pub struct RawDrain<'a, T: 'a> {
+    iter: RawIter<T>,
+
+    // We don't use a &'a RawTable<T> because we want RawDrain to be covariant
+    // over 'a.
+    table: NonNull<RawTable<T>>,
+    _marker: PhantomData<&'a RawTable<T>>,
+}
+
+impl<'a, T> RawDrain<'a, T> {
+    #[inline]
+    pub fn iter(&self) -> RawIter<T> {
+        self.iter.clone()
+    }
+}
+
+impl<'a, T> Drop for RawDrain<'a, T> {
+    #[inline]
+    fn drop(&mut self) {
+        unsafe {
+            // Ensure that the table is reset even if one of the drops panic
+            let _guard = guard(self.table, |table| table.as_mut().clear_no_drop());
+
+            // Drop all remaining elements
+            while let Some(item) = self.iter.next() {
+                item.drop();
+            }
+        }
+    }
+}
+
+impl<'a, T> Iterator for RawDrain<'a, T> {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        unsafe {
+            let item = self.iter.next()?;
+
+            // Mark the item as DELETED in the table and decrement the item
+            // counter. We don't need to use the full delete algorithm like
+            // erase_no_drop since we will just clear the control bytes when
+            // the RawDrain is dropped.
+            let index = self.table.as_ref().bucket_index(&item);
+            *self.table.as_mut().ctrl(index) = DELETED;
+            self.table.as_mut().items -= 1;
+
+            Some(item.read())
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<'a, T> ExactSizeIterator for RawDrain<'a, T> {}
+impl<'a, T> FusedIterator for RawDrain<'a, T> {}
--- a/src/raw/sse2.rs
+++ b/src/raw/sse2.rs
@ -0,0 +1,98 @@
+use core::mem;
+use raw::bitmask::BitMask;
+use raw::EMPTY;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64 as x86;
+
+pub type BitMaskWord = u32;
+pub const BITMASK_SHIFT: u32 = 0;
+pub const BITMASK_MASK: u32 = 0xffff;
+
+/// Abstraction over a group of control bytes which can be scanned in
+/// parallel.
+///
+/// This implementation uses a 128-bit SSE value.
+pub struct Group(x86::__m128i);
+
+impl Group {
+    /// Number of bytes in the group.
+    pub const WIDTH: usize = mem::size_of::<Self>();
+
+    /// Returns a full group of empty bytes, suitable for use as the initial
+    /// value for an empty hash table.
+    ///
+    /// This is guaranteed to be aligned to the group size.
+    #[inline]
+    pub fn static_empty() -> &'static [u8] {
+        #[repr(C)]
+        struct Dummy {
+            _align: [x86::__m128i; 0],
+            bytes: [u8; Group::WIDTH],
+        };
+        const DUMMY: Dummy = Dummy {
+            _align: [],
+            bytes: [EMPTY; Group::WIDTH],
+        };
+        &DUMMY.bytes
+    }
+
+    /// Loads a group of bytes starting at the given address.
+    #[inline]
+    pub unsafe fn load(ptr: *const u8) -> Group {
+        Group(x86::_mm_loadu_si128(ptr as *const _))
+    }
+
+    /// Loads a group of bytes starting at the given address, which must be
+    /// aligned to `WIDTH`.
+    #[inline]
+    pub unsafe fn load_aligned(ptr: *const u8) -> Group {
+        Group(x86::_mm_load_si128(ptr as *const _))
+    }
+
+    /// Stores the group of bytes to the given address, which must be
+    /// aligned to `WIDTH`.
+    #[inline]
+    pub unsafe fn store_aligned(&self, ptr: *mut u8) {
+        x86::_mm_store_si128(ptr as *mut _, self.0);
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which have
+    /// the given value.
+    #[inline]
+    pub fn match_byte(&self, byte: u8) -> BitMask {
+        unsafe {
+            let cmp = x86::_mm_cmpeq_epi8(self.0, x86::_mm_set1_epi8(byte as i8));
+            BitMask(x86::_mm_movemask_epi8(cmp) as u32)
+        }
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which are
+    /// `EMPTY`.
+    #[inline]
+    pub fn match_empty(&self) -> BitMask {
+        self.match_byte(EMPTY)
+    }
+
+    /// Returns a `BitMask` indicating all bytes in the group which are
+    /// `EMPTY` pr `DELETED`.
+    #[inline]
+    pub fn match_empty_or_deleted(&self) -> BitMask {
+        unsafe { BitMask(x86::_mm_movemask_epi8(self.0) as u32) }
+    }
+
+    /// Performs the following transformation on all bytes in the group:
+    /// - `EMPTY => EMPTY`
+    /// - `DELETED => EMPTY`
+    /// - `FULL => DELETED`
+    #[inline]
+    pub fn convert_special_to_empty_and_full_to_deleted(&self) -> Group {
+        unsafe {
+            let zero = x86::_mm_setzero_si128();
+            let special = x86::_mm_cmpgt_epi8(zero, self.0);
+            Group(x86::_mm_or_si128(special, x86::_mm_set1_epi8(0x80u8 as i8)))
+        }
+    }
+}
--- a/src/set.rs
+++ b/src/set.rs