Bug 1794001 - Part 1: Import the rure crate for rust regex ffi, r=xpcom-reviewers,supply-chain-reviewers,kmag

While we already have an in-tree `regex-ffi` crate which provides basic access to regex functionality for use in FormAutofillNative, the `regex` crate itself provides and maintains its own c api as the `rure` crate. This patch vendors in `rure` to allow us to use the more-fully-featured official ffi. Differential Revision: https://phabricator.services.mozilla.com/D158873
2024-10-07 18:04:46 +00:00 · 2022-10-13 21:46:54 +00:00 · 2022-10-13 21:46:54 +00:00 · 93a9c67b35
commit 93a9c67b35
parent 9127c66b99
21 changed files with 15516 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2166,6 +2166,7 @@ dependencies = [
 "qcms",
 "regex-ffi",
 "rsdparsa_capi",
+ "rure",
 "rusqlite",
 "rust_minidump_writer_linux",
 "static_prefs",
@ -4501,6 +4502,16 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d79b4b604167921892e84afbbaad9d5ad74e091bf6c511d9dbfb0593f09fabd"

+[[package]]
+name = "rure"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"
+dependencies = [
+ "libc",
+ "regex",
+]
+
 [[package]]
 name = "rusqlite"
 version = "0.27.0"
--- a/supply-chain/audits.toml
+++ b/supply-chain/audits.toml
@ -953,6 +953,33 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
 criteria = "safe-to-deploy"
 delta = "0.7.0 -> 0.7.1"

+[[audits.rure]]
+who = "Nika Layzell <nika@thelayzells.com>"
+criteria = "safe-to-deploy"
+version = "0.2.2"
+notes = """
+This is a fairly straightforward FFI wrapper crate for `regex`, maintained by
+the `regex` developers in the same repository.
+
+This crate is explicitly designed for FFI use, and should not be used directly
+by Rust code. The exported `extern \"C\"` functions are not marked as `unsafe`,
+meaning that it is technically incorrect to use them from within Rust code,
+however they are reasonable to use from C code.
+
+The unsafe code in this crate heavily depends on the C caller maintaining
+invariants, however these invariants are clearly documented in the `rure.h`
+file, bundled with the crate.
+
+I have checked the signatures of each function both in C++ and in the Rust to
+ensure they match.  In some places, the c `rure.h` header file is missing a
+`const` qualifier which could be present given the Rust code, however this will
+have no impact on ABI, and is fairly normal for FFI crates.
+
+Panics are handled in all Rust FFI methods, meaning that projects which do not
+disable unwinding will still consistently abort (using `libc::abort()`) if a
+panic occurs in the Rust code.
+"""
+
 [[audits.rust_decimal]]
 who = "Mike Hommey <mh+mozilla@glandium.org>"
 criteria = "safe-to-deploy"
--- a/third_party/rust/rure/.cargo-checksum.json
+++ b/third_party/rust/rure/.cargo-checksum.json
@ -0,0 +1 @@
+{"files":{"Cargo.toml":"6bed7b80456a66969f4fe9bb5341a0b927a7cd58e036441cbb3b79d67d86c24a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"e8462c4064a376c2b2d729cc766064cc97decd6a2bb325cf9c7b50be9b8897ce","ctest/compile":"48b692b2aca8b61dfbe372f46d3aeb242893cfa2d81b0a89a73eb2f5db6b6e27","ctest/test.c":"6565808675763c42f8f10bd95445eaab4eaa3618efcf8ec215d98c3a1cfe756d","examples/compile":"471a781860b733f9aa9c1691f33ac8e8a4e85efcb97540942432ba5b58fbb982","examples/iter.c":"ad8312b2271ee19bfaf681d1d8338afaa89e4b180174f008b8cf951a6275776f","examples/sherlock.txt":"242ec73a70f0a03dcbe007e32038e7deeaee004aaec9a09a07fa322743440fa8","include/rure.h":"ddd6056d434d4efaf6ad30b8a38798d61ad385b0c9866988f9b2d4306dc1a99a","src/error.rs":"965c0207eb6d9cf644580d13b2d2d3bd310ab5c1ff65cb1fc04abdbd08ce7fe8","src/lib.rs":"9e99e774ee2a3db507d1e2cd7142b680411d90cf2b033c19ea9a7ea59ae4ba98","src/macros.rs":"ef2d468c1babe1b2252e62ad953b14ce58afb87768dc88612a70df27456038d2","src/rure.rs":"a889bbf35ab2d0018eac1122fe69abbbe2880fb8f5da211a1f60f703fddb5c82","test":"e8b91d4378b3ba09b7dfecdfa733765569778f57bc1c72cecc718e4ad63c1537"},"package":"f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"}
--- a/third_party/rust/rure/Cargo.toml
+++ b/third_party/rust/rure/Cargo.toml
@ -0,0 +1,38 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+name = "rure"
+version = "0.2.2"
+authors = ["The Rust Project Developers"]
+description = """
+A C API for Rust's regular expression library.
+"""
+homepage = "https://github.com/rust-lang/regex"
+documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi"
+readme = "README.md"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/rust-lang/regex"
+
+[lib]
+name = "rure"
+crate-type = [
+    "staticlib",
+    "cdylib",
+    "rlib",
+]
+
+[dependencies.libc]
+version = "0.2"
+
+[dependencies.regex]
+version = "1"
--- a/third_party/rust/rure/LICENSE-APACHE
+++ b/third_party/rust/rure/LICENSE-APACHE
@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/third_party/rust/rure/LICENSE-MIT
+++ b/third_party/rust/rure/LICENSE-MIT
@ -0,0 +1,25 @@
+Copyright (c) 2014 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/third_party/rust/rure/README.md
+++ b/third_party/rust/rure/README.md
@ -0,0 +1,103 @@
+C API for RUst's REgex engine
+=============================
+rure is a C API to Rust's regex library, which guarantees linear time
+searching using finite automata. In exchange, it must give up some common
+regex features such as backreferences and arbitrary lookaround. It does
+however include capturing groups, lazy matching, Unicode support and word
+boundary assertions. Its matching semantics generally correspond to Perl's,
+or "leftmost first." Namely, the match locations reported correspond to the
+first match that would be found by a backtracking engine.
+
+The header file (`includes/rure.h`) serves as the primary API documentation of
+this library. Types and flags are documented first, and functions follow.
+
+The syntax and possibly other useful things are documented in the Rust
+API documentation: https://docs.rs/regex
+
+
+Examples
+--------
+There are readable examples in the `ctest` and `examples` sub-directories.
+
+Assuming you have
+[Rust and Cargo installed](https://www.rust-lang.org/downloads.html)
+(and a C compiler), then this should work to run the `iter` example:
+
+```
+$ git clone git://github.com/rust-lang/regex
+$ cd regex/regex-capi/examples
+$ ./compile
+$ LD_LIBRARY_PATH=../target/release ./iter
+```
+
+
+Performance
+-----------
+It's fast. Its core matching engine is a lazy DFA, which is what GNU grep
+and RE2 use. Like GNU grep, this regex engine can detect multi byte literals
+in the regex and will use fast literal string searching to quickly skip
+through the input to find possible match locations.
+
+All memory usage is bounded and all searching takes linear time with respect
+to the input string.
+
+For more details, see the PERFORMANCE guide:
+https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md
+
+
+Text encoding
+-------------
+All regular expressions must be valid UTF-8.
+
+The text encoding of haystacks is more complicated. To a first
+approximation, haystacks should be UTF-8. In fact, UTF-8 (and, one
+supposes, ASCII) is the only well defined text encoding supported by this
+library. It is impossible to match UTF-16, UTF-32 or any other encoding
+without first transcoding it to UTF-8.
+
+With that said, haystacks do not need to be valid UTF-8, and if they aren't
+valid UTF-8, no performance penalty is paid. Whether invalid UTF-8 is
+matched or not depends on the regular expression. For example, with the
+`RURE_FLAG_UNICODE` flag enabled, the regex `.` is guaranteed to match a
+single UTF-8 encoding of a Unicode codepoint (sans LF). In particular,
+it will not match invalid UTF-8 such as `\xFF`, nor will it match surrogate
+codepoints or "alternate" (i.e., non-minimal) encodings of codepoints.
+However, with the `RURE_FLAG_UNICODE` flag disabled, the regex `.` will match
+any *single* arbitrary byte (sans LF), including `\xFF`.
+
+This provides a useful invariant: wherever `RURE_FLAG_UNICODE` is set, the
+corresponding regex is guaranteed to match valid UTF-8. Invalid UTF-8 will
+always prevent a match from happening when the flag is set. Since flags can be
+toggled in the regular expression itself, this allows one to pick and choose
+which parts of the regular expression must match UTF-8 or not.
+
+Some good advice is to always enable the `RURE_FLAG_UNICODE` flag (which is
+enabled when using `rure_compile_must`) and selectively disable the flag when
+one wants to match arbitrary bytes. The flag can be disabled in a regular
+expression with `(?-u)`.
+
+Finally, if one wants to match specific invalid UTF-8 bytes, then you can
+use escape sequences. e.g., `(?-u)\\xFF` will match `\xFF`. It's not
+possible to use C literal escape sequences in this case since regular
+expressions must be valid UTF-8.
+
+
+Aborts
+------
+This library will abort your process if an unwinding panic is caught in the
+Rust code. Generally, a panic occurs when there is a bug in the program or
+if allocation failed. It is possible to cause this behavior by passing
+invalid inputs to some functions. For example, giving an invalid capture
+group index to `rure_captures_at` will cause Rust's bounds checks to fail,
+which will cause a panic, which will be caught and printed to stderr. The
+process will then `abort`.
+
+
+Missing
+-------
+There are a few things missing from the C API that are present in the Rust API.
+There's no particular (known) reason why they don't, they just haven't been
+implemented yet.
+
+* Splitting a string by a regex.
+* Replacing regex matches in a string with some other text.
--- a/third_party/rust/rure/ctest/compile
+++ b/third_party/rust/rure/ctest/compile
@ -0,0 +1,8 @@
+#!/bin/sh
+
+set -ex
+
+cargo build --manifest-path ../Cargo.toml
+gcc -DDEBUG -o test test.c -ansi -Wall -I../include -L../../target/debug -lrure
+# If you're using librure.a, then you'll need to link other stuff:
+# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure
--- a/third_party/rust/rure/ctest/test.c
+++ b/third_party/rust/rure/ctest/test.c
@ -0,0 +1,591 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rure.h"
+
+#ifndef DEBUG
+  #define DEBUG false
+#endif
+
+bool test_is_match() {
+    bool passed = true;
+    const char *haystack = "snowman: \xE2\x98\x83";
+
+    rure *re = rure_compile_must("\\p{So}$");
+    bool matched = rure_is_match(re, (const uint8_t *)haystack,
+                                 strlen(haystack), 0);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_is_match] expected match, but got no match\n");
+        }
+        passed = false;
+    }
+    rure_free(re);
+    return passed;
+}
+
+bool test_shortest_match() {
+    bool passed = true;
+    const char *haystack = "aaaaa";
+
+    rure *re = rure_compile_must("a+");
+    size_t end = 0;
+    bool matched = rure_shortest_match(re, (const uint8_t *)haystack,
+                                       strlen(haystack), 0, &end);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_shortest_match] expected match, "
+                    "but got no match\n");
+        }
+        passed = false;
+    }
+    size_t expect_end = 1;
+    if (end != expect_end) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_shortest_match] expected match end location %zu "
+                    "but got %zu\n", expect_end, end);
+        }
+        passed = false;
+    }
+    rure_free(re);
+    return passed;
+}
+
+bool test_find() {
+    bool passed = true;
+    const char *haystack = "snowman: \xE2\x98\x83";
+
+    rure *re = rure_compile_must("\\p{So}$");
+    rure_match match = {0};
+    bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack),
+                             0, &match);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr, "[test_find] expected match, but got no match\n");
+        }
+        passed = false;
+    }
+    size_t expect_start = 9;
+    size_t expect_end = 12;
+    if (match.start != expect_start || match.end != expect_end) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_find] expected match at (%zu, %zu), but "
+                    "got match at (%zu, %zu)\n",
+                    expect_start, expect_end, match.start, match.end);
+        }
+        passed = false;
+    }
+    rure_free(re);
+    return passed;
+}
+
+bool test_captures() {
+    bool passed = true;
+    const char *haystack = "snowman: \xE2\x98\x83";
+
+    rure *re = rure_compile_must(".(.*(?P<snowman>\\p{So}))$");
+    rure_match match = {0};
+    rure_captures *caps = rure_captures_new(re);
+    bool matched = rure_find_captures(re, (const uint8_t *)haystack,
+                                      strlen(haystack), 0, caps);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] expected match, but got no match\n");
+        }
+        passed = false;
+    }
+    size_t expect_captures_len = 3;
+    size_t captures_len = rure_captures_len(caps);
+    if (captures_len != expect_captures_len) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] "
+                    "expected capture group length to be %zd, but "
+                    "got %zd\n",
+                    expect_captures_len, captures_len);
+        }
+        passed = false;
+        goto done;
+    }
+    int32_t expect_capture_index = 2;
+    int32_t capture_index = rure_capture_name_index(re, "snowman");
+    if (capture_index != expect_capture_index) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] "
+                    "expected capture index %d for name 'snowman', but "
+                    "got %d\n",
+                    expect_capture_index, capture_index);
+        }
+        passed = false;
+        goto done;
+    }
+    size_t expect_start = 9;
+    size_t expect_end = 12;
+    rure_captures_at(caps, 2, &match);
+    if (match.start != expect_start || match.end != expect_end) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] "
+                    "expected capture 2 match at (%zu, %zu), "
+                    "but got match at (%zu, %zu)\n",
+                    expect_start, expect_end, match.start, match.end);
+        }
+        passed = false;
+    }
+done:
+    rure_captures_free(caps);
+    rure_free(re);
+    return passed;
+}
+
+bool test_iter() {
+    bool passed = true;
+    const uint8_t *haystack = (const uint8_t *)"abc xyz";
+    size_t haystack_len = strlen((const char *)haystack);
+
+    rure *re = rure_compile_must("\\w+(\\w)");
+    rure_match match = {0};
+    rure_captures *caps = rure_captures_new(re);
+    rure_iter *it = rure_iter_new(re);
+
+    bool matched = rure_iter_next(it, haystack, haystack_len, &match);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter] expected first match, but got no match\n");
+        }
+        passed = false;
+        goto done;
+    }
+    size_t expect_start = 0;
+    size_t expect_end = 3;
+    if (match.start != expect_start || match.end != expect_end) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter] expected first match at (%zu, %zu), but "
+                    "got match at (%zu, %zu)\n",
+                    expect_start, expect_end, match.start, match.end);
+        }
+        passed = false;
+        goto done;
+    }
+
+    matched = rure_iter_next_captures(it, haystack, haystack_len, caps);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter] expected second match, but got no match\n");
+        }
+        passed = false;
+        goto done;
+    }
+    rure_captures_at(caps, 1, &match);
+    expect_start = 6;
+    expect_end = 7;
+    if (match.start != expect_start || match.end != expect_end) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter] expected second match at (%zu, %zu), but "
+                    "got match at (%zu, %zu)\n",
+                    expect_start, expect_end, match.start, match.end);
+        }
+        passed = false;
+        goto done;
+    }
+done:
+    rure_iter_free(it);
+    rure_captures_free(caps);
+    rure_free(re);
+    return passed;
+}
+
+bool test_iter_capture_name(char *expect, char *given) {
+    bool passed = true;
+    if (strcmp(expect, given)) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter_capture_name] expected first capture "
+                    "name '%s' got '%s'\n",
+                    expect, given);
+        }
+        passed = false;
+    }
+    return passed;
+}
+
+bool test_iter_capture_names() {
+    bool passed = true;
+
+    char *name;
+    rure *re = rure_compile_must(
+        "(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})");
+    rure_iter_capture_names *it = rure_iter_capture_names_new(re);
+
+    bool result = rure_iter_capture_names_next(it, &name);
+    if (!result) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_iter_capture_names] expected a second name, "
+                    "but got none\n");
+        }
+        passed = false;
+        goto done;
+    }
+
+    result = rure_iter_capture_names_next(it, &name);
+    passed = test_iter_capture_name("year", name);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_capture_names_next(it, &name);
+    passed = test_iter_capture_name("month", name);
+    if (!passed) {
+        goto done;
+    }
+
+    result = rure_iter_capture_names_next(it, &name);
+    passed = test_iter_capture_name("day", name);
+    if (!passed) {
+        goto done;
+    }
+done:
+    rure_iter_capture_names_free(it);
+    rure_free(re);
+    return passed;
+}
+
+/*
+ * This tests whether we can set the flags correctly. In this case, we disable
+ * all flags, which includes disabling Unicode mode. When we disable Unicode
+ * mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF.
+ * (When Unicode mode is enabled, \xFF won't match .)
+ */
+bool test_flags() {
+    bool passed = true;
+    const char *pattern = ".";
+    const char *haystack = "\xFF";
+
+    rure *re = rure_compile((const uint8_t *)pattern, strlen(pattern),
+                            0, NULL, NULL);
+    bool matched = rure_is_match(re, (const uint8_t *)haystack,
+                                 strlen(haystack), 0);
+    if (!matched) {
+        if (DEBUG) {
+            fprintf(stderr, "[test_flags] expected match, but got no match\n");
+        }
+        passed = false;
+    }
+    rure_free(re);
+    return passed;
+}
+
+bool test_compile_error() {
+    bool passed = true;
+    rure_error *err = rure_error_new();
+    rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err);
+    if (re != NULL) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error] "
+                    "expected NULL regex pointer, but got non-NULL pointer\n");
+        }
+        passed = false;
+        rure_free(re);
+    }
+    const char *msg = rure_error_message(err);
+    if (NULL == strstr(msg, "unclosed group")) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error] "
+                    "expected an 'unclosed parenthesis' error message, but "
+                    "got this instead: '%s'\n", msg);
+        }
+        passed = false;
+    }
+    rure_error_free(err);
+    return passed;
+}
+
+bool test_compile_error_size_limit() {
+    bool passed = true;
+    rure_options *opts = rure_options_new();
+    rure_options_size_limit(opts, 0);
+    rure_error *err = rure_error_new();
+    rure *re = rure_compile((const uint8_t *)"\\w{100}", 8, 0, opts, err);
+    if (re != NULL) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error_size_limit] "
+                    "expected NULL regex pointer, but got non-NULL pointer\n");
+        }
+        passed = false;
+        rure_free(re);
+    }
+    const char *msg = rure_error_message(err);
+    if (NULL == strstr(msg, "exceeds size")) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error] "
+                    "expected an 'exceeds size' error message, but "
+                    "got this instead: '%s'\n", msg);
+        }
+        passed = false;
+    }
+    rure_options_free(opts);
+    rure_error_free(err);
+    return passed;
+}
+
+bool test_regex_set_matches() {
+
+#define PAT_COUNT 6
+
+    bool passed = true;
+    const char *patterns[] = {
+        "foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
+    };
+    const size_t patterns_lengths[] = {
+        3, 6, 3, 3, 6, 3
+    };
+
+    rure_error *err = rure_error_new();
+    rure_set *re = rure_compile_set((const uint8_t **) patterns,
+                                    patterns_lengths,
+                                    PAT_COUNT,
+                                    0,
+                                    NULL,
+                                    err);
+    if (re == NULL) {
+        passed = false;
+        goto done2;
+    }
+
+    if (rure_set_len(re) != PAT_COUNT) {
+        passed = false;
+        goto done1;
+    }
+
+    if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) {
+        passed = false;
+        goto done1;
+    }
+
+    if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) {
+        passed = false;
+        goto done1;
+    }
+
+    bool matches[PAT_COUNT];
+    if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) {
+        passed = false;
+        goto done1;
+    }
+
+    const bool match_target[] = {
+        true, false, true, false, true, true
+    };
+
+    int i;
+    for (i = 0; i < PAT_COUNT; ++i) {
+        if (matches[i] != match_target[i]) {
+            passed = false;
+            goto done1;
+        }
+    }
+
+done1:
+    rure_set_free(re);
+done2:
+    rure_error_free(err);
+    return passed;
+
+#undef PAT_COUNT
+}
+
+bool test_regex_set_match_start() {
+
+#define PAT_COUNT 3
+
+    bool passed = true;
+    const char *patterns[] = {
+        "foo", "bar", "fooo"
+    };
+    const size_t patterns_lengths[] = {
+        3, 3, 4
+    };
+
+    rure_error *err = rure_error_new();
+    rure_set *re = rure_compile_set((const uint8_t **) patterns,
+                                    patterns_lengths,
+                                    PAT_COUNT,
+                                    0,
+                                    NULL,
+                                    err);
+    if (re == NULL) {
+        passed = false;
+        goto done2;
+    }
+
+    if (rure_set_len(re) != PAT_COUNT) {
+        passed = false;
+        goto done1;
+    }
+
+    if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) {
+        passed = false;
+        goto done1;
+    }
+
+    {
+        bool matches[PAT_COUNT];
+        if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) {
+            passed = false;
+            goto done1;
+        }
+
+        const bool match_target[] = {
+            true, true, true
+        };
+
+        int i;
+        for (i = 0; i < PAT_COUNT; ++i) {
+            if (matches[i] != match_target[i]) {
+                passed = false;
+                goto done1;
+            }
+        }
+    }
+
+    {
+        bool matches[PAT_COUNT];
+        if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) {
+            passed = false;
+            goto done1;
+        }
+
+        const bool match_target[] = {
+            false, true, false
+        };
+
+        int i;
+        for (i = 0; i < PAT_COUNT; ++i) {
+            if (matches[i] != match_target[i]) {
+                passed = false;
+                goto done1;
+            }
+        }
+    }
+
+done1:
+    rure_set_free(re);
+done2:
+    rure_error_free(err);
+    return passed;
+
+#undef PAT_COUNT
+}
+
+bool test_regex_set_options() {
+
+    bool passed = true;
+    rure_options *opts = rure_options_new();
+    rure_options_size_limit(opts, 0);
+    rure_error *err = rure_error_new();
+
+    const char *patterns[] = { "\\w{100}" };
+    const size_t patterns_lengths[] = { 8 };
+
+    rure_set *re = rure_compile_set(
+        (const uint8_t **) patterns, patterns_lengths, 1, 0, opts, err);
+    if (re != NULL) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error_size_limit] "
+                    "expected NULL regex pointer, but got non-NULL pointer\n");
+        }
+        passed = false;
+        rure_set_free(re);
+    }
+    const char *msg = rure_error_message(err);
+    if (NULL == strstr(msg, "exceeds size")) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_compile_error] "
+                    "expected an 'exceeds size' error message, but "
+                    "got this instead: '%s'\n", msg);
+        }
+        passed = false;
+    }
+    rure_options_free(opts);
+    rure_error_free(err);
+    return passed;
+}
+
+bool test_escape() {
+    bool passed = true;
+
+    const char *pattern = "^[a-z]+.*$";
+    const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$";
+
+    const char *escaped = rure_escape_must(pattern);
+    if (!escaped) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] expected escaped, but got no escaped\n");
+        }
+        passed = false;
+    } else if (strcmp(escaped, expected_escaped) != 0) {
+        if (DEBUG) {
+            fprintf(stderr,
+                    "[test_captures] expected \"%s\", but got \"%s\"\n",
+                    expected_escaped, escaped);
+        }
+        passed = false;
+    }
+    rure_cstring_free((char *) escaped);
+    return passed;
+}
+
+void run_test(bool (test)(), const char *name, bool *passed) {
+    if (!test()) {
+        *passed = false;
+        fprintf(stderr, "FAILED: %s\n", name);
+    } else {
+        fprintf(stderr, "PASSED: %s\n", name);
+    }
+}
+
+int main() {
+    bool passed = true;
+
+    run_test(test_is_match, "test_is_match", &passed);
+    run_test(test_shortest_match, "test_shortest_match", &passed);
+    run_test(test_find, "test_find", &passed);
+    run_test(test_captures, "test_captures", &passed);
+    run_test(test_iter, "test_iter", &passed);
+    run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
+    run_test(test_flags, "test_flags", &passed);
+    run_test(test_compile_error, "test_compile_error", &passed);
+    run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
+             &passed);
+    run_test(test_regex_set_matches, "test_regex_set_match", &passed);
+    run_test(test_regex_set_options, "test_regex_set_options", &passed);
+    run_test(test_regex_set_match_start, "test_regex_set_match_start",
+             &passed);
+    run_test(test_escape, "test_escape", &passed);
+
+    if (!passed) {
+        exit(1);
+    }
+    return 0;
+}
--- a/third_party/rust/rure/examples/compile
+++ b/third_party/rust/rure/examples/compile
@ -0,0 +1,9 @@
+#!/bin/sh
+
+set -ex
+
+# N.B. Add `--release` flag to `cargo build` to make the example run faster.
+cargo build --manifest-path ../Cargo.toml
+gcc -O3 -DDEBUG -o iter iter.c -ansi -Wall -I../include -L../../target/debug -lrure
+# If you're using librure.a, then you'll need to link other stuff:
+# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure
--- a/third_party/rust/rure/examples/iter.c
+++ b/third_party/rust/rure/examples/iter.c
@ -0,0 +1,99 @@
+/*
+ * This example code shows how to iterate over all regex matches in a file,
+ * emit the match location and print the contents of a capturing group.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "rure.h"
+
+int main() {
+    /* Open a file and mmap it. */
+    int fd = open("sherlock.txt", O_RDONLY);
+    if (fd == -1) {
+        perror("failed to open sherlock.txt");
+        exit(1);
+    }
+    struct stat status;
+    if (fstat(fd, &status) == -1) {
+        perror("failed to stat sherlock.txt");
+        exit(1);
+    }
+    if ((uintmax_t)status.st_size > SIZE_MAX) {
+        perror("file too big");
+        exit(1);
+    }
+    if (status.st_size == 0) {
+        perror("file empty");
+        exit(1);
+    }
+    size_t sherlock_len = (size_t)status.st_size;
+    const uint8_t *sherlock = (const uint8_t *)mmap(
+        NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    close(fd);
+    if (sherlock == MAP_FAILED) {
+        perror("could not mmap file");
+        exit(1);
+    }
+
+    /*
+     * Compile the regular expression. A more convenient routine,
+     * rure_compile_must, is also available, which will abort the process if
+     * and print an error message to stderr if the regex compilation fails.
+     * We show the full gory details here as an example.
+     */
+    const char *pattern = "(\\w+)\\s+Holmes";
+    size_t pattern_len = strlen(pattern);
+    rure_error *err = rure_error_new();
+    rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
+                            RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
+    if (NULL == re) {
+        /* A null regex means compilation failed and an error exists. */
+        printf("compilation of %s failed: %s\n",
+               pattern, rure_error_message(err));
+        rure_error_free(err);
+        munmap((char*)sherlock, sherlock_len);
+        exit(1);
+    }
+    rure_error_free(err);
+
+    /*
+     * Create an iterator to find all successive non-overlapping matches.
+     * For each match, we extract the location of the capturing group.
+     */
+    rure_match group0 = {0};
+    rure_match group1 = {0};
+    rure_captures *caps = rure_captures_new(re);
+    rure_iter *it = rure_iter_new(re);
+
+    while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
+        /*
+         * Get the location of the full match and the capturing group.
+         * We know that both accesses are successful since the body of the
+         * loop only executes if there is a match and both capture groups
+         * must match in order for the entire regex to match.
+         *
+         * N.B. The zeroth group corresponds to the full match of the regex.
+         */
+        rure_captures_at(caps, 0, &group0);
+        rure_captures_at(caps, 1, &group1);
+        printf("%.*s (match at: %zu, %zu)\n",
+               (int)(group1.end - group1.start),
+               sherlock + group1.start,
+               group0.start, group0.end);
+    }
+
+    /* Free all our resources. */
+    munmap((char*)sherlock, sherlock_len);
+    rure_captures_free(caps);
+    rure_iter_free(it);
+    rure_free(re);
+    return 0;
+}
--- a/third_party/rust/rure/examples/sherlock.txt
+++ b/third_party/rust/rure/examples/sherlock.txt
--- a/third_party/rust/rure/include/rure.h
+++ b/third_party/rust/rure/include/rure.h
@ -0,0 +1,585 @@
+#ifndef _RURE_H
+#define _RURE_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * rure is the type of a compiled regular expression.
+ *
+ * An rure can be safely used from multiple threads simultaneously.
+ */
+typedef struct rure rure;
+
+/*
+ * rure_set is the type of a set of compiled regular expressions.
+ *
+ * A rure can be safely used from multiple threads simultaneously.
+ */
+typedef struct rure_set rure_set;
+
+/*
+ * rure_options is the set of non-flag configuration options for compiling
+ * a regular expression. Currently, only two options are available: setting
+ * the size limit of the compiled program and setting the size limit of the
+ * cache of states that the DFA uses while searching.
+ *
+ * For most uses, the default settings will work fine, and NULL can be passed
+ * wherever a *rure_options is expected.
+*/
+typedef struct rure_options rure_options;
+
+/*
+ * The flags listed below can be used in rure_compile to set the default
+ * flags. All flags can otherwise be toggled in the expression itself using
+ * standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
+ * disables it.
+ */
+/* The case insensitive (i) flag. */
+#define RURE_FLAG_CASEI (1 << 0)
+/* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
+#define RURE_FLAG_MULTI (1 << 1)
+/* The any character (s) flag. (. matches new line.) */
+#define RURE_FLAG_DOTNL (1 << 2)
+/* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
+#define RURE_FLAG_SWAP_GREED (1 << 3)
+/* The ignore whitespace (x) flag. */
+#define RURE_FLAG_SPACE (1 << 4)
+/* The Unicode (u) flag. */
+#define RURE_FLAG_UNICODE (1 << 5)
+/* The default set of flags enabled when no flags are set. */
+#define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
+
+/*
+ * rure_match corresponds to the location of a single match in a haystack.
+ */
+typedef struct rure_match {
+    /* The start position. */
+    size_t start;
+    /* The end position. */
+    size_t end;
+} rure_match;
+
+/*
+ * rure_captures represents storage for sub-capture locations of a match.
+ *
+ * Computing the capture groups of a match can carry a significant performance
+ * penalty, so their use in the API is optional.
+ *
+ * An rure_captures value can be reused in multiple calls to rure_find_captures,
+ * so long as it is used with the compiled regular expression that created
+ * it.
+ *
+ * An rure_captures value may outlive its corresponding rure and can be freed
+ * independently.
+ *
+ * It is not safe to use from multiple threads simultaneously.
+ */
+typedef struct rure_captures rure_captures;
+
+/*
+ * rure_iter is an iterator over successive non-overlapping matches in a
+ * particular haystack.
+ *
+ * An rure_iter value may not outlive its corresponding rure and should be freed
+ * before its corresponding rure is freed.
+ *
+ * It is not safe to use from multiple threads simultaneously.
+ */
+typedef struct rure_iter rure_iter;
+
+/*
+ * rure_iter_capture_names is an iterator over the list of capture group names
+ * in this particular rure.
+ *
+ * An rure_iter_capture_names value may not outlive its corresponding rure,
+ * and should be freed before its corresponding rure is freed.
+ *
+ * It is not safe to use from multiple threads simultaneously.
+ */
+typedef struct rure_iter_capture_names rure_iter_capture_names;
+
+/*
+ * rure_error is an error that caused compilation to fail.
+ *
+ * Most errors are syntax errors but an error can be returned if the compiled
+ * regular expression would be too big.
+ *
+ * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
+ * you will not get access to the error if one occurred.)
+ *
+ * It is not safe to use from multiple threads simultaneously.
+ */
+typedef struct rure_error rure_error;
+
+/*
+ * rure_compile_must compiles the given pattern into a regular expression. If
+ * compilation fails for any reason, an error message is printed to stderr and
+ * the process is aborted.
+ *
+ * The pattern given should be in UTF-8. For convenience, this accepts a C
+ * string, which means the pattern cannot usefully contain NUL. If your pattern
+ * may contain NUL, consider using a regular expression escape sequence, or
+ * just use rure_compile.
+ *
+ * This uses RURE_DEFAULT_FLAGS.
+ *
+ * The compiled expression returned may be used from multiple threads
+ * simultaneously.
+ */
+rure *rure_compile_must(const char *pattern);
+
+/*
+ * rure_compile compiles the given pattern into a regular expression. The
+ * pattern must be valid UTF-8 and the length corresponds to the number of
+ * bytes in the pattern.
+ *
+ * flags is a bitfield. Valid values are constants declared with prefix
+ * RURE_FLAG_.
+ *
+ * options contains non-flag configuration settings. If it's NULL, default
+ * settings are used. options may be freed immediately after a call to
+ * rure_compile.
+ *
+ * error is set if there was a problem compiling the pattern (including if the
+ * pattern is not valid UTF-8). If error is NULL, then no error information
+ * is returned. In all cases, if an error occurs, NULL is returned.
+ *
+ * The compiled expression returned may be used from multiple threads
+ * simultaneously.
+ */
+rure *rure_compile(const uint8_t *pattern, size_t length,
+                   uint32_t flags, rure_options *options,
+                   rure_error *error);
+
+/*
+ * rure_free frees the given compiled regular expression.
+ *
+ * This must be called at most once for any rure.
+ */
+void rure_free(rure *re);
+
+/*
+ * rure_is_match returns true if and only if re matches anywhere in haystack.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ *
+ * rure_is_match should be preferred to rure_find since it may be faster.
+ *
+ * N.B. The performance of this search is not impacted by the presence of
+ * capturing groups in your regular expression.
+ */
+bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
+                   size_t start);
+
+/*
+ * rure_find returns true if and only if re matches anywhere in haystack.
+ * If a match is found, then its start and end offsets (in bytes) are set
+ * on the match pointer given.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ *
+ * rure_find should be preferred to rure_find_captures since it may be faster.
+ *
+ * N.B. The performance of this search is not impacted by the presence of
+ * capturing groups in your regular expression.
+ */
+bool rure_find(rure *re, const uint8_t *haystack, size_t length,
+               size_t start, rure_match *match);
+
+/*
+ * rure_find_captures returns true if and only if re matches anywhere in
+ * haystack. If a match is found, then all of its capture locations are stored
+ * in the captures pointer given.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ *
+ * Only use this function if you specifically need access to capture locations.
+ * It is not necessary to use this function just because your regular
+ * expression contains capturing groups.
+ *
+ * Capture locations can be accessed using the rure_captures_* functions.
+ *
+ * N.B. The performance of this search can be impacted by the number of
+ * capturing groups. If you're using this function, it may be beneficial to
+ * use non-capturing groups (e.g., `(?:re)`) where possible.
+ */
+bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
+                        size_t start, rure_captures *captures);
+
+/*
+ * rure_shortest_match returns true if and only if re matches anywhere in
+ * haystack. If a match is found, then its end location is stored in the
+ * pointer given. The end location is the place at which the regex engine
+ * determined that a match exists, but may occur before the end of the proper
+ * leftmost-first match.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ *
+ * rure_shortest_match should be preferred to rure_find since it may be faster.
+ *
+ * N.B. The performance of this search is not impacted by the presence of
+ * capturing groups in your regular expression.
+ */
+bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
+                         size_t start, size_t *end);
+
+/*
+ * rure_capture_name_index returns the capture index for the name given. If
+ * no such named capturing group exists in re, then -1 is returned.
+ *
+ * The capture index may be used with rure_captures_at.
+ *
+ * This function never returns 0 since the first capture group always
+ * corresponds to the entire match and is always unnamed.
+ */
+int32_t rure_capture_name_index(rure *re, const char *name);
+
+/*
+ * rure_iter_capture_names_new creates a new capture_names iterator.
+ *
+ * An iterator will report all successive capture group names of re.
+ */
+rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
+
+/*
+ * rure_iter_capture_names_free frees the iterator given.
+ *
+ * It must be called at most once.
+ */
+void rure_iter_capture_names_free(rure_iter_capture_names *it);
+
+/*
+ * rure_iter_capture_names_next advances the iterator and returns true
+ * if and only if another capture group name exists.
+ *
+ * The value of the capture group name is written to the provided pointer.
+ */
+bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
+
+/*
+ * rure_iter_new creates a new iterator.
+ *
+ * An iterator will report all successive non-overlapping matches of re.
+ * When calling iterator functions, the same haystack and length must be
+ * supplied to all invocations. (Strict pointer equality is, however, not
+ * required.)
+ */
+rure_iter *rure_iter_new(rure *re);
+
+/*
+ * rure_iter_free frees the iterator given.
+ *
+ * It must be called at most once.
+ */
+void rure_iter_free(rure_iter *it);
+
+/*
+ * rure_iter_next advances the iterator and returns true if and only if a
+ * match was found. If a match is found, then the match pointer is set with the
+ * start and end location of the match, in bytes.
+ *
+ * If no match is found, then subsequent calls will return false indefinitely.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack. The given haystack must
+ * be logically equivalent to all other haystacks given to this iterator.
+ *
+ * rure_iter_next should be preferred to rure_iter_next_captures since it may
+ * be faster.
+ *
+ * N.B. The performance of this search is not impacted by the presence of
+ * capturing groups in your regular expression.
+ */
+bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
+                    rure_match *match);
+
+/*
+ * rure_iter_next_captures advances the iterator and returns true if and only if a
+ * match was found. If a match is found, then all of its capture locations are
+ * stored in the captures pointer given.
+ *
+ * If no match is found, then subsequent calls will return false indefinitely.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack. The given haystack must
+ * be logically equivalent to all other haystacks given to this iterator.
+ *
+ * Only use this function if you specifically need access to capture locations.
+ * It is not necessary to use this function just because your regular
+ * expression contains capturing groups.
+ *
+ * Capture locations can be accessed using the rure_captures_* functions.
+ *
+ * N.B. The performance of this search can be impacted by the number of
+ * capturing groups. If you're using this function, it may be beneficial to
+ * use non-capturing groups (e.g., `(?:re)`) where possible.
+ */
+bool rure_iter_next_captures(rure_iter *it,
+                             const uint8_t *haystack, size_t length,
+                             rure_captures *captures);
+
+/*
+ * rure_captures_new allocates storage for all capturing groups in re.
+ *
+ * An rure_captures value may be reused on subsequent calls to
+ * rure_find_captures or rure_iter_next_captures.
+ *
+ * An rure_captures value may be freed independently of re, although any
+ * particular rure_captures should be used only with the re given here.
+ *
+ * It is not safe to use an rure_captures value from multiple threads
+ * simultaneously.
+ */
+rure_captures *rure_captures_new(rure *re);
+
+/*
+ * rure_captures_free frees the given captures.
+ *
+ * This must be called at most once.
+ */
+void rure_captures_free(rure_captures *captures);
+
+/*
+ * rure_captures_at returns true if and only if the capturing group at the
+ * index given was part of a match. If so, the given match pointer is populated
+ * with the start and end location (in bytes) of the capturing group.
+ *
+ * If no capture group with the index i exists, then false is
+ * returned. (A capturing group exists if and only if i is less than
+ * rure_captures_len(captures).)
+ *
+ * Note that index 0 corresponds to the full match.
+ */
+bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
+
+/*
+ * rure_captures_len returns the number of capturing groups in the given
+ * captures.
+ */
+size_t rure_captures_len(rure_captures *captures);
+
+/*
+ * rure_options_new allocates space for options.
+ *
+ * Options may be freed immediately after a call to rure_compile, but otherwise
+ * may be freely used in multiple calls to rure_compile.
+ *
+ * It is not safe to set options from multiple threads simultaneously. It is
+ * safe to call rure_compile from multiple threads simultaneously using the
+ * same options pointer.
+ */
+rure_options *rure_options_new();
+
+/*
+ * rure_options_free frees the given options.
+ *
+ * This must be called at most once.
+ */
+void rure_options_free(rure_options *options);
+
+/*
+ * rure_options_size_limit sets the appoximate size limit of the compiled
+ * regular expression.
+ *
+ * This size limit roughly corresponds to the number of bytes occupied by a
+ * single compiled program. If the program would exceed this number, then a
+ * compilation error will be returned from rure_compile.
+ */
+void rure_options_size_limit(rure_options *options, size_t limit);
+
+/*
+ * rure_options_dfa_size_limit sets the approximate size of the cache used by
+ * the DFA during search.
+ *
+ * This roughly corresponds to the number of bytes that the DFA will use while
+ * searching.
+ *
+ * Note that this is a *per thread* limit. There is no way to set a global
+ * limit. In particular, if a regular expression is used from multiple threads
+ * simultaneously, then each thread may use up to the number of bytes
+ * specified here.
+ */
+void rure_options_dfa_size_limit(rure_options *options, size_t limit);
+
+/*
+ * rure_compile_set compiles the given list of patterns into a single regular
+ * expression which can be matched in a linear-scan. Each pattern in patterns
+ * must be valid UTF-8 and the length of each pattern in patterns corresponds
+ * to a byte length in patterns_lengths.
+ *
+ * The number of patterns to compile is specified by patterns_count. patterns
+ * must contain at least this many entries.
+ *
+ * flags is a bitfield. Valid values are constants declared with prefix
+ * RURE_FLAG_.
+ *
+ * options contains non-flag configuration settings. If it's NULL, default
+ * settings are used. options may be freed immediately after a call to
+ * rure_compile.
+ *
+ * error is set if there was a problem compiling the pattern.
+ *
+ * The compiled expression set returned may be used from multiple threads.
+ */
+rure_set *rure_compile_set(const uint8_t **patterns,
+                           const size_t *patterns_lengths,
+                           size_t patterns_count,
+                           uint32_t flags,
+                           rure_options *options,
+                           rure_error *error);
+
+/*
+ * rure_set_free frees the given compiled regular expression set.
+ *
+ * This must be called at most once for any rure_set.
+ */
+void rure_set_free(rure_set *re);
+
+/*
+ * rure_is_match returns true if and only if any regexes within the set
+ * match anywhere in the haystack. Once a match has been located, the
+ * matching engine will quit immediately.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ */
+bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
+                       size_t start);
+
+/*
+ * rure_set_matches compares each regex in the set against the haystack and
+ * modifies matches with the match result of each pattern. Match results are
+ * ordered in the same way as the rure_set was compiled. For example,
+ * index 0 of matches corresponds to the first pattern passed to
+ * `rure_compile_set`.
+ *
+ * haystack may contain arbitrary bytes, but ASCII compatible text is more
+ * useful. UTF-8 is even more useful. Other text encodings aren't supported.
+ * length should be the number of bytes in haystack.
+ *
+ * start is the position at which to start searching. Note that setting the
+ * start position is distinct from incrementing the pointer, since the regex
+ * engine may look at bytes before the start position to determine match
+ * information. For example, if the start position is greater than 0, then the
+ * \A ("begin text") anchor can never match.
+ *
+ * matches must be greater than or equal to the number of patterns the
+ * rure_set was compiled with.
+ *
+ * Only use this function if you specifically need to know which regexes
+ * matched within the set. To determine if any of the regexes matched without
+ * caring which, use rure_set_is_match.
+ */
+bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
+                      size_t start, bool *matches);
+
+/*
+ * rure_set_len returns the number of patterns rure_set was compiled with.
+ */
+size_t rure_set_len(rure_set *re);
+
+/*
+ * rure_error_new allocates space for an error.
+ *
+ * If error information is desired, then rure_error_new should be called
+ * to create an rure_error pointer, and that pointer can be passed to
+ * rure_compile. If an error occurred, then rure_compile will return NULL and
+ * the error pointer will be set. A message can then be extracted.
+ *
+ * It is not safe to use errors from multiple threads simultaneously. An error
+ * value may be reused on subsequent calls to rure_compile.
+ */
+rure_error *rure_error_new();
+
+/*
+ * rure_error_free frees the error given.
+ *
+ * This must be called at most once.
+ */
+void rure_error_free(rure_error *err);
+
+/*
+ * rure_error_message returns a NUL terminated string that describes the error
+ * message.
+ *
+ * The pointer returned must not be freed. Instead, it will be freed when
+ * rure_error_free is called. If err is used in subsequent calls to
+ * rure_compile, then this pointer may change or become invalid.
+ */
+const char *rure_error_message(rure_error *err);
+
+/*
+ * rure_escape_must returns a NUL terminated string where all meta characters
+ * have been escaped. If escaping fails for any reason, an error message is
+ * printed to stderr and the process is aborted.
+ *
+ * The pattern given should be in UTF-8. For convenience, this accepts a C
+ * string, which means the pattern cannot contain a NUL byte. These correspond
+ * to the only two failure conditions of this function. That is, if the caller
+ * guarantees that the given pattern is valid UTF-8 and does not contain a
+ * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
+ *
+ * The pointer returned must not be freed directly. Instead, it should be freed
+ * by calling rure_cstring_free.
+ */
+const char *rure_escape_must(const char *pattern);
+
+/*
+ * rure_cstring_free frees the string given.
+ *
+ * This must be called at most once per string.
+ */
+void rure_cstring_free(char *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/third_party/rust/rure/src/error.rs
+++ b/third_party/rust/rure/src/error.rs
@ -0,0 +1,79 @@
+use std::ffi;
+use std::ffi::CString;
+use std::fmt;
+use std::str;
+
+use libc::c_char;
+use regex;
+
+#[derive(Debug)]
+pub struct Error {
+    message: Option<CString>,
+    kind: ErrorKind,
+}
+
+#[derive(Debug)]
+pub enum ErrorKind {
+    None,
+    Str(str::Utf8Error),
+    Regex(regex::Error),
+    Nul(ffi::NulError),
+}
+
+impl Error {
+    pub fn new(kind: ErrorKind) -> Error {
+        Error { message: None, kind: kind }
+    }
+
+    pub fn is_err(&self) -> bool {
+        match self.kind {
+            ErrorKind::None => false,
+            ErrorKind::Str(_) | ErrorKind::Regex(_) | ErrorKind::Nul(_) => {
+                true
+            }
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.kind {
+            ErrorKind::None => write!(f, "no error"),
+            ErrorKind::Str(ref e) => e.fmt(f),
+            ErrorKind::Regex(ref e) => e.fmt(f),
+            ErrorKind::Nul(ref e) => e.fmt(f),
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_error_new() -> *mut Error {
+        Box::into_raw(Box::new(Error::new(ErrorKind::None)))
+    }
+}
+
+ffi_fn! {
+    fn rure_error_free(err: *mut Error) {
+        unsafe { drop(Box::from_raw(err)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_error_message(err: *mut Error) -> *const c_char {
+        let err = unsafe { &mut *err };
+        let cmsg = match CString::new(format!("{}", err)) {
+            Ok(msg) => msg,
+            Err(err) => {
+                // I guess this can probably happen if the regex itself has a
+                // NUL, and that NUL re-occurs in the context presented by the
+                // error message. In this case, just show as much as we can.
+                let nul = err.nul_position();
+                let msg = err.into_vec();
+                CString::new(msg[0..nul].to_owned()).unwrap()
+            }
+        };
+        let p = cmsg.as_ptr();
+        err.message = Some(cmsg);
+        p
+    }
+}
--- a/third_party/rust/rure/src/lib.rs
+++ b/third_party/rust/rure/src/lib.rs
@ -0,0 +1,7 @@
+#[macro_use]
+mod macros;
+mod error;
+mod rure;
+
+pub use crate::error::*;
+pub use crate::rure::*;
--- a/third_party/rust/rure/src/macros.rs
+++ b/third_party/rust/rure/src/macros.rs
@ -0,0 +1,36 @@
+macro_rules! ffi_fn {
+    (fn $name:ident($($arg:ident: $arg_ty:ty),*,) -> $ret:ty $body:block) => {
+        ffi_fn!(fn $name($($arg: $arg_ty),*) -> $ret $body);
+    };
+    (fn $name:ident($($arg:ident: $arg_ty:ty),*) -> $ret:ty $body:block) => {
+        #[no_mangle]
+        pub extern fn $name($($arg: $arg_ty),*) -> $ret {
+            use ::std::io::{self, Write};
+            use ::std::panic::{self, AssertUnwindSafe};
+            use ::libc::abort;
+            match panic::catch_unwind(AssertUnwindSafe(move || $body)) {
+                Ok(v) => v,
+                Err(err) => {
+                    let msg = if let Some(&s) = err.downcast_ref::<&str>() {
+                        s.to_owned()
+                    } else if let Some(s) = err.downcast_ref::<String>() {
+                        s.to_owned()
+                    } else {
+                        "UNABLE TO SHOW RESULT OF PANIC.".to_owned()
+                    };
+                    let _ = writeln!(
+                        &mut io::stderr(),
+                        "panic unwind caught, aborting: {:?}",
+                        msg);
+                    unsafe { abort() }
+                }
+            }
+        }
+    };
+    (fn $name:ident($($arg:ident: $arg_ty:ty),*,) $body:block) => {
+        ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
+    };
+    (fn $name:ident($($arg:ident: $arg_ty:ty),*) $body:block) => {
+        ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
+    };
+}
--- a/third_party/rust/rure/src/rure.rs
+++ b/third_party/rust/rure/src/rure.rs
@ -0,0 +1,629 @@
+use std::collections::HashMap;
+use std::ffi::{CStr, CString};
+use std::ops::Deref;
+use std::ptr;
+use std::slice;
+use std::str;
+
+use libc::{c_char, size_t};
+use regex::bytes;
+
+use crate::error::{Error, ErrorKind};
+
+const RURE_FLAG_CASEI: u32 = 1 << 0;
+const RURE_FLAG_MULTI: u32 = 1 << 1;
+const RURE_FLAG_DOTNL: u32 = 1 << 2;
+const RURE_FLAG_SWAP_GREED: u32 = 1 << 3;
+const RURE_FLAG_SPACE: u32 = 1 << 4;
+const RURE_FLAG_UNICODE: u32 = 1 << 5;
+const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE;
+
+pub struct Regex {
+    re: bytes::Regex,
+    capture_names: HashMap<String, i32>,
+}
+
+pub struct Options {
+    size_limit: usize,
+    dfa_size_limit: usize,
+}
+
+// The `RegexSet` is not exposed with option support or matching at an
+// arbitrary position with a crate just yet. To circumvent this, we use
+// the `Exec` structure directly.
+pub struct RegexSet {
+    re: bytes::RegexSet,
+}
+
+#[repr(C)]
+pub struct rure_match {
+    pub start: size_t,
+    pub end: size_t,
+}
+
+pub struct Captures(bytes::Locations);
+
+pub struct Iter {
+    re: *const Regex,
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+pub struct IterCaptureNames {
+    capture_names: bytes::CaptureNames<'static>,
+    name_ptrs: Vec<*mut c_char>,
+}
+
+impl Deref for Regex {
+    type Target = bytes::Regex;
+    fn deref(&self) -> &bytes::Regex {
+        &self.re
+    }
+}
+
+impl Deref for RegexSet {
+    type Target = bytes::RegexSet;
+    fn deref(&self) -> &bytes::RegexSet {
+        &self.re
+    }
+}
+
+impl Default for Options {
+    fn default() -> Options {
+        Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
+    }
+}
+
+ffi_fn! {
+    fn rure_compile_must(pattern: *const c_char) -> *const Regex {
+        let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
+        let pat = pattern as *const u8;
+        let mut err = Error::new(ErrorKind::None);
+        let re = rure_compile(
+            pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err);
+        if err.is_err() {
+            let _ = writeln!(&mut io::stderr(), "{}", err);
+            let _ = writeln!(
+                &mut io::stderr(), "aborting from rure_compile_must");
+            unsafe { abort() }
+        }
+        re
+    }
+}
+
+ffi_fn! {
+    fn rure_compile(
+        pattern: *const u8,
+        length: size_t,
+        flags: u32,
+        options: *const Options,
+        error: *mut Error,
+    ) -> *const Regex {
+        let pat = unsafe { slice::from_raw_parts(pattern, length) };
+        let pat = match str::from_utf8(pat) {
+            Ok(pat) => pat,
+            Err(err) => {
+                unsafe {
+                    if !error.is_null() {
+                        *error = Error::new(ErrorKind::Str(err));
+                    }
+                    return ptr::null();
+                }
+            }
+        };
+        let mut builder = bytes::RegexBuilder::new(pat);
+        if !options.is_null() {
+            let options = unsafe { &*options };
+            builder.size_limit(options.size_limit);
+            builder.dfa_size_limit(options.dfa_size_limit);
+        }
+        builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
+        builder.multi_line(flags & RURE_FLAG_MULTI > 0);
+        builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
+        builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
+        builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
+        builder.unicode(flags & RURE_FLAG_UNICODE > 0);
+        match builder.build() {
+            Ok(re) => {
+                let mut capture_names = HashMap::new();
+                for (i, name) in re.capture_names().enumerate() {
+                    if let Some(name) = name {
+                        capture_names.insert(name.to_owned(), i as i32);
+                    }
+                }
+                let re = Regex {
+                    re: re,
+                    capture_names: capture_names,
+                };
+                Box::into_raw(Box::new(re))
+            }
+            Err(err) => {
+                unsafe {
+                    if !error.is_null() {
+                        *error = Error::new(ErrorKind::Regex(err));
+                    }
+                    ptr::null()
+                }
+            }
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_free(re: *const Regex) {
+        unsafe { drop(Box::from_raw(re as *mut Regex)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_is_match(
+        re: *const Regex,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t,
+    ) -> bool {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        re.is_match_at(haystack, start)
+    }
+}
+
+ffi_fn! {
+    fn rure_find(
+        re: *const Regex,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t,
+        match_info: *mut rure_match,
+    ) -> bool {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        re.find_at(haystack, start).map(|m| unsafe {
+            if !match_info.is_null() {
+                (*match_info).start = m.start();
+                (*match_info).end = m.end();
+            }
+        }).is_some()
+    }
+}
+
+ffi_fn! {
+    fn rure_find_captures(
+        re: *const Regex,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t,
+        captures: *mut Captures,
+    ) -> bool {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        let slots = unsafe { &mut (*captures).0 };
+        re.read_captures_at(slots, haystack, start).is_some()
+    }
+}
+
+ffi_fn! {
+    fn rure_shortest_match(
+        re: *const Regex,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t,
+        end: *mut usize,
+    ) -> bool {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        match re.shortest_match_at(haystack, start) {
+            None => false,
+            Some(i) => {
+                if !end.is_null() {
+                    unsafe {
+                        *end = i;
+                    }
+                }
+                true
+            }
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_capture_name_index(
+        re: *const Regex,
+        name: *const c_char,
+    ) -> i32 {
+        let re = unsafe { &*re };
+        let name = unsafe { CStr::from_ptr(name) };
+        let name = match name.to_str() {
+            Err(_) => return -1,
+            Ok(name) => name,
+        };
+        re.capture_names.get(name).map(|&i|i).unwrap_or(-1)
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_capture_names_new(
+        re: *const Regex,
+    ) -> *mut IterCaptureNames {
+        let re = unsafe { &*re };
+        Box::into_raw(Box::new(IterCaptureNames {
+            capture_names: re.re.capture_names(),
+            name_ptrs: Vec::new(),
+        }))
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_capture_names_free(it: *mut IterCaptureNames) {
+        unsafe {
+            let it = &mut *it;
+            while let Some(ptr) = it.name_ptrs.pop() {
+                drop(CString::from_raw(ptr));
+            }
+            drop(Box::from_raw(it));
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_capture_names_next(
+        it: *mut IterCaptureNames,
+        capture_name: *mut *mut c_char,
+    ) -> bool {
+        if capture_name.is_null() {
+            return false;
+        }
+
+        let it = unsafe { &mut *it };
+        let cn = match it.capture_names.next() {
+            // Top-level iterator ran out of capture groups
+            None => return false,
+            Some(val) => {
+                let name = match val {
+                    // inner Option didn't have a name
+                    None => "",
+                    Some(name) => name
+                };
+                name
+            }
+        };
+
+        unsafe {
+            let cs = match CString::new(cn.as_bytes()) {
+                Result::Ok(val) => val,
+                Result::Err(_) => return false
+            };
+            let ptr = cs.into_raw();
+            it.name_ptrs.push(ptr);
+            *capture_name = ptr;
+        }
+        true
+
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_new(
+        re: *const Regex,
+    ) -> *mut Iter {
+        Box::into_raw(Box::new(Iter {
+            re: re,
+            last_end: 0,
+            last_match: None,
+        }))
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_free(it: *mut Iter) {
+        unsafe { drop(Box::from_raw(it)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_next(
+        it: *mut Iter,
+        haystack: *const u8,
+        len: size_t,
+        match_info: *mut rure_match,
+    ) -> bool {
+        let it = unsafe { &mut *it };
+        let re = unsafe { &*it.re };
+        let text = unsafe { slice::from_raw_parts(haystack, len) };
+        if it.last_end > text.len() {
+            return false;
+        }
+        let (s, e) = match re.find_at(text, it.last_end) {
+            None => return false,
+            Some(m) => (m.start(), m.end()),
+        };
+        if s == e {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            it.last_end += 1;
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(e) == it.last_match {
+                return rure_iter_next(it, haystack, len, match_info);
+            }
+        } else {
+            it.last_end = e;
+        }
+        it.last_match = Some(e);
+        if !match_info.is_null() {
+            unsafe {
+                (*match_info).start = s;
+                (*match_info).end = e;
+            }
+        }
+        true
+    }
+}
+
+ffi_fn! {
+    fn rure_iter_next_captures(
+        it: *mut Iter,
+        haystack: *const u8,
+        len: size_t,
+        captures: *mut Captures,
+    ) -> bool {
+        let it = unsafe { &mut *it };
+        let re = unsafe { &*it.re };
+        let slots = unsafe { &mut (*captures).0 };
+        let text = unsafe { slice::from_raw_parts(haystack, len) };
+        if it.last_end > text.len() {
+            return false;
+        }
+        let (s, e) = match re.read_captures_at(slots, text, it.last_end) {
+            None => return false,
+            Some(m) => (m.start(), m.end()),
+        };
+        if s == e {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            it.last_end += 1;
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(e) == it.last_match {
+                return rure_iter_next_captures(it, haystack, len, captures);
+            }
+        } else {
+            it.last_end = e;
+        }
+        it.last_match = Some(e);
+        true
+    }
+}
+
+ffi_fn! {
+    fn rure_captures_new(re: *const Regex) -> *mut Captures {
+        let re = unsafe { &*re };
+        let captures = Captures(re.locations());
+        Box::into_raw(Box::new(captures))
+    }
+}
+
+ffi_fn! {
+    fn rure_captures_free(captures: *const Captures) {
+        unsafe { drop(Box::from_raw(captures as *mut Captures)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_captures_at(
+        captures: *const Captures,
+        i: size_t,
+        match_info: *mut rure_match,
+    ) -> bool {
+        let locs = unsafe { &(*captures).0 };
+        match locs.pos(i) {
+            Some((start, end)) => {
+                if !match_info.is_null() {
+                    unsafe {
+                        (*match_info).start = start;
+                        (*match_info).end = end;
+                    }
+                }
+                true
+            }
+            _ => false
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_captures_len(captures: *const Captures) -> size_t {
+        unsafe { (*captures).0.len() }
+    }
+}
+
+ffi_fn! {
+    fn rure_options_new() -> *mut Options {
+        Box::into_raw(Box::new(Options::default()))
+    }
+}
+
+ffi_fn! {
+    fn rure_options_free(options: *mut Options) {
+        unsafe { drop(Box::from_raw(options)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_options_size_limit(options: *mut Options, limit: size_t) {
+        let options = unsafe { &mut *options };
+        options.size_limit = limit;
+    }
+}
+
+ffi_fn! {
+    fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) {
+        let options = unsafe { &mut *options };
+        options.dfa_size_limit = limit;
+    }
+}
+
+ffi_fn! {
+    fn rure_compile_set(
+        patterns: *const *const u8,
+        patterns_lengths: *const size_t,
+        patterns_count: size_t,
+        flags: u32,
+        options: *const Options,
+        error: *mut Error
+    ) -> *const RegexSet {
+        let (raw_pats, raw_patsl) = unsafe {
+            (
+                slice::from_raw_parts(patterns, patterns_count),
+                slice::from_raw_parts(patterns_lengths, patterns_count)
+            )
+        };
+
+        let mut pats = Vec::with_capacity(patterns_count);
+        for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
+            let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
+            pats.push(match str::from_utf8(pat) {
+                Ok(pat) => pat,
+                Err(err) => {
+                    unsafe {
+                        if !error.is_null() {
+                            *error = Error::new(ErrorKind::Str(err));
+                        }
+                        return ptr::null();
+                    }
+                }
+            });
+        }
+
+        let mut builder = bytes::RegexSetBuilder::new(pats);
+        if !options.is_null() {
+            let options = unsafe { &*options };
+            builder.size_limit(options.size_limit);
+            builder.dfa_size_limit(options.dfa_size_limit);
+        }
+        builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
+        builder.multi_line(flags & RURE_FLAG_MULTI > 0);
+        builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
+        builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
+        builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
+        builder.unicode(flags & RURE_FLAG_UNICODE > 0);
+        match builder.build() {
+            Ok(re) => {
+                Box::into_raw(Box::new(RegexSet { re: re }))
+            }
+            Err(err) => {
+                unsafe {
+                    if !error.is_null() {
+                        *error = Error::new(ErrorKind::Regex(err))
+                    }
+                    ptr::null()
+                }
+            }
+        }
+    }
+}
+
+ffi_fn! {
+    fn rure_set_free(re: *const RegexSet) {
+        unsafe { drop(Box::from_raw(re as *mut RegexSet)); }
+    }
+}
+
+ffi_fn! {
+    fn rure_set_is_match(
+        re: *const RegexSet,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t
+    ) -> bool {
+        let re = unsafe { &*re };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+        re.is_match_at(haystack, start)
+    }
+}
+
+ffi_fn! {
+    fn rure_set_matches(
+        re: *const RegexSet,
+        haystack: *const u8,
+        len: size_t,
+        start: size_t,
+        matches: *mut bool
+    ) -> bool {
+        let re = unsafe { &*re };
+        let mut matches = unsafe {
+            slice::from_raw_parts_mut(matches, re.len())
+        };
+        let haystack = unsafe { slice::from_raw_parts(haystack, len) };
+
+        // read_matches_at isn't guaranteed to set non-matches to false
+        for item in matches.iter_mut() {
+            *item = false;
+        }
+        re.read_matches_at(&mut matches, haystack, start)
+    }
+}
+
+ffi_fn! {
+    fn rure_set_len(re: *const RegexSet) -> size_t {
+        unsafe { (*re).len() }
+    }
+}
+
+ffi_fn! {
+    fn rure_escape_must(pattern: *const c_char) -> *const c_char {
+        let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
+        let pat = pattern as *const u8;
+        let mut err = Error::new(ErrorKind::None);
+        let esc = rure_escape(pat, len, &mut err);
+        if err.is_err() {
+            let _ = writeln!(&mut io::stderr(), "{}", err);
+            let _ = writeln!(
+                &mut io::stderr(), "aborting from rure_escape_must");
+            unsafe { abort() }
+        }
+        esc
+    }
+}
+
+/// A helper function that implements fallible escaping in a way that returns
+/// an error if escaping failed.
+///
+/// This should ideally be exposed, but it needs API design work. In
+/// particular, this should not return a C string, but a `const uint8_t *`
+/// instead, since it may contain a NUL byte.
+fn rure_escape(
+    pattern: *const u8,
+    length: size_t,
+    error: *mut Error,
+) -> *const c_char {
+    let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
+    let str_pat = match str::from_utf8(pat) {
+        Ok(val) => val,
+        Err(err) => unsafe {
+            if !error.is_null() {
+                *error = Error::new(ErrorKind::Str(err));
+            }
+            return ptr::null();
+        },
+    };
+    let esc_pat = regex::escape(str_pat);
+    let c_esc_pat = match CString::new(esc_pat) {
+        Ok(val) => val,
+        Err(err) => unsafe {
+            if !error.is_null() {
+                *error = Error::new(ErrorKind::Nul(err));
+            }
+            return ptr::null();
+        },
+    };
+    c_esc_pat.into_raw() as *const c_char
+}
+
+ffi_fn! {
+    fn rure_cstring_free(s: *mut c_char) {
+        unsafe { drop(CString::from_raw(s)); }
+    }
+}
--- a/third_party/rust/rure/test
+++ b/third_party/rust/rure/test
@ -0,0 +1,7 @@
+#!/bin/sh
+
+set -e
+
+cargo build --verbose
+(cd ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
+(cd examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
--- a/toolkit/library/rust/moz.build
+++ b/toolkit/library/rust/moz.build
@ -46,3 +46,9 @@ if CONFIG["CPU_ARCH"] != "x86":

 if CONFIG["MOZ_BITS_DOWNLOAD"]:
    RUST_TESTS += ["bits_client"]
+
+# Export the `rure` crate's included .h file. The symbols defined in that file
+# will be exported from the `gkrust-shared` crate.
+EXPORTS += [
+    "/third_party/rust/rure/include/rure.h",
+]
--- a/toolkit/library/rust/shared/Cargo.toml
+++ b/toolkit/library/rust/shared/Cargo.toml
@ -67,6 +67,7 @@ unic-langid-ffi = { path = "../../../../intl/locale/rust/unic-langid-ffi" }
 fluent-langneg = { version = "0.13", features = ["cldr"] }
 fluent-langneg-ffi = { path = "../../../../intl/locale/rust/fluent-langneg-ffi" }
 regex-ffi = { path = "../../../components/regex-ffi" }
+rure = "0.2.2"
 rust_minidump_writer_linux = { path = "../../../crashreporter/rust_minidump_writer_linux", optional = true }
 gecko-profiler = { path = "../../../../tools/profiler/rust-api"}
 midir_impl = { path = "../../../../dom/midi/midir_impl", optional = true }
--- a/toolkit/library/rust/shared/lib.rs
+++ b/toolkit/library/rust/shared/lib.rs
@ -74,6 +74,7 @@ extern crate fluent;
 extern crate fluent_ffi;

 extern crate regex_ffi;
+extern crate rure;

 extern crate fluent_fallback;
 extern crate l10nregistry_ffi;
				`@ -0,0 +1 @@`
				{"files":{"Cargo.toml":"6bed7b80456a66969f4fe9bb5341a0b927a7cd58e036441cbb3b79d67d86c24a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"e8462c4064a376c2b2d729cc766064cc97decd6a2bb325cf9c7b50be9b8897ce","ctest/compile":"48b692b2aca8b61dfbe372f46d3aeb242893cfa2d81b0a89a73eb2f5db6b6e27","ctest/test.c":"6565808675763c42f8f10bd95445eaab4eaa3618efcf8ec215d98c3a1cfe756d","examples/compile":"471a781860b733f9aa9c1691f33ac8e8a4e85efcb97540942432ba5b58fbb982","examples/iter.c":"ad8312b2271ee19bfaf681d1d8338afaa89e4b180174f008b8cf951a6275776f","examples/sherlock.txt":"242ec73a70f0a03dcbe007e32038e7deeaee004aaec9a09a07fa322743440fa8","include/rure.h":"ddd6056d434d4efaf6ad30b8a38798d61ad385b0c9866988f9b2d4306dc1a99a","src/error.rs":"965c0207eb6d9cf644580d13b2d2d3bd310ab5c1ff65cb1fc04abdbd08ce7fe8","src/lib.rs":"9e99e774ee2a3db507d1e2cd7142b680411d90cf2b033c19ea9a7ea59ae4ba98","src/macros.rs":"ef2d468c1babe1b2252e62ad953b14ce58afb87768dc88612a70df27456038d2","src/rure.rs":"a889bbf35ab2d0018eac1122fe69abbbe2880fb8f5da211a1f60f703fddb5c82","test":"e8b91d4378b3ba09b7dfecdfa733765569778f57bc1c72cecc718e4ad63c1537"},"package":"f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"}