Bug 1794001 - Part 1: Import the rure crate for rust regex ffi, r=xpcom-reviewers,supply-chain-reviewers,kmag

While we already have an in-tree `regex-ffi` crate which provides basic
access to regex functionality for use in FormAutofillNative, the `regex`
crate itself provides and maintains its own c api as the `rure` crate.
This patch vendors in `rure` to allow us to use the more-fully-featured
official ffi.

Differential Revision: https://phabricator.services.mozilla.com/D158873
This commit is contained in:
Nika Layzell 2022-10-13 21:46:54 +00:00
parent 9127c66b99
commit 93a9c67b35
21 changed files with 15516 additions and 0 deletions

11
Cargo.lock generated
View File

@ -2166,6 +2166,7 @@ dependencies = [
"qcms",
"regex-ffi",
"rsdparsa_capi",
"rure",
"rusqlite",
"rust_minidump_writer_linux",
"static_prefs",
@ -4501,6 +4502,16 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d79b4b604167921892e84afbbaad9d5ad74e091bf6c511d9dbfb0593f09fabd"
[[package]]
name = "rure"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"
dependencies = [
"libc",
"regex",
]
[[package]]
name = "rusqlite"
version = "0.27.0"

View File

@ -953,6 +953,33 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy"
delta = "0.7.0 -> 0.7.1"
[[audits.rure]]
who = "Nika Layzell <nika@thelayzells.com>"
criteria = "safe-to-deploy"
version = "0.2.2"
notes = """
This is a fairly straightforward FFI wrapper crate for `regex`, maintained by
the `regex` developers in the same repository.
This crate is explicitly designed for FFI use, and should not be used directly
by Rust code. The exported `extern \"C\"` functions are not marked as `unsafe`,
meaning that it is technically incorrect to use them from within Rust code,
however they are reasonable to use from C code.
The unsafe code in this crate heavily depends on the C caller maintaining
invariants, however these invariants are clearly documented in the `rure.h`
file, bundled with the crate.
I have checked the signatures of each function both in C++ and in the Rust to
ensure they match. In some places, the c `rure.h` header file is missing a
`const` qualifier which could be present given the Rust code, however this will
have no impact on ABI, and is fairly normal for FFI crates.
Panics are handled in all Rust FFI methods, meaning that projects which do not
disable unwinding will still consistently abort (using `libc::abort()`) if a
panic occurs in the Rust code.
"""
[[audits.rust_decimal]]
who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy"

View File

@ -0,0 +1 @@
{"files":{"Cargo.toml":"6bed7b80456a66969f4fe9bb5341a0b927a7cd58e036441cbb3b79d67d86c24a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"e8462c4064a376c2b2d729cc766064cc97decd6a2bb325cf9c7b50be9b8897ce","ctest/compile":"48b692b2aca8b61dfbe372f46d3aeb242893cfa2d81b0a89a73eb2f5db6b6e27","ctest/test.c":"6565808675763c42f8f10bd95445eaab4eaa3618efcf8ec215d98c3a1cfe756d","examples/compile":"471a781860b733f9aa9c1691f33ac8e8a4e85efcb97540942432ba5b58fbb982","examples/iter.c":"ad8312b2271ee19bfaf681d1d8338afaa89e4b180174f008b8cf951a6275776f","examples/sherlock.txt":"242ec73a70f0a03dcbe007e32038e7deeaee004aaec9a09a07fa322743440fa8","include/rure.h":"ddd6056d434d4efaf6ad30b8a38798d61ad385b0c9866988f9b2d4306dc1a99a","src/error.rs":"965c0207eb6d9cf644580d13b2d2d3bd310ab5c1ff65cb1fc04abdbd08ce7fe8","src/lib.rs":"9e99e774ee2a3db507d1e2cd7142b680411d90cf2b033c19ea9a7ea59ae4ba98","src/macros.rs":"ef2d468c1babe1b2252e62ad953b14ce58afb87768dc88612a70df27456038d2","src/rure.rs":"a889bbf35ab2d0018eac1122fe69abbbe2880fb8f5da211a1f60f703fddb5c82","test":"e8b91d4378b3ba09b7dfecdfa733765569778f57bc1c72cecc718e4ad63c1537"},"package":"f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"}

38
third_party/rust/rure/Cargo.toml vendored Normal file
View File

@ -0,0 +1,38 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "rure"
version = "0.2.2"
authors = ["The Rust Project Developers"]
description = """
A C API for Rust's regular expression library.
"""
homepage = "https://github.com/rust-lang/regex"
documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi"
readme = "README.md"
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/regex"
[lib]
name = "rure"
crate-type = [
"staticlib",
"cdylib",
"rlib",
]
[dependencies.libc]
version = "0.2"
[dependencies.regex]
version = "1"

201
third_party/rust/rure/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
third_party/rust/rure/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,25 @@
Copyright (c) 2014 The Rust Project Developers
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

103
third_party/rust/rure/README.md vendored Normal file
View File

@ -0,0 +1,103 @@
C API for RUst's REgex engine
=============================
rure is a C API to Rust's regex library, which guarantees linear time
searching using finite automata. In exchange, it must give up some common
regex features such as backreferences and arbitrary lookaround. It does
however include capturing groups, lazy matching, Unicode support and word
boundary assertions. Its matching semantics generally correspond to Perl's,
or "leftmost first." Namely, the match locations reported correspond to the
first match that would be found by a backtracking engine.
The header file (`includes/rure.h`) serves as the primary API documentation of
this library. Types and flags are documented first, and functions follow.
The syntax and possibly other useful things are documented in the Rust
API documentation: https://docs.rs/regex
Examples
--------
There are readable examples in the `ctest` and `examples` sub-directories.
Assuming you have
[Rust and Cargo installed](https://www.rust-lang.org/downloads.html)
(and a C compiler), then this should work to run the `iter` example:
```
$ git clone git://github.com/rust-lang/regex
$ cd regex/regex-capi/examples
$ ./compile
$ LD_LIBRARY_PATH=../target/release ./iter
```
Performance
-----------
It's fast. Its core matching engine is a lazy DFA, which is what GNU grep
and RE2 use. Like GNU grep, this regex engine can detect multi byte literals
in the regex and will use fast literal string searching to quickly skip
through the input to find possible match locations.
All memory usage is bounded and all searching takes linear time with respect
to the input string.
For more details, see the PERFORMANCE guide:
https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md
Text encoding
-------------
All regular expressions must be valid UTF-8.
The text encoding of haystacks is more complicated. To a first
approximation, haystacks should be UTF-8. In fact, UTF-8 (and, one
supposes, ASCII) is the only well defined text encoding supported by this
library. It is impossible to match UTF-16, UTF-32 or any other encoding
without first transcoding it to UTF-8.
With that said, haystacks do not need to be valid UTF-8, and if they aren't
valid UTF-8, no performance penalty is paid. Whether invalid UTF-8 is
matched or not depends on the regular expression. For example, with the
`RURE_FLAG_UNICODE` flag enabled, the regex `.` is guaranteed to match a
single UTF-8 encoding of a Unicode codepoint (sans LF). In particular,
it will not match invalid UTF-8 such as `\xFF`, nor will it match surrogate
codepoints or "alternate" (i.e., non-minimal) encodings of codepoints.
However, with the `RURE_FLAG_UNICODE` flag disabled, the regex `.` will match
any *single* arbitrary byte (sans LF), including `\xFF`.
This provides a useful invariant: wherever `RURE_FLAG_UNICODE` is set, the
corresponding regex is guaranteed to match valid UTF-8. Invalid UTF-8 will
always prevent a match from happening when the flag is set. Since flags can be
toggled in the regular expression itself, this allows one to pick and choose
which parts of the regular expression must match UTF-8 or not.
Some good advice is to always enable the `RURE_FLAG_UNICODE` flag (which is
enabled when using `rure_compile_must`) and selectively disable the flag when
one wants to match arbitrary bytes. The flag can be disabled in a regular
expression with `(?-u)`.
Finally, if one wants to match specific invalid UTF-8 bytes, then you can
use escape sequences. e.g., `(?-u)\\xFF` will match `\xFF`. It's not
possible to use C literal escape sequences in this case since regular
expressions must be valid UTF-8.
Aborts
------
This library will abort your process if an unwinding panic is caught in the
Rust code. Generally, a panic occurs when there is a bug in the program or
if allocation failed. It is possible to cause this behavior by passing
invalid inputs to some functions. For example, giving an invalid capture
group index to `rure_captures_at` will cause Rust's bounds checks to fail,
which will cause a panic, which will be caught and printed to stderr. The
process will then `abort`.
Missing
-------
There are a few things missing from the C API that are present in the Rust API.
There's no particular (known) reason why they don't, they just haven't been
implemented yet.
* Splitting a string by a regex.
* Replacing regex matches in a string with some other text.

8
third_party/rust/rure/ctest/compile vendored Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
set -ex
cargo build --manifest-path ../Cargo.toml
gcc -DDEBUG -o test test.c -ansi -Wall -I../include -L../../target/debug -lrure
# If you're using librure.a, then you'll need to link other stuff:
# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure

591
third_party/rust/rure/ctest/test.c vendored Normal file
View File

@ -0,0 +1,591 @@
#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "rure.h"
#ifndef DEBUG
#define DEBUG false
#endif
bool test_is_match() {
bool passed = true;
const char *haystack = "snowman: \xE2\x98\x83";
rure *re = rure_compile_must("\\p{So}$");
bool matched = rure_is_match(re, (const uint8_t *)haystack,
strlen(haystack), 0);
if (!matched) {
if (DEBUG) {
fprintf(stderr,
"[test_is_match] expected match, but got no match\n");
}
passed = false;
}
rure_free(re);
return passed;
}
bool test_shortest_match() {
bool passed = true;
const char *haystack = "aaaaa";
rure *re = rure_compile_must("a+");
size_t end = 0;
bool matched = rure_shortest_match(re, (const uint8_t *)haystack,
strlen(haystack), 0, &end);
if (!matched) {
if (DEBUG) {
fprintf(stderr,
"[test_shortest_match] expected match, "
"but got no match\n");
}
passed = false;
}
size_t expect_end = 1;
if (end != expect_end) {
if (DEBUG) {
fprintf(stderr,
"[test_shortest_match] expected match end location %zu "
"but got %zu\n", expect_end, end);
}
passed = false;
}
rure_free(re);
return passed;
}
bool test_find() {
bool passed = true;
const char *haystack = "snowman: \xE2\x98\x83";
rure *re = rure_compile_must("\\p{So}$");
rure_match match = {0};
bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack),
0, &match);
if (!matched) {
if (DEBUG) {
fprintf(stderr, "[test_find] expected match, but got no match\n");
}
passed = false;
}
size_t expect_start = 9;
size_t expect_end = 12;
if (match.start != expect_start || match.end != expect_end) {
if (DEBUG) {
fprintf(stderr,
"[test_find] expected match at (%zu, %zu), but "
"got match at (%zu, %zu)\n",
expect_start, expect_end, match.start, match.end);
}
passed = false;
}
rure_free(re);
return passed;
}
bool test_captures() {
bool passed = true;
const char *haystack = "snowman: \xE2\x98\x83";
rure *re = rure_compile_must(".(.*(?P<snowman>\\p{So}))$");
rure_match match = {0};
rure_captures *caps = rure_captures_new(re);
bool matched = rure_find_captures(re, (const uint8_t *)haystack,
strlen(haystack), 0, caps);
if (!matched) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] expected match, but got no match\n");
}
passed = false;
}
size_t expect_captures_len = 3;
size_t captures_len = rure_captures_len(caps);
if (captures_len != expect_captures_len) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] "
"expected capture group length to be %zd, but "
"got %zd\n",
expect_captures_len, captures_len);
}
passed = false;
goto done;
}
int32_t expect_capture_index = 2;
int32_t capture_index = rure_capture_name_index(re, "snowman");
if (capture_index != expect_capture_index) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] "
"expected capture index %d for name 'snowman', but "
"got %d\n",
expect_capture_index, capture_index);
}
passed = false;
goto done;
}
size_t expect_start = 9;
size_t expect_end = 12;
rure_captures_at(caps, 2, &match);
if (match.start != expect_start || match.end != expect_end) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] "
"expected capture 2 match at (%zu, %zu), "
"but got match at (%zu, %zu)\n",
expect_start, expect_end, match.start, match.end);
}
passed = false;
}
done:
rure_captures_free(caps);
rure_free(re);
return passed;
}
bool test_iter() {
bool passed = true;
const uint8_t *haystack = (const uint8_t *)"abc xyz";
size_t haystack_len = strlen((const char *)haystack);
rure *re = rure_compile_must("\\w+(\\w)");
rure_match match = {0};
rure_captures *caps = rure_captures_new(re);
rure_iter *it = rure_iter_new(re);
bool matched = rure_iter_next(it, haystack, haystack_len, &match);
if (!matched) {
if (DEBUG) {
fprintf(stderr,
"[test_iter] expected first match, but got no match\n");
}
passed = false;
goto done;
}
size_t expect_start = 0;
size_t expect_end = 3;
if (match.start != expect_start || match.end != expect_end) {
if (DEBUG) {
fprintf(stderr,
"[test_iter] expected first match at (%zu, %zu), but "
"got match at (%zu, %zu)\n",
expect_start, expect_end, match.start, match.end);
}
passed = false;
goto done;
}
matched = rure_iter_next_captures(it, haystack, haystack_len, caps);
if (!matched) {
if (DEBUG) {
fprintf(stderr,
"[test_iter] expected second match, but got no match\n");
}
passed = false;
goto done;
}
rure_captures_at(caps, 1, &match);
expect_start = 6;
expect_end = 7;
if (match.start != expect_start || match.end != expect_end) {
if (DEBUG) {
fprintf(stderr,
"[test_iter] expected second match at (%zu, %zu), but "
"got match at (%zu, %zu)\n",
expect_start, expect_end, match.start, match.end);
}
passed = false;
goto done;
}
done:
rure_iter_free(it);
rure_captures_free(caps);
rure_free(re);
return passed;
}
bool test_iter_capture_name(char *expect, char *given) {
bool passed = true;
if (strcmp(expect, given)) {
if (DEBUG) {
fprintf(stderr,
"[test_iter_capture_name] expected first capture "
"name '%s' got '%s'\n",
expect, given);
}
passed = false;
}
return passed;
}
bool test_iter_capture_names() {
bool passed = true;
char *name;
rure *re = rure_compile_must(
"(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})");
rure_iter_capture_names *it = rure_iter_capture_names_new(re);
bool result = rure_iter_capture_names_next(it, &name);
if (!result) {
if (DEBUG) {
fprintf(stderr,
"[test_iter_capture_names] expected a second name, "
"but got none\n");
}
passed = false;
goto done;
}
result = rure_iter_capture_names_next(it, &name);
passed = test_iter_capture_name("year", name);
if (!passed) {
goto done;
}
result = rure_iter_capture_names_next(it, &name);
passed = test_iter_capture_name("month", name);
if (!passed) {
goto done;
}
result = rure_iter_capture_names_next(it, &name);
passed = test_iter_capture_name("day", name);
if (!passed) {
goto done;
}
done:
rure_iter_capture_names_free(it);
rure_free(re);
return passed;
}
/*
* This tests whether we can set the flags correctly. In this case, we disable
* all flags, which includes disabling Unicode mode. When we disable Unicode
* mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF.
* (When Unicode mode is enabled, \xFF won't match .)
*/
bool test_flags() {
bool passed = true;
const char *pattern = ".";
const char *haystack = "\xFF";
rure *re = rure_compile((const uint8_t *)pattern, strlen(pattern),
0, NULL, NULL);
bool matched = rure_is_match(re, (const uint8_t *)haystack,
strlen(haystack), 0);
if (!matched) {
if (DEBUG) {
fprintf(stderr, "[test_flags] expected match, but got no match\n");
}
passed = false;
}
rure_free(re);
return passed;
}
bool test_compile_error() {
bool passed = true;
rure_error *err = rure_error_new();
rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err);
if (re != NULL) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error] "
"expected NULL regex pointer, but got non-NULL pointer\n");
}
passed = false;
rure_free(re);
}
const char *msg = rure_error_message(err);
if (NULL == strstr(msg, "unclosed group")) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error] "
"expected an 'unclosed parenthesis' error message, but "
"got this instead: '%s'\n", msg);
}
passed = false;
}
rure_error_free(err);
return passed;
}
bool test_compile_error_size_limit() {
bool passed = true;
rure_options *opts = rure_options_new();
rure_options_size_limit(opts, 0);
rure_error *err = rure_error_new();
rure *re = rure_compile((const uint8_t *)"\\w{100}", 8, 0, opts, err);
if (re != NULL) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error_size_limit] "
"expected NULL regex pointer, but got non-NULL pointer\n");
}
passed = false;
rure_free(re);
}
const char *msg = rure_error_message(err);
if (NULL == strstr(msg, "exceeds size")) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error] "
"expected an 'exceeds size' error message, but "
"got this instead: '%s'\n", msg);
}
passed = false;
}
rure_options_free(opts);
rure_error_free(err);
return passed;
}
bool test_regex_set_matches() {
#define PAT_COUNT 6
bool passed = true;
const char *patterns[] = {
"foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
};
const size_t patterns_lengths[] = {
3, 6, 3, 3, 6, 3
};
rure_error *err = rure_error_new();
rure_set *re = rure_compile_set((const uint8_t **) patterns,
patterns_lengths,
PAT_COUNT,
0,
NULL,
err);
if (re == NULL) {
passed = false;
goto done2;
}
if (rure_set_len(re) != PAT_COUNT) {
passed = false;
goto done1;
}
if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) {
passed = false;
goto done1;
}
if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) {
passed = false;
goto done1;
}
bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) {
passed = false;
goto done1;
}
const bool match_target[] = {
true, false, true, false, true, true
};
int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}
done1:
rure_set_free(re);
done2:
rure_error_free(err);
return passed;
#undef PAT_COUNT
}
bool test_regex_set_match_start() {
#define PAT_COUNT 3
bool passed = true;
const char *patterns[] = {
"foo", "bar", "fooo"
};
const size_t patterns_lengths[] = {
3, 3, 4
};
rure_error *err = rure_error_new();
rure_set *re = rure_compile_set((const uint8_t **) patterns,
patterns_lengths,
PAT_COUNT,
0,
NULL,
err);
if (re == NULL) {
passed = false;
goto done2;
}
if (rure_set_len(re) != PAT_COUNT) {
passed = false;
goto done1;
}
if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) {
passed = false;
goto done1;
}
{
bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) {
passed = false;
goto done1;
}
const bool match_target[] = {
true, true, true
};
int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}
}
{
bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) {
passed = false;
goto done1;
}
const bool match_target[] = {
false, true, false
};
int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}
}
done1:
rure_set_free(re);
done2:
rure_error_free(err);
return passed;
#undef PAT_COUNT
}
bool test_regex_set_options() {
bool passed = true;
rure_options *opts = rure_options_new();
rure_options_size_limit(opts, 0);
rure_error *err = rure_error_new();
const char *patterns[] = { "\\w{100}" };
const size_t patterns_lengths[] = { 8 };
rure_set *re = rure_compile_set(
(const uint8_t **) patterns, patterns_lengths, 1, 0, opts, err);
if (re != NULL) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error_size_limit] "
"expected NULL regex pointer, but got non-NULL pointer\n");
}
passed = false;
rure_set_free(re);
}
const char *msg = rure_error_message(err);
if (NULL == strstr(msg, "exceeds size")) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error] "
"expected an 'exceeds size' error message, but "
"got this instead: '%s'\n", msg);
}
passed = false;
}
rure_options_free(opts);
rure_error_free(err);
return passed;
}
bool test_escape() {
bool passed = true;
const char *pattern = "^[a-z]+.*$";
const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$";
const char *escaped = rure_escape_must(pattern);
if (!escaped) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] expected escaped, but got no escaped\n");
}
passed = false;
} else if (strcmp(escaped, expected_escaped) != 0) {
if (DEBUG) {
fprintf(stderr,
"[test_captures] expected \"%s\", but got \"%s\"\n",
expected_escaped, escaped);
}
passed = false;
}
rure_cstring_free((char *) escaped);
return passed;
}
void run_test(bool (test)(), const char *name, bool *passed) {
if (!test()) {
*passed = false;
fprintf(stderr, "FAILED: %s\n", name);
} else {
fprintf(stderr, "PASSED: %s\n", name);
}
}
int main() {
bool passed = true;
run_test(test_is_match, "test_is_match", &passed);
run_test(test_shortest_match, "test_shortest_match", &passed);
run_test(test_find, "test_find", &passed);
run_test(test_captures, "test_captures", &passed);
run_test(test_iter, "test_iter", &passed);
run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
run_test(test_flags, "test_flags", &passed);
run_test(test_compile_error, "test_compile_error", &passed);
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
&passed);
run_test(test_regex_set_matches, "test_regex_set_match", &passed);
run_test(test_regex_set_options, "test_regex_set_options", &passed);
run_test(test_regex_set_match_start, "test_regex_set_match_start",
&passed);
run_test(test_escape, "test_escape", &passed);
if (!passed) {
exit(1);
}
return 0;
}

9
third_party/rust/rure/examples/compile vendored Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
set -ex
# N.B. Add `--release` flag to `cargo build` to make the example run faster.
cargo build --manifest-path ../Cargo.toml
gcc -O3 -DDEBUG -o iter iter.c -ansi -Wall -I../include -L../../target/debug -lrure
# If you're using librure.a, then you'll need to link other stuff:
# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure

99
third_party/rust/rure/examples/iter.c vendored Normal file
View File

@ -0,0 +1,99 @@
/*
* This example code shows how to iterate over all regex matches in a file,
* emit the match location and print the contents of a capturing group.
*/
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "rure.h"
int main() {
/* Open a file and mmap it. */
int fd = open("sherlock.txt", O_RDONLY);
if (fd == -1) {
perror("failed to open sherlock.txt");
exit(1);
}
struct stat status;
if (fstat(fd, &status) == -1) {
perror("failed to stat sherlock.txt");
exit(1);
}
if ((uintmax_t)status.st_size > SIZE_MAX) {
perror("file too big");
exit(1);
}
if (status.st_size == 0) {
perror("file empty");
exit(1);
}
size_t sherlock_len = (size_t)status.st_size;
const uint8_t *sherlock = (const uint8_t *)mmap(
NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (sherlock == MAP_FAILED) {
perror("could not mmap file");
exit(1);
}
/*
* Compile the regular expression. A more convenient routine,
* rure_compile_must, is also available, which will abort the process if
* and print an error message to stderr if the regex compilation fails.
* We show the full gory details here as an example.
*/
const char *pattern = "(\\w+)\\s+Holmes";
size_t pattern_len = strlen(pattern);
rure_error *err = rure_error_new();
rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
if (NULL == re) {
/* A null regex means compilation failed and an error exists. */
printf("compilation of %s failed: %s\n",
pattern, rure_error_message(err));
rure_error_free(err);
munmap((char*)sherlock, sherlock_len);
exit(1);
}
rure_error_free(err);
/*
* Create an iterator to find all successive non-overlapping matches.
* For each match, we extract the location of the capturing group.
*/
rure_match group0 = {0};
rure_match group1 = {0};
rure_captures *caps = rure_captures_new(re);
rure_iter *it = rure_iter_new(re);
while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
/*
* Get the location of the full match and the capturing group.
* We know that both accesses are successful since the body of the
* loop only executes if there is a match and both capture groups
* must match in order for the entire regex to match.
*
* N.B. The zeroth group corresponds to the full match of the regex.
*/
rure_captures_at(caps, 0, &group0);
rure_captures_at(caps, 1, &group1);
printf("%.*s (match at: %zu, %zu)\n",
(int)(group1.end - group1.start),
sherlock + group1.start,
group0.start, group0.end);
}
/* Free all our resources. */
munmap((char*)sherlock, sherlock_len);
rure_captures_free(caps);
rure_iter_free(it);
rure_free(re);
return 0;
}

13052
third_party/rust/rure/examples/sherlock.txt vendored Normal file

File diff suppressed because it is too large Load Diff

585
third_party/rust/rure/include/rure.h vendored Normal file
View File

@ -0,0 +1,585 @@
#ifndef _RURE_H
#define _RURE_H
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* rure is the type of a compiled regular expression.
*
* An rure can be safely used from multiple threads simultaneously.
*/
typedef struct rure rure;
/*
* rure_set is the type of a set of compiled regular expressions.
*
* A rure can be safely used from multiple threads simultaneously.
*/
typedef struct rure_set rure_set;
/*
* rure_options is the set of non-flag configuration options for compiling
* a regular expression. Currently, only two options are available: setting
* the size limit of the compiled program and setting the size limit of the
* cache of states that the DFA uses while searching.
*
* For most uses, the default settings will work fine, and NULL can be passed
* wherever a *rure_options is expected.
*/
typedef struct rure_options rure_options;
/*
* The flags listed below can be used in rure_compile to set the default
* flags. All flags can otherwise be toggled in the expression itself using
* standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
* disables it.
*/
/* The case insensitive (i) flag. */
#define RURE_FLAG_CASEI (1 << 0)
/* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
#define RURE_FLAG_MULTI (1 << 1)
/* The any character (s) flag. (. matches new line.) */
#define RURE_FLAG_DOTNL (1 << 2)
/* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
#define RURE_FLAG_SWAP_GREED (1 << 3)
/* The ignore whitespace (x) flag. */
#define RURE_FLAG_SPACE (1 << 4)
/* The Unicode (u) flag. */
#define RURE_FLAG_UNICODE (1 << 5)
/* The default set of flags enabled when no flags are set. */
#define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
/*
* rure_match corresponds to the location of a single match in a haystack.
*/
typedef struct rure_match {
/* The start position. */
size_t start;
/* The end position. */
size_t end;
} rure_match;
/*
* rure_captures represents storage for sub-capture locations of a match.
*
* Computing the capture groups of a match can carry a significant performance
* penalty, so their use in the API is optional.
*
* An rure_captures value can be reused in multiple calls to rure_find_captures,
* so long as it is used with the compiled regular expression that created
* it.
*
* An rure_captures value may outlive its corresponding rure and can be freed
* independently.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_captures rure_captures;
/*
* rure_iter is an iterator over successive non-overlapping matches in a
* particular haystack.
*
* An rure_iter value may not outlive its corresponding rure and should be freed
* before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter rure_iter;
/*
* rure_iter_capture_names is an iterator over the list of capture group names
* in this particular rure.
*
* An rure_iter_capture_names value may not outlive its corresponding rure,
* and should be freed before its corresponding rure is freed.
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_iter_capture_names rure_iter_capture_names;
/*
* rure_error is an error that caused compilation to fail.
*
* Most errors are syntax errors but an error can be returned if the compiled
* regular expression would be too big.
*
* Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
* you will not get access to the error if one occurred.)
*
* It is not safe to use from multiple threads simultaneously.
*/
typedef struct rure_error rure_error;
/*
* rure_compile_must compiles the given pattern into a regular expression. If
* compilation fails for any reason, an error message is printed to stderr and
* the process is aborted.
*
* The pattern given should be in UTF-8. For convenience, this accepts a C
* string, which means the pattern cannot usefully contain NUL. If your pattern
* may contain NUL, consider using a regular expression escape sequence, or
* just use rure_compile.
*
* This uses RURE_DEFAULT_FLAGS.
*
* The compiled expression returned may be used from multiple threads
* simultaneously.
*/
rure *rure_compile_must(const char *pattern);
/*
* rure_compile compiles the given pattern into a regular expression. The
* pattern must be valid UTF-8 and the length corresponds to the number of
* bytes in the pattern.
*
* flags is a bitfield. Valid values are constants declared with prefix
* RURE_FLAG_.
*
* options contains non-flag configuration settings. If it's NULL, default
* settings are used. options may be freed immediately after a call to
* rure_compile.
*
* error is set if there was a problem compiling the pattern (including if the
* pattern is not valid UTF-8). If error is NULL, then no error information
* is returned. In all cases, if an error occurs, NULL is returned.
*
* The compiled expression returned may be used from multiple threads
* simultaneously.
*/
rure *rure_compile(const uint8_t *pattern, size_t length,
uint32_t flags, rure_options *options,
rure_error *error);
/*
* rure_free frees the given compiled regular expression.
*
* This must be called at most once for any rure.
*/
void rure_free(rure *re);
/*
* rure_is_match returns true if and only if re matches anywhere in haystack.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_is_match should be preferred to rure_find since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
size_t start);
/*
* rure_find returns true if and only if re matches anywhere in haystack.
* If a match is found, then its start and end offsets (in bytes) are set
* on the match pointer given.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_find should be preferred to rure_find_captures since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_find(rure *re, const uint8_t *haystack, size_t length,
size_t start, rure_match *match);
/*
* rure_find_captures returns true if and only if re matches anywhere in
* haystack. If a match is found, then all of its capture locations are stored
* in the captures pointer given.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* Only use this function if you specifically need access to capture locations.
* It is not necessary to use this function just because your regular
* expression contains capturing groups.
*
* Capture locations can be accessed using the rure_captures_* functions.
*
* N.B. The performance of this search can be impacted by the number of
* capturing groups. If you're using this function, it may be beneficial to
* use non-capturing groups (e.g., `(?:re)`) where possible.
*/
bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
size_t start, rure_captures *captures);
/*
* rure_shortest_match returns true if and only if re matches anywhere in
* haystack. If a match is found, then its end location is stored in the
* pointer given. The end location is the place at which the regex engine
* determined that a match exists, but may occur before the end of the proper
* leftmost-first match.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* rure_shortest_match should be preferred to rure_find since it may be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
size_t start, size_t *end);
/*
* rure_capture_name_index returns the capture index for the name given. If
* no such named capturing group exists in re, then -1 is returned.
*
* The capture index may be used with rure_captures_at.
*
* This function never returns 0 since the first capture group always
* corresponds to the entire match and is always unnamed.
*/
int32_t rure_capture_name_index(rure *re, const char *name);
/*
* rure_iter_capture_names_new creates a new capture_names iterator.
*
* An iterator will report all successive capture group names of re.
*/
rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
/*
* rure_iter_capture_names_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_capture_names_free(rure_iter_capture_names *it);
/*
* rure_iter_capture_names_next advances the iterator and returns true
* if and only if another capture group name exists.
*
* The value of the capture group name is written to the provided pointer.
*/
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
/*
* rure_iter_new creates a new iterator.
*
* An iterator will report all successive non-overlapping matches of re.
* When calling iterator functions, the same haystack and length must be
* supplied to all invocations. (Strict pointer equality is, however, not
* required.)
*/
rure_iter *rure_iter_new(rure *re);
/*
* rure_iter_free frees the iterator given.
*
* It must be called at most once.
*/
void rure_iter_free(rure_iter *it);
/*
* rure_iter_next advances the iterator and returns true if and only if a
* match was found. If a match is found, then the match pointer is set with the
* start and end location of the match, in bytes.
*
* If no match is found, then subsequent calls will return false indefinitely.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack. The given haystack must
* be logically equivalent to all other haystacks given to this iterator.
*
* rure_iter_next should be preferred to rure_iter_next_captures since it may
* be faster.
*
* N.B. The performance of this search is not impacted by the presence of
* capturing groups in your regular expression.
*/
bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
rure_match *match);
/*
* rure_iter_next_captures advances the iterator and returns true if and only if a
* match was found. If a match is found, then all of its capture locations are
* stored in the captures pointer given.
*
* If no match is found, then subsequent calls will return false indefinitely.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack. The given haystack must
* be logically equivalent to all other haystacks given to this iterator.
*
* Only use this function if you specifically need access to capture locations.
* It is not necessary to use this function just because your regular
* expression contains capturing groups.
*
* Capture locations can be accessed using the rure_captures_* functions.
*
* N.B. The performance of this search can be impacted by the number of
* capturing groups. If you're using this function, it may be beneficial to
* use non-capturing groups (e.g., `(?:re)`) where possible.
*/
bool rure_iter_next_captures(rure_iter *it,
const uint8_t *haystack, size_t length,
rure_captures *captures);
/*
* rure_captures_new allocates storage for all capturing groups in re.
*
* An rure_captures value may be reused on subsequent calls to
* rure_find_captures or rure_iter_next_captures.
*
* An rure_captures value may be freed independently of re, although any
* particular rure_captures should be used only with the re given here.
*
* It is not safe to use an rure_captures value from multiple threads
* simultaneously.
*/
rure_captures *rure_captures_new(rure *re);
/*
* rure_captures_free frees the given captures.
*
* This must be called at most once.
*/
void rure_captures_free(rure_captures *captures);
/*
* rure_captures_at returns true if and only if the capturing group at the
* index given was part of a match. If so, the given match pointer is populated
* with the start and end location (in bytes) of the capturing group.
*
* If no capture group with the index i exists, then false is
* returned. (A capturing group exists if and only if i is less than
* rure_captures_len(captures).)
*
* Note that index 0 corresponds to the full match.
*/
bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
/*
* rure_captures_len returns the number of capturing groups in the given
* captures.
*/
size_t rure_captures_len(rure_captures *captures);
/*
* rure_options_new allocates space for options.
*
* Options may be freed immediately after a call to rure_compile, but otherwise
* may be freely used in multiple calls to rure_compile.
*
* It is not safe to set options from multiple threads simultaneously. It is
* safe to call rure_compile from multiple threads simultaneously using the
* same options pointer.
*/
rure_options *rure_options_new();
/*
* rure_options_free frees the given options.
*
* This must be called at most once.
*/
void rure_options_free(rure_options *options);
/*
* rure_options_size_limit sets the appoximate size limit of the compiled
* regular expression.
*
* This size limit roughly corresponds to the number of bytes occupied by a
* single compiled program. If the program would exceed this number, then a
* compilation error will be returned from rure_compile.
*/
void rure_options_size_limit(rure_options *options, size_t limit);
/*
* rure_options_dfa_size_limit sets the approximate size of the cache used by
* the DFA during search.
*
* This roughly corresponds to the number of bytes that the DFA will use while
* searching.
*
* Note that this is a *per thread* limit. There is no way to set a global
* limit. In particular, if a regular expression is used from multiple threads
* simultaneously, then each thread may use up to the number of bytes
* specified here.
*/
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
/*
* rure_compile_set compiles the given list of patterns into a single regular
* expression which can be matched in a linear-scan. Each pattern in patterns
* must be valid UTF-8 and the length of each pattern in patterns corresponds
* to a byte length in patterns_lengths.
*
* The number of patterns to compile is specified by patterns_count. patterns
* must contain at least this many entries.
*
* flags is a bitfield. Valid values are constants declared with prefix
* RURE_FLAG_.
*
* options contains non-flag configuration settings. If it's NULL, default
* settings are used. options may be freed immediately after a call to
* rure_compile.
*
* error is set if there was a problem compiling the pattern.
*
* The compiled expression set returned may be used from multiple threads.
*/
rure_set *rure_compile_set(const uint8_t **patterns,
const size_t *patterns_lengths,
size_t patterns_count,
uint32_t flags,
rure_options *options,
rure_error *error);
/*
* rure_set_free frees the given compiled regular expression set.
*
* This must be called at most once for any rure_set.
*/
void rure_set_free(rure_set *re);
/*
* rure_is_match returns true if and only if any regexes within the set
* match anywhere in the haystack. Once a match has been located, the
* matching engine will quit immediately.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*/
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
size_t start);
/*
* rure_set_matches compares each regex in the set against the haystack and
* modifies matches with the match result of each pattern. Match results are
* ordered in the same way as the rure_set was compiled. For example,
* index 0 of matches corresponds to the first pattern passed to
* `rure_compile_set`.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* matches must be greater than or equal to the number of patterns the
* rure_set was compiled with.
*
* Only use this function if you specifically need to know which regexes
* matched within the set. To determine if any of the regexes matched without
* caring which, use rure_set_is_match.
*/
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
size_t start, bool *matches);
/*
* rure_set_len returns the number of patterns rure_set was compiled with.
*/
size_t rure_set_len(rure_set *re);
/*
* rure_error_new allocates space for an error.
*
* If error information is desired, then rure_error_new should be called
* to create an rure_error pointer, and that pointer can be passed to
* rure_compile. If an error occurred, then rure_compile will return NULL and
* the error pointer will be set. A message can then be extracted.
*
* It is not safe to use errors from multiple threads simultaneously. An error
* value may be reused on subsequent calls to rure_compile.
*/
rure_error *rure_error_new();
/*
* rure_error_free frees the error given.
*
* This must be called at most once.
*/
void rure_error_free(rure_error *err);
/*
* rure_error_message returns a NUL terminated string that describes the error
* message.
*
* The pointer returned must not be freed. Instead, it will be freed when
* rure_error_free is called. If err is used in subsequent calls to
* rure_compile, then this pointer may change or become invalid.
*/
const char *rure_error_message(rure_error *err);
/*
* rure_escape_must returns a NUL terminated string where all meta characters
* have been escaped. If escaping fails for any reason, an error message is
* printed to stderr and the process is aborted.
*
* The pattern given should be in UTF-8. For convenience, this accepts a C
* string, which means the pattern cannot contain a NUL byte. These correspond
* to the only two failure conditions of this function. That is, if the caller
* guarantees that the given pattern is valid UTF-8 and does not contain a
* NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
*
* The pointer returned must not be freed directly. Instead, it should be freed
* by calling rure_cstring_free.
*/
const char *rure_escape_must(const char *pattern);
/*
* rure_cstring_free frees the string given.
*
* This must be called at most once per string.
*/
void rure_cstring_free(char *s);
#ifdef __cplusplus
}
#endif
#endif

79
third_party/rust/rure/src/error.rs vendored Normal file
View File

@ -0,0 +1,79 @@
use std::ffi;
use std::ffi::CString;
use std::fmt;
use std::str;
use libc::c_char;
use regex;
#[derive(Debug)]
pub struct Error {
message: Option<CString>,
kind: ErrorKind,
}
#[derive(Debug)]
pub enum ErrorKind {
None,
Str(str::Utf8Error),
Regex(regex::Error),
Nul(ffi::NulError),
}
impl Error {
pub fn new(kind: ErrorKind) -> Error {
Error { message: None, kind: kind }
}
pub fn is_err(&self) -> bool {
match self.kind {
ErrorKind::None => false,
ErrorKind::Str(_) | ErrorKind::Regex(_) | ErrorKind::Nul(_) => {
true
}
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.kind {
ErrorKind::None => write!(f, "no error"),
ErrorKind::Str(ref e) => e.fmt(f),
ErrorKind::Regex(ref e) => e.fmt(f),
ErrorKind::Nul(ref e) => e.fmt(f),
}
}
}
ffi_fn! {
fn rure_error_new() -> *mut Error {
Box::into_raw(Box::new(Error::new(ErrorKind::None)))
}
}
ffi_fn! {
fn rure_error_free(err: *mut Error) {
unsafe { drop(Box::from_raw(err)); }
}
}
ffi_fn! {
fn rure_error_message(err: *mut Error) -> *const c_char {
let err = unsafe { &mut *err };
let cmsg = match CString::new(format!("{}", err)) {
Ok(msg) => msg,
Err(err) => {
// I guess this can probably happen if the regex itself has a
// NUL, and that NUL re-occurs in the context presented by the
// error message. In this case, just show as much as we can.
let nul = err.nul_position();
let msg = err.into_vec();
CString::new(msg[0..nul].to_owned()).unwrap()
}
};
let p = cmsg.as_ptr();
err.message = Some(cmsg);
p
}
}

7
third_party/rust/rure/src/lib.rs vendored Normal file
View File

@ -0,0 +1,7 @@
#[macro_use]
mod macros;
mod error;
mod rure;
pub use crate::error::*;
pub use crate::rure::*;

36
third_party/rust/rure/src/macros.rs vendored Normal file
View File

@ -0,0 +1,36 @@
macro_rules! ffi_fn {
(fn $name:ident($($arg:ident: $arg_ty:ty),*,) -> $ret:ty $body:block) => {
ffi_fn!(fn $name($($arg: $arg_ty),*) -> $ret $body);
};
(fn $name:ident($($arg:ident: $arg_ty:ty),*) -> $ret:ty $body:block) => {
#[no_mangle]
pub extern fn $name($($arg: $arg_ty),*) -> $ret {
use ::std::io::{self, Write};
use ::std::panic::{self, AssertUnwindSafe};
use ::libc::abort;
match panic::catch_unwind(AssertUnwindSafe(move || $body)) {
Ok(v) => v,
Err(err) => {
let msg = if let Some(&s) = err.downcast_ref::<&str>() {
s.to_owned()
} else if let Some(s) = err.downcast_ref::<String>() {
s.to_owned()
} else {
"UNABLE TO SHOW RESULT OF PANIC.".to_owned()
};
let _ = writeln!(
&mut io::stderr(),
"panic unwind caught, aborting: {:?}",
msg);
unsafe { abort() }
}
}
}
};
(fn $name:ident($($arg:ident: $arg_ty:ty),*,) $body:block) => {
ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
};
(fn $name:ident($($arg:ident: $arg_ty:ty),*) $body:block) => {
ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
};
}

629
third_party/rust/rure/src/rure.rs vendored Normal file
View File

@ -0,0 +1,629 @@
use std::collections::HashMap;
use std::ffi::{CStr, CString};
use std::ops::Deref;
use std::ptr;
use std::slice;
use std::str;
use libc::{c_char, size_t};
use regex::bytes;
use crate::error::{Error, ErrorKind};
const RURE_FLAG_CASEI: u32 = 1 << 0;
const RURE_FLAG_MULTI: u32 = 1 << 1;
const RURE_FLAG_DOTNL: u32 = 1 << 2;
const RURE_FLAG_SWAP_GREED: u32 = 1 << 3;
const RURE_FLAG_SPACE: u32 = 1 << 4;
const RURE_FLAG_UNICODE: u32 = 1 << 5;
const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE;
pub struct Regex {
re: bytes::Regex,
capture_names: HashMap<String, i32>,
}
pub struct Options {
size_limit: usize,
dfa_size_limit: usize,
}
// The `RegexSet` is not exposed with option support or matching at an
// arbitrary position with a crate just yet. To circumvent this, we use
// the `Exec` structure directly.
pub struct RegexSet {
re: bytes::RegexSet,
}
#[repr(C)]
pub struct rure_match {
pub start: size_t,
pub end: size_t,
}
pub struct Captures(bytes::Locations);
pub struct Iter {
re: *const Regex,
last_end: usize,
last_match: Option<usize>,
}
pub struct IterCaptureNames {
capture_names: bytes::CaptureNames<'static>,
name_ptrs: Vec<*mut c_char>,
}
impl Deref for Regex {
type Target = bytes::Regex;
fn deref(&self) -> &bytes::Regex {
&self.re
}
}
impl Deref for RegexSet {
type Target = bytes::RegexSet;
fn deref(&self) -> &bytes::RegexSet {
&self.re
}
}
impl Default for Options {
fn default() -> Options {
Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
}
}
ffi_fn! {
fn rure_compile_must(pattern: *const c_char) -> *const Regex {
let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
let pat = pattern as *const u8;
let mut err = Error::new(ErrorKind::None);
let re = rure_compile(
pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err);
if err.is_err() {
let _ = writeln!(&mut io::stderr(), "{}", err);
let _ = writeln!(
&mut io::stderr(), "aborting from rure_compile_must");
unsafe { abort() }
}
re
}
}
ffi_fn! {
fn rure_compile(
pattern: *const u8,
length: size_t,
flags: u32,
options: *const Options,
error: *mut Error,
) -> *const Regex {
let pat = unsafe { slice::from_raw_parts(pattern, length) };
let pat = match str::from_utf8(pat) {
Ok(pat) => pat,
Err(err) => {
unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Str(err));
}
return ptr::null();
}
}
};
let mut builder = bytes::RegexBuilder::new(pat);
if !options.is_null() {
let options = unsafe { &*options };
builder.size_limit(options.size_limit);
builder.dfa_size_limit(options.dfa_size_limit);
}
builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
builder.multi_line(flags & RURE_FLAG_MULTI > 0);
builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
builder.unicode(flags & RURE_FLAG_UNICODE > 0);
match builder.build() {
Ok(re) => {
let mut capture_names = HashMap::new();
for (i, name) in re.capture_names().enumerate() {
if let Some(name) = name {
capture_names.insert(name.to_owned(), i as i32);
}
}
let re = Regex {
re: re,
capture_names: capture_names,
};
Box::into_raw(Box::new(re))
}
Err(err) => {
unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Regex(err));
}
ptr::null()
}
}
}
}
}
ffi_fn! {
fn rure_free(re: *const Regex) {
unsafe { drop(Box::from_raw(re as *mut Regex)); }
}
}
ffi_fn! {
fn rure_is_match(
re: *const Regex,
haystack: *const u8,
len: size_t,
start: size_t,
) -> bool {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
re.is_match_at(haystack, start)
}
}
ffi_fn! {
fn rure_find(
re: *const Regex,
haystack: *const u8,
len: size_t,
start: size_t,
match_info: *mut rure_match,
) -> bool {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
re.find_at(haystack, start).map(|m| unsafe {
if !match_info.is_null() {
(*match_info).start = m.start();
(*match_info).end = m.end();
}
}).is_some()
}
}
ffi_fn! {
fn rure_find_captures(
re: *const Regex,
haystack: *const u8,
len: size_t,
start: size_t,
captures: *mut Captures,
) -> bool {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
let slots = unsafe { &mut (*captures).0 };
re.read_captures_at(slots, haystack, start).is_some()
}
}
ffi_fn! {
fn rure_shortest_match(
re: *const Regex,
haystack: *const u8,
len: size_t,
start: size_t,
end: *mut usize,
) -> bool {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
match re.shortest_match_at(haystack, start) {
None => false,
Some(i) => {
if !end.is_null() {
unsafe {
*end = i;
}
}
true
}
}
}
}
ffi_fn! {
fn rure_capture_name_index(
re: *const Regex,
name: *const c_char,
) -> i32 {
let re = unsafe { &*re };
let name = unsafe { CStr::from_ptr(name) };
let name = match name.to_str() {
Err(_) => return -1,
Ok(name) => name,
};
re.capture_names.get(name).map(|&i|i).unwrap_or(-1)
}
}
ffi_fn! {
fn rure_iter_capture_names_new(
re: *const Regex,
) -> *mut IterCaptureNames {
let re = unsafe { &*re };
Box::into_raw(Box::new(IterCaptureNames {
capture_names: re.re.capture_names(),
name_ptrs: Vec::new(),
}))
}
}
ffi_fn! {
fn rure_iter_capture_names_free(it: *mut IterCaptureNames) {
unsafe {
let it = &mut *it;
while let Some(ptr) = it.name_ptrs.pop() {
drop(CString::from_raw(ptr));
}
drop(Box::from_raw(it));
}
}
}
ffi_fn! {
fn rure_iter_capture_names_next(
it: *mut IterCaptureNames,
capture_name: *mut *mut c_char,
) -> bool {
if capture_name.is_null() {
return false;
}
let it = unsafe { &mut *it };
let cn = match it.capture_names.next() {
// Top-level iterator ran out of capture groups
None => return false,
Some(val) => {
let name = match val {
// inner Option didn't have a name
None => "",
Some(name) => name
};
name
}
};
unsafe {
let cs = match CString::new(cn.as_bytes()) {
Result::Ok(val) => val,
Result::Err(_) => return false
};
let ptr = cs.into_raw();
it.name_ptrs.push(ptr);
*capture_name = ptr;
}
true
}
}
ffi_fn! {
fn rure_iter_new(
re: *const Regex,
) -> *mut Iter {
Box::into_raw(Box::new(Iter {
re: re,
last_end: 0,
last_match: None,
}))
}
}
ffi_fn! {
fn rure_iter_free(it: *mut Iter) {
unsafe { drop(Box::from_raw(it)); }
}
}
ffi_fn! {
fn rure_iter_next(
it: *mut Iter,
haystack: *const u8,
len: size_t,
match_info: *mut rure_match,
) -> bool {
let it = unsafe { &mut *it };
let re = unsafe { &*it.re };
let text = unsafe { slice::from_raw_parts(haystack, len) };
if it.last_end > text.len() {
return false;
}
let (s, e) = match re.find_at(text, it.last_end) {
None => return false,
Some(m) => (m.start(), m.end()),
};
if s == e {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
it.last_end += 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(e) == it.last_match {
return rure_iter_next(it, haystack, len, match_info);
}
} else {
it.last_end = e;
}
it.last_match = Some(e);
if !match_info.is_null() {
unsafe {
(*match_info).start = s;
(*match_info).end = e;
}
}
true
}
}
ffi_fn! {
fn rure_iter_next_captures(
it: *mut Iter,
haystack: *const u8,
len: size_t,
captures: *mut Captures,
) -> bool {
let it = unsafe { &mut *it };
let re = unsafe { &*it.re };
let slots = unsafe { &mut (*captures).0 };
let text = unsafe { slice::from_raw_parts(haystack, len) };
if it.last_end > text.len() {
return false;
}
let (s, e) = match re.read_captures_at(slots, text, it.last_end) {
None => return false,
Some(m) => (m.start(), m.end()),
};
if s == e {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
it.last_end += 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(e) == it.last_match {
return rure_iter_next_captures(it, haystack, len, captures);
}
} else {
it.last_end = e;
}
it.last_match = Some(e);
true
}
}
ffi_fn! {
fn rure_captures_new(re: *const Regex) -> *mut Captures {
let re = unsafe { &*re };
let captures = Captures(re.locations());
Box::into_raw(Box::new(captures))
}
}
ffi_fn! {
fn rure_captures_free(captures: *const Captures) {
unsafe { drop(Box::from_raw(captures as *mut Captures)); }
}
}
ffi_fn! {
fn rure_captures_at(
captures: *const Captures,
i: size_t,
match_info: *mut rure_match,
) -> bool {
let locs = unsafe { &(*captures).0 };
match locs.pos(i) {
Some((start, end)) => {
if !match_info.is_null() {
unsafe {
(*match_info).start = start;
(*match_info).end = end;
}
}
true
}
_ => false
}
}
}
ffi_fn! {
fn rure_captures_len(captures: *const Captures) -> size_t {
unsafe { (*captures).0.len() }
}
}
ffi_fn! {
fn rure_options_new() -> *mut Options {
Box::into_raw(Box::new(Options::default()))
}
}
ffi_fn! {
fn rure_options_free(options: *mut Options) {
unsafe { drop(Box::from_raw(options)); }
}
}
ffi_fn! {
fn rure_options_size_limit(options: *mut Options, limit: size_t) {
let options = unsafe { &mut *options };
options.size_limit = limit;
}
}
ffi_fn! {
fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) {
let options = unsafe { &mut *options };
options.dfa_size_limit = limit;
}
}
ffi_fn! {
fn rure_compile_set(
patterns: *const *const u8,
patterns_lengths: *const size_t,
patterns_count: size_t,
flags: u32,
options: *const Options,
error: *mut Error
) -> *const RegexSet {
let (raw_pats, raw_patsl) = unsafe {
(
slice::from_raw_parts(patterns, patterns_count),
slice::from_raw_parts(patterns_lengths, patterns_count)
)
};
let mut pats = Vec::with_capacity(patterns_count);
for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
pats.push(match str::from_utf8(pat) {
Ok(pat) => pat,
Err(err) => {
unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Str(err));
}
return ptr::null();
}
}
});
}
let mut builder = bytes::RegexSetBuilder::new(pats);
if !options.is_null() {
let options = unsafe { &*options };
builder.size_limit(options.size_limit);
builder.dfa_size_limit(options.dfa_size_limit);
}
builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
builder.multi_line(flags & RURE_FLAG_MULTI > 0);
builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
builder.unicode(flags & RURE_FLAG_UNICODE > 0);
match builder.build() {
Ok(re) => {
Box::into_raw(Box::new(RegexSet { re: re }))
}
Err(err) => {
unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Regex(err))
}
ptr::null()
}
}
}
}
}
ffi_fn! {
fn rure_set_free(re: *const RegexSet) {
unsafe { drop(Box::from_raw(re as *mut RegexSet)); }
}
}
ffi_fn! {
fn rure_set_is_match(
re: *const RegexSet,
haystack: *const u8,
len: size_t,
start: size_t
) -> bool {
let re = unsafe { &*re };
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
re.is_match_at(haystack, start)
}
}
ffi_fn! {
fn rure_set_matches(
re: *const RegexSet,
haystack: *const u8,
len: size_t,
start: size_t,
matches: *mut bool
) -> bool {
let re = unsafe { &*re };
let mut matches = unsafe {
slice::from_raw_parts_mut(matches, re.len())
};
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
// read_matches_at isn't guaranteed to set non-matches to false
for item in matches.iter_mut() {
*item = false;
}
re.read_matches_at(&mut matches, haystack, start)
}
}
ffi_fn! {
fn rure_set_len(re: *const RegexSet) -> size_t {
unsafe { (*re).len() }
}
}
ffi_fn! {
fn rure_escape_must(pattern: *const c_char) -> *const c_char {
let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
let pat = pattern as *const u8;
let mut err = Error::new(ErrorKind::None);
let esc = rure_escape(pat, len, &mut err);
if err.is_err() {
let _ = writeln!(&mut io::stderr(), "{}", err);
let _ = writeln!(
&mut io::stderr(), "aborting from rure_escape_must");
unsafe { abort() }
}
esc
}
}
/// A helper function that implements fallible escaping in a way that returns
/// an error if escaping failed.
///
/// This should ideally be exposed, but it needs API design work. In
/// particular, this should not return a C string, but a `const uint8_t *`
/// instead, since it may contain a NUL byte.
fn rure_escape(
pattern: *const u8,
length: size_t,
error: *mut Error,
) -> *const c_char {
let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
let str_pat = match str::from_utf8(pat) {
Ok(val) => val,
Err(err) => unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Str(err));
}
return ptr::null();
},
};
let esc_pat = regex::escape(str_pat);
let c_esc_pat = match CString::new(esc_pat) {
Ok(val) => val,
Err(err) => unsafe {
if !error.is_null() {
*error = Error::new(ErrorKind::Nul(err));
}
return ptr::null();
},
};
c_esc_pat.into_raw() as *const c_char
}
ffi_fn! {
fn rure_cstring_free(s: *mut c_char) {
unsafe { drop(CString::from_raw(s)); }
}
}

7
third_party/rust/rure/test vendored Executable file
View File

@ -0,0 +1,7 @@
#!/bin/sh
set -e
cargo build --verbose
(cd ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
(cd examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)

View File

@ -46,3 +46,9 @@ if CONFIG["CPU_ARCH"] != "x86":
if CONFIG["MOZ_BITS_DOWNLOAD"]:
RUST_TESTS += ["bits_client"]
# Export the `rure` crate's included .h file. The symbols defined in that file
# will be exported from the `gkrust-shared` crate.
EXPORTS += [
"/third_party/rust/rure/include/rure.h",
]

View File

@ -67,6 +67,7 @@ unic-langid-ffi = { path = "../../../../intl/locale/rust/unic-langid-ffi" }
fluent-langneg = { version = "0.13", features = ["cldr"] }
fluent-langneg-ffi = { path = "../../../../intl/locale/rust/fluent-langneg-ffi" }
regex-ffi = { path = "../../../components/regex-ffi" }
rure = "0.2.2"
rust_minidump_writer_linux = { path = "../../../crashreporter/rust_minidump_writer_linux", optional = true }
gecko-profiler = { path = "../../../../tools/profiler/rust-api"}
midir_impl = { path = "../../../../dom/midi/midir_impl", optional = true }

View File

@ -74,6 +74,7 @@ extern crate fluent;
extern crate fluent_ffi;
extern crate regex_ffi;
extern crate rure;
extern crate fluent_fallback;
extern crate l10nregistry_ffi;