mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-07 18:04:46 +00:00
Bug 1794001 - Part 1: Import the rure
crate for rust regex ffi, r=xpcom-reviewers,supply-chain-reviewers,kmag
While we already have an in-tree `regex-ffi` crate which provides basic access to regex functionality for use in FormAutofillNative, the `regex` crate itself provides and maintains its own c api as the `rure` crate. This patch vendors in `rure` to allow us to use the more-fully-featured official ffi. Differential Revision: https://phabricator.services.mozilla.com/D158873
This commit is contained in:
parent
9127c66b99
commit
93a9c67b35
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -2166,6 +2166,7 @@ dependencies = [
|
||||
"qcms",
|
||||
"regex-ffi",
|
||||
"rsdparsa_capi",
|
||||
"rure",
|
||||
"rusqlite",
|
||||
"rust_minidump_writer_linux",
|
||||
"static_prefs",
|
||||
@ -4501,6 +4502,16 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d79b4b604167921892e84afbbaad9d5ad74e091bf6c511d9dbfb0593f09fabd"
|
||||
|
||||
[[package]]
|
||||
name = "rure"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rusqlite"
|
||||
version = "0.27.0"
|
||||
|
@ -953,6 +953,33 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
|
||||
criteria = "safe-to-deploy"
|
||||
delta = "0.7.0 -> 0.7.1"
|
||||
|
||||
[[audits.rure]]
|
||||
who = "Nika Layzell <nika@thelayzells.com>"
|
||||
criteria = "safe-to-deploy"
|
||||
version = "0.2.2"
|
||||
notes = """
|
||||
This is a fairly straightforward FFI wrapper crate for `regex`, maintained by
|
||||
the `regex` developers in the same repository.
|
||||
|
||||
This crate is explicitly designed for FFI use, and should not be used directly
|
||||
by Rust code. The exported `extern \"C\"` functions are not marked as `unsafe`,
|
||||
meaning that it is technically incorrect to use them from within Rust code,
|
||||
however they are reasonable to use from C code.
|
||||
|
||||
The unsafe code in this crate heavily depends on the C caller maintaining
|
||||
invariants, however these invariants are clearly documented in the `rure.h`
|
||||
file, bundled with the crate.
|
||||
|
||||
I have checked the signatures of each function both in C++ and in the Rust to
|
||||
ensure they match. In some places, the c `rure.h` header file is missing a
|
||||
`const` qualifier which could be present given the Rust code, however this will
|
||||
have no impact on ABI, and is fairly normal for FFI crates.
|
||||
|
||||
Panics are handled in all Rust FFI methods, meaning that projects which do not
|
||||
disable unwinding will still consistently abort (using `libc::abort()`) if a
|
||||
panic occurs in the Rust code.
|
||||
"""
|
||||
|
||||
[[audits.rust_decimal]]
|
||||
who = "Mike Hommey <mh+mozilla@glandium.org>"
|
||||
criteria = "safe-to-deploy"
|
||||
|
1
third_party/rust/rure/.cargo-checksum.json
vendored
Normal file
1
third_party/rust/rure/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"Cargo.toml":"6bed7b80456a66969f4fe9bb5341a0b927a7cd58e036441cbb3b79d67d86c24a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"e8462c4064a376c2b2d729cc766064cc97decd6a2bb325cf9c7b50be9b8897ce","ctest/compile":"48b692b2aca8b61dfbe372f46d3aeb242893cfa2d81b0a89a73eb2f5db6b6e27","ctest/test.c":"6565808675763c42f8f10bd95445eaab4eaa3618efcf8ec215d98c3a1cfe756d","examples/compile":"471a781860b733f9aa9c1691f33ac8e8a4e85efcb97540942432ba5b58fbb982","examples/iter.c":"ad8312b2271ee19bfaf681d1d8338afaa89e4b180174f008b8cf951a6275776f","examples/sherlock.txt":"242ec73a70f0a03dcbe007e32038e7deeaee004aaec9a09a07fa322743440fa8","include/rure.h":"ddd6056d434d4efaf6ad30b8a38798d61ad385b0c9866988f9b2d4306dc1a99a","src/error.rs":"965c0207eb6d9cf644580d13b2d2d3bd310ab5c1ff65cb1fc04abdbd08ce7fe8","src/lib.rs":"9e99e774ee2a3db507d1e2cd7142b680411d90cf2b033c19ea9a7ea59ae4ba98","src/macros.rs":"ef2d468c1babe1b2252e62ad953b14ce58afb87768dc88612a70df27456038d2","src/rure.rs":"a889bbf35ab2d0018eac1122fe69abbbe2880fb8f5da211a1f60f703fddb5c82","test":"e8b91d4378b3ba09b7dfecdfa733765569778f57bc1c72cecc718e4ad63c1537"},"package":"f3de09595e75baee10da378a1fadfb50d04334a031d69dfb74d0cee3a94aa24c"}
|
38
third_party/rust/rure/Cargo.toml
vendored
Normal file
38
third_party/rust/rure/Cargo.toml
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "rure"
|
||||
version = "0.2.2"
|
||||
authors = ["The Rust Project Developers"]
|
||||
description = """
|
||||
A C API for Rust's regular expression library.
|
||||
"""
|
||||
homepage = "https://github.com/rust-lang/regex"
|
||||
documentation = "https://github.com/rust-lang/regex/tree/master/regex-capi"
|
||||
readme = "README.md"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/rust-lang/regex"
|
||||
|
||||
[lib]
|
||||
name = "rure"
|
||||
crate-type = [
|
||||
"staticlib",
|
||||
"cdylib",
|
||||
"rlib",
|
||||
]
|
||||
|
||||
[dependencies.libc]
|
||||
version = "0.2"
|
||||
|
||||
[dependencies.regex]
|
||||
version = "1"
|
201
third_party/rust/rure/LICENSE-APACHE
vendored
Normal file
201
third_party/rust/rure/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
25
third_party/rust/rure/LICENSE-MIT
vendored
Normal file
25
third_party/rust/rure/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
Copyright (c) 2014 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
103
third_party/rust/rure/README.md
vendored
Normal file
103
third_party/rust/rure/README.md
vendored
Normal file
@ -0,0 +1,103 @@
|
||||
C API for RUst's REgex engine
|
||||
=============================
|
||||
rure is a C API to Rust's regex library, which guarantees linear time
|
||||
searching using finite automata. In exchange, it must give up some common
|
||||
regex features such as backreferences and arbitrary lookaround. It does
|
||||
however include capturing groups, lazy matching, Unicode support and word
|
||||
boundary assertions. Its matching semantics generally correspond to Perl's,
|
||||
or "leftmost first." Namely, the match locations reported correspond to the
|
||||
first match that would be found by a backtracking engine.
|
||||
|
||||
The header file (`includes/rure.h`) serves as the primary API documentation of
|
||||
this library. Types and flags are documented first, and functions follow.
|
||||
|
||||
The syntax and possibly other useful things are documented in the Rust
|
||||
API documentation: https://docs.rs/regex
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
There are readable examples in the `ctest` and `examples` sub-directories.
|
||||
|
||||
Assuming you have
|
||||
[Rust and Cargo installed](https://www.rust-lang.org/downloads.html)
|
||||
(and a C compiler), then this should work to run the `iter` example:
|
||||
|
||||
```
|
||||
$ git clone git://github.com/rust-lang/regex
|
||||
$ cd regex/regex-capi/examples
|
||||
$ ./compile
|
||||
$ LD_LIBRARY_PATH=../target/release ./iter
|
||||
```
|
||||
|
||||
|
||||
Performance
|
||||
-----------
|
||||
It's fast. Its core matching engine is a lazy DFA, which is what GNU grep
|
||||
and RE2 use. Like GNU grep, this regex engine can detect multi byte literals
|
||||
in the regex and will use fast literal string searching to quickly skip
|
||||
through the input to find possible match locations.
|
||||
|
||||
All memory usage is bounded and all searching takes linear time with respect
|
||||
to the input string.
|
||||
|
||||
For more details, see the PERFORMANCE guide:
|
||||
https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md
|
||||
|
||||
|
||||
Text encoding
|
||||
-------------
|
||||
All regular expressions must be valid UTF-8.
|
||||
|
||||
The text encoding of haystacks is more complicated. To a first
|
||||
approximation, haystacks should be UTF-8. In fact, UTF-8 (and, one
|
||||
supposes, ASCII) is the only well defined text encoding supported by this
|
||||
library. It is impossible to match UTF-16, UTF-32 or any other encoding
|
||||
without first transcoding it to UTF-8.
|
||||
|
||||
With that said, haystacks do not need to be valid UTF-8, and if they aren't
|
||||
valid UTF-8, no performance penalty is paid. Whether invalid UTF-8 is
|
||||
matched or not depends on the regular expression. For example, with the
|
||||
`RURE_FLAG_UNICODE` flag enabled, the regex `.` is guaranteed to match a
|
||||
single UTF-8 encoding of a Unicode codepoint (sans LF). In particular,
|
||||
it will not match invalid UTF-8 such as `\xFF`, nor will it match surrogate
|
||||
codepoints or "alternate" (i.e., non-minimal) encodings of codepoints.
|
||||
However, with the `RURE_FLAG_UNICODE` flag disabled, the regex `.` will match
|
||||
any *single* arbitrary byte (sans LF), including `\xFF`.
|
||||
|
||||
This provides a useful invariant: wherever `RURE_FLAG_UNICODE` is set, the
|
||||
corresponding regex is guaranteed to match valid UTF-8. Invalid UTF-8 will
|
||||
always prevent a match from happening when the flag is set. Since flags can be
|
||||
toggled in the regular expression itself, this allows one to pick and choose
|
||||
which parts of the regular expression must match UTF-8 or not.
|
||||
|
||||
Some good advice is to always enable the `RURE_FLAG_UNICODE` flag (which is
|
||||
enabled when using `rure_compile_must`) and selectively disable the flag when
|
||||
one wants to match arbitrary bytes. The flag can be disabled in a regular
|
||||
expression with `(?-u)`.
|
||||
|
||||
Finally, if one wants to match specific invalid UTF-8 bytes, then you can
|
||||
use escape sequences. e.g., `(?-u)\\xFF` will match `\xFF`. It's not
|
||||
possible to use C literal escape sequences in this case since regular
|
||||
expressions must be valid UTF-8.
|
||||
|
||||
|
||||
Aborts
|
||||
------
|
||||
This library will abort your process if an unwinding panic is caught in the
|
||||
Rust code. Generally, a panic occurs when there is a bug in the program or
|
||||
if allocation failed. It is possible to cause this behavior by passing
|
||||
invalid inputs to some functions. For example, giving an invalid capture
|
||||
group index to `rure_captures_at` will cause Rust's bounds checks to fail,
|
||||
which will cause a panic, which will be caught and printed to stderr. The
|
||||
process will then `abort`.
|
||||
|
||||
|
||||
Missing
|
||||
-------
|
||||
There are a few things missing from the C API that are present in the Rust API.
|
||||
There's no particular (known) reason why they don't, they just haven't been
|
||||
implemented yet.
|
||||
|
||||
* Splitting a string by a regex.
|
||||
* Replacing regex matches in a string with some other text.
|
8
third_party/rust/rure/ctest/compile
vendored
Executable file
8
third_party/rust/rure/ctest/compile
vendored
Executable file
@ -0,0 +1,8 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -ex
|
||||
|
||||
cargo build --manifest-path ../Cargo.toml
|
||||
gcc -DDEBUG -o test test.c -ansi -Wall -I../include -L../../target/debug -lrure
|
||||
# If you're using librure.a, then you'll need to link other stuff:
|
||||
# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure
|
591
third_party/rust/rure/ctest/test.c
vendored
Normal file
591
third_party/rust/rure/ctest/test.c
vendored
Normal file
@ -0,0 +1,591 @@
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "rure.h"
|
||||
|
||||
#ifndef DEBUG
|
||||
#define DEBUG false
|
||||
#endif
|
||||
|
||||
bool test_is_match() {
|
||||
bool passed = true;
|
||||
const char *haystack = "snowman: \xE2\x98\x83";
|
||||
|
||||
rure *re = rure_compile_must("\\p{So}$");
|
||||
bool matched = rure_is_match(re, (const uint8_t *)haystack,
|
||||
strlen(haystack), 0);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_is_match] expected match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_shortest_match() {
|
||||
bool passed = true;
|
||||
const char *haystack = "aaaaa";
|
||||
|
||||
rure *re = rure_compile_must("a+");
|
||||
size_t end = 0;
|
||||
bool matched = rure_shortest_match(re, (const uint8_t *)haystack,
|
||||
strlen(haystack), 0, &end);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_shortest_match] expected match, "
|
||||
"but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
size_t expect_end = 1;
|
||||
if (end != expect_end) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_shortest_match] expected match end location %zu "
|
||||
"but got %zu\n", expect_end, end);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_find() {
|
||||
bool passed = true;
|
||||
const char *haystack = "snowman: \xE2\x98\x83";
|
||||
|
||||
rure *re = rure_compile_must("\\p{So}$");
|
||||
rure_match match = {0};
|
||||
bool matched = rure_find(re, (const uint8_t *)haystack, strlen(haystack),
|
||||
0, &match);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr, "[test_find] expected match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
size_t expect_start = 9;
|
||||
size_t expect_end = 12;
|
||||
if (match.start != expect_start || match.end != expect_end) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_find] expected match at (%zu, %zu), but "
|
||||
"got match at (%zu, %zu)\n",
|
||||
expect_start, expect_end, match.start, match.end);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_captures() {
|
||||
bool passed = true;
|
||||
const char *haystack = "snowman: \xE2\x98\x83";
|
||||
|
||||
rure *re = rure_compile_must(".(.*(?P<snowman>\\p{So}))$");
|
||||
rure_match match = {0};
|
||||
rure_captures *caps = rure_captures_new(re);
|
||||
bool matched = rure_find_captures(re, (const uint8_t *)haystack,
|
||||
strlen(haystack), 0, caps);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] expected match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
size_t expect_captures_len = 3;
|
||||
size_t captures_len = rure_captures_len(caps);
|
||||
if (captures_len != expect_captures_len) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] "
|
||||
"expected capture group length to be %zd, but "
|
||||
"got %zd\n",
|
||||
expect_captures_len, captures_len);
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
int32_t expect_capture_index = 2;
|
||||
int32_t capture_index = rure_capture_name_index(re, "snowman");
|
||||
if (capture_index != expect_capture_index) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] "
|
||||
"expected capture index %d for name 'snowman', but "
|
||||
"got %d\n",
|
||||
expect_capture_index, capture_index);
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
size_t expect_start = 9;
|
||||
size_t expect_end = 12;
|
||||
rure_captures_at(caps, 2, &match);
|
||||
if (match.start != expect_start || match.end != expect_end) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] "
|
||||
"expected capture 2 match at (%zu, %zu), "
|
||||
"but got match at (%zu, %zu)\n",
|
||||
expect_start, expect_end, match.start, match.end);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
done:
|
||||
rure_captures_free(caps);
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_iter() {
|
||||
bool passed = true;
|
||||
const uint8_t *haystack = (const uint8_t *)"abc xyz";
|
||||
size_t haystack_len = strlen((const char *)haystack);
|
||||
|
||||
rure *re = rure_compile_must("\\w+(\\w)");
|
||||
rure_match match = {0};
|
||||
rure_captures *caps = rure_captures_new(re);
|
||||
rure_iter *it = rure_iter_new(re);
|
||||
|
||||
bool matched = rure_iter_next(it, haystack, haystack_len, &match);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter] expected first match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
size_t expect_start = 0;
|
||||
size_t expect_end = 3;
|
||||
if (match.start != expect_start || match.end != expect_end) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter] expected first match at (%zu, %zu), but "
|
||||
"got match at (%zu, %zu)\n",
|
||||
expect_start, expect_end, match.start, match.end);
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
matched = rure_iter_next_captures(it, haystack, haystack_len, caps);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter] expected second match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
rure_captures_at(caps, 1, &match);
|
||||
expect_start = 6;
|
||||
expect_end = 7;
|
||||
if (match.start != expect_start || match.end != expect_end) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter] expected second match at (%zu, %zu), but "
|
||||
"got match at (%zu, %zu)\n",
|
||||
expect_start, expect_end, match.start, match.end);
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
done:
|
||||
rure_iter_free(it);
|
||||
rure_captures_free(caps);
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_iter_capture_name(char *expect, char *given) {
|
||||
bool passed = true;
|
||||
if (strcmp(expect, given)) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter_capture_name] expected first capture "
|
||||
"name '%s' got '%s'\n",
|
||||
expect, given);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_iter_capture_names() {
|
||||
bool passed = true;
|
||||
|
||||
char *name;
|
||||
rure *re = rure_compile_must(
|
||||
"(?P<year>\\d{4})-(?P<month>\\d{2})-(?P<day>\\d{2})");
|
||||
rure_iter_capture_names *it = rure_iter_capture_names_new(re);
|
||||
|
||||
bool result = rure_iter_capture_names_next(it, &name);
|
||||
if (!result) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_iter_capture_names] expected a second name, "
|
||||
"but got none\n");
|
||||
}
|
||||
passed = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
result = rure_iter_capture_names_next(it, &name);
|
||||
passed = test_iter_capture_name("year", name);
|
||||
if (!passed) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
result = rure_iter_capture_names_next(it, &name);
|
||||
passed = test_iter_capture_name("month", name);
|
||||
if (!passed) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
result = rure_iter_capture_names_next(it, &name);
|
||||
passed = test_iter_capture_name("day", name);
|
||||
if (!passed) {
|
||||
goto done;
|
||||
}
|
||||
done:
|
||||
rure_iter_capture_names_free(it);
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
/*
|
||||
* This tests whether we can set the flags correctly. In this case, we disable
|
||||
* all flags, which includes disabling Unicode mode. When we disable Unicode
|
||||
* mode, we can match arbitrary possibly invalid UTF-8 bytes, such as \xFF.
|
||||
* (When Unicode mode is enabled, \xFF won't match .)
|
||||
*/
|
||||
bool test_flags() {
|
||||
bool passed = true;
|
||||
const char *pattern = ".";
|
||||
const char *haystack = "\xFF";
|
||||
|
||||
rure *re = rure_compile((const uint8_t *)pattern, strlen(pattern),
|
||||
0, NULL, NULL);
|
||||
bool matched = rure_is_match(re, (const uint8_t *)haystack,
|
||||
strlen(haystack), 0);
|
||||
if (!matched) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr, "[test_flags] expected match, but got no match\n");
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_free(re);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_compile_error() {
|
||||
bool passed = true;
|
||||
rure_error *err = rure_error_new();
|
||||
rure *re = rure_compile((const uint8_t *)"(", 1, 0, NULL, err);
|
||||
if (re != NULL) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error] "
|
||||
"expected NULL regex pointer, but got non-NULL pointer\n");
|
||||
}
|
||||
passed = false;
|
||||
rure_free(re);
|
||||
}
|
||||
const char *msg = rure_error_message(err);
|
||||
if (NULL == strstr(msg, "unclosed group")) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error] "
|
||||
"expected an 'unclosed parenthesis' error message, but "
|
||||
"got this instead: '%s'\n", msg);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_error_free(err);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_compile_error_size_limit() {
|
||||
bool passed = true;
|
||||
rure_options *opts = rure_options_new();
|
||||
rure_options_size_limit(opts, 0);
|
||||
rure_error *err = rure_error_new();
|
||||
rure *re = rure_compile((const uint8_t *)"\\w{100}", 8, 0, opts, err);
|
||||
if (re != NULL) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error_size_limit] "
|
||||
"expected NULL regex pointer, but got non-NULL pointer\n");
|
||||
}
|
||||
passed = false;
|
||||
rure_free(re);
|
||||
}
|
||||
const char *msg = rure_error_message(err);
|
||||
if (NULL == strstr(msg, "exceeds size")) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error] "
|
||||
"expected an 'exceeds size' error message, but "
|
||||
"got this instead: '%s'\n", msg);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_options_free(opts);
|
||||
rure_error_free(err);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_regex_set_matches() {
|
||||
|
||||
#define PAT_COUNT 6
|
||||
|
||||
bool passed = true;
|
||||
const char *patterns[] = {
|
||||
"foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
|
||||
};
|
||||
const size_t patterns_lengths[] = {
|
||||
3, 6, 3, 3, 6, 3
|
||||
};
|
||||
|
||||
rure_error *err = rure_error_new();
|
||||
rure_set *re = rure_compile_set((const uint8_t **) patterns,
|
||||
patterns_lengths,
|
||||
PAT_COUNT,
|
||||
0,
|
||||
NULL,
|
||||
err);
|
||||
if (re == NULL) {
|
||||
passed = false;
|
||||
goto done2;
|
||||
}
|
||||
|
||||
if (rure_set_len(re) != PAT_COUNT) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
bool matches[PAT_COUNT];
|
||||
if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
const bool match_target[] = {
|
||||
true, false, true, false, true, true
|
||||
};
|
||||
|
||||
int i;
|
||||
for (i = 0; i < PAT_COUNT; ++i) {
|
||||
if (matches[i] != match_target[i]) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
}
|
||||
|
||||
done1:
|
||||
rure_set_free(re);
|
||||
done2:
|
||||
rure_error_free(err);
|
||||
return passed;
|
||||
|
||||
#undef PAT_COUNT
|
||||
}
|
||||
|
||||
bool test_regex_set_match_start() {
|
||||
|
||||
#define PAT_COUNT 3
|
||||
|
||||
bool passed = true;
|
||||
const char *patterns[] = {
|
||||
"foo", "bar", "fooo"
|
||||
};
|
||||
const size_t patterns_lengths[] = {
|
||||
3, 3, 4
|
||||
};
|
||||
|
||||
rure_error *err = rure_error_new();
|
||||
rure_set *re = rure_compile_set((const uint8_t **) patterns,
|
||||
patterns_lengths,
|
||||
PAT_COUNT,
|
||||
0,
|
||||
NULL,
|
||||
err);
|
||||
if (re == NULL) {
|
||||
passed = false;
|
||||
goto done2;
|
||||
}
|
||||
|
||||
if (rure_set_len(re) != PAT_COUNT) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
if (rure_set_is_match(re, (const uint8_t *)"foobiasdr", 7, 2)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
{
|
||||
bool matches[PAT_COUNT];
|
||||
if (!rure_set_matches(re, (const uint8_t *)"fooobar", 8, 0, matches)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
const bool match_target[] = {
|
||||
true, true, true
|
||||
};
|
||||
|
||||
int i;
|
||||
for (i = 0; i < PAT_COUNT; ++i) {
|
||||
if (matches[i] != match_target[i]) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
bool matches[PAT_COUNT];
|
||||
if (!rure_set_matches(re, (const uint8_t *)"fooobar", 7, 1, matches)) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
|
||||
const bool match_target[] = {
|
||||
false, true, false
|
||||
};
|
||||
|
||||
int i;
|
||||
for (i = 0; i < PAT_COUNT; ++i) {
|
||||
if (matches[i] != match_target[i]) {
|
||||
passed = false;
|
||||
goto done1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
done1:
|
||||
rure_set_free(re);
|
||||
done2:
|
||||
rure_error_free(err);
|
||||
return passed;
|
||||
|
||||
#undef PAT_COUNT
|
||||
}
|
||||
|
||||
bool test_regex_set_options() {
|
||||
|
||||
bool passed = true;
|
||||
rure_options *opts = rure_options_new();
|
||||
rure_options_size_limit(opts, 0);
|
||||
rure_error *err = rure_error_new();
|
||||
|
||||
const char *patterns[] = { "\\w{100}" };
|
||||
const size_t patterns_lengths[] = { 8 };
|
||||
|
||||
rure_set *re = rure_compile_set(
|
||||
(const uint8_t **) patterns, patterns_lengths, 1, 0, opts, err);
|
||||
if (re != NULL) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error_size_limit] "
|
||||
"expected NULL regex pointer, but got non-NULL pointer\n");
|
||||
}
|
||||
passed = false;
|
||||
rure_set_free(re);
|
||||
}
|
||||
const char *msg = rure_error_message(err);
|
||||
if (NULL == strstr(msg, "exceeds size")) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_compile_error] "
|
||||
"expected an 'exceeds size' error message, but "
|
||||
"got this instead: '%s'\n", msg);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_options_free(opts);
|
||||
rure_error_free(err);
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool test_escape() {
|
||||
bool passed = true;
|
||||
|
||||
const char *pattern = "^[a-z]+.*$";
|
||||
const char *expected_escaped = "\\^\\[a\\-z\\]\\+\\.\\*\\$";
|
||||
|
||||
const char *escaped = rure_escape_must(pattern);
|
||||
if (!escaped) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] expected escaped, but got no escaped\n");
|
||||
}
|
||||
passed = false;
|
||||
} else if (strcmp(escaped, expected_escaped) != 0) {
|
||||
if (DEBUG) {
|
||||
fprintf(stderr,
|
||||
"[test_captures] expected \"%s\", but got \"%s\"\n",
|
||||
expected_escaped, escaped);
|
||||
}
|
||||
passed = false;
|
||||
}
|
||||
rure_cstring_free((char *) escaped);
|
||||
return passed;
|
||||
}
|
||||
|
||||
void run_test(bool (test)(), const char *name, bool *passed) {
|
||||
if (!test()) {
|
||||
*passed = false;
|
||||
fprintf(stderr, "FAILED: %s\n", name);
|
||||
} else {
|
||||
fprintf(stderr, "PASSED: %s\n", name);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
bool passed = true;
|
||||
|
||||
run_test(test_is_match, "test_is_match", &passed);
|
||||
run_test(test_shortest_match, "test_shortest_match", &passed);
|
||||
run_test(test_find, "test_find", &passed);
|
||||
run_test(test_captures, "test_captures", &passed);
|
||||
run_test(test_iter, "test_iter", &passed);
|
||||
run_test(test_iter_capture_names, "test_iter_capture_names", &passed);
|
||||
run_test(test_flags, "test_flags", &passed);
|
||||
run_test(test_compile_error, "test_compile_error", &passed);
|
||||
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
|
||||
&passed);
|
||||
run_test(test_regex_set_matches, "test_regex_set_match", &passed);
|
||||
run_test(test_regex_set_options, "test_regex_set_options", &passed);
|
||||
run_test(test_regex_set_match_start, "test_regex_set_match_start",
|
||||
&passed);
|
||||
run_test(test_escape, "test_escape", &passed);
|
||||
|
||||
if (!passed) {
|
||||
exit(1);
|
||||
}
|
||||
return 0;
|
||||
}
|
9
third_party/rust/rure/examples/compile
vendored
Executable file
9
third_party/rust/rure/examples/compile
vendored
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -ex
|
||||
|
||||
# N.B. Add `--release` flag to `cargo build` to make the example run faster.
|
||||
cargo build --manifest-path ../Cargo.toml
|
||||
gcc -O3 -DDEBUG -o iter iter.c -ansi -Wall -I../include -L../../target/debug -lrure
|
||||
# If you're using librure.a, then you'll need to link other stuff:
|
||||
# -lutil -ldl -lpthread -lgcc_s -lc -lm -lrt -lutil -lrure
|
99
third_party/rust/rure/examples/iter.c
vendored
Normal file
99
third_party/rust/rure/examples/iter.c
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* This example code shows how to iterate over all regex matches in a file,
|
||||
* emit the match location and print the contents of a capturing group.
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "rure.h"
|
||||
|
||||
int main() {
|
||||
/* Open a file and mmap it. */
|
||||
int fd = open("sherlock.txt", O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("failed to open sherlock.txt");
|
||||
exit(1);
|
||||
}
|
||||
struct stat status;
|
||||
if (fstat(fd, &status) == -1) {
|
||||
perror("failed to stat sherlock.txt");
|
||||
exit(1);
|
||||
}
|
||||
if ((uintmax_t)status.st_size > SIZE_MAX) {
|
||||
perror("file too big");
|
||||
exit(1);
|
||||
}
|
||||
if (status.st_size == 0) {
|
||||
perror("file empty");
|
||||
exit(1);
|
||||
}
|
||||
size_t sherlock_len = (size_t)status.st_size;
|
||||
const uint8_t *sherlock = (const uint8_t *)mmap(
|
||||
NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||
close(fd);
|
||||
if (sherlock == MAP_FAILED) {
|
||||
perror("could not mmap file");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compile the regular expression. A more convenient routine,
|
||||
* rure_compile_must, is also available, which will abort the process if
|
||||
* and print an error message to stderr if the regex compilation fails.
|
||||
* We show the full gory details here as an example.
|
||||
*/
|
||||
const char *pattern = "(\\w+)\\s+Holmes";
|
||||
size_t pattern_len = strlen(pattern);
|
||||
rure_error *err = rure_error_new();
|
||||
rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
|
||||
RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
|
||||
if (NULL == re) {
|
||||
/* A null regex means compilation failed and an error exists. */
|
||||
printf("compilation of %s failed: %s\n",
|
||||
pattern, rure_error_message(err));
|
||||
rure_error_free(err);
|
||||
munmap((char*)sherlock, sherlock_len);
|
||||
exit(1);
|
||||
}
|
||||
rure_error_free(err);
|
||||
|
||||
/*
|
||||
* Create an iterator to find all successive non-overlapping matches.
|
||||
* For each match, we extract the location of the capturing group.
|
||||
*/
|
||||
rure_match group0 = {0};
|
||||
rure_match group1 = {0};
|
||||
rure_captures *caps = rure_captures_new(re);
|
||||
rure_iter *it = rure_iter_new(re);
|
||||
|
||||
while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
|
||||
/*
|
||||
* Get the location of the full match and the capturing group.
|
||||
* We know that both accesses are successful since the body of the
|
||||
* loop only executes if there is a match and both capture groups
|
||||
* must match in order for the entire regex to match.
|
||||
*
|
||||
* N.B. The zeroth group corresponds to the full match of the regex.
|
||||
*/
|
||||
rure_captures_at(caps, 0, &group0);
|
||||
rure_captures_at(caps, 1, &group1);
|
||||
printf("%.*s (match at: %zu, %zu)\n",
|
||||
(int)(group1.end - group1.start),
|
||||
sherlock + group1.start,
|
||||
group0.start, group0.end);
|
||||
}
|
||||
|
||||
/* Free all our resources. */
|
||||
munmap((char*)sherlock, sherlock_len);
|
||||
rure_captures_free(caps);
|
||||
rure_iter_free(it);
|
||||
rure_free(re);
|
||||
return 0;
|
||||
}
|
13052
third_party/rust/rure/examples/sherlock.txt
vendored
Normal file
13052
third_party/rust/rure/examples/sherlock.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
585
third_party/rust/rure/include/rure.h
vendored
Normal file
585
third_party/rust/rure/include/rure.h
vendored
Normal file
@ -0,0 +1,585 @@
|
||||
#ifndef _RURE_H
|
||||
#define _RURE_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* rure is the type of a compiled regular expression.
|
||||
*
|
||||
* An rure can be safely used from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure rure;
|
||||
|
||||
/*
|
||||
* rure_set is the type of a set of compiled regular expressions.
|
||||
*
|
||||
* A rure can be safely used from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure_set rure_set;
|
||||
|
||||
/*
|
||||
* rure_options is the set of non-flag configuration options for compiling
|
||||
* a regular expression. Currently, only two options are available: setting
|
||||
* the size limit of the compiled program and setting the size limit of the
|
||||
* cache of states that the DFA uses while searching.
|
||||
*
|
||||
* For most uses, the default settings will work fine, and NULL can be passed
|
||||
* wherever a *rure_options is expected.
|
||||
*/
|
||||
typedef struct rure_options rure_options;
|
||||
|
||||
/*
|
||||
* The flags listed below can be used in rure_compile to set the default
|
||||
* flags. All flags can otherwise be toggled in the expression itself using
|
||||
* standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
|
||||
* disables it.
|
||||
*/
|
||||
/* The case insensitive (i) flag. */
|
||||
#define RURE_FLAG_CASEI (1 << 0)
|
||||
/* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
|
||||
#define RURE_FLAG_MULTI (1 << 1)
|
||||
/* The any character (s) flag. (. matches new line.) */
|
||||
#define RURE_FLAG_DOTNL (1 << 2)
|
||||
/* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
|
||||
#define RURE_FLAG_SWAP_GREED (1 << 3)
|
||||
/* The ignore whitespace (x) flag. */
|
||||
#define RURE_FLAG_SPACE (1 << 4)
|
||||
/* The Unicode (u) flag. */
|
||||
#define RURE_FLAG_UNICODE (1 << 5)
|
||||
/* The default set of flags enabled when no flags are set. */
|
||||
#define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
|
||||
|
||||
/*
|
||||
* rure_match corresponds to the location of a single match in a haystack.
|
||||
*/
|
||||
typedef struct rure_match {
|
||||
/* The start position. */
|
||||
size_t start;
|
||||
/* The end position. */
|
||||
size_t end;
|
||||
} rure_match;
|
||||
|
||||
/*
|
||||
* rure_captures represents storage for sub-capture locations of a match.
|
||||
*
|
||||
* Computing the capture groups of a match can carry a significant performance
|
||||
* penalty, so their use in the API is optional.
|
||||
*
|
||||
* An rure_captures value can be reused in multiple calls to rure_find_captures,
|
||||
* so long as it is used with the compiled regular expression that created
|
||||
* it.
|
||||
*
|
||||
* An rure_captures value may outlive its corresponding rure and can be freed
|
||||
* independently.
|
||||
*
|
||||
* It is not safe to use from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure_captures rure_captures;
|
||||
|
||||
/*
|
||||
* rure_iter is an iterator over successive non-overlapping matches in a
|
||||
* particular haystack.
|
||||
*
|
||||
* An rure_iter value may not outlive its corresponding rure and should be freed
|
||||
* before its corresponding rure is freed.
|
||||
*
|
||||
* It is not safe to use from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure_iter rure_iter;
|
||||
|
||||
/*
|
||||
* rure_iter_capture_names is an iterator over the list of capture group names
|
||||
* in this particular rure.
|
||||
*
|
||||
* An rure_iter_capture_names value may not outlive its corresponding rure,
|
||||
* and should be freed before its corresponding rure is freed.
|
||||
*
|
||||
* It is not safe to use from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure_iter_capture_names rure_iter_capture_names;
|
||||
|
||||
/*
|
||||
* rure_error is an error that caused compilation to fail.
|
||||
*
|
||||
* Most errors are syntax errors but an error can be returned if the compiled
|
||||
* regular expression would be too big.
|
||||
*
|
||||
* Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
|
||||
* you will not get access to the error if one occurred.)
|
||||
*
|
||||
* It is not safe to use from multiple threads simultaneously.
|
||||
*/
|
||||
typedef struct rure_error rure_error;
|
||||
|
||||
/*
|
||||
* rure_compile_must compiles the given pattern into a regular expression. If
|
||||
* compilation fails for any reason, an error message is printed to stderr and
|
||||
* the process is aborted.
|
||||
*
|
||||
* The pattern given should be in UTF-8. For convenience, this accepts a C
|
||||
* string, which means the pattern cannot usefully contain NUL. If your pattern
|
||||
* may contain NUL, consider using a regular expression escape sequence, or
|
||||
* just use rure_compile.
|
||||
*
|
||||
* This uses RURE_DEFAULT_FLAGS.
|
||||
*
|
||||
* The compiled expression returned may be used from multiple threads
|
||||
* simultaneously.
|
||||
*/
|
||||
rure *rure_compile_must(const char *pattern);
|
||||
|
||||
/*
|
||||
* rure_compile compiles the given pattern into a regular expression. The
|
||||
* pattern must be valid UTF-8 and the length corresponds to the number of
|
||||
* bytes in the pattern.
|
||||
*
|
||||
* flags is a bitfield. Valid values are constants declared with prefix
|
||||
* RURE_FLAG_.
|
||||
*
|
||||
* options contains non-flag configuration settings. If it's NULL, default
|
||||
* settings are used. options may be freed immediately after a call to
|
||||
* rure_compile.
|
||||
*
|
||||
* error is set if there was a problem compiling the pattern (including if the
|
||||
* pattern is not valid UTF-8). If error is NULL, then no error information
|
||||
* is returned. In all cases, if an error occurs, NULL is returned.
|
||||
*
|
||||
* The compiled expression returned may be used from multiple threads
|
||||
* simultaneously.
|
||||
*/
|
||||
rure *rure_compile(const uint8_t *pattern, size_t length,
|
||||
uint32_t flags, rure_options *options,
|
||||
rure_error *error);
|
||||
|
||||
/*
|
||||
* rure_free frees the given compiled regular expression.
|
||||
*
|
||||
* This must be called at most once for any rure.
|
||||
*/
|
||||
void rure_free(rure *re);
|
||||
|
||||
/*
|
||||
* rure_is_match returns true if and only if re matches anywhere in haystack.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*
|
||||
* rure_is_match should be preferred to rure_find since it may be faster.
|
||||
*
|
||||
* N.B. The performance of this search is not impacted by the presence of
|
||||
* capturing groups in your regular expression.
|
||||
*/
|
||||
bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
|
||||
size_t start);
|
||||
|
||||
/*
|
||||
* rure_find returns true if and only if re matches anywhere in haystack.
|
||||
* If a match is found, then its start and end offsets (in bytes) are set
|
||||
* on the match pointer given.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*
|
||||
* rure_find should be preferred to rure_find_captures since it may be faster.
|
||||
*
|
||||
* N.B. The performance of this search is not impacted by the presence of
|
||||
* capturing groups in your regular expression.
|
||||
*/
|
||||
bool rure_find(rure *re, const uint8_t *haystack, size_t length,
|
||||
size_t start, rure_match *match);
|
||||
|
||||
/*
|
||||
* rure_find_captures returns true if and only if re matches anywhere in
|
||||
* haystack. If a match is found, then all of its capture locations are stored
|
||||
* in the captures pointer given.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*
|
||||
* Only use this function if you specifically need access to capture locations.
|
||||
* It is not necessary to use this function just because your regular
|
||||
* expression contains capturing groups.
|
||||
*
|
||||
* Capture locations can be accessed using the rure_captures_* functions.
|
||||
*
|
||||
* N.B. The performance of this search can be impacted by the number of
|
||||
* capturing groups. If you're using this function, it may be beneficial to
|
||||
* use non-capturing groups (e.g., `(?:re)`) where possible.
|
||||
*/
|
||||
bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
|
||||
size_t start, rure_captures *captures);
|
||||
|
||||
/*
|
||||
* rure_shortest_match returns true if and only if re matches anywhere in
|
||||
* haystack. If a match is found, then its end location is stored in the
|
||||
* pointer given. The end location is the place at which the regex engine
|
||||
* determined that a match exists, but may occur before the end of the proper
|
||||
* leftmost-first match.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*
|
||||
* rure_shortest_match should be preferred to rure_find since it may be faster.
|
||||
*
|
||||
* N.B. The performance of this search is not impacted by the presence of
|
||||
* capturing groups in your regular expression.
|
||||
*/
|
||||
bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
|
||||
size_t start, size_t *end);
|
||||
|
||||
/*
|
||||
* rure_capture_name_index returns the capture index for the name given. If
|
||||
* no such named capturing group exists in re, then -1 is returned.
|
||||
*
|
||||
* The capture index may be used with rure_captures_at.
|
||||
*
|
||||
* This function never returns 0 since the first capture group always
|
||||
* corresponds to the entire match and is always unnamed.
|
||||
*/
|
||||
int32_t rure_capture_name_index(rure *re, const char *name);
|
||||
|
||||
/*
|
||||
* rure_iter_capture_names_new creates a new capture_names iterator.
|
||||
*
|
||||
* An iterator will report all successive capture group names of re.
|
||||
*/
|
||||
rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
|
||||
|
||||
/*
|
||||
* rure_iter_capture_names_free frees the iterator given.
|
||||
*
|
||||
* It must be called at most once.
|
||||
*/
|
||||
void rure_iter_capture_names_free(rure_iter_capture_names *it);
|
||||
|
||||
/*
|
||||
* rure_iter_capture_names_next advances the iterator and returns true
|
||||
* if and only if another capture group name exists.
|
||||
*
|
||||
* The value of the capture group name is written to the provided pointer.
|
||||
*/
|
||||
bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
|
||||
|
||||
/*
|
||||
* rure_iter_new creates a new iterator.
|
||||
*
|
||||
* An iterator will report all successive non-overlapping matches of re.
|
||||
* When calling iterator functions, the same haystack and length must be
|
||||
* supplied to all invocations. (Strict pointer equality is, however, not
|
||||
* required.)
|
||||
*/
|
||||
rure_iter *rure_iter_new(rure *re);
|
||||
|
||||
/*
|
||||
* rure_iter_free frees the iterator given.
|
||||
*
|
||||
* It must be called at most once.
|
||||
*/
|
||||
void rure_iter_free(rure_iter *it);
|
||||
|
||||
/*
|
||||
* rure_iter_next advances the iterator and returns true if and only if a
|
||||
* match was found. If a match is found, then the match pointer is set with the
|
||||
* start and end location of the match, in bytes.
|
||||
*
|
||||
* If no match is found, then subsequent calls will return false indefinitely.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack. The given haystack must
|
||||
* be logically equivalent to all other haystacks given to this iterator.
|
||||
*
|
||||
* rure_iter_next should be preferred to rure_iter_next_captures since it may
|
||||
* be faster.
|
||||
*
|
||||
* N.B. The performance of this search is not impacted by the presence of
|
||||
* capturing groups in your regular expression.
|
||||
*/
|
||||
bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
|
||||
rure_match *match);
|
||||
|
||||
/*
|
||||
* rure_iter_next_captures advances the iterator and returns true if and only if a
|
||||
* match was found. If a match is found, then all of its capture locations are
|
||||
* stored in the captures pointer given.
|
||||
*
|
||||
* If no match is found, then subsequent calls will return false indefinitely.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack. The given haystack must
|
||||
* be logically equivalent to all other haystacks given to this iterator.
|
||||
*
|
||||
* Only use this function if you specifically need access to capture locations.
|
||||
* It is not necessary to use this function just because your regular
|
||||
* expression contains capturing groups.
|
||||
*
|
||||
* Capture locations can be accessed using the rure_captures_* functions.
|
||||
*
|
||||
* N.B. The performance of this search can be impacted by the number of
|
||||
* capturing groups. If you're using this function, it may be beneficial to
|
||||
* use non-capturing groups (e.g., `(?:re)`) where possible.
|
||||
*/
|
||||
bool rure_iter_next_captures(rure_iter *it,
|
||||
const uint8_t *haystack, size_t length,
|
||||
rure_captures *captures);
|
||||
|
||||
/*
|
||||
* rure_captures_new allocates storage for all capturing groups in re.
|
||||
*
|
||||
* An rure_captures value may be reused on subsequent calls to
|
||||
* rure_find_captures or rure_iter_next_captures.
|
||||
*
|
||||
* An rure_captures value may be freed independently of re, although any
|
||||
* particular rure_captures should be used only with the re given here.
|
||||
*
|
||||
* It is not safe to use an rure_captures value from multiple threads
|
||||
* simultaneously.
|
||||
*/
|
||||
rure_captures *rure_captures_new(rure *re);
|
||||
|
||||
/*
|
||||
* rure_captures_free frees the given captures.
|
||||
*
|
||||
* This must be called at most once.
|
||||
*/
|
||||
void rure_captures_free(rure_captures *captures);
|
||||
|
||||
/*
|
||||
* rure_captures_at returns true if and only if the capturing group at the
|
||||
* index given was part of a match. If so, the given match pointer is populated
|
||||
* with the start and end location (in bytes) of the capturing group.
|
||||
*
|
||||
* If no capture group with the index i exists, then false is
|
||||
* returned. (A capturing group exists if and only if i is less than
|
||||
* rure_captures_len(captures).)
|
||||
*
|
||||
* Note that index 0 corresponds to the full match.
|
||||
*/
|
||||
bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
|
||||
|
||||
/*
|
||||
* rure_captures_len returns the number of capturing groups in the given
|
||||
* captures.
|
||||
*/
|
||||
size_t rure_captures_len(rure_captures *captures);
|
||||
|
||||
/*
|
||||
* rure_options_new allocates space for options.
|
||||
*
|
||||
* Options may be freed immediately after a call to rure_compile, but otherwise
|
||||
* may be freely used in multiple calls to rure_compile.
|
||||
*
|
||||
* It is not safe to set options from multiple threads simultaneously. It is
|
||||
* safe to call rure_compile from multiple threads simultaneously using the
|
||||
* same options pointer.
|
||||
*/
|
||||
rure_options *rure_options_new();
|
||||
|
||||
/*
|
||||
* rure_options_free frees the given options.
|
||||
*
|
||||
* This must be called at most once.
|
||||
*/
|
||||
void rure_options_free(rure_options *options);
|
||||
|
||||
/*
|
||||
* rure_options_size_limit sets the appoximate size limit of the compiled
|
||||
* regular expression.
|
||||
*
|
||||
* This size limit roughly corresponds to the number of bytes occupied by a
|
||||
* single compiled program. If the program would exceed this number, then a
|
||||
* compilation error will be returned from rure_compile.
|
||||
*/
|
||||
void rure_options_size_limit(rure_options *options, size_t limit);
|
||||
|
||||
/*
|
||||
* rure_options_dfa_size_limit sets the approximate size of the cache used by
|
||||
* the DFA during search.
|
||||
*
|
||||
* This roughly corresponds to the number of bytes that the DFA will use while
|
||||
* searching.
|
||||
*
|
||||
* Note that this is a *per thread* limit. There is no way to set a global
|
||||
* limit. In particular, if a regular expression is used from multiple threads
|
||||
* simultaneously, then each thread may use up to the number of bytes
|
||||
* specified here.
|
||||
*/
|
||||
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
|
||||
|
||||
/*
|
||||
* rure_compile_set compiles the given list of patterns into a single regular
|
||||
* expression which can be matched in a linear-scan. Each pattern in patterns
|
||||
* must be valid UTF-8 and the length of each pattern in patterns corresponds
|
||||
* to a byte length in patterns_lengths.
|
||||
*
|
||||
* The number of patterns to compile is specified by patterns_count. patterns
|
||||
* must contain at least this many entries.
|
||||
*
|
||||
* flags is a bitfield. Valid values are constants declared with prefix
|
||||
* RURE_FLAG_.
|
||||
*
|
||||
* options contains non-flag configuration settings. If it's NULL, default
|
||||
* settings are used. options may be freed immediately after a call to
|
||||
* rure_compile.
|
||||
*
|
||||
* error is set if there was a problem compiling the pattern.
|
||||
*
|
||||
* The compiled expression set returned may be used from multiple threads.
|
||||
*/
|
||||
rure_set *rure_compile_set(const uint8_t **patterns,
|
||||
const size_t *patterns_lengths,
|
||||
size_t patterns_count,
|
||||
uint32_t flags,
|
||||
rure_options *options,
|
||||
rure_error *error);
|
||||
|
||||
/*
|
||||
* rure_set_free frees the given compiled regular expression set.
|
||||
*
|
||||
* This must be called at most once for any rure_set.
|
||||
*/
|
||||
void rure_set_free(rure_set *re);
|
||||
|
||||
/*
|
||||
* rure_is_match returns true if and only if any regexes within the set
|
||||
* match anywhere in the haystack. Once a match has been located, the
|
||||
* matching engine will quit immediately.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*/
|
||||
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
|
||||
size_t start);
|
||||
|
||||
/*
|
||||
* rure_set_matches compares each regex in the set against the haystack and
|
||||
* modifies matches with the match result of each pattern. Match results are
|
||||
* ordered in the same way as the rure_set was compiled. For example,
|
||||
* index 0 of matches corresponds to the first pattern passed to
|
||||
* `rure_compile_set`.
|
||||
*
|
||||
* haystack may contain arbitrary bytes, but ASCII compatible text is more
|
||||
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
|
||||
* length should be the number of bytes in haystack.
|
||||
*
|
||||
* start is the position at which to start searching. Note that setting the
|
||||
* start position is distinct from incrementing the pointer, since the regex
|
||||
* engine may look at bytes before the start position to determine match
|
||||
* information. For example, if the start position is greater than 0, then the
|
||||
* \A ("begin text") anchor can never match.
|
||||
*
|
||||
* matches must be greater than or equal to the number of patterns the
|
||||
* rure_set was compiled with.
|
||||
*
|
||||
* Only use this function if you specifically need to know which regexes
|
||||
* matched within the set. To determine if any of the regexes matched without
|
||||
* caring which, use rure_set_is_match.
|
||||
*/
|
||||
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
|
||||
size_t start, bool *matches);
|
||||
|
||||
/*
|
||||
* rure_set_len returns the number of patterns rure_set was compiled with.
|
||||
*/
|
||||
size_t rure_set_len(rure_set *re);
|
||||
|
||||
/*
|
||||
* rure_error_new allocates space for an error.
|
||||
*
|
||||
* If error information is desired, then rure_error_new should be called
|
||||
* to create an rure_error pointer, and that pointer can be passed to
|
||||
* rure_compile. If an error occurred, then rure_compile will return NULL and
|
||||
* the error pointer will be set. A message can then be extracted.
|
||||
*
|
||||
* It is not safe to use errors from multiple threads simultaneously. An error
|
||||
* value may be reused on subsequent calls to rure_compile.
|
||||
*/
|
||||
rure_error *rure_error_new();
|
||||
|
||||
/*
|
||||
* rure_error_free frees the error given.
|
||||
*
|
||||
* This must be called at most once.
|
||||
*/
|
||||
void rure_error_free(rure_error *err);
|
||||
|
||||
/*
|
||||
* rure_error_message returns a NUL terminated string that describes the error
|
||||
* message.
|
||||
*
|
||||
* The pointer returned must not be freed. Instead, it will be freed when
|
||||
* rure_error_free is called. If err is used in subsequent calls to
|
||||
* rure_compile, then this pointer may change or become invalid.
|
||||
*/
|
||||
const char *rure_error_message(rure_error *err);
|
||||
|
||||
/*
|
||||
* rure_escape_must returns a NUL terminated string where all meta characters
|
||||
* have been escaped. If escaping fails for any reason, an error message is
|
||||
* printed to stderr and the process is aborted.
|
||||
*
|
||||
* The pattern given should be in UTF-8. For convenience, this accepts a C
|
||||
* string, which means the pattern cannot contain a NUL byte. These correspond
|
||||
* to the only two failure conditions of this function. That is, if the caller
|
||||
* guarantees that the given pattern is valid UTF-8 and does not contain a
|
||||
* NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
|
||||
*
|
||||
* The pointer returned must not be freed directly. Instead, it should be freed
|
||||
* by calling rure_cstring_free.
|
||||
*/
|
||||
const char *rure_escape_must(const char *pattern);
|
||||
|
||||
/*
|
||||
* rure_cstring_free frees the string given.
|
||||
*
|
||||
* This must be called at most once per string.
|
||||
*/
|
||||
void rure_cstring_free(char *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
79
third_party/rust/rure/src/error.rs
vendored
Normal file
79
third_party/rust/rure/src/error.rs
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
use std::ffi;
|
||||
use std::ffi::CString;
|
||||
use std::fmt;
|
||||
use std::str;
|
||||
|
||||
use libc::c_char;
|
||||
use regex;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Error {
|
||||
message: Option<CString>,
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorKind {
|
||||
None,
|
||||
Str(str::Utf8Error),
|
||||
Regex(regex::Error),
|
||||
Nul(ffi::NulError),
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub fn new(kind: ErrorKind) -> Error {
|
||||
Error { message: None, kind: kind }
|
||||
}
|
||||
|
||||
pub fn is_err(&self) -> bool {
|
||||
match self.kind {
|
||||
ErrorKind::None => false,
|
||||
ErrorKind::Str(_) | ErrorKind::Regex(_) | ErrorKind::Nul(_) => {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::None => write!(f, "no error"),
|
||||
ErrorKind::Str(ref e) => e.fmt(f),
|
||||
ErrorKind::Regex(ref e) => e.fmt(f),
|
||||
ErrorKind::Nul(ref e) => e.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_error_new() -> *mut Error {
|
||||
Box::into_raw(Box::new(Error::new(ErrorKind::None)))
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_error_free(err: *mut Error) {
|
||||
unsafe { drop(Box::from_raw(err)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_error_message(err: *mut Error) -> *const c_char {
|
||||
let err = unsafe { &mut *err };
|
||||
let cmsg = match CString::new(format!("{}", err)) {
|
||||
Ok(msg) => msg,
|
||||
Err(err) => {
|
||||
// I guess this can probably happen if the regex itself has a
|
||||
// NUL, and that NUL re-occurs in the context presented by the
|
||||
// error message. In this case, just show as much as we can.
|
||||
let nul = err.nul_position();
|
||||
let msg = err.into_vec();
|
||||
CString::new(msg[0..nul].to_owned()).unwrap()
|
||||
}
|
||||
};
|
||||
let p = cmsg.as_ptr();
|
||||
err.message = Some(cmsg);
|
||||
p
|
||||
}
|
||||
}
|
7
third_party/rust/rure/src/lib.rs
vendored
Normal file
7
third_party/rust/rure/src/lib.rs
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
mod error;
|
||||
mod rure;
|
||||
|
||||
pub use crate::error::*;
|
||||
pub use crate::rure::*;
|
36
third_party/rust/rure/src/macros.rs
vendored
Normal file
36
third_party/rust/rure/src/macros.rs
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
macro_rules! ffi_fn {
|
||||
(fn $name:ident($($arg:ident: $arg_ty:ty),*,) -> $ret:ty $body:block) => {
|
||||
ffi_fn!(fn $name($($arg: $arg_ty),*) -> $ret $body);
|
||||
};
|
||||
(fn $name:ident($($arg:ident: $arg_ty:ty),*) -> $ret:ty $body:block) => {
|
||||
#[no_mangle]
|
||||
pub extern fn $name($($arg: $arg_ty),*) -> $ret {
|
||||
use ::std::io::{self, Write};
|
||||
use ::std::panic::{self, AssertUnwindSafe};
|
||||
use ::libc::abort;
|
||||
match panic::catch_unwind(AssertUnwindSafe(move || $body)) {
|
||||
Ok(v) => v,
|
||||
Err(err) => {
|
||||
let msg = if let Some(&s) = err.downcast_ref::<&str>() {
|
||||
s.to_owned()
|
||||
} else if let Some(s) = err.downcast_ref::<String>() {
|
||||
s.to_owned()
|
||||
} else {
|
||||
"UNABLE TO SHOW RESULT OF PANIC.".to_owned()
|
||||
};
|
||||
let _ = writeln!(
|
||||
&mut io::stderr(),
|
||||
"panic unwind caught, aborting: {:?}",
|
||||
msg);
|
||||
unsafe { abort() }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
(fn $name:ident($($arg:ident: $arg_ty:ty),*,) $body:block) => {
|
||||
ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
|
||||
};
|
||||
(fn $name:ident($($arg:ident: $arg_ty:ty),*) $body:block) => {
|
||||
ffi_fn!(fn $name($($arg: $arg_ty),*) -> () $body);
|
||||
};
|
||||
}
|
629
third_party/rust/rure/src/rure.rs
vendored
Normal file
629
third_party/rust/rure/src/rure.rs
vendored
Normal file
@ -0,0 +1,629 @@
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::{CStr, CString};
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
use std::slice;
|
||||
use std::str;
|
||||
|
||||
use libc::{c_char, size_t};
|
||||
use regex::bytes;
|
||||
|
||||
use crate::error::{Error, ErrorKind};
|
||||
|
||||
const RURE_FLAG_CASEI: u32 = 1 << 0;
|
||||
const RURE_FLAG_MULTI: u32 = 1 << 1;
|
||||
const RURE_FLAG_DOTNL: u32 = 1 << 2;
|
||||
const RURE_FLAG_SWAP_GREED: u32 = 1 << 3;
|
||||
const RURE_FLAG_SPACE: u32 = 1 << 4;
|
||||
const RURE_FLAG_UNICODE: u32 = 1 << 5;
|
||||
const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE;
|
||||
|
||||
pub struct Regex {
|
||||
re: bytes::Regex,
|
||||
capture_names: HashMap<String, i32>,
|
||||
}
|
||||
|
||||
pub struct Options {
|
||||
size_limit: usize,
|
||||
dfa_size_limit: usize,
|
||||
}
|
||||
|
||||
// The `RegexSet` is not exposed with option support or matching at an
|
||||
// arbitrary position with a crate just yet. To circumvent this, we use
|
||||
// the `Exec` structure directly.
|
||||
pub struct RegexSet {
|
||||
re: bytes::RegexSet,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
pub struct rure_match {
|
||||
pub start: size_t,
|
||||
pub end: size_t,
|
||||
}
|
||||
|
||||
pub struct Captures(bytes::Locations);
|
||||
|
||||
pub struct Iter {
|
||||
re: *const Regex,
|
||||
last_end: usize,
|
||||
last_match: Option<usize>,
|
||||
}
|
||||
|
||||
pub struct IterCaptureNames {
|
||||
capture_names: bytes::CaptureNames<'static>,
|
||||
name_ptrs: Vec<*mut c_char>,
|
||||
}
|
||||
|
||||
impl Deref for Regex {
|
||||
type Target = bytes::Regex;
|
||||
fn deref(&self) -> &bytes::Regex {
|
||||
&self.re
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for RegexSet {
|
||||
type Target = bytes::RegexSet;
|
||||
fn deref(&self) -> &bytes::RegexSet {
|
||||
&self.re
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Options {
|
||||
Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_compile_must(pattern: *const c_char) -> *const Regex {
|
||||
let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
|
||||
let pat = pattern as *const u8;
|
||||
let mut err = Error::new(ErrorKind::None);
|
||||
let re = rure_compile(
|
||||
pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err);
|
||||
if err.is_err() {
|
||||
let _ = writeln!(&mut io::stderr(), "{}", err);
|
||||
let _ = writeln!(
|
||||
&mut io::stderr(), "aborting from rure_compile_must");
|
||||
unsafe { abort() }
|
||||
}
|
||||
re
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_compile(
|
||||
pattern: *const u8,
|
||||
length: size_t,
|
||||
flags: u32,
|
||||
options: *const Options,
|
||||
error: *mut Error,
|
||||
) -> *const Regex {
|
||||
let pat = unsafe { slice::from_raw_parts(pattern, length) };
|
||||
let pat = match str::from_utf8(pat) {
|
||||
Ok(pat) => pat,
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Str(err));
|
||||
}
|
||||
return ptr::null();
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut builder = bytes::RegexBuilder::new(pat);
|
||||
if !options.is_null() {
|
||||
let options = unsafe { &*options };
|
||||
builder.size_limit(options.size_limit);
|
||||
builder.dfa_size_limit(options.dfa_size_limit);
|
||||
}
|
||||
builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
|
||||
builder.multi_line(flags & RURE_FLAG_MULTI > 0);
|
||||
builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
|
||||
builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
|
||||
builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
|
||||
builder.unicode(flags & RURE_FLAG_UNICODE > 0);
|
||||
match builder.build() {
|
||||
Ok(re) => {
|
||||
let mut capture_names = HashMap::new();
|
||||
for (i, name) in re.capture_names().enumerate() {
|
||||
if let Some(name) = name {
|
||||
capture_names.insert(name.to_owned(), i as i32);
|
||||
}
|
||||
}
|
||||
let re = Regex {
|
||||
re: re,
|
||||
capture_names: capture_names,
|
||||
};
|
||||
Box::into_raw(Box::new(re))
|
||||
}
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Regex(err));
|
||||
}
|
||||
ptr::null()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_free(re: *const Regex) {
|
||||
unsafe { drop(Box::from_raw(re as *mut Regex)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_is_match(
|
||||
re: *const Regex,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t,
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
re.is_match_at(haystack, start)
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_find(
|
||||
re: *const Regex,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t,
|
||||
match_info: *mut rure_match,
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
re.find_at(haystack, start).map(|m| unsafe {
|
||||
if !match_info.is_null() {
|
||||
(*match_info).start = m.start();
|
||||
(*match_info).end = m.end();
|
||||
}
|
||||
}).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_find_captures(
|
||||
re: *const Regex,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t,
|
||||
captures: *mut Captures,
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
let slots = unsafe { &mut (*captures).0 };
|
||||
re.read_captures_at(slots, haystack, start).is_some()
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_shortest_match(
|
||||
re: *const Regex,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t,
|
||||
end: *mut usize,
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
match re.shortest_match_at(haystack, start) {
|
||||
None => false,
|
||||
Some(i) => {
|
||||
if !end.is_null() {
|
||||
unsafe {
|
||||
*end = i;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_capture_name_index(
|
||||
re: *const Regex,
|
||||
name: *const c_char,
|
||||
) -> i32 {
|
||||
let re = unsafe { &*re };
|
||||
let name = unsafe { CStr::from_ptr(name) };
|
||||
let name = match name.to_str() {
|
||||
Err(_) => return -1,
|
||||
Ok(name) => name,
|
||||
};
|
||||
re.capture_names.get(name).map(|&i|i).unwrap_or(-1)
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_capture_names_new(
|
||||
re: *const Regex,
|
||||
) -> *mut IterCaptureNames {
|
||||
let re = unsafe { &*re };
|
||||
Box::into_raw(Box::new(IterCaptureNames {
|
||||
capture_names: re.re.capture_names(),
|
||||
name_ptrs: Vec::new(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_capture_names_free(it: *mut IterCaptureNames) {
|
||||
unsafe {
|
||||
let it = &mut *it;
|
||||
while let Some(ptr) = it.name_ptrs.pop() {
|
||||
drop(CString::from_raw(ptr));
|
||||
}
|
||||
drop(Box::from_raw(it));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_capture_names_next(
|
||||
it: *mut IterCaptureNames,
|
||||
capture_name: *mut *mut c_char,
|
||||
) -> bool {
|
||||
if capture_name.is_null() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let it = unsafe { &mut *it };
|
||||
let cn = match it.capture_names.next() {
|
||||
// Top-level iterator ran out of capture groups
|
||||
None => return false,
|
||||
Some(val) => {
|
||||
let name = match val {
|
||||
// inner Option didn't have a name
|
||||
None => "",
|
||||
Some(name) => name
|
||||
};
|
||||
name
|
||||
}
|
||||
};
|
||||
|
||||
unsafe {
|
||||
let cs = match CString::new(cn.as_bytes()) {
|
||||
Result::Ok(val) => val,
|
||||
Result::Err(_) => return false
|
||||
};
|
||||
let ptr = cs.into_raw();
|
||||
it.name_ptrs.push(ptr);
|
||||
*capture_name = ptr;
|
||||
}
|
||||
true
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_new(
|
||||
re: *const Regex,
|
||||
) -> *mut Iter {
|
||||
Box::into_raw(Box::new(Iter {
|
||||
re: re,
|
||||
last_end: 0,
|
||||
last_match: None,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_free(it: *mut Iter) {
|
||||
unsafe { drop(Box::from_raw(it)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_next(
|
||||
it: *mut Iter,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
match_info: *mut rure_match,
|
||||
) -> bool {
|
||||
let it = unsafe { &mut *it };
|
||||
let re = unsafe { &*it.re };
|
||||
let text = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
if it.last_end > text.len() {
|
||||
return false;
|
||||
}
|
||||
let (s, e) = match re.find_at(text, it.last_end) {
|
||||
None => return false,
|
||||
Some(m) => (m.start(), m.end()),
|
||||
};
|
||||
if s == e {
|
||||
// This is an empty match. To ensure we make progress, start
|
||||
// the next search at the smallest possible starting position
|
||||
// of the next match following this one.
|
||||
it.last_end += 1;
|
||||
// Don't accept empty matches immediately following a match.
|
||||
// Just move on to the next match.
|
||||
if Some(e) == it.last_match {
|
||||
return rure_iter_next(it, haystack, len, match_info);
|
||||
}
|
||||
} else {
|
||||
it.last_end = e;
|
||||
}
|
||||
it.last_match = Some(e);
|
||||
if !match_info.is_null() {
|
||||
unsafe {
|
||||
(*match_info).start = s;
|
||||
(*match_info).end = e;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_iter_next_captures(
|
||||
it: *mut Iter,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
captures: *mut Captures,
|
||||
) -> bool {
|
||||
let it = unsafe { &mut *it };
|
||||
let re = unsafe { &*it.re };
|
||||
let slots = unsafe { &mut (*captures).0 };
|
||||
let text = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
if it.last_end > text.len() {
|
||||
return false;
|
||||
}
|
||||
let (s, e) = match re.read_captures_at(slots, text, it.last_end) {
|
||||
None => return false,
|
||||
Some(m) => (m.start(), m.end()),
|
||||
};
|
||||
if s == e {
|
||||
// This is an empty match. To ensure we make progress, start
|
||||
// the next search at the smallest possible starting position
|
||||
// of the next match following this one.
|
||||
it.last_end += 1;
|
||||
// Don't accept empty matches immediately following a match.
|
||||
// Just move on to the next match.
|
||||
if Some(e) == it.last_match {
|
||||
return rure_iter_next_captures(it, haystack, len, captures);
|
||||
}
|
||||
} else {
|
||||
it.last_end = e;
|
||||
}
|
||||
it.last_match = Some(e);
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_captures_new(re: *const Regex) -> *mut Captures {
|
||||
let re = unsafe { &*re };
|
||||
let captures = Captures(re.locations());
|
||||
Box::into_raw(Box::new(captures))
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_captures_free(captures: *const Captures) {
|
||||
unsafe { drop(Box::from_raw(captures as *mut Captures)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_captures_at(
|
||||
captures: *const Captures,
|
||||
i: size_t,
|
||||
match_info: *mut rure_match,
|
||||
) -> bool {
|
||||
let locs = unsafe { &(*captures).0 };
|
||||
match locs.pos(i) {
|
||||
Some((start, end)) => {
|
||||
if !match_info.is_null() {
|
||||
unsafe {
|
||||
(*match_info).start = start;
|
||||
(*match_info).end = end;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_captures_len(captures: *const Captures) -> size_t {
|
||||
unsafe { (*captures).0.len() }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_options_new() -> *mut Options {
|
||||
Box::into_raw(Box::new(Options::default()))
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_options_free(options: *mut Options) {
|
||||
unsafe { drop(Box::from_raw(options)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_options_size_limit(options: *mut Options, limit: size_t) {
|
||||
let options = unsafe { &mut *options };
|
||||
options.size_limit = limit;
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) {
|
||||
let options = unsafe { &mut *options };
|
||||
options.dfa_size_limit = limit;
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_compile_set(
|
||||
patterns: *const *const u8,
|
||||
patterns_lengths: *const size_t,
|
||||
patterns_count: size_t,
|
||||
flags: u32,
|
||||
options: *const Options,
|
||||
error: *mut Error
|
||||
) -> *const RegexSet {
|
||||
let (raw_pats, raw_patsl) = unsafe {
|
||||
(
|
||||
slice::from_raw_parts(patterns, patterns_count),
|
||||
slice::from_raw_parts(patterns_lengths, patterns_count)
|
||||
)
|
||||
};
|
||||
|
||||
let mut pats = Vec::with_capacity(patterns_count);
|
||||
for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
|
||||
let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
|
||||
pats.push(match str::from_utf8(pat) {
|
||||
Ok(pat) => pat,
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Str(err));
|
||||
}
|
||||
return ptr::null();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let mut builder = bytes::RegexSetBuilder::new(pats);
|
||||
if !options.is_null() {
|
||||
let options = unsafe { &*options };
|
||||
builder.size_limit(options.size_limit);
|
||||
builder.dfa_size_limit(options.dfa_size_limit);
|
||||
}
|
||||
builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
|
||||
builder.multi_line(flags & RURE_FLAG_MULTI > 0);
|
||||
builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
|
||||
builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
|
||||
builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
|
||||
builder.unicode(flags & RURE_FLAG_UNICODE > 0);
|
||||
match builder.build() {
|
||||
Ok(re) => {
|
||||
Box::into_raw(Box::new(RegexSet { re: re }))
|
||||
}
|
||||
Err(err) => {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Regex(err))
|
||||
}
|
||||
ptr::null()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_set_free(re: *const RegexSet) {
|
||||
unsafe { drop(Box::from_raw(re as *mut RegexSet)); }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_set_is_match(
|
||||
re: *const RegexSet,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
re.is_match_at(haystack, start)
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_set_matches(
|
||||
re: *const RegexSet,
|
||||
haystack: *const u8,
|
||||
len: size_t,
|
||||
start: size_t,
|
||||
matches: *mut bool
|
||||
) -> bool {
|
||||
let re = unsafe { &*re };
|
||||
let mut matches = unsafe {
|
||||
slice::from_raw_parts_mut(matches, re.len())
|
||||
};
|
||||
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
|
||||
|
||||
// read_matches_at isn't guaranteed to set non-matches to false
|
||||
for item in matches.iter_mut() {
|
||||
*item = false;
|
||||
}
|
||||
re.read_matches_at(&mut matches, haystack, start)
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_set_len(re: *const RegexSet) -> size_t {
|
||||
unsafe { (*re).len() }
|
||||
}
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_escape_must(pattern: *const c_char) -> *const c_char {
|
||||
let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
|
||||
let pat = pattern as *const u8;
|
||||
let mut err = Error::new(ErrorKind::None);
|
||||
let esc = rure_escape(pat, len, &mut err);
|
||||
if err.is_err() {
|
||||
let _ = writeln!(&mut io::stderr(), "{}", err);
|
||||
let _ = writeln!(
|
||||
&mut io::stderr(), "aborting from rure_escape_must");
|
||||
unsafe { abort() }
|
||||
}
|
||||
esc
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper function that implements fallible escaping in a way that returns
|
||||
/// an error if escaping failed.
|
||||
///
|
||||
/// This should ideally be exposed, but it needs API design work. In
|
||||
/// particular, this should not return a C string, but a `const uint8_t *`
|
||||
/// instead, since it may contain a NUL byte.
|
||||
fn rure_escape(
|
||||
pattern: *const u8,
|
||||
length: size_t,
|
||||
error: *mut Error,
|
||||
) -> *const c_char {
|
||||
let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
|
||||
let str_pat = match str::from_utf8(pat) {
|
||||
Ok(val) => val,
|
||||
Err(err) => unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Str(err));
|
||||
}
|
||||
return ptr::null();
|
||||
},
|
||||
};
|
||||
let esc_pat = regex::escape(str_pat);
|
||||
let c_esc_pat = match CString::new(esc_pat) {
|
||||
Ok(val) => val,
|
||||
Err(err) => unsafe {
|
||||
if !error.is_null() {
|
||||
*error = Error::new(ErrorKind::Nul(err));
|
||||
}
|
||||
return ptr::null();
|
||||
},
|
||||
};
|
||||
c_esc_pat.into_raw() as *const c_char
|
||||
}
|
||||
|
||||
ffi_fn! {
|
||||
fn rure_cstring_free(s: *mut c_char) {
|
||||
unsafe { drop(CString::from_raw(s)); }
|
||||
}
|
||||
}
|
7
third_party/rust/rure/test
vendored
Executable file
7
third_party/rust/rure/test
vendored
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
cargo build --verbose
|
||||
(cd ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
|
||||
(cd examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
|
@ -46,3 +46,9 @@ if CONFIG["CPU_ARCH"] != "x86":
|
||||
|
||||
if CONFIG["MOZ_BITS_DOWNLOAD"]:
|
||||
RUST_TESTS += ["bits_client"]
|
||||
|
||||
# Export the `rure` crate's included .h file. The symbols defined in that file
|
||||
# will be exported from the `gkrust-shared` crate.
|
||||
EXPORTS += [
|
||||
"/third_party/rust/rure/include/rure.h",
|
||||
]
|
||||
|
@ -67,6 +67,7 @@ unic-langid-ffi = { path = "../../../../intl/locale/rust/unic-langid-ffi" }
|
||||
fluent-langneg = { version = "0.13", features = ["cldr"] }
|
||||
fluent-langneg-ffi = { path = "../../../../intl/locale/rust/fluent-langneg-ffi" }
|
||||
regex-ffi = { path = "../../../components/regex-ffi" }
|
||||
rure = "0.2.2"
|
||||
rust_minidump_writer_linux = { path = "../../../crashreporter/rust_minidump_writer_linux", optional = true }
|
||||
gecko-profiler = { path = "../../../../tools/profiler/rust-api"}
|
||||
midir_impl = { path = "../../../../dom/midi/midir_impl", optional = true }
|
||||
|
@ -74,6 +74,7 @@ extern crate fluent;
|
||||
extern crate fluent_ffi;
|
||||
|
||||
extern crate regex_ffi;
|
||||
extern crate rure;
|
||||
|
||||
extern crate fluent_fallback;
|
||||
extern crate l10nregistry_ffi;
|
||||
|
Loading…
Reference in New Issue
Block a user