mirror of
https://github.com/openharmony/third_party_re2.git
synced 2026-07-01 09:20:39 -04:00
re2 解压缩包
Signed-off-by:ganchuantao1<ganchuantao1@huawei.com> Signed-off-by: ganchuantao1 <ganchuantao1@huawei.com>
This commit is contained in:
+438
@@ -0,0 +1,438 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) BUILD file for RE2.
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
exports_files(["LICENSE"])
|
||||
|
||||
cc_library(
|
||||
name = "re2",
|
||||
srcs = [
|
||||
"re2/bitmap256.cc",
|
||||
"re2/bitmap256.h",
|
||||
"re2/bitstate.cc",
|
||||
"re2/compile.cc",
|
||||
"re2/dfa.cc",
|
||||
"re2/filtered_re2.cc",
|
||||
"re2/mimics_pcre.cc",
|
||||
"re2/nfa.cc",
|
||||
"re2/onepass.cc",
|
||||
"re2/parse.cc",
|
||||
"re2/perl_groups.cc",
|
||||
"re2/pod_array.h",
|
||||
"re2/prefilter.cc",
|
||||
"re2/prefilter.h",
|
||||
"re2/prefilter_tree.cc",
|
||||
"re2/prefilter_tree.h",
|
||||
"re2/prog.cc",
|
||||
"re2/prog.h",
|
||||
"re2/re2.cc",
|
||||
"re2/regexp.cc",
|
||||
"re2/regexp.h",
|
||||
"re2/set.cc",
|
||||
"re2/simplify.cc",
|
||||
"re2/sparse_array.h",
|
||||
"re2/sparse_set.h",
|
||||
"re2/tostring.cc",
|
||||
"re2/unicode_casefold.cc",
|
||||
"re2/unicode_casefold.h",
|
||||
"re2/unicode_groups.cc",
|
||||
"re2/unicode_groups.h",
|
||||
"re2/walker-inl.h",
|
||||
"util/rune.cc",
|
||||
"util/strutil.cc",
|
||||
"util/strutil.h",
|
||||
"util/utf.h",
|
||||
],
|
||||
hdrs = [
|
||||
"re2/filtered_re2.h",
|
||||
"re2/re2.h",
|
||||
"re2/set.h",
|
||||
"re2/stringpiece.h",
|
||||
],
|
||||
copts = select({
|
||||
# WebAssembly support for threads is... fraught at every level.
|
||||
"@platforms//cpu:wasm32": [],
|
||||
"@platforms//cpu:wasm64": [],
|
||||
"@platforms//os:emscripten": [],
|
||||
"@platforms//os:wasi": [],
|
||||
"@platforms//os:windows": [],
|
||||
"//conditions:default": ["-pthread"],
|
||||
}),
|
||||
linkopts = select({
|
||||
# macOS doesn't need `-pthread' when linking and it appears that
|
||||
# older versions of Clang will warn about the unused command line
|
||||
# argument, so just don't pass it.
|
||||
"@platforms//os:macos": [],
|
||||
# WebAssembly support for threads is... fraught at every level.
|
||||
"@platforms//cpu:wasm32": [],
|
||||
"@platforms//cpu:wasm64": [],
|
||||
"@platforms//os:emscripten": [],
|
||||
"@platforms//os:wasi": [],
|
||||
"@platforms//os:windows": [],
|
||||
"//conditions:default": ["-pthread"],
|
||||
}),
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"@abseil-cpp//absl/base",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/container:fixed_array",
|
||||
"@abseil-cpp//absl/container:flat_hash_map",
|
||||
"@abseil-cpp//absl/container:flat_hash_set",
|
||||
"@abseil-cpp//absl/container:inlined_vector",
|
||||
"@abseil-cpp//absl/hash",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@abseil-cpp//absl/synchronization",
|
||||
"@abseil-cpp//absl/types:optional",
|
||||
"@abseil-cpp//absl/types:span",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "testing",
|
||||
testonly = 1,
|
||||
srcs = [
|
||||
"re2/testing/backtrack.cc",
|
||||
"re2/testing/dump.cc",
|
||||
"re2/testing/exhaustive_tester.cc",
|
||||
"re2/testing/null_walker.cc",
|
||||
"re2/testing/regexp_generator.cc",
|
||||
"re2/testing/string_generator.cc",
|
||||
"re2/testing/tester.cc",
|
||||
"util/pcre.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"re2/testing/exhaustive_tester.h",
|
||||
"re2/testing/regexp_generator.h",
|
||||
"re2/testing/string_generator.h",
|
||||
"re2/testing/tester.h",
|
||||
"util/malloc_counter.h",
|
||||
"util/pcre.h",
|
||||
|
||||
# Exposed for testing only.
|
||||
"re2/bitmap256.h",
|
||||
"re2/pod_array.h",
|
||||
"re2/prefilter.h",
|
||||
"re2/prefilter_tree.h",
|
||||
"re2/prog.h",
|
||||
"re2/regexp.h",
|
||||
"re2/sparse_array.h",
|
||||
"re2/sparse_set.h",
|
||||
"re2/unicode_casefold.h",
|
||||
"re2/unicode_groups.h",
|
||||
"re2/walker-inl.h",
|
||||
"util/strutil.h",
|
||||
"util/utf.h",
|
||||
],
|
||||
visibility = [":__subpackages__"],
|
||||
deps = [
|
||||
":re2",
|
||||
"@abseil-cpp//absl/base",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/container:flat_hash_set",
|
||||
"@abseil-cpp//absl/flags:flag",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@googletest//:gtest",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "charclass_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/charclass_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "compile_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/compile_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "filtered_re2_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/filtered_re2_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "mimics_pcre_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/mimics_pcre_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "parse_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/parse_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "possible_match_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/possible_match_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "re2_arg_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/re2_arg_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/types:optional",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "re2_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/re2_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "regexp_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/regexp_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "required_prefix_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/required_prefix_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "search_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/search_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "set_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/set_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "simplify_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/simplify_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "string_generator_test",
|
||||
size = "small",
|
||||
srcs = ["re2/testing/string_generator_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "dfa_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/dfa_test.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/base:core_headers",
|
||||
"@abseil-cpp//absl/flags:flag",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "exhaustive1_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/exhaustive1_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "exhaustive2_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/exhaustive2_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "exhaustive3_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/exhaustive3_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "exhaustive_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/exhaustive_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "random_test",
|
||||
size = "large",
|
||||
srcs = ["re2/testing/random_test.cc"],
|
||||
deps = [
|
||||
":testing",
|
||||
"@abseil-cpp//absl/flags:flag",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@googletest//:gtest",
|
||||
"@googletest//:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "regexp_benchmark",
|
||||
testonly = 1,
|
||||
srcs = ["re2/testing/regexp_benchmark.cc"],
|
||||
deps = [
|
||||
":re2",
|
||||
":testing",
|
||||
"@abseil-cpp//absl/container:flat_hash_map",
|
||||
"@abseil-cpp//absl/flags:flag",
|
||||
"@abseil-cpp//absl/log:absl_check",
|
||||
"@abseil-cpp//absl/log:absl_log",
|
||||
"@abseil-cpp//absl/strings",
|
||||
"@abseil-cpp//absl/strings:str_format",
|
||||
"@abseil-cpp//absl/synchronization",
|
||||
"@google_benchmark//:benchmark_main",
|
||||
],
|
||||
)
|
||||
@@ -16,10 +16,7 @@ import("//build/ohos.gni")
|
||||
THIRDPARTY_RE2_SUBSYS_NAME = "thirdparty"
|
||||
THIRDPARTY_RE2_PART_NAME = "re2"
|
||||
|
||||
libre2_path = rebase_path("//third_party/re2")
|
||||
exec_script("install.sh", [ "$libre2_path" ])
|
||||
|
||||
RE2_DIR = rebase_path("//third_party/re2/re2")
|
||||
RE2_DIR = rebase_path("//third_party/re2")
|
||||
|
||||
config("re2_public_config") {
|
||||
include_dirs = [ "${RE2_DIR}/" ]
|
||||
|
||||
+268
@@ -0,0 +1,268 @@
|
||||
# Copyright 2015 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md
|
||||
cmake_minimum_required(VERSION 3.13)
|
||||
|
||||
project(RE2 CXX)
|
||||
include(CMakePackageConfigHelpers)
|
||||
include(CTest)
|
||||
include(GNUInstallDirs)
|
||||
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||
option(RE2_USE_ICU "build against ICU for full Unicode properties support" OFF)
|
||||
|
||||
# For historical reasons, this is just "USEPCRE", not "RE2_USE_PCRE".
|
||||
option(USEPCRE "build against PCRE for testing and benchmarking" OFF)
|
||||
|
||||
# See https://groups.google.com/g/re2-dev/c/P6_NM0YIWvA for details.
|
||||
# This has no effect unless RE2 is being built for an Apple platform
|
||||
# such as macOS or iOS.
|
||||
option(RE2_BUILD_FRAMEWORK "build RE2 as a framework" OFF)
|
||||
|
||||
# CMake seems to have no way to enable/disable testing per subproject,
|
||||
# so we provide an option similar to BUILD_TESTING, but just for RE2.
|
||||
option(RE2_BUILD_TESTING "enable testing for RE2" OFF)
|
||||
|
||||
# The pkg-config Requires: field.
|
||||
set(REQUIRES)
|
||||
|
||||
# ABI version
|
||||
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
|
||||
set(SONAME 11)
|
||||
|
||||
set(EXTRA_TARGET_LINK_LIBRARIES)
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
|
||||
if(MSVC_VERSION LESS 1920)
|
||||
message(FATAL_ERROR "you need Visual Studio 2019 or later")
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
# CMake defaults to /W3, but some users like /W4 (or /Wall) and /WX,
|
||||
# so we disable various warnings that aren't particularly helpful.
|
||||
add_compile_options(/wd4100 /wd4201 /wd4456 /wd4457 /wd4702 /wd4815)
|
||||
# Without a byte order mark (BOM), Visual Studio assumes that the source
|
||||
# file is encoded using the current user code page, so we specify UTF-8.
|
||||
add_compile_options(/utf-8)
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX)
|
||||
add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
endif()
|
||||
|
||||
set(ABSL_DEPS
|
||||
absl_absl_check
|
||||
absl_absl_log
|
||||
absl_base
|
||||
absl_core_headers
|
||||
absl_fixed_array
|
||||
absl_flags
|
||||
absl_flat_hash_map
|
||||
absl_flat_hash_set
|
||||
absl_hash
|
||||
absl_inlined_vector
|
||||
absl_optional
|
||||
absl_span
|
||||
absl_str_format
|
||||
absl_strings
|
||||
absl_synchronization
|
||||
)
|
||||
|
||||
# If a top-level project has called add_directory(abseil-cpp) already (possibly
|
||||
# indirectly), let that take precedence over any copy of Abseil that might have
|
||||
# been installed on the system. And likewise for ICU, GoogleTest and Benchmark.
|
||||
if(NOT TARGET absl::base)
|
||||
find_package(absl REQUIRED)
|
||||
endif()
|
||||
list(APPEND REQUIRES ${ABSL_DEPS})
|
||||
|
||||
if(RE2_USE_ICU)
|
||||
if(NOT TARGET ICU::uc)
|
||||
find_package(ICU REQUIRED COMPONENTS uc)
|
||||
endif()
|
||||
add_definitions(-DRE2_USE_ICU)
|
||||
list(APPEND REQUIRES icu-uc)
|
||||
endif()
|
||||
|
||||
if(USEPCRE)
|
||||
add_definitions(-DUSEPCRE)
|
||||
list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
|
||||
endif()
|
||||
|
||||
list(JOIN REQUIRES " " REQUIRES)
|
||||
|
||||
set(RE2_SOURCES
|
||||
re2/bitmap256.cc
|
||||
re2/bitstate.cc
|
||||
re2/compile.cc
|
||||
re2/dfa.cc
|
||||
re2/filtered_re2.cc
|
||||
re2/mimics_pcre.cc
|
||||
re2/nfa.cc
|
||||
re2/onepass.cc
|
||||
re2/parse.cc
|
||||
re2/perl_groups.cc
|
||||
re2/prefilter.cc
|
||||
re2/prefilter_tree.cc
|
||||
re2/prog.cc
|
||||
re2/re2.cc
|
||||
re2/regexp.cc
|
||||
re2/set.cc
|
||||
re2/simplify.cc
|
||||
re2/tostring.cc
|
||||
re2/unicode_casefold.cc
|
||||
re2/unicode_groups.cc
|
||||
util/rune.cc
|
||||
util/strutil.cc
|
||||
)
|
||||
|
||||
set(RE2_HEADERS
|
||||
re2/filtered_re2.h
|
||||
re2/re2.h
|
||||
re2/set.h
|
||||
re2/stringpiece.h
|
||||
)
|
||||
|
||||
add_library(re2 ${RE2_SOURCES})
|
||||
target_compile_features(re2 PUBLIC cxx_std_14)
|
||||
target_include_directories(re2 PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
|
||||
# CMake gives "set_target_properties called with incorrect number of arguments."
|
||||
# errors if we don't quote ${RE2_HEADERS}, so quote it despite prevailing style.
|
||||
set_target_properties(re2 PROPERTIES PUBLIC_HEADER "${RE2_HEADERS}")
|
||||
set_target_properties(re2 PROPERTIES SOVERSION ${SONAME} VERSION ${SONAME}.0.0)
|
||||
add_library(re2::re2 ALIAS re2)
|
||||
|
||||
if(APPLE AND RE2_BUILD_FRAMEWORK)
|
||||
set_target_properties(re2 PROPERTIES
|
||||
FRAMEWORK TRUE
|
||||
FRAMEWORK_VERSION A
|
||||
MACOSX_FRAMEWORK_IDENTIFIER com.googlesource.code.re2)
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
target_link_libraries(re2 PUBLIC Threads::Threads)
|
||||
endif()
|
||||
|
||||
foreach(dep ${ABSL_DEPS})
|
||||
# Work around https://gitlab.kitware.com/cmake/cmake/-/issues/16899. >:(
|
||||
string(PREPEND dep "^")
|
||||
string(REGEX REPLACE "\\^absl_" "absl::" dep ${dep})
|
||||
target_link_libraries(re2 PUBLIC ${dep})
|
||||
endforeach()
|
||||
|
||||
if(RE2_USE_ICU)
|
||||
target_link_libraries(re2 PUBLIC ICU::uc)
|
||||
endif()
|
||||
|
||||
if(RE2_BUILD_TESTING)
|
||||
if(NOT TARGET GTest::gtest)
|
||||
find_package(GTest REQUIRED)
|
||||
endif()
|
||||
if(NOT TARGET benchmark::benchmark)
|
||||
find_package(benchmark REQUIRED)
|
||||
endif()
|
||||
|
||||
set(TESTING_SOURCES
|
||||
re2/testing/backtrack.cc
|
||||
re2/testing/dump.cc
|
||||
re2/testing/exhaustive_tester.cc
|
||||
re2/testing/null_walker.cc
|
||||
re2/testing/regexp_generator.cc
|
||||
re2/testing/string_generator.cc
|
||||
re2/testing/tester.cc
|
||||
util/pcre.cc
|
||||
)
|
||||
|
||||
add_library(testing ${TESTING_SOURCES})
|
||||
if(BUILD_SHARED_LIBS AND WIN32)
|
||||
target_compile_definitions(testing PRIVATE -DRE2_BUILD_TESTING_DLL)
|
||||
endif()
|
||||
target_compile_features(testing PUBLIC cxx_std_14)
|
||||
target_link_libraries(testing PUBLIC re2 GTest::gtest)
|
||||
|
||||
set(TEST_TARGETS
|
||||
charclass_test
|
||||
compile_test
|
||||
filtered_re2_test
|
||||
mimics_pcre_test
|
||||
parse_test
|
||||
possible_match_test
|
||||
re2_test
|
||||
re2_arg_test
|
||||
regexp_test
|
||||
required_prefix_test
|
||||
search_test
|
||||
set_test
|
||||
simplify_test
|
||||
string_generator_test
|
||||
|
||||
dfa_test
|
||||
exhaustive1_test
|
||||
exhaustive2_test
|
||||
exhaustive3_test
|
||||
exhaustive_test
|
||||
random_test
|
||||
)
|
||||
|
||||
set(BENCHMARK_TARGETS
|
||||
regexp_benchmark
|
||||
)
|
||||
|
||||
foreach(target ${TEST_TARGETS})
|
||||
add_executable(${target} re2/testing/${target}.cc)
|
||||
if(BUILD_SHARED_LIBS AND WIN32)
|
||||
target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
|
||||
endif()
|
||||
target_compile_features(${target} PUBLIC cxx_std_14)
|
||||
target_link_libraries(${target} PUBLIC testing GTest::gtest_main ${EXTRA_TARGET_LINK_LIBRARIES})
|
||||
add_test(NAME ${target} COMMAND ${target})
|
||||
endforeach()
|
||||
|
||||
foreach(target ${BENCHMARK_TARGETS})
|
||||
add_executable(${target} re2/testing/${target}.cc)
|
||||
if(BUILD_SHARED_LIBS AND WIN32)
|
||||
target_compile_definitions(${target} PRIVATE -DRE2_CONSUME_TESTING_DLL)
|
||||
endif()
|
||||
target_compile_features(${target} PUBLIC cxx_std_14)
|
||||
target_link_libraries(${target} PUBLIC testing benchmark::benchmark_main ${EXTRA_TARGET_LINK_LIBRARIES})
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
install(TARGETS re2
|
||||
EXPORT re2Targets
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
FRAMEWORK DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/re2
|
||||
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
install(EXPORT re2Targets
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2
|
||||
NAMESPACE re2::)
|
||||
|
||||
configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/re2Config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
|
||||
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
|
||||
VERSION ${SONAME}.0.0
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2Config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/re2ConfigVersion.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/re2)
|
||||
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/re2.pc.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/re2.pc
|
||||
@ONLY)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/re2.pc
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
@@ -0,0 +1,2 @@
|
||||
RE2 uses Gerrit instead of GitHub pull requests.
|
||||
See the [Contribute](https://github.com/google/re2/wiki/Contribute) wiki page.
|
||||
@@ -0,0 +1,29 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) MODULE file for RE2.
|
||||
|
||||
module(
|
||||
name = "re2",
|
||||
version = "2024-07-02",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "platforms", version = "0.0.10")
|
||||
bazel_dep(name = "apple_support", version = "1.15.1")
|
||||
bazel_dep(name = "rules_cc", version = "0.0.9")
|
||||
bazel_dep(name = "abseil-cpp", version = "20240116.2")
|
||||
bazel_dep(name = "rules_python", version = "0.33.2")
|
||||
bazel_dep(name = "pybind11_bazel", version = "2.12.0")
|
||||
|
||||
# This is a temporary hack for `x64_x86_windows`.
|
||||
# TODO(junyer): Remove whenever no longer needed.
|
||||
cc_configure = use_extension("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_configure_extension")
|
||||
use_repo(cc_configure, "local_config_cc")
|
||||
|
||||
# These dependencies will be ignored when the `re2` module is not
|
||||
# the root module (or when `--ignore_dev_dependency` is enabled).
|
||||
bazel_dep(name = "google_benchmark", version = "1.8.4", dev_dependency = True)
|
||||
bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True)
|
||||
bazel_dep(name = "abseil-py", version = "2.1.0", dev_dependency = True)
|
||||
@@ -0,0 +1,408 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Build against Abseil.
|
||||
ABSL_DEPS=\
|
||||
absl_absl_check\
|
||||
absl_absl_log\
|
||||
absl_base\
|
||||
absl_core_headers\
|
||||
absl_fixed_array\
|
||||
absl_flags\
|
||||
absl_flat_hash_map\
|
||||
absl_flat_hash_set\
|
||||
absl_hash\
|
||||
absl_inlined_vector\
|
||||
absl_optional\
|
||||
absl_span\
|
||||
absl_str_format\
|
||||
absl_strings\
|
||||
absl_synchronization\
|
||||
|
||||
PKG_CONFIG?=pkg-config
|
||||
CCABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --cflags)
|
||||
# GCC barfs on `-Wl` whereas Clang doesn't mind, but it's unclear what
|
||||
# causes it to manifest on Ubuntu 22.04 LTS, so filter it out for now.
|
||||
# Similar is needed for `static-testinstall` and `shared-testinstall`.
|
||||
LDABSL=$(shell $(PKG_CONFIG) $(ABSL_DEPS) --libs | sed -e 's/-Wl / /g')
|
||||
|
||||
# To build against ICU for full Unicode properties support,
|
||||
# uncomment the next two lines:
|
||||
# CCICU=$(shell $(PKG_CONFIG) icu-uc --cflags) -DRE2_USE_ICU
|
||||
# LDICU=$(shell $(PKG_CONFIG) icu-uc --libs)
|
||||
|
||||
# Build against GoogleTest and Benchmark for... testing and benchmarking.
|
||||
# Capture only the `-L` flags for now; we will pass the `-l` flags later.
|
||||
CCGTEST=$(shell $(PKG_CONFIG) gtest gtest_main --cflags)
|
||||
LDGTEST=$(shell $(PKG_CONFIG) gtest gtest_main --libs-only-L)
|
||||
CCBENCHMARK=$(shell $(PKG_CONFIG) benchmark --cflags)
|
||||
LDBENCHMARK=$(shell $(PKG_CONFIG) benchmark --libs-only-L)
|
||||
|
||||
# To build against PCRE for testing and benchmarking,
|
||||
# uncomment the next two lines:
|
||||
# CCPCRE=-I/usr/local/include -DUSEPCRE
|
||||
# LDPCRE=-L/usr/local/lib -lpcre
|
||||
|
||||
CXX?=g++
|
||||
# can override
|
||||
CXXFLAGS?=-O3 -g
|
||||
LDFLAGS?=
|
||||
# required
|
||||
RE2_CXXFLAGS?=-pthread -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCABSL) $(CCICU) $(CCGTEST) $(CCBENCHMARK) $(CCPCRE)
|
||||
RE2_LDFLAGS?=-pthread $(LDABSL) $(LDICU) $(LDGTEST) $(LDBENCHMARK) $(LDPCRE)
|
||||
AR?=ar
|
||||
ARFLAGS?=rsc
|
||||
NM?=nm
|
||||
NMFLAGS?=-p
|
||||
|
||||
# Variables mandated by GNU, the arbiter of all good taste on the internet.
|
||||
# http://www.gnu.org/prep/standards/standards.html
|
||||
prefix=/usr/local
|
||||
exec_prefix=$(prefix)
|
||||
includedir=$(prefix)/include
|
||||
libdir=$(exec_prefix)/lib
|
||||
INSTALL=install
|
||||
INSTALL_DATA=$(INSTALL) -m 644
|
||||
|
||||
# Work around the weirdness of sed(1) on Darwin. :/
|
||||
ifeq ($(shell uname),Darwin)
|
||||
SED_INPLACE=sed -i ''
|
||||
else ifeq ($(shell uname),SunOS)
|
||||
SED_INPLACE=sed -i
|
||||
else
|
||||
SED_INPLACE=sed -i
|
||||
endif
|
||||
|
||||
# The pkg-config Requires: field.
|
||||
REQUIRES=$(ABSL_DEPS)
|
||||
ifdef LDICU
|
||||
REQUIRES+=icu-uc
|
||||
endif
|
||||
|
||||
# ABI version
|
||||
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
|
||||
SONAME=11
|
||||
|
||||
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
|
||||
# access for Unicode data), uncomment the following line:
|
||||
# REBUILD_TABLES=1
|
||||
|
||||
# The SunOS linker does not support wildcards. :(
|
||||
ifeq ($(shell uname),Darwin)
|
||||
SOEXT=dylib
|
||||
SOEXTVER=$(SONAME).$(SOEXT)
|
||||
SOEXTVER00=$(SONAME).0.0.$(SOEXT)
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib -Wl,-compatibility_version,$(SONAME),-current_version,$(SONAME).0.0,-install_name,$(libdir)/libre2.$(SOEXTVER),-exported_symbols_list,libre2.symbols.darwin
|
||||
else ifeq ($(shell uname),SunOS)
|
||||
SOEXT=so
|
||||
SOEXTVER=$(SOEXT).$(SONAME)
|
||||
SOEXTVER00=$(SOEXT).$(SONAME).0.0
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER)
|
||||
else
|
||||
SOEXT=so
|
||||
SOEXTVER=$(SOEXT).$(SONAME)
|
||||
SOEXTVER00=$(SOEXT).$(SONAME).0.0
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols
|
||||
endif
|
||||
|
||||
.PHONY: all
|
||||
all: obj/libre2.a obj/so/libre2.$(SOEXT)
|
||||
|
||||
INSTALL_HFILES=\
|
||||
re2/filtered_re2.h\
|
||||
re2/re2.h\
|
||||
re2/set.h\
|
||||
re2/stringpiece.h\
|
||||
|
||||
HFILES=\
|
||||
util/malloc_counter.h\
|
||||
util/pcre.h\
|
||||
util/strutil.h\
|
||||
util/utf.h\
|
||||
re2/bitmap256.h\
|
||||
re2/filtered_re2.h\
|
||||
re2/pod_array.h\
|
||||
re2/prefilter.h\
|
||||
re2/prefilter_tree.h\
|
||||
re2/prog.h\
|
||||
re2/re2.h\
|
||||
re2/regexp.h\
|
||||
re2/set.h\
|
||||
re2/sparse_array.h\
|
||||
re2/sparse_set.h\
|
||||
re2/stringpiece.h\
|
||||
re2/testing/exhaustive_tester.h\
|
||||
re2/testing/regexp_generator.h\
|
||||
re2/testing/string_generator.h\
|
||||
re2/testing/tester.h\
|
||||
re2/unicode_casefold.h\
|
||||
re2/unicode_groups.h\
|
||||
re2/walker-inl.h\
|
||||
|
||||
OFILES=\
|
||||
obj/util/rune.o\
|
||||
obj/util/strutil.o\
|
||||
obj/re2/bitmap256.o\
|
||||
obj/re2/bitstate.o\
|
||||
obj/re2/compile.o\
|
||||
obj/re2/dfa.o\
|
||||
obj/re2/filtered_re2.o\
|
||||
obj/re2/mimics_pcre.o\
|
||||
obj/re2/nfa.o\
|
||||
obj/re2/onepass.o\
|
||||
obj/re2/parse.o\
|
||||
obj/re2/perl_groups.o\
|
||||
obj/re2/prefilter.o\
|
||||
obj/re2/prefilter_tree.o\
|
||||
obj/re2/prog.o\
|
||||
obj/re2/re2.o\
|
||||
obj/re2/regexp.o\
|
||||
obj/re2/set.o\
|
||||
obj/re2/simplify.o\
|
||||
obj/re2/tostring.o\
|
||||
obj/re2/unicode_casefold.o\
|
||||
obj/re2/unicode_groups.o\
|
||||
|
||||
TESTOFILES=\
|
||||
obj/util/pcre.o\
|
||||
obj/re2/testing/backtrack.o\
|
||||
obj/re2/testing/dump.o\
|
||||
obj/re2/testing/exhaustive_tester.o\
|
||||
obj/re2/testing/null_walker.o\
|
||||
obj/re2/testing/regexp_generator.o\
|
||||
obj/re2/testing/string_generator.o\
|
||||
obj/re2/testing/tester.o\
|
||||
|
||||
TESTS=\
|
||||
obj/test/charclass_test\
|
||||
obj/test/compile_test\
|
||||
obj/test/filtered_re2_test\
|
||||
obj/test/mimics_pcre_test\
|
||||
obj/test/parse_test\
|
||||
obj/test/possible_match_test\
|
||||
obj/test/re2_test\
|
||||
obj/test/re2_arg_test\
|
||||
obj/test/regexp_test\
|
||||
obj/test/required_prefix_test\
|
||||
obj/test/search_test\
|
||||
obj/test/set_test\
|
||||
obj/test/simplify_test\
|
||||
obj/test/string_generator_test\
|
||||
|
||||
BIGTESTS=\
|
||||
obj/test/dfa_test\
|
||||
obj/test/exhaustive1_test\
|
||||
obj/test/exhaustive2_test\
|
||||
obj/test/exhaustive3_test\
|
||||
obj/test/exhaustive_test\
|
||||
obj/test/random_test\
|
||||
|
||||
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
|
||||
# We use TESTOFILES for testing the shared lib, only it is built differently.
|
||||
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
|
||||
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
|
||||
|
||||
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
|
||||
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
|
||||
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
|
||||
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
|
||||
|
||||
.PRECIOUS: obj/%.o
|
||||
obj/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
.PRECIOUS: obj/dbg/%.o
|
||||
obj/dbg/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc
|
||||
|
||||
.PRECIOUS: obj/so/%.o
|
||||
obj/so/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
.PRECIOUS: obj/libre2.a
|
||||
obj/libre2.a: $(OFILES)
|
||||
@mkdir -p obj
|
||||
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
|
||||
|
||||
.PRECIOUS: obj/dbg/libre2.a
|
||||
obj/dbg/libre2.a: $(DOFILES)
|
||||
@mkdir -p obj/dbg
|
||||
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
|
||||
|
||||
.PRECIOUS: obj/so/libre2.$(SOEXT)
|
||||
obj/so/libre2.$(SOEXT): $(SOFILES) libre2.symbols libre2.symbols.darwin
|
||||
@mkdir -p obj/so
|
||||
$(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) $(RE2_LDFLAGS) $(LDFLAGS)
|
||||
ln -sf libre2.$(SOEXTVER) $@
|
||||
|
||||
.PRECIOUS: obj/dbg/test/%
|
||||
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES)
|
||||
@mkdir -p obj/dbg/test
|
||||
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) -lgtest -lgtest_main
|
||||
|
||||
.PRECIOUS: obj/test/%
|
||||
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
|
||||
@mkdir -p obj/test
|
||||
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) -lgtest -lgtest_main
|
||||
|
||||
# Test the shared lib, falling back to the static lib for private symbols
|
||||
.PRECIOUS: obj/so/test/%
|
||||
obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES)
|
||||
@mkdir -p obj/so/test
|
||||
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) -lgtest -lgtest_main
|
||||
|
||||
obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES)
|
||||
@mkdir -p obj/test
|
||||
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) -lgtest -lbenchmark -lbenchmark_main
|
||||
|
||||
obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o
|
||||
@mkdir -p obj/test
|
||||
$(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
ifdef REBUILD_TABLES
|
||||
.PRECIOUS: re2/perl_groups.cc
|
||||
re2/perl_groups.cc: re2/make_perl_groups.pl
|
||||
perl $< > $@
|
||||
|
||||
.PRECIOUS: re2/unicode_%.cc
|
||||
re2/unicode_%.cc: re2/make_unicode_%.py re2/unicode.py
|
||||
python3 $< > $@
|
||||
endif
|
||||
|
||||
.PHONY: distclean
|
||||
distclean: clean
|
||||
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -rf obj
|
||||
rm -f re2/*.pyc
|
||||
|
||||
.PHONY: testofiles
|
||||
testofiles: $(TESTOFILES)
|
||||
|
||||
.PHONY: test
|
||||
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
|
||||
|
||||
.PHONY: debug-test
|
||||
debug-test: $(DTESTS)
|
||||
@./runtests $(DTESTS)
|
||||
|
||||
.PHONY: static-test
|
||||
static-test: $(TESTS)
|
||||
@./runtests $(TESTS)
|
||||
|
||||
.PHONY: shared-test
|
||||
shared-test: $(STESTS)
|
||||
@./runtests -shared-library-path obj/so $(STESTS)
|
||||
|
||||
.PHONY: debug-bigtest
|
||||
debug-bigtest: $(DTESTS) $(DBIGTESTS)
|
||||
@./runtests $(DTESTS) $(DBIGTESTS)
|
||||
|
||||
.PHONY: static-bigtest
|
||||
static-bigtest: $(TESTS) $(BIGTESTS)
|
||||
@./runtests $(TESTS) $(BIGTESTS)
|
||||
|
||||
.PHONY: shared-bigtest
|
||||
shared-bigtest: $(STESTS) $(SBIGTESTS)
|
||||
@./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS)
|
||||
|
||||
.PHONY: benchmark
|
||||
benchmark: obj/test/regexp_benchmark
|
||||
|
||||
.PHONY: fuzz
|
||||
fuzz: obj/test/re2_fuzzer
|
||||
|
||||
.PHONY: install
|
||||
install: static-install shared-install
|
||||
|
||||
.PHONY: static
|
||||
static: obj/libre2.a
|
||||
|
||||
.PHONY: static-install
|
||||
static-install: obj/libre2.a common-install
|
||||
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
|
||||
|
||||
.PHONY: shared
|
||||
shared: obj/so/libre2.$(SOEXT)
|
||||
|
||||
.PHONY: shared-install
|
||||
shared-install: obj/so/libre2.$(SOEXT) common-install
|
||||
$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
|
||||
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
|
||||
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
|
||||
|
||||
.PHONY: common-install
|
||||
common-install:
|
||||
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
|
||||
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
|
||||
$(INSTALL_DATA) re2.pc.in $(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
$(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_INCLUDEDIR@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
$(SED_INPLACE) -e "s#@CMAKE_INSTALL_FULL_LIBDIR@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
$(SED_INPLACE) -e "s#@REQUIRES@#$(REQUIRES)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
$(SED_INPLACE) -e "s#@SONAME@#$(SONAME)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
|
||||
.PHONY: testinstall
|
||||
testinstall: static-testinstall shared-testinstall
|
||||
@echo
|
||||
@echo Install tests passed.
|
||||
@echo
|
||||
|
||||
.PHONY: static-testinstall
|
||||
static-testinstall:
|
||||
ifeq ($(shell uname),Darwin)
|
||||
@echo Skipping test for libre2.a on Darwin.
|
||||
else ifeq ($(shell uname),SunOS)
|
||||
@echo Skipping test for libre2.a on SunOS.
|
||||
else
|
||||
@mkdir -p obj
|
||||
@cp testinstall.cc obj/static-testinstall.cc
|
||||
(cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
|
||||
$(CXX) static-testinstall.cc -o static-testinstall $(CXXFLAGS) $(LDFLAGS) \
|
||||
$$($(PKG_CONFIG) re2 --cflags) \
|
||||
$$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g' | sed -e 's/-lre2/-l:libre2.a/'))
|
||||
obj/static-testinstall
|
||||
endif
|
||||
|
||||
.PHONY: shared-testinstall
|
||||
shared-testinstall:
|
||||
@mkdir -p obj
|
||||
@cp testinstall.cc obj/shared-testinstall.cc
|
||||
(cd obj && export PKG_CONFIG_PATH=$(DESTDIR)$(libdir)/pkgconfig; \
|
||||
$(CXX) shared-testinstall.cc -o shared-testinstall $(CXXFLAGS) $(LDFLAGS) \
|
||||
$$($(PKG_CONFIG) re2 --cflags) \
|
||||
$$($(PKG_CONFIG) re2 --libs | sed -e 's/-Wl / /g'))
|
||||
ifeq ($(shell uname),Darwin)
|
||||
DYLD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(DYLD_LIBRARY_PATH)" obj/shared-testinstall
|
||||
else
|
||||
LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/shared-testinstall
|
||||
endif
|
||||
|
||||
.PHONY: benchlog
|
||||
benchlog: obj/test/regexp_benchmark
|
||||
(echo '==BENCHMARK==' `hostname` `date`; \
|
||||
(uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
|
||||
echo; \
|
||||
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
|
||||
|
||||
.PHONY: log
|
||||
log:
|
||||
$(MAKE) clean
|
||||
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
|
||||
$(filter obj/test/exhaustive%_test,$(BIGTESTS))
|
||||
echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
|
||||
echo '#' $$(date) >>re2-exhaustive.txt
|
||||
obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
|
||||
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
|
||||
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
|
||||
echo '#' $$(date) >>re2-search.txt
|
||||
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
|
||||
@@ -10,6 +10,11 @@ make test
|
||||
make install
|
||||
make testinstall
|
||||
|
||||
Building RE2 requires Abseil (https://github.com/abseil/abseil-cpp)
|
||||
to be installed on your system. Building the testing for RE2 requires
|
||||
GoogleTest (https://github.com/google/googletest) and Benchmark
|
||||
(https://github.com/google/benchmark) to be installed as well.
|
||||
|
||||
There is a fair amount of documentation (including code snippets) in
|
||||
the re2.h header file.
|
||||
|
||||
@@ -27,7 +32,7 @@ under the BSD-style license found in the LICENSE file.
|
||||
|
||||
RE2's native language is C++.
|
||||
|
||||
The Python wrapper is at https://github.com/google/re2/tree/abseil/python
|
||||
The Python wrapper is at https://github.com/google/re2/tree/main/python
|
||||
and on PyPI (https://pypi.org/project/google-re2/).
|
||||
|
||||
A C wrapper is at https://github.com/marcomaggi/cre2/.
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
To report a security issue, please use https://g.co/vulnz. We use
|
||||
https://g.co/vulnz for our intake, and do coordination and disclosure here on
|
||||
GitHub (including using GitHub Security Advisory). The Google Security Team will
|
||||
respond within 5 working days of your report on https://g.co/vulnz.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) WORKSPACE file for RE2.
|
||||
|
||||
workspace(name = "com_googlesource_code_re2")
|
||||
@@ -0,0 +1,7 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) WORKSPACE file for RE2.
|
||||
|
||||
workspace(name = "com_googlesource_code_re2")
|
||||
@@ -0,0 +1,24 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) BUILD file for RE2 app.
|
||||
|
||||
cc_binary(
|
||||
name = "_re2.js",
|
||||
testonly = 1,
|
||||
srcs = ["_re2.cc"],
|
||||
linkopts = [
|
||||
"--bind",
|
||||
"-sENVIRONMENT=web",
|
||||
"-sSINGLE_FILE=1",
|
||||
"-sMODULARIZE=1",
|
||||
"-sEXPORT_ES6=1",
|
||||
"-sEXPORT_NAME=loadModule",
|
||||
"-sUSE_PTHREADS=0",
|
||||
],
|
||||
deps = [
|
||||
"//:re2",
|
||||
"//:testing",
|
||||
],
|
||||
)
|
||||
+94
@@ -0,0 +1,94 @@
|
||||
// Copyright 2022 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include <emscripten/bind.h>
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2_app {
|
||||
|
||||
struct Info {
|
||||
std::string pattern;
|
||||
std::string error;
|
||||
std::string prefix;
|
||||
bool prefix_foldcase = false;
|
||||
std::string accel_prefix;
|
||||
bool accel_prefix_foldcase = false;
|
||||
int num_captures;
|
||||
bool is_one_pass;
|
||||
bool can_bit_state;
|
||||
std::string bytecode;
|
||||
std::string bytemap;
|
||||
};
|
||||
|
||||
Info GetInfo(const std::string& pattern) {
|
||||
Info info;
|
||||
info.pattern = pattern;
|
||||
|
||||
RE2::Options options;
|
||||
re2::RegexpStatus status;
|
||||
re2::Regexp* regexp = re2::Regexp::Parse(
|
||||
pattern, static_cast<re2::Regexp::ParseFlags>(options.ParseFlags()),
|
||||
&status);
|
||||
if (regexp == nullptr) {
|
||||
info.error = "failed to parse pattern: " + status.Text();
|
||||
return info;
|
||||
}
|
||||
|
||||
std::string prefix;
|
||||
bool prefix_foldcase;
|
||||
re2::Regexp* suffix;
|
||||
if (regexp->RequiredPrefix(&prefix, &prefix_foldcase, &suffix)) {
|
||||
info.prefix = prefix;
|
||||
info.prefix_foldcase = prefix_foldcase;
|
||||
} else {
|
||||
suffix = regexp->Incref();
|
||||
}
|
||||
|
||||
std::unique_ptr<re2::Prog> prog(suffix->CompileToProg(options.max_mem()));
|
||||
if (prog == nullptr) {
|
||||
info.error = "failed to compile forward Prog";
|
||||
suffix->Decref();
|
||||
regexp->Decref();
|
||||
return info;
|
||||
}
|
||||
|
||||
if (regexp->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) {
|
||||
info.accel_prefix = prefix;
|
||||
info.accel_prefix_foldcase = prefix_foldcase;
|
||||
}
|
||||
|
||||
info.num_captures = suffix->NumCaptures();
|
||||
info.is_one_pass = prog->IsOnePass();
|
||||
info.can_bit_state = prog->CanBitState();
|
||||
info.bytecode = prog->Dump();
|
||||
info.bytemap = prog->DumpByteMap();
|
||||
|
||||
suffix->Decref();
|
||||
regexp->Decref();
|
||||
return info;
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(_re2) {
|
||||
emscripten::value_object<Info>("Info")
|
||||
.field("pattern", &Info::pattern)
|
||||
.field("error", &Info::error)
|
||||
.field("prefix", &Info::prefix)
|
||||
.field("prefix_foldcase", &Info::prefix_foldcase)
|
||||
.field("accel_prefix", &Info::accel_prefix)
|
||||
.field("accel_prefix_foldcase", &Info::accel_prefix_foldcase)
|
||||
.field("num_captures", &Info::num_captures)
|
||||
.field("is_one_pass", &Info::is_one_pass)
|
||||
.field("can_bit_state", &Info::can_bit_state)
|
||||
.field("bytecode", &Info::bytecode)
|
||||
.field("bytemap", &Info::bytemap);
|
||||
|
||||
emscripten::function("getInfo", &GetInfo);
|
||||
}
|
||||
|
||||
} // namespace re2_app
|
||||
Vendored
+23
@@ -0,0 +1,23 @@
|
||||
// Copyright 2022 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
export type Info = {
|
||||
pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
error: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
prefix_foldcase: boolean,
|
||||
accel_prefix: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
accel_prefix_foldcase: boolean,
|
||||
num_captures: number,
|
||||
is_one_pass: boolean,
|
||||
can_bit_state: boolean,
|
||||
bytecode: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
bytemap: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string,
|
||||
};
|
||||
|
||||
export interface MainModule {
|
||||
getInfo(pattern: ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string): Info;
|
||||
}
|
||||
|
||||
export default function loadModule(): Promise<MainModule>;
|
||||
+111
@@ -0,0 +1,111 @@
|
||||
// Copyright 2022 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
import {css, html, LitElement, render} from 'lit';
|
||||
import {customElement} from 'lit/decorators.js';
|
||||
|
||||
import /*default*/ loadModule from './_re2';
|
||||
import {Info, MainModule} from './_re2';
|
||||
|
||||
var _re2: MainModule;
|
||||
loadModule().then((module: MainModule) => {
|
||||
_re2 = module;
|
||||
render(html`<title>re2-dev</title><re2-dev></re2-dev>`, document.body);
|
||||
});
|
||||
|
||||
@customElement('re2-dev')
|
||||
export class RE2Dev extends LitElement {
|
||||
private _pattern: string = '';
|
||||
private _info: Info|null = null;
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this._pattern = decodeURIComponent(window.location.hash.slice(1));
|
||||
this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
|
||||
this.requestUpdate();
|
||||
}
|
||||
|
||||
private _onChange = (e: Event) => {
|
||||
this._pattern = (e.target as HTMLInputElement).value;
|
||||
this._info = this._pattern ? _re2.getInfo(this._pattern) : null;
|
||||
this.requestUpdate();
|
||||
window.location.hash = '#' + encodeURIComponent(this._pattern);
|
||||
};
|
||||
|
||||
static override styles = css`
|
||||
.code {
|
||||
font-family: monospace;
|
||||
white-space: pre-line;
|
||||
}
|
||||
`;
|
||||
|
||||
override render() {
|
||||
var fragments = [];
|
||||
fragments.push(html`
|
||||
<div>
|
||||
<input type="text" size="48" @change=${this._onChange} .value=${this._pattern}>
|
||||
</div>
|
||||
`);
|
||||
|
||||
if (this._info === null) {
|
||||
return html`${fragments}`;
|
||||
}
|
||||
|
||||
if (this._info.error) {
|
||||
fragments.push(html`
|
||||
<br>
|
||||
<div>
|
||||
error:
|
||||
<span class="code">${this._info.error}</span>
|
||||
</div>
|
||||
`);
|
||||
return html`${fragments}`;
|
||||
}
|
||||
|
||||
fragments.push(html`
|
||||
<br>
|
||||
<div>
|
||||
pattern:
|
||||
<span class="code">${this._info.pattern}</span>
|
||||
<br>
|
||||
prefix:
|
||||
<span class="code">${this._info.prefix}</span>
|
||||
·
|
||||
_foldcase:
|
||||
<span class="code">${this._info.prefix_foldcase}</span>
|
||||
<br>
|
||||
accel_prefix:
|
||||
<span class="code">${this._info.accel_prefix}</span>
|
||||
·
|
||||
_foldcase:
|
||||
<span class="code">${this._info.accel_prefix_foldcase}</span>
|
||||
<br>
|
||||
num_captures:
|
||||
<span class="code">${this._info.num_captures}</span>
|
||||
<br>
|
||||
is_one_pass:
|
||||
<span class="code">${this._info.is_one_pass}</span>
|
||||
<br>
|
||||
can_bit_state:
|
||||
<span class="code">${this._info.can_bit_state}</span>
|
||||
<br>
|
||||
<br>
|
||||
bytecode:
|
||||
<br>
|
||||
<span class="code">${this._info.bytecode}</span>
|
||||
<br>
|
||||
bytemap:
|
||||
<br>
|
||||
<span class="code">${this._info.bytemap}</span>
|
||||
</div>
|
||||
`);
|
||||
return html`${fragments}`;
|
||||
}
|
||||
}
|
||||
|
||||
declare global {
|
||||
interface HTMLElementTagNameMap {
|
||||
're2-dev': RE2Dev;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
set -eux
|
||||
|
||||
SRCDIR=$(readlink --canonicalize $(dirname $0))
|
||||
DSTDIR=$(mktemp --directory --tmpdir $(basename $0).XXXXXXXXXX)
|
||||
|
||||
cd ${SRCDIR}
|
||||
# Emscripten doesn't support `-fstack-protector`.
|
||||
AR=emar CC=emcc \
|
||||
bazel build \
|
||||
--copt=-fno-stack-protector \
|
||||
--compilation_mode=opt -- :all
|
||||
cp ../bazel-bin/app/_re2.js ${DSTDIR}
|
||||
bazel clean --expunge
|
||||
cp app.ts index.html _re2.d.ts ${DSTDIR}
|
||||
cp package.json rollup.config.js tsconfig.json ${DSTDIR}
|
||||
|
||||
cd ${DSTDIR}
|
||||
npm install
|
||||
npx tsc
|
||||
npx rollup -c rollup.config.js -d deploy
|
||||
|
||||
cd ${SRCDIR}
|
||||
mkdir deploy
|
||||
cat >deploy/index.html <<EOF
|
||||
<html><head><meta http-equiv="refresh" content="0; url=https://github.com/google/re2"></head><body></body></html>
|
||||
EOF
|
||||
mkdir deploy/app
|
||||
cp ${DSTDIR}/deploy/* deploy/app
|
||||
ls -lR deploy
|
||||
|
||||
exit 0
|
||||
@@ -0,0 +1,5 @@
|
||||
<!DOCTYPE html>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style>:root { color-scheme: dark light; }</style>
|
||||
<script type="module" src="app.js"></script>
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"lit": "*"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@rollup/plugin-node-resolve": "*",
|
||||
"@rollup/plugin-terser": "*",
|
||||
"@web/rollup-plugin-html": "*",
|
||||
"@web/rollup-plugin-import-meta-assets": "*",
|
||||
"rollup": "~2",
|
||||
"tslib": "*",
|
||||
"typescript": "*"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
// Copyright 2022 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
import nodeResolve from '@rollup/plugin-node-resolve';
|
||||
import terser from '@rollup/plugin-terser';
|
||||
import html from '@web/rollup-plugin-html';
|
||||
import {importMetaAssets} from '@web/rollup-plugin-import-meta-assets';
|
||||
|
||||
export default {
|
||||
input: 'index.html',
|
||||
output: {
|
||||
entryFileNames: '[hash].js',
|
||||
chunkFileNames: '[hash].js',
|
||||
assetFileNames: '[hash][extname]',
|
||||
format: 'es',
|
||||
},
|
||||
preserveEntrySignatures: false,
|
||||
plugins:
|
||||
[
|
||||
html({
|
||||
minify: true,
|
||||
}),
|
||||
nodeResolve(),
|
||||
terser(),
|
||||
importMetaAssets(),
|
||||
],
|
||||
};
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "esnext",
|
||||
"module": "esnext",
|
||||
"moduleResolution": "node",
|
||||
"noEmitOnError": true,
|
||||
"lib": ["esnext", "dom"],
|
||||
"strict": true,
|
||||
"esModuleInterop": false,
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"experimentalDecorators": true,
|
||||
"importHelpers": true,
|
||||
"sourceMap": true,
|
||||
"inlineSources": true,
|
||||
"incremental": true
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,582 @@
|
||||
hw.ncpu: 2
|
||||
hw.byteorder: 1234
|
||||
hw.memsize: 4294967296
|
||||
hw.activecpu: 2
|
||||
hw.physicalcpu: 2
|
||||
hw.physicalcpu_max: 2
|
||||
hw.logicalcpu: 2
|
||||
hw.logicalcpu_max: 2
|
||||
hw.cputype: 7
|
||||
hw.cpusubtype: 4
|
||||
hw.cpu64bit_capable: 1
|
||||
hw.cpufamily: 1114597871
|
||||
hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0
|
||||
hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0
|
||||
hw.pagesize: 4096
|
||||
hw.busfrequency: 664000000
|
||||
hw.busfrequency_min: 664000000
|
||||
hw.busfrequency_max: 664000000
|
||||
hw.cpufrequency: 1830000000
|
||||
hw.cpufrequency_min: 1830000000
|
||||
hw.cpufrequency_max: 1830000000
|
||||
hw.cachelinesize: 64
|
||||
hw.l1icachesize: 32768
|
||||
hw.l1dcachesize: 32768
|
||||
hw.l2cachesize: 2097152
|
||||
hw.tbfrequency: 1000000000
|
||||
hw.packages: 1
|
||||
hw.optional.floatingpoint: 1
|
||||
hw.optional.mmx: 1
|
||||
hw.optional.sse: 1
|
||||
hw.optional.sse2: 1
|
||||
hw.optional.sse3: 1
|
||||
hw.optional.supplementalsse3: 1
|
||||
hw.optional.sse4_1: 0
|
||||
hw.optional.sse4_2: 0
|
||||
hw.optional.x86_64: 1
|
||||
hw.machine = i386
|
||||
hw.model = Macmini2,1
|
||||
hw.ncpu = 2
|
||||
hw.byteorder = 1234
|
||||
hw.physmem = 2147483648
|
||||
hw.usermem = 1849147392
|
||||
hw.pagesize = 4096
|
||||
hw.epoch = 0
|
||||
hw.vectorunit = 1
|
||||
hw.busfrequency = 664000000
|
||||
hw.cpufrequency = 1830000000
|
||||
hw.cachelinesize = 64
|
||||
hw.l1icachesize = 32768
|
||||
hw.l1dcachesize = 32768
|
||||
hw.l2settings = 1
|
||||
hw.l2cachesize = 2097152
|
||||
hw.tbfrequency = 1000000000
|
||||
hw.memsize = 4294967296
|
||||
hw.availcpu = 2
|
||||
|
||||
machdep.cpu.max_basic: 10
|
||||
machdep.cpu.max_ext: 2147483656
|
||||
machdep.cpu.vendor: GenuineIntel
|
||||
machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz
|
||||
machdep.cpu.family: 6
|
||||
machdep.cpu.model: 15
|
||||
machdep.cpu.extmodel: 0
|
||||
machdep.cpu.extfamily: 0
|
||||
machdep.cpu.stepping: 2
|
||||
machdep.cpu.feature_bits: 3219913727 58301
|
||||
machdep.cpu.extfeature_bits: 537921536 1
|
||||
machdep.cpu.signature: 1778
|
||||
machdep.cpu.brand: 0
|
||||
machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
|
||||
machdep.cpu.extfeatures: SYSCALL XD EM64T
|
||||
machdep.cpu.logical_per_package: 2
|
||||
machdep.cpu.cores_per_package: 2
|
||||
machdep.cpu.microcode_version: 87
|
||||
machdep.cpu.mwait.linesize_min: 64
|
||||
machdep.cpu.mwait.linesize_max: 64
|
||||
machdep.cpu.mwait.extensions: 3
|
||||
machdep.cpu.mwait.sub_Cstates: 139808
|
||||
machdep.cpu.thermal.sensor: 1
|
||||
machdep.cpu.thermal.dynamic_acceleration: 0
|
||||
machdep.cpu.thermal.thresholds: 2
|
||||
machdep.cpu.thermal.ACNT_MCNT: 1
|
||||
machdep.cpu.arch_perf.version: 2
|
||||
machdep.cpu.arch_perf.number: 2
|
||||
machdep.cpu.arch_perf.width: 40
|
||||
machdep.cpu.arch_perf.events_number: 7
|
||||
machdep.cpu.arch_perf.events: 0
|
||||
machdep.cpu.arch_perf.fixed_number: 0
|
||||
machdep.cpu.arch_perf.fixed_width: 0
|
||||
machdep.cpu.cache.linesize: 64
|
||||
machdep.cpu.cache.L2_associativity: 6
|
||||
machdep.cpu.cache.size: 2048
|
||||
machdep.cpu.tlb.inst.small: 128
|
||||
machdep.cpu.tlb.inst.large: 8
|
||||
machdep.cpu.tlb.data.small: 16
|
||||
machdep.cpu.tlb.data.small_level1: 256
|
||||
machdep.cpu.tlb.data.large: 16
|
||||
machdep.cpu.tlb.data.large_level1: 32
|
||||
machdep.cpu.address_bits.physical: 36
|
||||
machdep.cpu.address_bits.virtual: 48
|
||||
machdep.cpu.core_count: 2
|
||||
machdep.cpu.thread_count: 2
|
||||
|
||||
|
||||
==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010
|
||||
# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386
|
||||
# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1)
|
||||
# Copyright (C) 2007 Free Software Foundation, Inc.
|
||||
# This is free software; see the source for copying conditions. There is NO
|
||||
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
#
|
||||
# a94585d91e66+ tip
|
||||
# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64
|
||||
|
||||
Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s
|
||||
Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s
|
||||
Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s
|
||||
Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s
|
||||
Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s
|
||||
Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s
|
||||
Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s
|
||||
Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s
|
||||
Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s
|
||||
Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s
|
||||
Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s
|
||||
Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s
|
||||
Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s
|
||||
Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s
|
||||
Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s
|
||||
Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s
|
||||
Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s
|
||||
Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s
|
||||
Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s
|
||||
Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s
|
||||
Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s
|
||||
Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s
|
||||
Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s
|
||||
Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s
|
||||
Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s
|
||||
Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s
|
||||
Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s
|
||||
Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s
|
||||
Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s
|
||||
Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s
|
||||
Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s
|
||||
Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s
|
||||
Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s
|
||||
Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s
|
||||
Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s
|
||||
Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s
|
||||
Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s
|
||||
Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s
|
||||
Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s
|
||||
Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s
|
||||
Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s
|
||||
Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s
|
||||
Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s
|
||||
Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s
|
||||
Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s
|
||||
Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
|
||||
Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s
|
||||
Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s
|
||||
Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s
|
||||
Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s
|
||||
Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s
|
||||
Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s
|
||||
Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s
|
||||
Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s
|
||||
Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s
|
||||
Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s
|
||||
Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s
|
||||
Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s
|
||||
Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s
|
||||
Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s
|
||||
Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s
|
||||
Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s
|
||||
Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s
|
||||
Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s
|
||||
Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s
|
||||
Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s
|
||||
Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s
|
||||
Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s
|
||||
Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s
|
||||
Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s
|
||||
Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s
|
||||
Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s
|
||||
Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s
|
||||
Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s
|
||||
Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s
|
||||
Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s
|
||||
Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s
|
||||
Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s
|
||||
Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s
|
||||
Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s
|
||||
Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s
|
||||
Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s
|
||||
Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s
|
||||
Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s
|
||||
Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s
|
||||
Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s
|
||||
Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s
|
||||
Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s
|
||||
Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s
|
||||
Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
|
||||
Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s
|
||||
Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s
|
||||
Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s
|
||||
Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s
|
||||
Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s
|
||||
Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s
|
||||
Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s
|
||||
Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s
|
||||
Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s
|
||||
Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s
|
||||
Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s
|
||||
Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s
|
||||
Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s
|
||||
Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s
|
||||
Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s
|
||||
Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s
|
||||
Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s
|
||||
Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s
|
||||
Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s
|
||||
Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s
|
||||
Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s
|
||||
Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s
|
||||
Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s
|
||||
Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s
|
||||
Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s
|
||||
Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s
|
||||
Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s
|
||||
Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s
|
||||
Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s
|
||||
Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s
|
||||
Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s
|
||||
Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s
|
||||
Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s
|
||||
Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s
|
||||
Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s
|
||||
Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s
|
||||
Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s
|
||||
Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s
|
||||
Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s
|
||||
Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s
|
||||
Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s
|
||||
Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s
|
||||
Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s
|
||||
Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s
|
||||
Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s
|
||||
Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s
|
||||
Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s
|
||||
Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s
|
||||
Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s
|
||||
Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s
|
||||
Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s
|
||||
Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s
|
||||
Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s
|
||||
Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s
|
||||
Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s
|
||||
Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s
|
||||
Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s
|
||||
Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s
|
||||
Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s
|
||||
Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s
|
||||
Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s
|
||||
Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s
|
||||
Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s
|
||||
Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s
|
||||
Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s
|
||||
Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s
|
||||
Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s
|
||||
Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s
|
||||
Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s
|
||||
Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s
|
||||
Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s
|
||||
Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s
|
||||
Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s
|
||||
Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s
|
||||
Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s
|
||||
Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s
|
||||
Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s
|
||||
Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s
|
||||
Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s
|
||||
Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s
|
||||
Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s
|
||||
Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s
|
||||
Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s
|
||||
Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s
|
||||
Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s
|
||||
Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s
|
||||
Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s
|
||||
Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s
|
||||
Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s
|
||||
Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s
|
||||
Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s
|
||||
Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s
|
||||
Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s
|
||||
Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s
|
||||
Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s
|
||||
Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s
|
||||
Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s
|
||||
Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s
|
||||
Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s
|
||||
Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s
|
||||
Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s
|
||||
Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s
|
||||
Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s
|
||||
Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s
|
||||
Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s
|
||||
Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s
|
||||
Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s
|
||||
Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s
|
||||
Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s
|
||||
Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s
|
||||
Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s
|
||||
Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s
|
||||
Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s
|
||||
Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s
|
||||
Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s
|
||||
Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s
|
||||
Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s
|
||||
Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s
|
||||
Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s
|
||||
Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s
|
||||
Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s
|
||||
Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s
|
||||
Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s
|
||||
Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s
|
||||
Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s
|
||||
Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s
|
||||
Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s
|
||||
Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s
|
||||
Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s
|
||||
Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s
|
||||
Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s
|
||||
Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s
|
||||
Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s
|
||||
Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s
|
||||
Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s
|
||||
Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s
|
||||
Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s
|
||||
Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s
|
||||
Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s
|
||||
Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s
|
||||
Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s
|
||||
Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s
|
||||
Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s
|
||||
Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s
|
||||
Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s
|
||||
Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s
|
||||
Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s
|
||||
Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s
|
||||
Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s
|
||||
Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s
|
||||
Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s
|
||||
Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s
|
||||
Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s
|
||||
Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s
|
||||
Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s
|
||||
Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s
|
||||
Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s
|
||||
Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s
|
||||
Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s
|
||||
Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s
|
||||
Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s
|
||||
Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s
|
||||
Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s
|
||||
Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s
|
||||
Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s
|
||||
Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s
|
||||
Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s
|
||||
Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s
|
||||
Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s
|
||||
Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s
|
||||
Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s
|
||||
Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s
|
||||
Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s
|
||||
Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s
|
||||
Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s
|
||||
Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s
|
||||
Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s
|
||||
Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s
|
||||
Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s
|
||||
Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s
|
||||
Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s
|
||||
Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s
|
||||
Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s
|
||||
Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s
|
||||
Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s
|
||||
Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s
|
||||
Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s
|
||||
Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s
|
||||
Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s
|
||||
Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s
|
||||
Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s
|
||||
Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s
|
||||
Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s
|
||||
Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s
|
||||
Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s
|
||||
Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s
|
||||
Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s
|
||||
Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s
|
||||
Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s
|
||||
Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s
|
||||
Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s
|
||||
Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s
|
||||
Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s
|
||||
Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s
|
||||
Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s
|
||||
Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s
|
||||
Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s
|
||||
Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s
|
||||
Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s
|
||||
Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s
|
||||
Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s
|
||||
Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s
|
||||
Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s
|
||||
Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s
|
||||
Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s
|
||||
Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s
|
||||
Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s
|
||||
Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s
|
||||
Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s
|
||||
Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s
|
||||
Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s
|
||||
Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s
|
||||
Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s
|
||||
Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s
|
||||
Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s
|
||||
Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s
|
||||
Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s
|
||||
Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s
|
||||
Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s
|
||||
Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s
|
||||
Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s
|
||||
Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s
|
||||
Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s
|
||||
Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s
|
||||
Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s
|
||||
Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s
|
||||
Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s
|
||||
Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s
|
||||
Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s
|
||||
Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s
|
||||
Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s
|
||||
Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s
|
||||
Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s
|
||||
Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s
|
||||
Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s
|
||||
Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s
|
||||
Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s
|
||||
Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s
|
||||
Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s
|
||||
Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s
|
||||
Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s
|
||||
Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s
|
||||
Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s
|
||||
Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s
|
||||
Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s
|
||||
Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s
|
||||
Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s
|
||||
Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s
|
||||
Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s
|
||||
Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s
|
||||
Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s
|
||||
Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s
|
||||
Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s
|
||||
Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s
|
||||
Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s
|
||||
Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s
|
||||
Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s
|
||||
Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s
|
||||
Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s
|
||||
Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s
|
||||
Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s
|
||||
Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s
|
||||
Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s
|
||||
Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s
|
||||
Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s
|
||||
Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s
|
||||
Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s
|
||||
Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s
|
||||
Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s
|
||||
Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s
|
||||
Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s
|
||||
Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s
|
||||
Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s
|
||||
Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s
|
||||
Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s
|
||||
Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s
|
||||
Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s
|
||||
Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s
|
||||
Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s
|
||||
Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s
|
||||
Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s
|
||||
Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s
|
||||
Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s
|
||||
Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s
|
||||
Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s
|
||||
Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s
|
||||
Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s
|
||||
Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s
|
||||
Search_Digits_PCRE 500000 7534 ns/op
|
||||
Search_Digits_RE2 50000 44162 ns/op
|
||||
Parse_Digits_PCRE 200000 7664 ns/op
|
||||
Parse_Digits_RE2 100000 22595 ns/op
|
||||
Parse_CachedDigits_PCRE 5000000 721 ns/op
|
||||
Parse_CachedDigits_RE2 5000000 413 ns/op
|
||||
Parse_DigitDs_PCRE 500000 7095 ns/op
|
||||
Parse_DigitDs_RE2 100000 22259 ns/op
|
||||
Parse_CachedDigitDs_PCRE 5000000 704 ns/op
|
||||
Parse_CachedDigitDs_RE2 5000000 415 ns/op
|
||||
Parse_Split_PCRE 500000 5540 ns/op
|
||||
Parse_Split_RE2 100000 23817 ns/op
|
||||
Parse_CachedSplit_PCRE 5000000 490 ns/op
|
||||
Parse_CachedSplit_RE2 10000000 251 ns/op
|
||||
Parse_SplitHard_PCRE 500000 5410 ns/op
|
||||
Parse_SplitHard_RE2 100000 28518 ns/op
|
||||
Parse_CachedSplitHard_PCRE 5000000 488 ns/op
|
||||
Parse_CachedSplitHard_RE2 1000000 2489 ns/op
|
||||
Parse_CachedSplitBig1_PCRE 500 7171752 ns/op
|
||||
Parse_CachedSplitBig1_RE2 2000 990722 ns/op
|
||||
Parse_CachedSplitBig2_PCRE 5000 658331 ns/op
|
||||
Parse_CachedSplitBig2_RE2 20 81205250 ns/op
|
||||
BM_PCRE_Compile 500000 6443 ns/op
|
||||
BM_RE2_Compile 100000 24103 ns/op
|
||||
SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s
|
||||
SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s
|
||||
SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s
|
||||
SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s
|
||||
SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s
|
||||
SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s
|
||||
SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s
|
||||
SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s
|
||||
SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s
|
||||
SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s
|
||||
SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s
|
||||
SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s
|
||||
SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s
|
||||
SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s
|
||||
SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s
|
||||
SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s
|
||||
SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s
|
||||
SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s
|
||||
SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s
|
||||
SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s
|
||||
SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s
|
||||
SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s
|
||||
SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s
|
||||
SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s
|
||||
SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s
|
||||
SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s
|
||||
SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s
|
||||
SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s
|
||||
SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s
|
||||
SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s
|
||||
SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s
|
||||
SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s
|
||||
SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s
|
||||
SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s
|
||||
SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s
|
||||
SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s
|
||||
SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s
|
||||
SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s
|
||||
SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s
|
||||
SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s
|
||||
SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s
|
||||
SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s
|
||||
SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s
|
||||
SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s
|
||||
EmptyPartialMatchPCRE 10000000 195 ns/op
|
||||
EmptyPartialMatchRE2 5000000 497 ns/op
|
||||
SimplePartialMatchPCRE 10000000 276 ns/op
|
||||
SimplePartialMatchRE2 5000000 548 ns/op
|
||||
HTTPPartialMatchPCRE 2000000 826 ns/op
|
||||
HTTPPartialMatchRE2 2000000 894 ns/op
|
||||
SmallHTTPPartialMatchPCRE 2000000 825 ns/op
|
||||
SmallHTTPPartialMatchRE2 2000000 895 ns/op
|
||||
DotMatchPCRE 2000000 810 ns/op
|
||||
DotMatchRE2 2000000 976 ns/op
|
||||
ASCIIMatchPCRE 5000000 604 ns/op
|
||||
ASCIIMatchRE2 2000000 976 ns/op
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import argparse # for ArgumentParser
|
||||
import subprocess # for Popen
|
||||
import tempfile # for NamedTemporaryFile
|
||||
import os # for remove
|
||||
|
||||
class gnuplot(object):
|
||||
|
||||
output = "result.png"
|
||||
|
||||
script = """
|
||||
set terminal png size 1024, 768
|
||||
set output "{}.png"
|
||||
set title "re2 benchlog"
|
||||
set datafile separator ";"
|
||||
set grid x y
|
||||
set ylabel "MB/s"
|
||||
set autoscale
|
||||
plot """
|
||||
|
||||
template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
|
||||
|
||||
benchdata = dict()
|
||||
tempfiles = []
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
"""
|
||||
remove all temporary files
|
||||
"""
|
||||
|
||||
for filename in self.tempfiles:
|
||||
os.remove(filename)
|
||||
|
||||
def parse_re2_benchlog(self, filename):
|
||||
"""
|
||||
parse the input benchlog and return a dictionary contain bench data
|
||||
"""
|
||||
|
||||
benchdata = self.benchdata
|
||||
|
||||
with open(filename) as f:
|
||||
|
||||
for raw in f.readlines():
|
||||
|
||||
data = raw.split('\t')
|
||||
|
||||
if len(data) == 4:
|
||||
|
||||
data = data[0].split('/') + data[1:]
|
||||
data = list(map(str.strip, data))
|
||||
|
||||
if not benchdata.get(data[0]):
|
||||
benchdata[data[0]] = [ data[1:] ]
|
||||
else:
|
||||
benchdata[data[0]].append(data[1:])
|
||||
|
||||
def gen_csv(self):
|
||||
"""
|
||||
generate temporary csv files
|
||||
"""
|
||||
|
||||
for name, data in self.benchdata.items():
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as f:
|
||||
|
||||
for index, line in enumerate(data):
|
||||
f.write('{};{}\n'.format(index, ';'.join(line)).encode())
|
||||
|
||||
self.tempfiles.append(f.name)
|
||||
self.script = self.script + self.template.format(f.name, name)
|
||||
|
||||
def run(self):
|
||||
self.gen_csv()
|
||||
script = self.script[:-3].format(self.output)
|
||||
command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
|
||||
command.communicate(script.encode())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='generate plots for benchlog')
|
||||
parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
|
||||
except FileNotFoundError:
|
||||
print('you can install "gnuplot" to generate plots automatically')
|
||||
exit(1)
|
||||
|
||||
with gnuplot() as plot:
|
||||
plot.output = args.benchlog
|
||||
plot.parse_re2_benchlog(args.benchlog)
|
||||
plot.run()
|
||||
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/perl
|
||||
# XXX
|
||||
|
||||
sub table() {
|
||||
my ($name) = @_;
|
||||
print <<'EOF';
|
||||
<table border=0>
|
||||
<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
|
||||
EOF
|
||||
foreach my $sys (@sys) {
|
||||
my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
|
||||
my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
|
||||
printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
|
||||
}
|
||||
print <<'EOF';
|
||||
<tr height=5><td colspan=3></td></tr>
|
||||
</table>
|
||||
EOF
|
||||
}
|
||||
|
||||
@sizes = (
|
||||
"8", "16", "32", "64", "128", "256", "512",
|
||||
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
|
||||
"1M", "2M", "4M", "8M", "16M"
|
||||
);
|
||||
|
||||
%color = (
|
||||
"PCRE" => "0.7 0 0",
|
||||
"RE2" => "0 0 1",
|
||||
);
|
||||
|
||||
$ngraph = 0;
|
||||
|
||||
sub graph() {
|
||||
my ($name) = @_;
|
||||
|
||||
my $sys = "wreck";
|
||||
my $base = sprintf("regexp3g%d", ++$ngraph);
|
||||
|
||||
open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
|
||||
printf JGR "bbox -20 -12 392 95\n";
|
||||
printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
|
||||
$ymax = 0;
|
||||
%lastx = ();
|
||||
%lasty = ();
|
||||
foreach my $who ("PCRE", "RE2") {
|
||||
printf JGR "newcurve pts\n";
|
||||
for(my $i=0; $i<@sizes; $i++) {
|
||||
my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
|
||||
my $val = $data{$sys}->{$key}->{'MB/s'};
|
||||
next if !defined($val);
|
||||
if($val > $ymax) {
|
||||
$ymax = $val;
|
||||
}
|
||||
$lastx{$who} = $i;
|
||||
$lasty{$who} = $val;
|
||||
printf JGR "$i %f (* %s *)\n", $val, $key;
|
||||
}
|
||||
my $color = $color{$who};
|
||||
printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
|
||||
}
|
||||
my $n = @sizes;
|
||||
printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
|
||||
printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
|
||||
for($i=0; $i<@sizes; $i+=3) {
|
||||
printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
|
||||
}
|
||||
my $y = 1;
|
||||
while(10*$y <= $ymax) {
|
||||
$y = 10*$y;
|
||||
}
|
||||
for($i=2; $i<=10; $i++) {
|
||||
if($i*$y > $ymax) {
|
||||
$y = $i*$y;
|
||||
last;
|
||||
}
|
||||
}
|
||||
foreach my $who ("PCRE", "RE2") {
|
||||
$x1 = $lastx{$who};
|
||||
$y1 = $lasty{$who};
|
||||
$x1 *= 1.01;
|
||||
my $v = "vjc";
|
||||
if($y1 < 0.05 * $y) {
|
||||
$v = "vjb";
|
||||
$y1 = 0.05 * $y;
|
||||
}
|
||||
printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
|
||||
}
|
||||
printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
|
||||
printf JGR " hash_labels fontsize 9\n";
|
||||
# printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
|
||||
|
||||
system("jgraph $base.jgr >$base.eps"); # die "system: $!";
|
||||
system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
|
||||
|
||||
printf "<img src=$base.png>\n"
|
||||
|
||||
}
|
||||
|
||||
sub skip() {
|
||||
while(<>) {
|
||||
if(/^<!-- -->/) {
|
||||
print;
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@sys = ("r70", "c2", "wreck", "mini");
|
||||
%sysname = (
|
||||
"r70" => "AMD Opteron 8214 HE, 2.2 GHz",
|
||||
"c2" => "Intel Core2 Duo E7200, 2.53 GHz",
|
||||
"wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
|
||||
"mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
|
||||
);
|
||||
|
||||
%func = (
|
||||
"table" => \&table,
|
||||
"graph" => \&graph,
|
||||
|
||||
);
|
||||
|
||||
foreach my $sys (@sys) {
|
||||
open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
|
||||
my %sysdat;
|
||||
while(<F>) {
|
||||
if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
|
||||
my %row;
|
||||
$row{"name"} = $1;
|
||||
$row{"iter"} = $2;
|
||||
$row{"ns/op"} = $3;
|
||||
if(/([\d.]+) MB\/s/){
|
||||
$row{"MB/s"} = $1;
|
||||
}
|
||||
$sysdat{$row{"name"}} = \%row;
|
||||
}
|
||||
}
|
||||
close F;
|
||||
$data{$sys} = \%sysdat;
|
||||
}
|
||||
|
||||
while(<>) {
|
||||
print;
|
||||
if(/^<!-- benchlog (\w+) -->/) {
|
||||
$func{$1}();
|
||||
skip();
|
||||
next;
|
||||
}
|
||||
if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
|
||||
$func{$1}($2);
|
||||
skip();
|
||||
next;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
out=$GOROOT/src/regexp/syntax/doc.go
|
||||
cp syntax.txt $out
|
||||
sam -d $out <<'!'
|
||||
,x g/NOT SUPPORTED/d
|
||||
/^Unicode character class/,$d
|
||||
,s/[«»]//g
|
||||
,x g/^Possessive repetitions:/d
|
||||
,x g/\\C/d
|
||||
,x g/Flag syntax/d
|
||||
,s/.=(true|false)/flag &/g
|
||||
,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
|
||||
,s/\n\n\n+/\n\n/g
|
||||
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
|
||||
1,2c
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Code generated by mksyntaxgo from the RE2 distribution. DO NOT EDIT.
|
||||
|
||||
/*
|
||||
Package syntax parses regular expressions into parse trees and compiles
|
||||
parse trees into programs. Most clients of regular expressions will use the
|
||||
facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package.
|
||||
|
||||
# Syntax
|
||||
|
||||
The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows.
|
||||
Parts of the syntax can be disabled by passing alternate flags to [Parse].
|
||||
|
||||
.
|
||||
$a
|
||||
Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
|
||||
*/
|
||||
package syntax
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
||||
@@ -0,0 +1,42 @@
|
||||
#!/bin/sh
|
||||
|
||||
cp syntax.txt syntax.html
|
||||
sam -d syntax.html <<'!'
|
||||
,s/\&/\&/g
|
||||
,s/</\</g
|
||||
,s/>/\>/g
|
||||
,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
|
||||
,s!«!<code>!g
|
||||
,s!»!</code>!g
|
||||
,s! vim$! <font size=-2>VIM</font>!g
|
||||
,s! pcre$! <font size=-2>PCRE</font>!g
|
||||
,s! perl$! <font size=-2>PERL</font>!g
|
||||
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
|
||||
,s!NOT SUPPORTED!!g
|
||||
,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
|
||||
,s!.*:$!<b>&</b>!g
|
||||
,s!^$!<tr><td></td></tr>!g
|
||||
,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
|
||||
1,2c
|
||||
<html>
|
||||
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<title>RE2 regular expression syntax reference</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>RE2 regular expression syntax reference</h1>
|
||||
|
||||
<table border=0 cellpadding=2 cellspacing=2>
|
||||
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
|
||||
.
|
||||
$a
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
||||
@@ -0,0 +1,36 @@
|
||||
#!/bin/sh
|
||||
|
||||
cp syntax.txt syntax.wiki
|
||||
sam -d syntax.wiki <<'!'
|
||||
,s!`!`````!g
|
||||
,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
|
||||
,s!«!`!g
|
||||
,s!»!`!g
|
||||
,s! vim$! <font size="1">VIM</font>!g
|
||||
,s! pcre$! <font size="1">PCRE</font>!g
|
||||
,s! perl$! <font size="1">PERL</font>!g
|
||||
,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
|
||||
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
|
||||
,s!NOT SUPPORTED!<font size="1">(&)</font>!g
|
||||
,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
|
||||
,s!.*:$!<b>&</b>!g
|
||||
,s!^$!<tr><td></td></tr>!g
|
||||
,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
|
||||
1,2c
|
||||
#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
|
||||
|
||||
<wiki:comment>
|
||||
GENERATED BY mksyntaxwiki. DO NOT EDIT
|
||||
</wiki:comment>
|
||||
|
||||
<table border="0" cellpadding="2" cellspacing="2">
|
||||
<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
|
||||
.
|
||||
$a
|
||||
</table>
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
||||
+477
@@ -0,0 +1,477 @@
|
||||
<html>
|
||||
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<title>RE2 regular expression syntax reference</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>RE2 regular expression syntax reference</h1>
|
||||
|
||||
<table border=0 cellpadding=2 cellspacing=2>
|
||||
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Single characters:</b></td></tr>
|
||||
<tr><td><code>.</code></td><td>any character, possibly including newline (s=true)</td></tr>
|
||||
<tr><td><code>[xyz]</code></td><td>character class</td></tr>
|
||||
<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
|
||||
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
|
||||
<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
|
||||
<tr><td><code>[[:alpha:]]</code></td><td>ASCII character class</td></tr>
|
||||
<tr><td><code>[[:^alpha:]]</code></td><td>negated ASCII character class</td></tr>
|
||||
<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
|
||||
<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
|
||||
<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
|
||||
<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Composites:</b></td></tr>
|
||||
<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
|
||||
<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Repetitions:</b></td></tr>
|
||||
<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
|
||||
<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
|
||||
<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
|
||||
<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
|
||||
<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2>Implementation restriction: The counting forms <code>x{n,m}</code>, <code>x{n,}</code>, and <code>x{n}</code></td></tr>
|
||||
<tr><td colspan=2>reject forms that create a minimum or maximum repetition count above 1000.</td></tr>
|
||||
<tr><td colspan=2>Unlimited repetitions are not subject to this restriction.</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Grouping:</b></td></tr>
|
||||
<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
|
||||
<tr><td><code>(?P<name>re)</code></td><td>named & numbered capturing group (submatch)</td></tr>
|
||||
<tr><td><code>(?<name>re)</code></td><td>named & numbered capturing group (submatch)</td></tr>
|
||||
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named & numbered capturing group (submatch) </td></tr>
|
||||
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
|
||||
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
|
||||
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
|
||||
<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
|
||||
<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
|
||||
<tr><td><code><font color=#808080>(?>re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>re@></font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Flags:</b></td></tr>
|
||||
<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
|
||||
<tr><td><code>m</code></td><td>multi-line mode: <code>^</code> and <code>$</code> match begin/end line in addition to begin/end text (default false)</td></tr>
|
||||
<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
|
||||
<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
|
||||
<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Empty strings:</b></td></tr>
|
||||
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
|
||||
<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr>
|
||||
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
|
||||
<tr><td><code>\b</code></td><td>at ASCII word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
|
||||
<tr><td><code>\B</code></td><td>not at ASCII word boundary</td></tr>
|
||||
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
|
||||
<tr><td><code>\z</code></td><td>at end of text</td></tr>
|
||||
<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?<=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?<!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>re&</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@<=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@<!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Escape sequences:</b></td></tr>
|
||||
<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
|
||||
<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
|
||||
<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
|
||||
<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
|
||||
<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
|
||||
<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
|
||||
<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
|
||||
<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
|
||||
<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
|
||||
<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
|
||||
<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
|
||||
<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
|
||||
<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
|
||||
<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
|
||||
<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g<name></font></code></td><td>subroutine call </td></tr>
|
||||
<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
|
||||
<tr><td><code><font color=#808080>\k<name></font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
|
||||
<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
|
||||
<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Character class elements:</b></td></tr>
|
||||
<tr><td><code>x</code></td><td>single character</td></tr>
|
||||
<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
|
||||
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
|
||||
<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
|
||||
<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
|
||||
<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
|
||||
<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
|
||||
<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
|
||||
<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
|
||||
<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
|
||||
<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
|
||||
<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
|
||||
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
|
||||
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Perl character classes (all ASCII-only):</b></td></tr>
|
||||
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
|
||||
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
|
||||
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
|
||||
<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
|
||||
<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
|
||||
<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
|
||||
<tr><td><code>[[:alnum:]]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:alpha:]]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:ascii:]]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
|
||||
<tr><td><code>[[:blank:]]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
|
||||
<tr><td><code>[[:cntrl:]]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
|
||||
<tr><td><code>[[:digit:]]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
|
||||
<tr><td><code>[[:graph:]]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]</code>)</td></tr>
|
||||
<tr><td><code>[[:lower:]]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:print:]]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
|
||||
<tr><td><code>[[:punct:]]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
|
||||
<tr><td><code>[[:space:]]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>[[:upper:]]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
|
||||
<tr><td><code>[[:word:]]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td><code>[[:xdigit:]]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
|
||||
<tr><td><code>C</code></td><td>other</td></tr>
|
||||
<tr><td><code>Cc</code></td><td>control</td></tr>
|
||||
<tr><td><code>Cf</code></td><td>format</td></tr>
|
||||
<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
|
||||
<tr><td><code>Co</code></td><td>private use</td></tr>
|
||||
<tr><td><code>Cs</code></td><td>surrogate</td></tr>
|
||||
<tr><td><code>L</code></td><td>letter</td></tr>
|
||||
<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
|
||||
<tr><td><code><font color=#808080>L&</font></code></td><td>cased letter </td></tr>
|
||||
<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
|
||||
<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
|
||||
<tr><td><code>Lo</code></td><td>other letter</td></tr>
|
||||
<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
|
||||
<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
|
||||
<tr><td><code>M</code></td><td>mark</td></tr>
|
||||
<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
|
||||
<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
|
||||
<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
|
||||
<tr><td><code>N</code></td><td>number</td></tr>
|
||||
<tr><td><code>Nd</code></td><td>decimal number</td></tr>
|
||||
<tr><td><code>Nl</code></td><td>letter number</td></tr>
|
||||
<tr><td><code>No</code></td><td>other number</td></tr>
|
||||
<tr><td><code>P</code></td><td>punctuation</td></tr>
|
||||
<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
|
||||
<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
|
||||
<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
|
||||
<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
|
||||
<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
|
||||
<tr><td><code>Po</code></td><td>other punctuation</td></tr>
|
||||
<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
|
||||
<tr><td><code>S</code></td><td>symbol</td></tr>
|
||||
<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
|
||||
<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
|
||||
<tr><td><code>Sm</code></td><td>math symbol</td></tr>
|
||||
<tr><td><code>So</code></td><td>other symbol</td></tr>
|
||||
<tr><td><code>Z</code></td><td>separator</td></tr>
|
||||
<tr><td><code>Zl</code></td><td>line separator</td></tr>
|
||||
<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
|
||||
<tr><td><code>Zs</code></td><td>space separator</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
|
||||
<tr><td colspan=2>Adlam</td></tr>
|
||||
<tr><td colspan=2>Ahom</td></tr>
|
||||
<tr><td colspan=2>Anatolian_Hieroglyphs</td></tr>
|
||||
<tr><td colspan=2>Arabic</td></tr>
|
||||
<tr><td colspan=2>Armenian</td></tr>
|
||||
<tr><td colspan=2>Avestan</td></tr>
|
||||
<tr><td colspan=2>Balinese</td></tr>
|
||||
<tr><td colspan=2>Bamum</td></tr>
|
||||
<tr><td colspan=2>Bassa_Vah</td></tr>
|
||||
<tr><td colspan=2>Batak</td></tr>
|
||||
<tr><td colspan=2>Bengali</td></tr>
|
||||
<tr><td colspan=2>Bhaiksuki</td></tr>
|
||||
<tr><td colspan=2>Bopomofo</td></tr>
|
||||
<tr><td colspan=2>Brahmi</td></tr>
|
||||
<tr><td colspan=2>Braille</td></tr>
|
||||
<tr><td colspan=2>Buginese</td></tr>
|
||||
<tr><td colspan=2>Buhid</td></tr>
|
||||
<tr><td colspan=2>Canadian_Aboriginal</td></tr>
|
||||
<tr><td colspan=2>Carian</td></tr>
|
||||
<tr><td colspan=2>Caucasian_Albanian</td></tr>
|
||||
<tr><td colspan=2>Chakma</td></tr>
|
||||
<tr><td colspan=2>Cham</td></tr>
|
||||
<tr><td colspan=2>Cherokee</td></tr>
|
||||
<tr><td colspan=2>Chorasmian</td></tr>
|
||||
<tr><td colspan=2>Common</td></tr>
|
||||
<tr><td colspan=2>Coptic</td></tr>
|
||||
<tr><td colspan=2>Cuneiform</td></tr>
|
||||
<tr><td colspan=2>Cypriot</td></tr>
|
||||
<tr><td colspan=2>Cypro_Minoan</td></tr>
|
||||
<tr><td colspan=2>Cyrillic</td></tr>
|
||||
<tr><td colspan=2>Deseret</td></tr>
|
||||
<tr><td colspan=2>Devanagari</td></tr>
|
||||
<tr><td colspan=2>Dives_Akuru</td></tr>
|
||||
<tr><td colspan=2>Dogra</td></tr>
|
||||
<tr><td colspan=2>Duployan</td></tr>
|
||||
<tr><td colspan=2>Egyptian_Hieroglyphs</td></tr>
|
||||
<tr><td colspan=2>Elbasan</td></tr>
|
||||
<tr><td colspan=2>Elymaic</td></tr>
|
||||
<tr><td colspan=2>Ethiopic</td></tr>
|
||||
<tr><td colspan=2>Georgian</td></tr>
|
||||
<tr><td colspan=2>Glagolitic</td></tr>
|
||||
<tr><td colspan=2>Gothic</td></tr>
|
||||
<tr><td colspan=2>Grantha</td></tr>
|
||||
<tr><td colspan=2>Greek</td></tr>
|
||||
<tr><td colspan=2>Gujarati</td></tr>
|
||||
<tr><td colspan=2>Gunjala_Gondi</td></tr>
|
||||
<tr><td colspan=2>Gurmukhi</td></tr>
|
||||
<tr><td colspan=2>Han</td></tr>
|
||||
<tr><td colspan=2>Hangul</td></tr>
|
||||
<tr><td colspan=2>Hanifi_Rohingya</td></tr>
|
||||
<tr><td colspan=2>Hanunoo</td></tr>
|
||||
<tr><td colspan=2>Hatran</td></tr>
|
||||
<tr><td colspan=2>Hebrew</td></tr>
|
||||
<tr><td colspan=2>Hiragana</td></tr>
|
||||
<tr><td colspan=2>Imperial_Aramaic</td></tr>
|
||||
<tr><td colspan=2>Inherited</td></tr>
|
||||
<tr><td colspan=2>Inscriptional_Pahlavi</td></tr>
|
||||
<tr><td colspan=2>Inscriptional_Parthian</td></tr>
|
||||
<tr><td colspan=2>Javanese</td></tr>
|
||||
<tr><td colspan=2>Kaithi</td></tr>
|
||||
<tr><td colspan=2>Kannada</td></tr>
|
||||
<tr><td colspan=2>Katakana</td></tr>
|
||||
<tr><td colspan=2>Kawi</td></tr>
|
||||
<tr><td colspan=2>Kayah_Li</td></tr>
|
||||
<tr><td colspan=2>Kharoshthi</td></tr>
|
||||
<tr><td colspan=2>Khitan_Small_Script</td></tr>
|
||||
<tr><td colspan=2>Khmer</td></tr>
|
||||
<tr><td colspan=2>Khojki</td></tr>
|
||||
<tr><td colspan=2>Khudawadi</td></tr>
|
||||
<tr><td colspan=2>Lao</td></tr>
|
||||
<tr><td colspan=2>Latin</td></tr>
|
||||
<tr><td colspan=2>Lepcha</td></tr>
|
||||
<tr><td colspan=2>Limbu</td></tr>
|
||||
<tr><td colspan=2>Linear_A</td></tr>
|
||||
<tr><td colspan=2>Linear_B</td></tr>
|
||||
<tr><td colspan=2>Lisu</td></tr>
|
||||
<tr><td colspan=2>Lycian</td></tr>
|
||||
<tr><td colspan=2>Lydian</td></tr>
|
||||
<tr><td colspan=2>Mahajani</td></tr>
|
||||
<tr><td colspan=2>Makasar</td></tr>
|
||||
<tr><td colspan=2>Malayalam</td></tr>
|
||||
<tr><td colspan=2>Mandaic</td></tr>
|
||||
<tr><td colspan=2>Manichaean</td></tr>
|
||||
<tr><td colspan=2>Marchen</td></tr>
|
||||
<tr><td colspan=2>Masaram_Gondi</td></tr>
|
||||
<tr><td colspan=2>Medefaidrin</td></tr>
|
||||
<tr><td colspan=2>Meetei_Mayek</td></tr>
|
||||
<tr><td colspan=2>Mende_Kikakui</td></tr>
|
||||
<tr><td colspan=2>Meroitic_Cursive</td></tr>
|
||||
<tr><td colspan=2>Meroitic_Hieroglyphs</td></tr>
|
||||
<tr><td colspan=2>Miao</td></tr>
|
||||
<tr><td colspan=2>Modi</td></tr>
|
||||
<tr><td colspan=2>Mongolian</td></tr>
|
||||
<tr><td colspan=2>Mro</td></tr>
|
||||
<tr><td colspan=2>Multani</td></tr>
|
||||
<tr><td colspan=2>Myanmar</td></tr>
|
||||
<tr><td colspan=2>Nabataean</td></tr>
|
||||
<tr><td colspan=2>Nag_Mundari</td></tr>
|
||||
<tr><td colspan=2>Nandinagari</td></tr>
|
||||
<tr><td colspan=2>New_Tai_Lue</td></tr>
|
||||
<tr><td colspan=2>Newa</td></tr>
|
||||
<tr><td colspan=2>Nko</td></tr>
|
||||
<tr><td colspan=2>Nushu</td></tr>
|
||||
<tr><td colspan=2>Nyiakeng_Puachue_Hmong</td></tr>
|
||||
<tr><td colspan=2>Ogham</td></tr>
|
||||
<tr><td colspan=2>Ol_Chiki</td></tr>
|
||||
<tr><td colspan=2>Old_Hungarian</td></tr>
|
||||
<tr><td colspan=2>Old_Italic</td></tr>
|
||||
<tr><td colspan=2>Old_North_Arabian</td></tr>
|
||||
<tr><td colspan=2>Old_Permic</td></tr>
|
||||
<tr><td colspan=2>Old_Persian</td></tr>
|
||||
<tr><td colspan=2>Old_Sogdian</td></tr>
|
||||
<tr><td colspan=2>Old_South_Arabian</td></tr>
|
||||
<tr><td colspan=2>Old_Turkic</td></tr>
|
||||
<tr><td colspan=2>Old_Uyghur</td></tr>
|
||||
<tr><td colspan=2>Oriya</td></tr>
|
||||
<tr><td colspan=2>Osage</td></tr>
|
||||
<tr><td colspan=2>Osmanya</td></tr>
|
||||
<tr><td colspan=2>Pahawh_Hmong</td></tr>
|
||||
<tr><td colspan=2>Palmyrene</td></tr>
|
||||
<tr><td colspan=2>Pau_Cin_Hau</td></tr>
|
||||
<tr><td colspan=2>Phags_Pa</td></tr>
|
||||
<tr><td colspan=2>Phoenician</td></tr>
|
||||
<tr><td colspan=2>Psalter_Pahlavi</td></tr>
|
||||
<tr><td colspan=2>Rejang</td></tr>
|
||||
<tr><td colspan=2>Runic</td></tr>
|
||||
<tr><td colspan=2>Samaritan</td></tr>
|
||||
<tr><td colspan=2>Saurashtra</td></tr>
|
||||
<tr><td colspan=2>Sharada</td></tr>
|
||||
<tr><td colspan=2>Shavian</td></tr>
|
||||
<tr><td colspan=2>Siddham</td></tr>
|
||||
<tr><td colspan=2>SignWriting</td></tr>
|
||||
<tr><td colspan=2>Sinhala</td></tr>
|
||||
<tr><td colspan=2>Sogdian</td></tr>
|
||||
<tr><td colspan=2>Sora_Sompeng</td></tr>
|
||||
<tr><td colspan=2>Soyombo</td></tr>
|
||||
<tr><td colspan=2>Sundanese</td></tr>
|
||||
<tr><td colspan=2>Syloti_Nagri</td></tr>
|
||||
<tr><td colspan=2>Syriac</td></tr>
|
||||
<tr><td colspan=2>Tagalog</td></tr>
|
||||
<tr><td colspan=2>Tagbanwa</td></tr>
|
||||
<tr><td colspan=2>Tai_Le</td></tr>
|
||||
<tr><td colspan=2>Tai_Tham</td></tr>
|
||||
<tr><td colspan=2>Tai_Viet</td></tr>
|
||||
<tr><td colspan=2>Takri</td></tr>
|
||||
<tr><td colspan=2>Tamil</td></tr>
|
||||
<tr><td colspan=2>Tangsa</td></tr>
|
||||
<tr><td colspan=2>Tangut</td></tr>
|
||||
<tr><td colspan=2>Telugu</td></tr>
|
||||
<tr><td colspan=2>Thaana</td></tr>
|
||||
<tr><td colspan=2>Thai</td></tr>
|
||||
<tr><td colspan=2>Tibetan</td></tr>
|
||||
<tr><td colspan=2>Tifinagh</td></tr>
|
||||
<tr><td colspan=2>Tirhuta</td></tr>
|
||||
<tr><td colspan=2>Toto</td></tr>
|
||||
<tr><td colspan=2>Ugaritic</td></tr>
|
||||
<tr><td colspan=2>Vai</td></tr>
|
||||
<tr><td colspan=2>Vithkuqi</td></tr>
|
||||
<tr><td colspan=2>Wancho</td></tr>
|
||||
<tr><td colspan=2>Warang_Citi</td></tr>
|
||||
<tr><td colspan=2>Yezidi</td></tr>
|
||||
<tr><td colspan=2>Yi</td></tr>
|
||||
<tr><td colspan=2>Zanabazar_Square</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Vim flags:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Magic:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
|
||||
<tr><td><code><font color=#808080>(?&name)</font></code></td><td>recursive call to named group </td></tr>
|
||||
<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>(?P>name)</font></code></td><td>recursive call to named group </td></tr>
|
||||
<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
|
||||
<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
|
||||
<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
|
||||
<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
|
||||
<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
+463
@@ -0,0 +1,463 @@
|
||||
RE2 regular expression syntax reference
|
||||
-------------------------------------
|
||||
|
||||
Single characters:
|
||||
. any character, possibly including newline (s=true)
|
||||
[xyz] character class
|
||||
[^xyz] negated character class
|
||||
\d Perl character class
|
||||
\D negated Perl character class
|
||||
[[:alpha:]] ASCII character class
|
||||
[[:^alpha:]] negated ASCII character class
|
||||
\pN Unicode character class (one-letter name)
|
||||
\p{Greek} Unicode character class
|
||||
\PN negated Unicode character class (one-letter name)
|
||||
\P{Greek} negated Unicode character class
|
||||
|
||||
Composites:
|
||||
xy «x» followed by «y»
|
||||
x|y «x» or «y» (prefer «x»)
|
||||
|
||||
Repetitions:
|
||||
x* zero or more «x», prefer more
|
||||
x+ one or more «x», prefer more
|
||||
x? zero or one «x», prefer one
|
||||
x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
|
||||
x{n,} «n» or more «x», prefer more
|
||||
x{n} exactly «n» «x»
|
||||
x*? zero or more «x», prefer fewer
|
||||
x+? one or more «x», prefer fewer
|
||||
x?? zero or one «x», prefer zero
|
||||
x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
|
||||
x{n,}? «n» or more «x», prefer fewer
|
||||
x{n}? exactly «n» «x»
|
||||
x{} (== x*) NOT SUPPORTED vim
|
||||
x{-} (== x*?) NOT SUPPORTED vim
|
||||
x{-n} (== x{n}?) NOT SUPPORTED vim
|
||||
x= (== x?) NOT SUPPORTED vim
|
||||
|
||||
Implementation restriction: The counting forms «x{n,m}», «x{n,}», and «x{n}»
|
||||
reject forms that create a minimum or maximum repetition count above 1000.
|
||||
Unlimited repetitions are not subject to this restriction.
|
||||
|
||||
Possessive repetitions:
|
||||
x*+ zero or more «x», possessive NOT SUPPORTED
|
||||
x++ one or more «x», possessive NOT SUPPORTED
|
||||
x?+ zero or one «x», possessive NOT SUPPORTED
|
||||
x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
|
||||
x{n,}+ «n» or more «x», possessive NOT SUPPORTED
|
||||
x{n}+ exactly «n» «x», possessive NOT SUPPORTED
|
||||
|
||||
Grouping:
|
||||
(re) numbered capturing group (submatch)
|
||||
(?P<name>re) named & numbered capturing group (submatch)
|
||||
(?<name>re) named & numbered capturing group (submatch)
|
||||
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
|
||||
(?:re) non-capturing group
|
||||
(?flags) set flags within current group; non-capturing
|
||||
(?flags:re) set flags during re; non-capturing
|
||||
(?#text) comment NOT SUPPORTED
|
||||
(?|x|y|z) branch numbering reset NOT SUPPORTED
|
||||
(?>re) possessive match of «re» NOT SUPPORTED
|
||||
re@> possessive match of «re» NOT SUPPORTED vim
|
||||
%(re) non-capturing group NOT SUPPORTED vim
|
||||
|
||||
Flags:
|
||||
i case-insensitive (default false)
|
||||
m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
|
||||
s let «.» match «\n» (default false)
|
||||
U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
|
||||
Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
|
||||
|
||||
Empty strings:
|
||||
^ at beginning of text or line («m»=true)
|
||||
$ at end of text (like «\z» not «\Z») or line («m»=true)
|
||||
\A at beginning of text
|
||||
\b at ASCII word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
|
||||
\B not at ASCII word boundary
|
||||
\G at beginning of subtext being searched NOT SUPPORTED pcre
|
||||
\G at end of last match NOT SUPPORTED perl
|
||||
\Z at end of text, or before newline at end of text NOT SUPPORTED
|
||||
\z at end of text
|
||||
(?=re) before text matching «re» NOT SUPPORTED
|
||||
(?!re) before text not matching «re» NOT SUPPORTED
|
||||
(?<=re) after text matching «re» NOT SUPPORTED
|
||||
(?<!re) after text not matching «re» NOT SUPPORTED
|
||||
re& before text matching «re» NOT SUPPORTED vim
|
||||
re@= before text matching «re» NOT SUPPORTED vim
|
||||
re@! before text not matching «re» NOT SUPPORTED vim
|
||||
re@<= after text matching «re» NOT SUPPORTED vim
|
||||
re@<! after text not matching «re» NOT SUPPORTED vim
|
||||
\zs sets start of match (= \K) NOT SUPPORTED vim
|
||||
\ze sets end of match NOT SUPPORTED vim
|
||||
\%^ beginning of file NOT SUPPORTED vim
|
||||
\%$ end of file NOT SUPPORTED vim
|
||||
\%V on screen NOT SUPPORTED vim
|
||||
\%# cursor position NOT SUPPORTED vim
|
||||
\%'m mark «m» position NOT SUPPORTED vim
|
||||
\%23l in line 23 NOT SUPPORTED vim
|
||||
\%23c in column 23 NOT SUPPORTED vim
|
||||
\%23v in virtual column 23 NOT SUPPORTED vim
|
||||
|
||||
Escape sequences:
|
||||
\a bell (== \007)
|
||||
\f form feed (== \014)
|
||||
\t horizontal tab (== \011)
|
||||
\n newline (== \012)
|
||||
\r carriage return (== \015)
|
||||
\v vertical tab character (== \013)
|
||||
\* literal «*», for any punctuation character «*»
|
||||
\123 octal character code (up to three digits)
|
||||
\x7F hex character code (exactly two digits)
|
||||
\x{10FFFF} hex character code
|
||||
\C match a single byte even in UTF-8 mode
|
||||
\Q...\E literal text «...» even if «...» has punctuation
|
||||
|
||||
\1 backreference NOT SUPPORTED
|
||||
\b backspace NOT SUPPORTED (use «\010»)
|
||||
\cK control char ^K NOT SUPPORTED (use «\001» etc)
|
||||
\e escape NOT SUPPORTED (use «\033»)
|
||||
\g1 backreference NOT SUPPORTED
|
||||
\g{1} backreference NOT SUPPORTED
|
||||
\g{+1} backreference NOT SUPPORTED
|
||||
\g{-1} backreference NOT SUPPORTED
|
||||
\g{name} named backreference NOT SUPPORTED
|
||||
\g<name> subroutine call NOT SUPPORTED
|
||||
\g'name' subroutine call NOT SUPPORTED
|
||||
\k<name> named backreference NOT SUPPORTED
|
||||
\k'name' named backreference NOT SUPPORTED
|
||||
\lX lowercase «X» NOT SUPPORTED
|
||||
\ux uppercase «x» NOT SUPPORTED
|
||||
\L...\E lowercase text «...» NOT SUPPORTED
|
||||
\K reset beginning of «$0» NOT SUPPORTED
|
||||
\N{name} named Unicode character NOT SUPPORTED
|
||||
\R line break NOT SUPPORTED
|
||||
\U...\E upper case text «...» NOT SUPPORTED
|
||||
\X extended Unicode sequence NOT SUPPORTED
|
||||
|
||||
\%d123 decimal character 123 NOT SUPPORTED vim
|
||||
\%xFF hex character FF NOT SUPPORTED vim
|
||||
\%o123 octal character 123 NOT SUPPORTED vim
|
||||
\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
|
||||
\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
|
||||
|
||||
Character class elements:
|
||||
x single character
|
||||
A-Z character range (inclusive)
|
||||
\d Perl character class
|
||||
[:foo:] ASCII character class «foo»
|
||||
\p{Foo} Unicode character class «Foo»
|
||||
\pF Unicode character class «F» (one-letter name)
|
||||
|
||||
Named character classes as character class elements:
|
||||
[\d] digits (== \d)
|
||||
[^\d] not digits (== \D)
|
||||
[\D] not digits (== \D)
|
||||
[^\D] not not digits (== \d)
|
||||
[[:name:]] named ASCII class inside character class (== [:name:])
|
||||
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
|
||||
[\p{Name}] named Unicode property inside character class (== \p{Name})
|
||||
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
|
||||
|
||||
Perl character classes (all ASCII-only):
|
||||
\d digits (== [0-9])
|
||||
\D not digits (== [^0-9])
|
||||
\s whitespace (== [\t\n\f\r ])
|
||||
\S not whitespace (== [^\t\n\f\r ])
|
||||
\w word characters (== [0-9A-Za-z_])
|
||||
\W not word characters (== [^0-9A-Za-z_])
|
||||
|
||||
\h horizontal space NOT SUPPORTED
|
||||
\H not horizontal space NOT SUPPORTED
|
||||
\v vertical space NOT SUPPORTED
|
||||
\V not vertical space NOT SUPPORTED
|
||||
|
||||
ASCII character classes:
|
||||
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
|
||||
[[:alpha:]] alphabetic (== [A-Za-z])
|
||||
[[:ascii:]] ASCII (== [\x00-\x7F])
|
||||
[[:blank:]] blank (== [\t ])
|
||||
[[:cntrl:]] control (== [\x00-\x1F\x7F])
|
||||
[[:digit:]] digits (== [0-9])
|
||||
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
|
||||
[[:lower:]] lower case (== [a-z])
|
||||
[[:print:]] printable (== [ -~] == [ [:graph:]])
|
||||
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
|
||||
[[:space:]] whitespace (== [\t\n\v\f\r ])
|
||||
[[:upper:]] upper case (== [A-Z])
|
||||
[[:word:]] word characters (== [0-9A-Za-z_])
|
||||
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
|
||||
|
||||
Unicode character class names--general category:
|
||||
C other
|
||||
Cc control
|
||||
Cf format
|
||||
Cn unassigned code points NOT SUPPORTED
|
||||
Co private use
|
||||
Cs surrogate
|
||||
L letter
|
||||
LC cased letter NOT SUPPORTED
|
||||
L& cased letter NOT SUPPORTED
|
||||
Ll lowercase letter
|
||||
Lm modifier letter
|
||||
Lo other letter
|
||||
Lt titlecase letter
|
||||
Lu uppercase letter
|
||||
M mark
|
||||
Mc spacing mark
|
||||
Me enclosing mark
|
||||
Mn non-spacing mark
|
||||
N number
|
||||
Nd decimal number
|
||||
Nl letter number
|
||||
No other number
|
||||
P punctuation
|
||||
Pc connector punctuation
|
||||
Pd dash punctuation
|
||||
Pe close punctuation
|
||||
Pf final punctuation
|
||||
Pi initial punctuation
|
||||
Po other punctuation
|
||||
Ps open punctuation
|
||||
S symbol
|
||||
Sc currency symbol
|
||||
Sk modifier symbol
|
||||
Sm math symbol
|
||||
So other symbol
|
||||
Z separator
|
||||
Zl line separator
|
||||
Zp paragraph separator
|
||||
Zs space separator
|
||||
|
||||
Unicode character class names--scripts:
|
||||
Adlam
|
||||
Ahom
|
||||
Anatolian_Hieroglyphs
|
||||
Arabic
|
||||
Armenian
|
||||
Avestan
|
||||
Balinese
|
||||
Bamum
|
||||
Bassa_Vah
|
||||
Batak
|
||||
Bengali
|
||||
Bhaiksuki
|
||||
Bopomofo
|
||||
Brahmi
|
||||
Braille
|
||||
Buginese
|
||||
Buhid
|
||||
Canadian_Aboriginal
|
||||
Carian
|
||||
Caucasian_Albanian
|
||||
Chakma
|
||||
Cham
|
||||
Cherokee
|
||||
Chorasmian
|
||||
Common
|
||||
Coptic
|
||||
Cuneiform
|
||||
Cypriot
|
||||
Cypro_Minoan
|
||||
Cyrillic
|
||||
Deseret
|
||||
Devanagari
|
||||
Dives_Akuru
|
||||
Dogra
|
||||
Duployan
|
||||
Egyptian_Hieroglyphs
|
||||
Elbasan
|
||||
Elymaic
|
||||
Ethiopic
|
||||
Georgian
|
||||
Glagolitic
|
||||
Gothic
|
||||
Grantha
|
||||
Greek
|
||||
Gujarati
|
||||
Gunjala_Gondi
|
||||
Gurmukhi
|
||||
Han
|
||||
Hangul
|
||||
Hanifi_Rohingya
|
||||
Hanunoo
|
||||
Hatran
|
||||
Hebrew
|
||||
Hiragana
|
||||
Imperial_Aramaic
|
||||
Inherited
|
||||
Inscriptional_Pahlavi
|
||||
Inscriptional_Parthian
|
||||
Javanese
|
||||
Kaithi
|
||||
Kannada
|
||||
Katakana
|
||||
Kawi
|
||||
Kayah_Li
|
||||
Kharoshthi
|
||||
Khitan_Small_Script
|
||||
Khmer
|
||||
Khojki
|
||||
Khudawadi
|
||||
Lao
|
||||
Latin
|
||||
Lepcha
|
||||
Limbu
|
||||
Linear_A
|
||||
Linear_B
|
||||
Lisu
|
||||
Lycian
|
||||
Lydian
|
||||
Mahajani
|
||||
Makasar
|
||||
Malayalam
|
||||
Mandaic
|
||||
Manichaean
|
||||
Marchen
|
||||
Masaram_Gondi
|
||||
Medefaidrin
|
||||
Meetei_Mayek
|
||||
Mende_Kikakui
|
||||
Meroitic_Cursive
|
||||
Meroitic_Hieroglyphs
|
||||
Miao
|
||||
Modi
|
||||
Mongolian
|
||||
Mro
|
||||
Multani
|
||||
Myanmar
|
||||
Nabataean
|
||||
Nag_Mundari
|
||||
Nandinagari
|
||||
New_Tai_Lue
|
||||
Newa
|
||||
Nko
|
||||
Nushu
|
||||
Nyiakeng_Puachue_Hmong
|
||||
Ogham
|
||||
Ol_Chiki
|
||||
Old_Hungarian
|
||||
Old_Italic
|
||||
Old_North_Arabian
|
||||
Old_Permic
|
||||
Old_Persian
|
||||
Old_Sogdian
|
||||
Old_South_Arabian
|
||||
Old_Turkic
|
||||
Old_Uyghur
|
||||
Oriya
|
||||
Osage
|
||||
Osmanya
|
||||
Pahawh_Hmong
|
||||
Palmyrene
|
||||
Pau_Cin_Hau
|
||||
Phags_Pa
|
||||
Phoenician
|
||||
Psalter_Pahlavi
|
||||
Rejang
|
||||
Runic
|
||||
Samaritan
|
||||
Saurashtra
|
||||
Sharada
|
||||
Shavian
|
||||
Siddham
|
||||
SignWriting
|
||||
Sinhala
|
||||
Sogdian
|
||||
Sora_Sompeng
|
||||
Soyombo
|
||||
Sundanese
|
||||
Syloti_Nagri
|
||||
Syriac
|
||||
Tagalog
|
||||
Tagbanwa
|
||||
Tai_Le
|
||||
Tai_Tham
|
||||
Tai_Viet
|
||||
Takri
|
||||
Tamil
|
||||
Tangsa
|
||||
Tangut
|
||||
Telugu
|
||||
Thaana
|
||||
Thai
|
||||
Tibetan
|
||||
Tifinagh
|
||||
Tirhuta
|
||||
Toto
|
||||
Ugaritic
|
||||
Vai
|
||||
Vithkuqi
|
||||
Wancho
|
||||
Warang_Citi
|
||||
Yezidi
|
||||
Yi
|
||||
Zanabazar_Square
|
||||
|
||||
Vim character classes:
|
||||
\i identifier character NOT SUPPORTED vim
|
||||
\I «\i» except digits NOT SUPPORTED vim
|
||||
\k keyword character NOT SUPPORTED vim
|
||||
\K «\k» except digits NOT SUPPORTED vim
|
||||
\f file name character NOT SUPPORTED vim
|
||||
\F «\f» except digits NOT SUPPORTED vim
|
||||
\p printable character NOT SUPPORTED vim
|
||||
\P «\p» except digits NOT SUPPORTED vim
|
||||
\s whitespace character (== [ \t]) NOT SUPPORTED vim
|
||||
\S non-white space character (== [^ \t]) NOT SUPPORTED vim
|
||||
\d digits (== [0-9]) vim
|
||||
\D not «\d» vim
|
||||
\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
|
||||
\X not «\x» NOT SUPPORTED vim
|
||||
\o octal digits (== [0-7]) NOT SUPPORTED vim
|
||||
\O not «\o» NOT SUPPORTED vim
|
||||
\w word character vim
|
||||
\W not «\w» vim
|
||||
\h head of word character NOT SUPPORTED vim
|
||||
\H not «\h» NOT SUPPORTED vim
|
||||
\a alphabetic NOT SUPPORTED vim
|
||||
\A not «\a» NOT SUPPORTED vim
|
||||
\l lowercase NOT SUPPORTED vim
|
||||
\L not lowercase NOT SUPPORTED vim
|
||||
\u uppercase NOT SUPPORTED vim
|
||||
\U not uppercase NOT SUPPORTED vim
|
||||
\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
|
||||
|
||||
Vim flags:
|
||||
\c ignore case NOT SUPPORTED vim
|
||||
\C match case NOT SUPPORTED vim
|
||||
\m magic NOT SUPPORTED vim
|
||||
\M nomagic NOT SUPPORTED vim
|
||||
\v verymagic NOT SUPPORTED vim
|
||||
\V verynomagic NOT SUPPORTED vim
|
||||
\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
|
||||
|
||||
Magic:
|
||||
(?{code}) arbitrary Perl code NOT SUPPORTED perl
|
||||
(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
|
||||
(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
|
||||
(?+n) recursive call to relative group «+n» NOT SUPPORTED
|
||||
(?-n) recursive call to relative group «-n» NOT SUPPORTED
|
||||
(?C) PCRE callout NOT SUPPORTED pcre
|
||||
(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
|
||||
(?&name) recursive call to named group NOT SUPPORTED
|
||||
(?P=name) named backreference NOT SUPPORTED
|
||||
(?P>name) recursive call to named group NOT SUPPORTED
|
||||
(?(cond)true|false) conditional branch NOT SUPPORTED
|
||||
(?(cond)true) conditional branch NOT SUPPORTED
|
||||
(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
|
||||
(*COMMIT) NOT SUPPORTED
|
||||
(*F) NOT SUPPORTED
|
||||
(*FAIL) NOT SUPPORTED
|
||||
(*MARK) NOT SUPPORTED
|
||||
(*PRUNE) NOT SUPPORTED
|
||||
(*SKIP) NOT SUPPORTED
|
||||
(*THEN) NOT SUPPORTED
|
||||
(*ANY) set newline convention NOT SUPPORTED
|
||||
(*ANYCRLF) NOT SUPPORTED
|
||||
(*CR) NOT SUPPORTED
|
||||
(*CRLF) NOT SUPPORTED
|
||||
(*LF) NOT SUPPORTED
|
||||
(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
|
||||
(*BSR_UNICODE) NOT SUPPORTED pcre
|
||||
|
||||
-16
@@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation version 2.1
|
||||
# of the License.
|
||||
#
|
||||
# Copyright(c) 2023 Huawei Device Co., Ltd.
|
||||
|
||||
set -e
|
||||
cd $1
|
||||
if [ -d "re2" ];then
|
||||
rm -rf re2
|
||||
fi
|
||||
tar zxvf re2-2024-07-02.tar.gz
|
||||
mv re2-2024-07-02 re2
|
||||
exit 0
|
||||
@@ -0,0 +1,104 @@
|
||||
#!/bin/sh
|
||||
# From Gerrit Code Review 2.2.1
|
||||
#
|
||||
# Part of Gerrit Code Review (http://code.google.com/p/gerrit/)
|
||||
#
|
||||
# Copyright (C) 2009 The Android Open Source Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
CHANGE_ID_AFTER="Bug|Issue"
|
||||
MSG="$1"
|
||||
|
||||
# Check for, and add if missing, a unique Change-Id
|
||||
#
|
||||
add_ChangeId() {
|
||||
clean_message=`sed -e '
|
||||
/^diff --git a\/.*/{
|
||||
s///
|
||||
q
|
||||
}
|
||||
/^Signed-off-by:/d
|
||||
/^#/d
|
||||
' "$MSG" | git stripspace`
|
||||
if test -z "$clean_message"
|
||||
then
|
||||
return
|
||||
fi
|
||||
|
||||
if grep -i '^Change-Id:' "$MSG" >/dev/null
|
||||
then
|
||||
return
|
||||
fi
|
||||
|
||||
id=`_gen_ChangeId`
|
||||
perl -e '
|
||||
$MSG = shift;
|
||||
$id = shift;
|
||||
$CHANGE_ID_AFTER = shift;
|
||||
|
||||
undef $/;
|
||||
open(I, $MSG); $_ = <I>; close I;
|
||||
s|^diff --git a/.*||ms;
|
||||
s|^#.*$||mg;
|
||||
exit unless $_;
|
||||
|
||||
@message = split /\n/;
|
||||
$haveFooter = 0;
|
||||
$startFooter = @message;
|
||||
for($line = @message - 1; $line >= 0; $line--) {
|
||||
$_ = $message[$line];
|
||||
|
||||
if (/^[a-zA-Z0-9-]+:/ && !m,^[a-z0-9-]+://,) {
|
||||
$haveFooter++;
|
||||
next;
|
||||
}
|
||||
next if /^[ []/;
|
||||
$startFooter = $line if ($haveFooter && /^\r?$/);
|
||||
last;
|
||||
}
|
||||
|
||||
@footer = @message[$startFooter+1..@message];
|
||||
@message = @message[0..$startFooter];
|
||||
push(@footer, "") unless @footer;
|
||||
|
||||
for ($line = 0; $line < @footer; $line++) {
|
||||
$_ = $footer[$line];
|
||||
next if /^($CHANGE_ID_AFTER):/i;
|
||||
last;
|
||||
}
|
||||
splice(@footer, $line, 0, "Change-Id: I$id");
|
||||
|
||||
$_ = join("\n", @message, @footer);
|
||||
open(O, ">$MSG"); print O; close O;
|
||||
' "$MSG" "$id" "$CHANGE_ID_AFTER"
|
||||
}
|
||||
_gen_ChangeIdInput() {
|
||||
echo "tree `git write-tree`"
|
||||
if parent=`git rev-parse HEAD^0 2>/dev/null`
|
||||
then
|
||||
echo "parent $parent"
|
||||
fi
|
||||
echo "author `git var GIT_AUTHOR_IDENT`"
|
||||
echo "committer `git var GIT_COMMITTER_IDENT`"
|
||||
echo
|
||||
printf '%s' "$clean_message"
|
||||
}
|
||||
_gen_ChangeId() {
|
||||
_gen_ChangeIdInput |
|
||||
git hash-object -t commit --stdin
|
||||
}
|
||||
|
||||
|
||||
add_ChangeId
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
global:
|
||||
# re2::RE2*
|
||||
_ZN3re23RE2*;
|
||||
_ZNK3re23RE2*;
|
||||
# re2::operator<<*
|
||||
_ZN3re2ls*;
|
||||
# re2::FilteredRE2*
|
||||
_ZN3re211FilteredRE2*;
|
||||
_ZNK3re211FilteredRE2*;
|
||||
# re2::re2_internal*
|
||||
_ZN3re212re2_internal*;
|
||||
_ZNK3re212re2_internal*;
|
||||
local:
|
||||
*;
|
||||
};
|
||||
@@ -0,0 +1,12 @@
|
||||
# Linker doesn't like these unmangled:
|
||||
# re2::RE2*
|
||||
__ZN3re23RE2*
|
||||
__ZNK3re23RE2*
|
||||
# re2::operator<<*
|
||||
__ZN3re2ls*
|
||||
# re2::FilteredRE2*
|
||||
__ZN3re211FilteredRE2*
|
||||
__ZNK3re211FilteredRE2*
|
||||
# re2::re2_internal*
|
||||
__ZN3re212re2_internal*
|
||||
__ZNK3re212re2_internal*
|
||||
@@ -0,0 +1,72 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.build/) BUILD file for RE2 Python.
|
||||
|
||||
load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
|
||||
load("@rules_python//python:defs.bzl", "py_library", "py_test")
|
||||
|
||||
pybind_extension(
|
||||
name = "_re2",
|
||||
srcs = ["_re2.cc"],
|
||||
deps = [
|
||||
"//:re2",
|
||||
"@abseil-cpp//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "re2",
|
||||
srcs = ["re2.py"],
|
||||
data = [":_re2"],
|
||||
imports = ["."],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "re2_test",
|
||||
size = "small",
|
||||
srcs = ["re2_test.py"],
|
||||
deps = [
|
||||
":re2",
|
||||
"@abseil-py//absl/testing:absltest",
|
||||
"@abseil-py//absl/testing:parameterized",
|
||||
],
|
||||
)
|
||||
|
||||
# These are implementation details for `setup.py`, so they can be
|
||||
# named however we want. For now, they are named to be consistent
|
||||
# with the `--cpu` flag values that they will eventually replace.
|
||||
|
||||
platform(
|
||||
name = "darwin_x86_64",
|
||||
constraint_values = [
|
||||
"@platforms//cpu:x86_64",
|
||||
"@platforms//os:macos",
|
||||
],
|
||||
)
|
||||
|
||||
platform(
|
||||
name = "darwin_arm64",
|
||||
constraint_values = [
|
||||
"@platforms//cpu:arm64",
|
||||
"@platforms//os:macos",
|
||||
],
|
||||
)
|
||||
|
||||
platform(
|
||||
name = "x64_x86_windows",
|
||||
constraint_values = [
|
||||
"@platforms//cpu:x86_32",
|
||||
"@platforms//os:windows",
|
||||
],
|
||||
)
|
||||
|
||||
platform(
|
||||
name = "x64_windows",
|
||||
constraint_values = [
|
||||
"@platforms//cpu:x86_64",
|
||||
"@platforms//os:windows",
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1 @@
|
||||
Building requires Python 3 and pybind11 to be installed on your system.
|
||||
+352
@@ -0,0 +1,352 @@
|
||||
// Copyright 2019 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "pybind11/buffer_info.h"
|
||||
#include "pybind11/gil.h"
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/pytypes.h"
|
||||
#include "pybind11/stl.h" // IWYU pragma: keep
|
||||
#include "re2/filtered_re2.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/set.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <basetsd.h>
|
||||
#define ssize_t SSIZE_T
|
||||
#endif
|
||||
|
||||
namespace re2_python {
|
||||
|
||||
// This is conventional.
|
||||
namespace py = pybind11;
|
||||
|
||||
// In terms of the pybind11 API, a py::buffer is merely a py::object that
|
||||
// supports the buffer interface/protocol and you must explicitly request
|
||||
// a py::buffer_info in order to access the actual bytes. Under the hood,
|
||||
// the py::buffer_info manages a reference count to the py::buffer, so it
|
||||
// must be constructed and subsequently destructed while holding the GIL.
|
||||
static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
|
||||
char* data = reinterpret_cast<char*>(bytes.ptr);
|
||||
ssize_t size = bytes.size;
|
||||
return absl::string_view(data, size);
|
||||
}
|
||||
|
||||
static inline int OneCharLen(const char* ptr) {
|
||||
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
|
||||
}
|
||||
|
||||
// Helper function for when Python encodes str to bytes and then needs to
|
||||
// convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
|
||||
ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
|
||||
auto bytes = buffer.request();
|
||||
auto text = FromBytes(bytes);
|
||||
auto ptr = text.data() + pos;
|
||||
auto end = text.data() + text.size();
|
||||
while (ptr < end && len > 0) {
|
||||
ptr += OneCharLen(ptr);
|
||||
--len;
|
||||
}
|
||||
return ptr - (text.data() + pos);
|
||||
}
|
||||
|
||||
// Helper function for when Python decodes bytes to str and then needs to
|
||||
// convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
|
||||
ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
|
||||
auto bytes = buffer.request();
|
||||
auto text = FromBytes(bytes);
|
||||
auto ptr = text.data() + pos;
|
||||
auto end = text.data() + endpos;
|
||||
ssize_t len = 0;
|
||||
while (ptr < end) {
|
||||
ptr += OneCharLen(ptr);
|
||||
++len;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
|
||||
const RE2::Options& options) {
|
||||
auto bytes = buffer.request();
|
||||
auto pattern = FromBytes(bytes);
|
||||
return std::make_unique<RE2>(pattern, options);
|
||||
}
|
||||
|
||||
py::bytes RE2ErrorShim(const RE2& self) {
|
||||
// Return std::string as bytes. That is, without decoding to str.
|
||||
return self.error();
|
||||
}
|
||||
|
||||
std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
|
||||
const RE2& self) {
|
||||
const int num_groups = self.NumberOfCapturingGroups();
|
||||
std::vector<std::pair<py::bytes, int>> groups;
|
||||
groups.reserve(num_groups);
|
||||
for (const auto& it : self.NamedCapturingGroups()) {
|
||||
groups.emplace_back(it.first, it.second);
|
||||
}
|
||||
return groups;
|
||||
}
|
||||
|
||||
std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
|
||||
std::vector<int> histogram;
|
||||
self.ProgramFanout(&histogram);
|
||||
return histogram;
|
||||
}
|
||||
|
||||
std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
|
||||
std::vector<int> histogram;
|
||||
self.ReverseProgramFanout(&histogram);
|
||||
return histogram;
|
||||
}
|
||||
|
||||
std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
|
||||
const RE2& self, int maxlen) {
|
||||
std::string min, max;
|
||||
// Return std::string as bytes. That is, without decoding to str.
|
||||
return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
|
||||
}
|
||||
|
||||
std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
|
||||
RE2::Anchor anchor,
|
||||
py::buffer buffer,
|
||||
ssize_t pos,
|
||||
ssize_t endpos) {
|
||||
auto bytes = buffer.request();
|
||||
auto text = FromBytes(bytes);
|
||||
const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
|
||||
std::vector<absl::string_view> groups;
|
||||
groups.resize(num_groups);
|
||||
py::gil_scoped_release release_gil;
|
||||
if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
|
||||
// Ensure that groups are null before converting to spans!
|
||||
for (auto& it : groups) {
|
||||
it = absl::string_view();
|
||||
}
|
||||
}
|
||||
std::vector<std::pair<ssize_t, ssize_t>> spans;
|
||||
spans.reserve(num_groups);
|
||||
for (const auto& it : groups) {
|
||||
if (it.data() == NULL) {
|
||||
spans.emplace_back(-1, -1);
|
||||
} else {
|
||||
spans.emplace_back(it.data() - text.data(),
|
||||
it.data() - text.data() + it.size());
|
||||
}
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
|
||||
py::bytes RE2QuoteMetaShim(py::buffer buffer) {
|
||||
auto bytes = buffer.request();
|
||||
auto pattern = FromBytes(bytes);
|
||||
// Return std::string as bytes. That is, without decoding to str.
|
||||
return RE2::QuoteMeta(pattern);
|
||||
}
|
||||
|
||||
class Set {
|
||||
public:
|
||||
Set(RE2::Anchor anchor, const RE2::Options& options)
|
||||
: set_(options, anchor) {}
|
||||
|
||||
~Set() = default;
|
||||
|
||||
// Not copyable or movable.
|
||||
Set(const Set&) = delete;
|
||||
Set& operator=(const Set&) = delete;
|
||||
|
||||
int Add(py::buffer buffer) {
|
||||
auto bytes = buffer.request();
|
||||
auto pattern = FromBytes(bytes);
|
||||
int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
|
||||
return index;
|
||||
}
|
||||
|
||||
bool Compile() {
|
||||
// Compiling can fail.
|
||||
return set_.Compile();
|
||||
}
|
||||
|
||||
std::vector<int> Match(py::buffer buffer) const {
|
||||
auto bytes = buffer.request();
|
||||
auto text = FromBytes(bytes);
|
||||
std::vector<int> matches;
|
||||
py::gil_scoped_release release_gil;
|
||||
set_.Match(text, &matches);
|
||||
return matches;
|
||||
}
|
||||
|
||||
private:
|
||||
RE2::Set set_;
|
||||
};
|
||||
|
||||
class Filter {
|
||||
public:
|
||||
Filter() = default;
|
||||
~Filter() = default;
|
||||
|
||||
// Not copyable or movable.
|
||||
Filter(const Filter&) = delete;
|
||||
Filter& operator=(const Filter&) = delete;
|
||||
|
||||
int Add(py::buffer buffer, const RE2::Options& options) {
|
||||
auto bytes = buffer.request();
|
||||
auto pattern = FromBytes(bytes);
|
||||
int index = -1; // not clobbered on error
|
||||
filter_.Add(pattern, options, &index);
|
||||
return index;
|
||||
}
|
||||
|
||||
bool Compile() {
|
||||
std::vector<std::string> atoms;
|
||||
filter_.Compile(&atoms);
|
||||
RE2::Options options;
|
||||
options.set_literal(true);
|
||||
options.set_case_sensitive(false);
|
||||
set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
|
||||
for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
|
||||
if (set_->Add(atoms[i], /*error=*/NULL) != i) {
|
||||
// Should never happen: the atom is a literal!
|
||||
py::pybind11_fail("set_->Add() failed");
|
||||
}
|
||||
}
|
||||
// Compiling can fail.
|
||||
return set_->Compile();
|
||||
}
|
||||
|
||||
std::vector<int> Match(py::buffer buffer, bool potential) const {
|
||||
if (set_ == nullptr) {
|
||||
py::pybind11_fail("Match() called before compiling");
|
||||
}
|
||||
|
||||
auto bytes = buffer.request();
|
||||
auto text = FromBytes(bytes);
|
||||
std::vector<int> atoms;
|
||||
py::gil_scoped_release release_gil;
|
||||
set_->Match(text, &atoms);
|
||||
std::vector<int> matches;
|
||||
if (potential) {
|
||||
filter_.AllPotentials(atoms, &matches);
|
||||
} else {
|
||||
filter_.AllMatches(text, atoms, &matches);
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
const RE2& GetRE2(int index) const {
|
||||
return filter_.GetRE2(index);
|
||||
}
|
||||
|
||||
private:
|
||||
re2::FilteredRE2 filter_;
|
||||
std::unique_ptr<RE2::Set> set_;
|
||||
};
|
||||
|
||||
PYBIND11_MODULE(_re2, module) {
|
||||
// Translate exceptions thrown by py::pybind11_fail() into Python.
|
||||
py::register_local_exception<std::runtime_error>(module, "Error");
|
||||
|
||||
module.def("CharLenToBytes", &CharLenToBytes);
|
||||
module.def("BytesToCharLen", &BytesToCharLen);
|
||||
|
||||
// CLASSES
|
||||
// class RE2
|
||||
// enum Anchor
|
||||
// class Options
|
||||
// enum Encoding
|
||||
// class Set
|
||||
// class Filter
|
||||
py::class_<RE2> re2(module, "RE2");
|
||||
py::enum_<RE2::Anchor> anchor(re2, "Anchor");
|
||||
py::class_<RE2::Options> options(re2, "Options");
|
||||
py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
|
||||
py::class_<Set> set(module, "Set");
|
||||
py::class_<Filter> filter(module, "Filter");
|
||||
|
||||
anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
|
||||
anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
|
||||
anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
|
||||
|
||||
encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
|
||||
encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
|
||||
|
||||
options.def(py::init<>())
|
||||
.def_property("max_mem", //
|
||||
&RE2::Options::max_mem, //
|
||||
&RE2::Options::set_max_mem) //
|
||||
.def_property("encoding", //
|
||||
&RE2::Options::encoding, //
|
||||
&RE2::Options::set_encoding) //
|
||||
.def_property("posix_syntax", //
|
||||
&RE2::Options::posix_syntax, //
|
||||
&RE2::Options::set_posix_syntax) //
|
||||
.def_property("longest_match", //
|
||||
&RE2::Options::longest_match, //
|
||||
&RE2::Options::set_longest_match) //
|
||||
.def_property("log_errors", //
|
||||
&RE2::Options::log_errors, //
|
||||
&RE2::Options::set_log_errors) //
|
||||
.def_property("literal", //
|
||||
&RE2::Options::literal, //
|
||||
&RE2::Options::set_literal) //
|
||||
.def_property("never_nl", //
|
||||
&RE2::Options::never_nl, //
|
||||
&RE2::Options::set_never_nl) //
|
||||
.def_property("dot_nl", //
|
||||
&RE2::Options::dot_nl, //
|
||||
&RE2::Options::set_dot_nl) //
|
||||
.def_property("never_capture", //
|
||||
&RE2::Options::never_capture, //
|
||||
&RE2::Options::set_never_capture) //
|
||||
.def_property("case_sensitive", //
|
||||
&RE2::Options::case_sensitive, //
|
||||
&RE2::Options::set_case_sensitive) //
|
||||
.def_property("perl_classes", //
|
||||
&RE2::Options::perl_classes, //
|
||||
&RE2::Options::set_perl_classes) //
|
||||
.def_property("word_boundary", //
|
||||
&RE2::Options::word_boundary, //
|
||||
&RE2::Options::set_word_boundary) //
|
||||
.def_property("one_line", //
|
||||
&RE2::Options::one_line, //
|
||||
&RE2::Options::set_one_line); //
|
||||
|
||||
re2.def(py::init(&RE2InitShim))
|
||||
.def("ok", &RE2::ok)
|
||||
.def("error", &RE2ErrorShim)
|
||||
.def("options", &RE2::options)
|
||||
.def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
|
||||
.def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
|
||||
.def("ProgramSize", &RE2::ProgramSize)
|
||||
.def("ReverseProgramSize", &RE2::ReverseProgramSize)
|
||||
.def("ProgramFanout", &RE2ProgramFanoutShim)
|
||||
.def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
|
||||
.def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
|
||||
.def("Match", &RE2MatchShim)
|
||||
.def_static("QuoteMeta", &RE2QuoteMetaShim);
|
||||
|
||||
set.def(py::init<RE2::Anchor, const RE2::Options&>())
|
||||
.def("Add", &Set::Add)
|
||||
.def("Compile", &Set::Compile)
|
||||
.def("Match", &Set::Match);
|
||||
|
||||
filter.def(py::init<>())
|
||||
.def("Add", &Filter::Add)
|
||||
.def("Compile", &Filter::Compile)
|
||||
.def("Match", &Filter::Match)
|
||||
.def("GetRE2", &Filter::GetRE2,
|
||||
py::return_value_policy::reference_internal);
|
||||
}
|
||||
|
||||
} // namespace re2_python
|
||||
+583
@@ -0,0 +1,583 @@
|
||||
# Copyright 2019 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
r"""A drop-in replacement for the re module.
|
||||
|
||||
It uses RE2 under the hood, of course, so various PCRE features
|
||||
(e.g. backreferences, look-around assertions) are not supported.
|
||||
See https://github.com/google/re2/wiki/Syntax for the canonical
|
||||
reference, but known syntactic "gotchas" relative to Python are:
|
||||
|
||||
* PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
|
||||
but calls it \Z. You must rewrite \Z to \z in pattern strings.
|
||||
|
||||
Known differences between this module's API and the re module's API:
|
||||
|
||||
* The error class does not provide any error information as attributes.
|
||||
* The Options class replaces the re module's flags with RE2's options as
|
||||
gettable/settable properties. Please see re2.h for their documentation.
|
||||
* The pattern string and the input string do not have to be the same type.
|
||||
Any str will be encoded to UTF-8.
|
||||
* The pattern string cannot be str if the options specify Latin-1 encoding.
|
||||
|
||||
This module's LRU cache contains a maximum of 128 regular expression objects.
|
||||
Each regular expression object's underlying RE2 object uses a maximum of 8MiB
|
||||
of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB
|
||||
of memory (by default), but in most cases, it should use much less than that.
|
||||
"""
|
||||
|
||||
import codecs
|
||||
import functools
|
||||
import itertools
|
||||
|
||||
import _re2
|
||||
|
||||
|
||||
# pybind11 translates C++ exceptions to Python exceptions.
|
||||
# We use that same Python exception class for consistency.
|
||||
error = _re2.Error
|
||||
|
||||
|
||||
class Options(_re2.RE2.Options):
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
NAMES = (
|
||||
'max_mem',
|
||||
'encoding',
|
||||
'posix_syntax',
|
||||
'longest_match',
|
||||
'log_errors',
|
||||
'literal',
|
||||
'never_nl',
|
||||
'dot_nl',
|
||||
'never_capture',
|
||||
'case_sensitive',
|
||||
'perl_classes',
|
||||
'word_boundary',
|
||||
'one_line',
|
||||
)
|
||||
|
||||
|
||||
def compile(pattern, options=None):
|
||||
if isinstance(pattern, _Regexp):
|
||||
if options:
|
||||
raise error('pattern is already compiled, so '
|
||||
'options may not be specified')
|
||||
pattern = pattern._pattern
|
||||
options = options or Options()
|
||||
values = tuple(getattr(options, name) for name in Options.NAMES)
|
||||
return _Regexp._make(pattern, values)
|
||||
|
||||
|
||||
def search(pattern, text, options=None):
|
||||
return compile(pattern, options=options).search(text)
|
||||
|
||||
|
||||
def match(pattern, text, options=None):
|
||||
return compile(pattern, options=options).match(text)
|
||||
|
||||
|
||||
def fullmatch(pattern, text, options=None):
|
||||
return compile(pattern, options=options).fullmatch(text)
|
||||
|
||||
|
||||
def finditer(pattern, text, options=None):
|
||||
return compile(pattern, options=options).finditer(text)
|
||||
|
||||
|
||||
def findall(pattern, text, options=None):
|
||||
return compile(pattern, options=options).findall(text)
|
||||
|
||||
|
||||
def split(pattern, text, maxsplit=0, options=None):
|
||||
return compile(pattern, options=options).split(text, maxsplit)
|
||||
|
||||
|
||||
def subn(pattern, repl, text, count=0, options=None):
|
||||
return compile(pattern, options=options).subn(repl, text, count)
|
||||
|
||||
|
||||
def sub(pattern, repl, text, count=0, options=None):
|
||||
return compile(pattern, options=options).sub(repl, text, count)
|
||||
|
||||
|
||||
def _encode(t):
|
||||
return t.encode(encoding='utf-8')
|
||||
|
||||
|
||||
def _decode(b):
|
||||
return b.decode(encoding='utf-8')
|
||||
|
||||
|
||||
def escape(pattern):
|
||||
if isinstance(pattern, str):
|
||||
encoded_pattern = _encode(pattern)
|
||||
escaped = _re2.RE2.QuoteMeta(encoded_pattern)
|
||||
decoded_escaped = _decode(escaped)
|
||||
return decoded_escaped
|
||||
else:
|
||||
escaped = _re2.RE2.QuoteMeta(pattern)
|
||||
return escaped
|
||||
|
||||
|
||||
def purge():
|
||||
return _Regexp._make.cache_clear()
|
||||
|
||||
|
||||
_Anchor = _re2.RE2.Anchor
|
||||
_NULL_SPAN = (-1, -1)
|
||||
|
||||
|
||||
class _Regexp(object):
|
||||
|
||||
__slots__ = ('_pattern', '_regexp')
|
||||
|
||||
@classmethod
|
||||
@functools.lru_cache(typed=True)
|
||||
def _make(cls, pattern, values):
|
||||
options = Options()
|
||||
for name, value in zip(Options.NAMES, values):
|
||||
setattr(options, name, value)
|
||||
return cls(pattern, options)
|
||||
|
||||
def __init__(self, pattern, options):
|
||||
self._pattern = pattern
|
||||
if isinstance(self._pattern, str):
|
||||
if options.encoding == Options.Encoding.LATIN1:
|
||||
raise error('string type of pattern is str, but '
|
||||
'encoding specified in options is LATIN1')
|
||||
encoded_pattern = _encode(self._pattern)
|
||||
self._regexp = _re2.RE2(encoded_pattern, options)
|
||||
else:
|
||||
self._regexp = _re2.RE2(self._pattern, options)
|
||||
if not self._regexp.ok():
|
||||
raise error(self._regexp.error())
|
||||
|
||||
def __getstate__(self):
|
||||
options = {name: getattr(self.options, name) for name in Options.NAMES}
|
||||
return self._pattern, options
|
||||
|
||||
def __setstate__(self, state):
|
||||
pattern, options = state
|
||||
values = tuple(options[name] for name in Options.NAMES)
|
||||
other = _Regexp._make(pattern, values)
|
||||
self._pattern = other._pattern
|
||||
self._regexp = other._regexp
|
||||
|
||||
def _match(self, anchor, text, pos=None, endpos=None):
|
||||
pos = 0 if pos is None else max(0, min(pos, len(text)))
|
||||
endpos = len(text) if endpos is None else max(0, min(endpos, len(text)))
|
||||
if pos > endpos:
|
||||
return
|
||||
if isinstance(text, str):
|
||||
encoded_text = _encode(text)
|
||||
encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos)
|
||||
if endpos == len(text):
|
||||
# This is the common case.
|
||||
encoded_endpos = len(encoded_text)
|
||||
else:
|
||||
encoded_endpos = encoded_pos + _re2.CharLenToBytes(
|
||||
encoded_text, encoded_pos, endpos - pos)
|
||||
decoded_offsets = {0: 0}
|
||||
last_offset = 0
|
||||
while True:
|
||||
spans = self._regexp.Match(anchor, encoded_text, encoded_pos,
|
||||
encoded_endpos)
|
||||
if spans[0] == _NULL_SPAN:
|
||||
break
|
||||
|
||||
# This algorithm is linear in the length of encoded_text. Specifically,
|
||||
# no matter how many groups there are for a given regular expression or
|
||||
# how many iterations through the loop there are for a given generator,
|
||||
# this algorithm uses a single, straightforward pass over encoded_text.
|
||||
offsets = sorted(set(itertools.chain(*spans)))
|
||||
if offsets[0] == -1:
|
||||
offsets = offsets[1:]
|
||||
# Discard the rest of the items because they are useless now - and we
|
||||
# could accumulate one item per str offset in the pathological case!
|
||||
decoded_offsets = {last_offset: decoded_offsets[last_offset]}
|
||||
for offset in offsets:
|
||||
decoded_offsets[offset] = (
|
||||
decoded_offsets[last_offset] +
|
||||
_re2.BytesToCharLen(encoded_text, last_offset, offset))
|
||||
last_offset = offset
|
||||
|
||||
def decode(span):
|
||||
if span == _NULL_SPAN:
|
||||
return span
|
||||
return decoded_offsets[span[0]], decoded_offsets[span[1]]
|
||||
|
||||
decoded_spans = [decode(span) for span in spans]
|
||||
yield _Match(self, text, pos, endpos, decoded_spans)
|
||||
if encoded_pos == encoded_endpos:
|
||||
break
|
||||
elif encoded_pos == spans[0][1]:
|
||||
# We matched the empty string at encoded_pos and would be stuck, so
|
||||
# in order to make forward progress, increment the str offset.
|
||||
encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1)
|
||||
else:
|
||||
encoded_pos = spans[0][1]
|
||||
else:
|
||||
while True:
|
||||
spans = self._regexp.Match(anchor, text, pos, endpos)
|
||||
if spans[0] == _NULL_SPAN:
|
||||
break
|
||||
yield _Match(self, text, pos, endpos, spans)
|
||||
if pos == endpos:
|
||||
break
|
||||
elif pos == spans[0][1]:
|
||||
# We matched the empty string at pos and would be stuck, so in order
|
||||
# to make forward progress, increment the bytes offset.
|
||||
pos += 1
|
||||
else:
|
||||
pos = spans[0][1]
|
||||
|
||||
def search(self, text, pos=None, endpos=None):
|
||||
return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None)
|
||||
|
||||
def match(self, text, pos=None, endpos=None):
|
||||
return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None)
|
||||
|
||||
def fullmatch(self, text, pos=None, endpos=None):
|
||||
return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None)
|
||||
|
||||
def finditer(self, text, pos=None, endpos=None):
|
||||
return self._match(_Anchor.UNANCHORED, text, pos, endpos)
|
||||
|
||||
def findall(self, text, pos=None, endpos=None):
|
||||
empty = type(text)()
|
||||
items = []
|
||||
for match in self.finditer(text, pos, endpos):
|
||||
if not self.groups:
|
||||
item = match.group()
|
||||
elif self.groups == 1:
|
||||
item = match.groups(default=empty)[0]
|
||||
else:
|
||||
item = match.groups(default=empty)
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
def _split(self, cb, text, maxsplit=0):
|
||||
if maxsplit < 0:
|
||||
return [text], 0
|
||||
elif maxsplit > 0:
|
||||
matchiter = itertools.islice(self.finditer(text), maxsplit)
|
||||
else:
|
||||
matchiter = self.finditer(text)
|
||||
pieces = []
|
||||
end = 0
|
||||
numsplit = 0
|
||||
for match in matchiter:
|
||||
pieces.append(text[end:match.start()])
|
||||
pieces.extend(cb(match))
|
||||
end = match.end()
|
||||
numsplit += 1
|
||||
pieces.append(text[end:])
|
||||
return pieces, numsplit
|
||||
|
||||
def split(self, text, maxsplit=0):
|
||||
cb = lambda match: [match[group] for group in range(1, self.groups + 1)]
|
||||
pieces, _ = self._split(cb, text, maxsplit)
|
||||
return pieces
|
||||
|
||||
def subn(self, repl, text, count=0):
|
||||
cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)]
|
||||
empty = type(text)()
|
||||
pieces, numsplit = self._split(cb, text, count)
|
||||
joined_pieces = empty.join(pieces)
|
||||
return joined_pieces, numsplit
|
||||
|
||||
def sub(self, repl, text, count=0):
|
||||
joined_pieces, _ = self.subn(repl, text, count)
|
||||
return joined_pieces
|
||||
|
||||
@property
|
||||
def pattern(self):
|
||||
return self._pattern
|
||||
|
||||
@property
|
||||
def options(self):
|
||||
return self._regexp.options()
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
return self._regexp.NumberOfCapturingGroups()
|
||||
|
||||
@property
|
||||
def groupindex(self):
|
||||
groups = self._regexp.NamedCapturingGroups()
|
||||
if isinstance(self._pattern, str):
|
||||
decoded_groups = [(_decode(group), index) for group, index in groups]
|
||||
return dict(decoded_groups)
|
||||
else:
|
||||
return dict(groups)
|
||||
|
||||
@property
|
||||
def programsize(self):
|
||||
return self._regexp.ProgramSize()
|
||||
|
||||
@property
|
||||
def reverseprogramsize(self):
|
||||
return self._regexp.ReverseProgramSize()
|
||||
|
||||
@property
|
||||
def programfanout(self):
|
||||
return self._regexp.ProgramFanout()
|
||||
|
||||
@property
|
||||
def reverseprogramfanout(self):
|
||||
return self._regexp.ReverseProgramFanout()
|
||||
|
||||
def possiblematchrange(self, maxlen):
|
||||
ok, min, max = self._regexp.PossibleMatchRange(maxlen)
|
||||
if not ok:
|
||||
raise error('failed to compute match range')
|
||||
return min, max
|
||||
|
||||
|
||||
class _Match(object):
|
||||
|
||||
__slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans')
|
||||
|
||||
def __init__(self, regexp, text, pos, endpos, spans):
|
||||
self._regexp = regexp
|
||||
self._text = text
|
||||
self._pos = pos
|
||||
self._endpos = endpos
|
||||
self._spans = spans
|
||||
|
||||
# Python prioritises three-digit octal numbers over group escapes.
|
||||
# For example, \100 should not be handled the same way as \g<10>0.
|
||||
_OCTAL_RE = compile('\\\\[0-7][0-7][0-7]')
|
||||
|
||||
# Python supports \1 through \99 (inclusive) and \g<...> syntax.
|
||||
_GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>')
|
||||
|
||||
@classmethod
|
||||
@functools.lru_cache(typed=True)
|
||||
def _split(cls, template):
|
||||
if isinstance(template, str):
|
||||
backslash = '\\'
|
||||
else:
|
||||
backslash = b'\\'
|
||||
empty = type(template)()
|
||||
pieces = [empty]
|
||||
index = template.find(backslash)
|
||||
while index != -1:
|
||||
piece, template = template[:index], template[index:]
|
||||
pieces[-1] += piece
|
||||
octal_match = cls._OCTAL_RE.match(template)
|
||||
group_match = cls._GROUP_RE.match(template)
|
||||
if (not octal_match) and group_match:
|
||||
index = group_match.end()
|
||||
piece, template = template[:index], template[index:]
|
||||
pieces.extend((piece, empty))
|
||||
else:
|
||||
# 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those
|
||||
# should contain backslashes, so break them here and then fix them at
|
||||
# the beginning of the next loop iteration or right before returning.
|
||||
index = 2
|
||||
piece, template = template[:index], template[index:]
|
||||
pieces[-1] += piece
|
||||
index = template.find(backslash)
|
||||
pieces[-1] += template
|
||||
return pieces
|
||||
|
||||
def expand(self, template):
|
||||
if isinstance(template, str):
|
||||
unescape = codecs.unicode_escape_decode
|
||||
else:
|
||||
unescape = codecs.escape_decode
|
||||
empty = type(template)()
|
||||
# Make a copy so that we don't clobber the cached pieces!
|
||||
pieces = list(self._split(template))
|
||||
for index, piece in enumerate(pieces):
|
||||
if not index % 2:
|
||||
pieces[index], _ = unescape(piece)
|
||||
else:
|
||||
if len(piece) <= 3: # \1 through \99 (inclusive)
|
||||
group = int(piece[1:])
|
||||
else: # \g<...>
|
||||
group = piece[3:-1]
|
||||
try:
|
||||
group = int(group)
|
||||
except ValueError:
|
||||
pass
|
||||
pieces[index] = self.__getitem__(group) or empty
|
||||
joined_pieces = empty.join(pieces)
|
||||
return joined_pieces
|
||||
|
||||
def __getitem__(self, group):
|
||||
if not isinstance(group, int):
|
||||
try:
|
||||
group = self._regexp.groupindex[group]
|
||||
except KeyError:
|
||||
raise IndexError('bad group name')
|
||||
if not 0 <= group <= self._regexp.groups:
|
||||
raise IndexError('bad group index')
|
||||
span = self._spans[group]
|
||||
if span == _NULL_SPAN:
|
||||
return None
|
||||
return self._text[span[0]:span[1]]
|
||||
|
||||
def group(self, *groups):
|
||||
if not groups:
|
||||
groups = (0,)
|
||||
items = (self.__getitem__(group) for group in groups)
|
||||
return next(items) if len(groups) == 1 else tuple(items)
|
||||
|
||||
def groups(self, default=None):
|
||||
items = []
|
||||
for group in range(1, self._regexp.groups + 1):
|
||||
item = self.__getitem__(group)
|
||||
items.append(default if item is None else item)
|
||||
return tuple(items)
|
||||
|
||||
def groupdict(self, default=None):
|
||||
items = []
|
||||
for group, index in self._regexp.groupindex.items():
|
||||
item = self.__getitem__(index)
|
||||
items.append((group, default) if item is None else (group, item))
|
||||
return dict(items)
|
||||
|
||||
def start(self, group=0):
|
||||
if not 0 <= group <= self._regexp.groups:
|
||||
raise IndexError('bad group index')
|
||||
return self._spans[group][0]
|
||||
|
||||
def end(self, group=0):
|
||||
if not 0 <= group <= self._regexp.groups:
|
||||
raise IndexError('bad group index')
|
||||
return self._spans[group][1]
|
||||
|
||||
def span(self, group=0):
|
||||
if not 0 <= group <= self._regexp.groups:
|
||||
raise IndexError('bad group index')
|
||||
return self._spans[group]
|
||||
|
||||
@property
|
||||
def re(self):
|
||||
return self._regexp
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
return self._text
|
||||
|
||||
@property
|
||||
def pos(self):
|
||||
return self._pos
|
||||
|
||||
@property
|
||||
def endpos(self):
|
||||
return self._endpos
|
||||
|
||||
@property
|
||||
def lastindex(self):
|
||||
max_end = -1
|
||||
max_group = None
|
||||
# We look for the rightmost right parenthesis by keeping the first group
|
||||
# that ends at max_end because that is the leftmost/outermost group when
|
||||
# there are nested groups!
|
||||
for group in range(1, self._regexp.groups + 1):
|
||||
end = self._spans[group][1]
|
||||
if max_end < end:
|
||||
max_end = end
|
||||
max_group = group
|
||||
return max_group
|
||||
|
||||
@property
|
||||
def lastgroup(self):
|
||||
max_group = self.lastindex
|
||||
if not max_group:
|
||||
return None
|
||||
for group, index in self._regexp.groupindex.items():
|
||||
if max_group == index:
|
||||
return group
|
||||
return None
|
||||
|
||||
|
||||
class Set(object):
|
||||
"""A Pythonic wrapper around RE2::Set."""
|
||||
|
||||
__slots__ = ('_set')
|
||||
|
||||
def __init__(self, anchor, options=None):
|
||||
options = options or Options()
|
||||
self._set = _re2.Set(anchor, options)
|
||||
|
||||
@classmethod
|
||||
def SearchSet(cls, options=None):
|
||||
return cls(_Anchor.UNANCHORED, options=options)
|
||||
|
||||
@classmethod
|
||||
def MatchSet(cls, options=None):
|
||||
return cls(_Anchor.ANCHOR_START, options=options)
|
||||
|
||||
@classmethod
|
||||
def FullMatchSet(cls, options=None):
|
||||
return cls(_Anchor.ANCHOR_BOTH, options=options)
|
||||
|
||||
def Add(self, pattern):
|
||||
if isinstance(pattern, str):
|
||||
encoded_pattern = _encode(pattern)
|
||||
index = self._set.Add(encoded_pattern)
|
||||
else:
|
||||
index = self._set.Add(pattern)
|
||||
if index == -1:
|
||||
raise error('failed to add %r to Set' % pattern)
|
||||
return index
|
||||
|
||||
def Compile(self):
|
||||
if not self._set.Compile():
|
||||
raise error('failed to compile Set')
|
||||
|
||||
def Match(self, text):
|
||||
if isinstance(text, str):
|
||||
encoded_text = _encode(text)
|
||||
matches = self._set.Match(encoded_text)
|
||||
else:
|
||||
matches = self._set.Match(text)
|
||||
return matches or None
|
||||
|
||||
|
||||
class Filter(object):
|
||||
"""A Pythonic wrapper around FilteredRE2."""
|
||||
|
||||
__slots__ = ('_filter', '_patterns')
|
||||
|
||||
def __init__(self):
|
||||
self._filter = _re2.Filter()
|
||||
self._patterns = []
|
||||
|
||||
def Add(self, pattern, options=None):
|
||||
options = options or Options()
|
||||
if isinstance(pattern, str):
|
||||
encoded_pattern = _encode(pattern)
|
||||
index = self._filter.Add(encoded_pattern, options)
|
||||
else:
|
||||
index = self._filter.Add(pattern, options)
|
||||
if index == -1:
|
||||
raise error('failed to add %r to Filter' % pattern)
|
||||
self._patterns.append(pattern)
|
||||
return index
|
||||
|
||||
def Compile(self):
|
||||
if not self._filter.Compile():
|
||||
raise error('failed to compile Filter')
|
||||
|
||||
def Match(self, text, potential=False):
|
||||
if isinstance(text, str):
|
||||
encoded_text = _encode(text)
|
||||
matches = self._filter.Match(encoded_text, potential)
|
||||
else:
|
||||
matches = self._filter.Match(text, potential)
|
||||
return matches or None
|
||||
|
||||
def re(self, index):
|
||||
if not 0 <= index < len(self._patterns):
|
||||
raise IndexError('bad index')
|
||||
proxy = object.__new__(_Regexp)
|
||||
proxy._pattern = self._patterns[index]
|
||||
proxy._regexp = self._filter.GetRE2(index)
|
||||
return proxy
|
||||
@@ -0,0 +1,489 @@
|
||||
# Copyright 2019 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
"""Tests for google3.third_party.re2.python.re2."""
|
||||
|
||||
import collections
|
||||
import pickle
|
||||
import re
|
||||
|
||||
from absl.testing import absltest
|
||||
from absl.testing import parameterized
|
||||
import re2
|
||||
|
||||
|
||||
class OptionsTest(parameterized.TestCase):
|
||||
|
||||
@parameterized.parameters(*re2.Options.NAMES)
|
||||
def test_option(self, name):
|
||||
options = re2.Options()
|
||||
value = getattr(options, name)
|
||||
if isinstance(value, re2.Options.Encoding):
|
||||
value = next(v for v in type(value).__members__.values() if v != value)
|
||||
elif isinstance(value, bool):
|
||||
value = not value
|
||||
elif isinstance(value, int):
|
||||
value = value + 1
|
||||
else:
|
||||
raise TypeError('option {!r}: {!r} {!r}'.format(name, type(value), value))
|
||||
setattr(options, name, value)
|
||||
self.assertEqual(value, getattr(options, name))
|
||||
|
||||
|
||||
class Re2CompileTest(parameterized.TestCase):
|
||||
"""Contains tests that apply to the re2 module only.
|
||||
|
||||
We disagree with Python on the string types of group names,
|
||||
so there is no point attempting to verify consistency.
|
||||
"""
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'(foo*)(?P<bar>qux+)', 2, [(u'bar', 2)]),
|
||||
(b'(foo*)(?P<bar>qux+)', 2, [(b'bar', 2)]),
|
||||
(u'(foo*)(?P<中文>qux+)', 2, [(u'中文', 2)]),
|
||||
)
|
||||
def test_compile(self, pattern, expected_groups, expected_groupindex):
|
||||
regexp = re2.compile(pattern)
|
||||
self.assertIs(regexp, re2.compile(pattern)) # cached
|
||||
self.assertIs(regexp, re2.compile(regexp)) # cached
|
||||
with self.assertRaisesRegex(re2.error,
|
||||
('pattern is already compiled, so '
|
||||
'options may not be specified')):
|
||||
options = re2.Options()
|
||||
options.log_errors = not options.log_errors
|
||||
re2.compile(regexp, options=options)
|
||||
self.assertIsNotNone(regexp.options)
|
||||
self.assertEqual(expected_groups, regexp.groups)
|
||||
self.assertDictEqual(dict(expected_groupindex), regexp.groupindex)
|
||||
|
||||
def test_compile_with_options(self):
|
||||
options = re2.Options()
|
||||
options.max_mem = 100
|
||||
with self.assertRaisesRegex(re2.error, 'pattern too large'):
|
||||
re2.compile('.{1000}', options=options)
|
||||
|
||||
def test_programsize_reverseprogramsize(self):
|
||||
regexp = re2.compile('a+b')
|
||||
self.assertEqual(7, regexp.programsize)
|
||||
self.assertEqual(7, regexp.reverseprogramsize)
|
||||
|
||||
def test_programfanout_reverseprogramfanout(self):
|
||||
regexp = re2.compile('a+b')
|
||||
self.assertListEqual([1, 1], regexp.programfanout)
|
||||
self.assertListEqual([3], regexp.reverseprogramfanout)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'abc', 0, None),
|
||||
(b'abc', 0, None),
|
||||
(u'abc', 10, (b'abc', b'abc')),
|
||||
(b'abc', 10, (b'abc', b'abc')),
|
||||
(u'ab*c', 10, (b'ab', b'ac')),
|
||||
(b'ab*c', 10, (b'ab', b'ac')),
|
||||
(u'ab+c', 10, (b'abb', b'abc')),
|
||||
(b'ab+c', 10, (b'abb', b'abc')),
|
||||
(u'ab?c', 10, (b'abc', b'ac')),
|
||||
(b'ab?c', 10, (b'abc', b'ac')),
|
||||
(u'.*', 10, (b'', b'\xf4\xbf\xbf\xc0')),
|
||||
(b'.*', 10, None),
|
||||
(u'\\C*', 10, None),
|
||||
(b'\\C*', 10, None),
|
||||
)
|
||||
def test_possiblematchrange(self, pattern, maxlen, expected_min_max):
|
||||
# For brevity, the string type of pattern determines the encoding.
|
||||
# It would otherwise be possible to have bytes with UTF8, but as per
|
||||
# the module docstring, it isn't permitted to have str with LATIN1.
|
||||
options = re2.Options()
|
||||
if isinstance(pattern, str):
|
||||
options.encoding = re2.Options.Encoding.UTF8
|
||||
else:
|
||||
options.encoding = re2.Options.Encoding.LATIN1
|
||||
regexp = re2.compile(pattern, options=options)
|
||||
if expected_min_max:
|
||||
self.assertEqual(expected_min_max, regexp.possiblematchrange(maxlen))
|
||||
else:
|
||||
with self.assertRaisesRegex(re2.error, 'failed to compute match range'):
|
||||
regexp.possiblematchrange(maxlen)
|
||||
|
||||
|
||||
Params = collections.namedtuple(
|
||||
'Params', ('pattern', 'text', 'spans', 'search', 'match', 'fullmatch'))
|
||||
|
||||
PARAMS = [
|
||||
Params(u'\\d+', u'Hello, world.', None, False, False, False),
|
||||
Params(b'\\d+', b'Hello, world.', None, False, False, False),
|
||||
Params(u'\\s+', u'Hello, world.', [(6, 7)], True, False, False),
|
||||
Params(b'\\s+', b'Hello, world.', [(6, 7)], True, False, False),
|
||||
Params(u'\\w+', u'Hello, world.', [(0, 5)], True, True, False),
|
||||
Params(b'\\w+', b'Hello, world.', [(0, 5)], True, True, False),
|
||||
Params(u'(\\d+)?', u'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
|
||||
Params(b'(\\d+)?', b'Hello, world.', [(0, 0), (-1, -1)], True, True, False),
|
||||
Params(u'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
|
||||
u'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
|
||||
Params(b'youtube(_device|_md|_gaia|_multiday|_multiday_gaia)?',
|
||||
b'youtube_ads', [(0, 7), (-1, -1)], True, True, False),
|
||||
]
|
||||
|
||||
|
||||
def upper(match):
|
||||
return match.group().upper()
|
||||
|
||||
|
||||
class ReRegexpTest(parameterized.TestCase):
|
||||
"""Contains tests that apply to the re and re2 modules."""
|
||||
|
||||
MODULE = re
|
||||
|
||||
@parameterized.parameters((p.pattern,) for p in PARAMS)
|
||||
def test_pickle(self, pattern):
|
||||
regexp = self.MODULE.compile(pattern)
|
||||
rick = pickle.loads(pickle.dumps(regexp))
|
||||
self.assertEqual(regexp.pattern, rick.pattern)
|
||||
|
||||
@parameterized.parameters(
|
||||
(p.pattern, p.text, (p.spans if p.search else None)) for p in PARAMS)
|
||||
def test_search(self, pattern, text, expected_spans):
|
||||
match = self.MODULE.search(pattern, text)
|
||||
if expected_spans is None:
|
||||
self.assertIsNone(match)
|
||||
else:
|
||||
spans = [match.span(group) for group in range(match.re.groups + 1)]
|
||||
self.assertListEqual(expected_spans, spans)
|
||||
|
||||
def test_search_with_pos_and_endpos(self):
|
||||
regexp = self.MODULE.compile(u'.+') # empty string NOT allowed
|
||||
text = u'I \u2665 RE2!'
|
||||
# Note that len(text) is the position of the empty string at the end of
|
||||
# text, so range() stops at len(text) + 1 in order to include len(text).
|
||||
for pos in range(len(text) + 1):
|
||||
for endpos in range(pos, len(text) + 1):
|
||||
match = regexp.search(text, pos=pos, endpos=endpos)
|
||||
if pos == endpos:
|
||||
self.assertIsNone(match)
|
||||
else:
|
||||
self.assertEqual(pos, match.pos)
|
||||
self.assertEqual(endpos, match.endpos)
|
||||
self.assertEqual(pos, match.start())
|
||||
self.assertEqual(endpos, match.end())
|
||||
self.assertTupleEqual((pos, endpos), match.span())
|
||||
|
||||
def test_search_with_bogus_pos_and_endpos(self):
|
||||
regexp = self.MODULE.compile(u'.*') # empty string allowed
|
||||
text = u'I \u2665 RE2!'
|
||||
|
||||
match = regexp.search(text, pos=-100)
|
||||
self.assertEqual(0, match.pos)
|
||||
match = regexp.search(text, pos=100)
|
||||
self.assertEqual(8, match.pos)
|
||||
|
||||
match = regexp.search(text, endpos=-100)
|
||||
self.assertEqual(0, match.endpos)
|
||||
match = regexp.search(text, endpos=100)
|
||||
self.assertEqual(8, match.endpos)
|
||||
|
||||
match = regexp.search(text, pos=100, endpos=-100)
|
||||
self.assertIsNone(match)
|
||||
|
||||
@parameterized.parameters(
|
||||
(p.pattern, p.text, (p.spans if p.match else None)) for p in PARAMS)
|
||||
def test_match(self, pattern, text, expected_spans):
|
||||
match = self.MODULE.match(pattern, text)
|
||||
if expected_spans is None:
|
||||
self.assertIsNone(match)
|
||||
else:
|
||||
spans = [match.span(group) for group in range(match.re.groups + 1)]
|
||||
self.assertListEqual(expected_spans, spans)
|
||||
|
||||
@parameterized.parameters(
|
||||
(p.pattern, p.text, (p.spans if p.fullmatch else None)) for p in PARAMS)
|
||||
def test_fullmatch(self, pattern, text, expected_spans):
|
||||
match = self.MODULE.fullmatch(pattern, text)
|
||||
if expected_spans is None:
|
||||
self.assertIsNone(match)
|
||||
else:
|
||||
spans = [match.span(group) for group in range(match.re.groups + 1)]
|
||||
self.assertListEqual(expected_spans, spans)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'', u'', [(0, 0)]),
|
||||
(b'', b'', [(0, 0)]),
|
||||
(u'', u'x', [(0, 0), (1, 1)]),
|
||||
(b'', b'x', [(0, 0), (1, 1)]),
|
||||
(u'', u'xy', [(0, 0), (1, 1), (2, 2)]),
|
||||
(b'', b'xy', [(0, 0), (1, 1), (2, 2)]),
|
||||
(u'.', u'xy', [(0, 1), (1, 2)]),
|
||||
(b'.', b'xy', [(0, 1), (1, 2)]),
|
||||
(u'x', u'xy', [(0, 1)]),
|
||||
(b'x', b'xy', [(0, 1)]),
|
||||
(u'y', u'xy', [(1, 2)]),
|
||||
(b'y', b'xy', [(1, 2)]),
|
||||
(u'z', u'xy', []),
|
||||
(b'z', b'xy', []),
|
||||
(u'\\w*', u'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
|
||||
(13, 13)]),
|
||||
(b'\\w*', b'Hello, world.', [(0, 5), (5, 5), (6, 6), (7, 12), (12, 12),
|
||||
(13, 13)]),
|
||||
)
|
||||
def test_finditer(self, pattern, text, expected_matches):
|
||||
matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
|
||||
self.assertListEqual(expected_matches, matches)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'\\w\\w+', u'Hello, world.', [u'Hello', u'world']),
|
||||
(b'\\w\\w+', b'Hello, world.', [b'Hello', b'world']),
|
||||
(u'(\\w)\\w+', u'Hello, world.', [u'H', u'w']),
|
||||
(b'(\\w)\\w+', b'Hello, world.', [b'H', b'w']),
|
||||
(u'(\\w)(\\w+)', u'Hello, world.', [(u'H', u'ello'), (u'w', u'orld')]),
|
||||
(b'(\\w)(\\w+)', b'Hello, world.', [(b'H', b'ello'), (b'w', b'orld')]),
|
||||
(u'(\\w)(\\w+)?', u'Hello, w.', [(u'H', u'ello'), (u'w', u'')]),
|
||||
(b'(\\w)(\\w+)?', b'Hello, w.', [(b'H', b'ello'), (b'w', b'')]),
|
||||
)
|
||||
def test_findall(self, pattern, text, expected_matches):
|
||||
matches = self.MODULE.findall(pattern, text)
|
||||
self.assertListEqual(expected_matches, matches)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'\\W+', u'Hello, world.', -1, [u'Hello, world.']),
|
||||
(b'\\W+', b'Hello, world.', -1, [b'Hello, world.']),
|
||||
(u'\\W+', u'Hello, world.', 0, [u'Hello', u'world', u'']),
|
||||
(b'\\W+', b'Hello, world.', 0, [b'Hello', b'world', b'']),
|
||||
(u'\\W+', u'Hello, world.', 1, [u'Hello', u'world.']),
|
||||
(b'\\W+', b'Hello, world.', 1, [b'Hello', b'world.']),
|
||||
(u'(\\W+)', u'Hello, world.', -1, [u'Hello, world.']),
|
||||
(b'(\\W+)', b'Hello, world.', -1, [b'Hello, world.']),
|
||||
(u'(\\W+)', u'Hello, world.', 0, [u'Hello', u', ', u'world', u'.', u'']),
|
||||
(b'(\\W+)', b'Hello, world.', 0, [b'Hello', b', ', b'world', b'.', b'']),
|
||||
(u'(\\W+)', u'Hello, world.', 1, [u'Hello', u', ', u'world.']),
|
||||
(b'(\\W+)', b'Hello, world.', 1, [b'Hello', b', ', b'world.']),
|
||||
)
|
||||
def test_split(self, pattern, text, maxsplit, expected_pieces):
|
||||
pieces = self.MODULE.split(pattern, text, maxsplit)
|
||||
self.assertListEqual(expected_pieces, pieces)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'\\w+', upper, u'Hello, world.', -1, u'Hello, world.', 0),
|
||||
(b'\\w+', upper, b'Hello, world.', -1, b'Hello, world.', 0),
|
||||
(u'\\w+', upper, u'Hello, world.', 0, u'HELLO, WORLD.', 2),
|
||||
(b'\\w+', upper, b'Hello, world.', 0, b'HELLO, WORLD.', 2),
|
||||
(u'\\w+', upper, u'Hello, world.', 1, u'HELLO, world.', 1),
|
||||
(b'\\w+', upper, b'Hello, world.', 1, b'HELLO, world.', 1),
|
||||
(u'\\w+', u'MEEP', u'Hello, world.', -1, u'Hello, world.', 0),
|
||||
(b'\\w+', b'MEEP', b'Hello, world.', -1, b'Hello, world.', 0),
|
||||
(u'\\w+', u'MEEP', u'Hello, world.', 0, u'MEEP, MEEP.', 2),
|
||||
(b'\\w+', b'MEEP', b'Hello, world.', 0, b'MEEP, MEEP.', 2),
|
||||
(u'\\w+', u'MEEP', u'Hello, world.', 1, u'MEEP, world.', 1),
|
||||
(b'\\w+', b'MEEP', b'Hello, world.', 1, b'MEEP, world.', 1),
|
||||
(u'\\\\', u'\\\\\\\\', u'Hello,\\world.', 0, u'Hello,\\\\world.', 1),
|
||||
(b'\\\\', b'\\\\\\\\', b'Hello,\\world.', 0, b'Hello,\\\\world.', 1),
|
||||
)
|
||||
def test_subn_sub(self, pattern, repl, text, count, expected_joined_pieces,
|
||||
expected_numsplit):
|
||||
joined_pieces, numsplit = self.MODULE.subn(pattern, repl, text, count)
|
||||
self.assertEqual(expected_joined_pieces, joined_pieces)
|
||||
self.assertEqual(expected_numsplit, numsplit)
|
||||
|
||||
joined_pieces = self.MODULE.sub(pattern, repl, text, count)
|
||||
self.assertEqual(expected_joined_pieces, joined_pieces)
|
||||
|
||||
|
||||
class Re2RegexpTest(ReRegexpTest):
|
||||
"""Contains tests that apply to the re2 module only."""
|
||||
|
||||
MODULE = re2
|
||||
|
||||
def test_compile_with_latin1_encoding(self):
|
||||
options = re2.Options()
|
||||
options.encoding = re2.Options.Encoding.LATIN1
|
||||
with self.assertRaisesRegex(re2.error,
|
||||
('string type of pattern is str, but '
|
||||
'encoding specified in options is LATIN1')):
|
||||
re2.compile(u'.?', options=options)
|
||||
|
||||
# ... whereas this is fine, of course.
|
||||
re2.compile(b'.?', options=options)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'\\p{Lo}', u'\u0ca0_\u0ca0', [(0, 1), (2, 3)]),
|
||||
(b'\\p{Lo}', b'\xe0\xb2\xa0_\xe0\xb2\xa0', [(0, 3), (4, 7)]),
|
||||
)
|
||||
def test_finditer_with_utf8(self, pattern, text, expected_matches):
|
||||
matches = [match.span() for match in self.MODULE.finditer(pattern, text)]
|
||||
self.assertListEqual(expected_matches, matches)
|
||||
|
||||
def test_purge(self):
|
||||
re2.compile('Goodbye, world.')
|
||||
self.assertGreater(re2._Regexp._make.cache_info().currsize, 0)
|
||||
re2.purge()
|
||||
self.assertEqual(re2._Regexp._make.cache_info().currsize, 0)
|
||||
|
||||
|
||||
class Re2EscapeTest(parameterized.TestCase):
|
||||
"""Contains tests that apply to the re2 module only.
|
||||
|
||||
We disagree with Python on the escaping of some characters,
|
||||
so there is no point attempting to verify consistency.
|
||||
"""
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'a*b+c?', u'a\\*b\\+c\\?'),
|
||||
(b'a*b+c?', b'a\\*b\\+c\\?'),
|
||||
)
|
||||
def test_escape(self, pattern, expected_escaped):
|
||||
escaped = re2.escape(pattern)
|
||||
self.assertEqual(expected_escaped, escaped)
|
||||
|
||||
|
||||
class ReMatchTest(parameterized.TestCase):
|
||||
"""Contains tests that apply to the re and re2 modules."""
|
||||
|
||||
MODULE = re
|
||||
|
||||
def test_expand(self):
|
||||
pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
|
||||
text = u'I \u2665 RE2!\n'
|
||||
match = self.MODULE.search(pattern, text)
|
||||
|
||||
self.assertEqual(u'\u2665\n!', match.expand(u'\\1\\n\\2'))
|
||||
self.assertEqual(u'\u2665\n!', match.expand(u'\\g<1>\\n\\g<2>'))
|
||||
self.assertEqual(u'\u2665\n!', match.expand(u'\\g<S>\\n\\g<P>'))
|
||||
self.assertEqual(u'\\1\\2\n\u2665!', match.expand(u'\\\\1\\\\2\\n\\1\\2'))
|
||||
|
||||
def test_expand_with_octal(self):
|
||||
pattern = u'()()()()()()()()()(\\w+)'
|
||||
text = u'Hello, world.'
|
||||
match = self.MODULE.search(pattern, text)
|
||||
|
||||
self.assertEqual(u'Hello\n', match.expand(u'\\g<0>\\n'))
|
||||
self.assertEqual(u'Hello\n', match.expand(u'\\g<10>\\n'))
|
||||
|
||||
self.assertEqual(u'\x00\n', match.expand(u'\\0\\n'))
|
||||
self.assertEqual(u'\x00\n', match.expand(u'\\00\\n'))
|
||||
self.assertEqual(u'\x00\n', match.expand(u'\\000\\n'))
|
||||
self.assertEqual(u'\x000\n', match.expand(u'\\0000\\n'))
|
||||
|
||||
self.assertEqual(u'\n', match.expand(u'\\1\\n'))
|
||||
self.assertEqual(u'Hello\n', match.expand(u'\\10\\n'))
|
||||
self.assertEqual(u'@\n', match.expand(u'\\100\\n'))
|
||||
self.assertEqual(u'@0\n', match.expand(u'\\1000\\n'))
|
||||
|
||||
def test_getitem_group_groups_groupdict(self):
|
||||
pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
|
||||
text = u'Hello, world.\nI \u2665 RE2!\nGoodbye, world.\n'
|
||||
match = self.MODULE.search(pattern, text)
|
||||
|
||||
self.assertEqual(u'\u2665 RE2!', match[0])
|
||||
self.assertEqual(u'\u2665', match[1])
|
||||
self.assertEqual(u'!', match[2])
|
||||
self.assertEqual(u'\u2665', match[u'S'])
|
||||
self.assertEqual(u'!', match[u'P'])
|
||||
|
||||
self.assertEqual(u'\u2665 RE2!', match.group())
|
||||
self.assertEqual(u'\u2665 RE2!', match.group(0))
|
||||
self.assertEqual(u'\u2665', match.group(1))
|
||||
self.assertEqual(u'!', match.group(2))
|
||||
self.assertEqual(u'\u2665', match.group(u'S'))
|
||||
self.assertEqual(u'!', match.group(u'P'))
|
||||
|
||||
self.assertTupleEqual((u'\u2665', u'!'), match.group(1, 2))
|
||||
self.assertTupleEqual((u'\u2665', u'!'), match.group(u'S', u'P'))
|
||||
self.assertTupleEqual((u'\u2665', u'!'), match.groups())
|
||||
self.assertDictEqual({u'S': u'\u2665', u'P': u'!'}, match.groupdict())
|
||||
|
||||
def test_bogus_group_start_end_and_span(self):
|
||||
pattern = u'(?P<S>[\u2600-\u26ff]+).*?(?P<P>[^\\s\\w]+)'
|
||||
text = u'I \u2665 RE2!\n'
|
||||
match = self.MODULE.search(pattern, text)
|
||||
|
||||
self.assertRaises(IndexError, match.group, -1)
|
||||
self.assertRaises(IndexError, match.group, 3)
|
||||
self.assertRaises(IndexError, match.group, 'X')
|
||||
|
||||
self.assertRaises(IndexError, match.start, -1)
|
||||
self.assertRaises(IndexError, match.start, 3)
|
||||
|
||||
self.assertRaises(IndexError, match.end, -1)
|
||||
self.assertRaises(IndexError, match.end, 3)
|
||||
|
||||
self.assertRaises(IndexError, match.span, -1)
|
||||
self.assertRaises(IndexError, match.span, 3)
|
||||
|
||||
@parameterized.parameters(
|
||||
(u'((a)(b))((c)(d))', u'foo bar qux', None, None),
|
||||
(u'(?P<one>(a)(b))((c)(d))', u'foo abcd qux', 4, None),
|
||||
(u'(?P<one>(a)(b))(?P<four>(c)(d))', u'foo abcd qux', 4, 'four'),
|
||||
)
|
||||
def test_lastindex_lastgroup(self, pattern, text, expected_lastindex,
|
||||
expected_lastgroup):
|
||||
match = self.MODULE.search(pattern, text)
|
||||
if expected_lastindex is None:
|
||||
self.assertIsNone(match)
|
||||
else:
|
||||
self.assertEqual(expected_lastindex, match.lastindex)
|
||||
self.assertEqual(expected_lastgroup, match.lastgroup)
|
||||
|
||||
|
||||
class Re2MatchTest(ReMatchTest):
|
||||
"""Contains tests that apply to the re2 module only."""
|
||||
|
||||
MODULE = re2
|
||||
|
||||
|
||||
class SetTest(absltest.TestCase):
|
||||
|
||||
def test_search(self):
|
||||
s = re2.Set.SearchSet()
|
||||
self.assertEqual(0, s.Add('\\d+'))
|
||||
self.assertEqual(1, s.Add('\\s+'))
|
||||
self.assertEqual(2, s.Add('\\w+'))
|
||||
self.assertRaises(re2.error, s.Add, '(MEEP')
|
||||
s.Compile()
|
||||
self.assertItemsEqual([1, 2], s.Match('Hello, world.'))
|
||||
|
||||
def test_match(self):
|
||||
s = re2.Set.MatchSet()
|
||||
self.assertEqual(0, s.Add('\\d+'))
|
||||
self.assertEqual(1, s.Add('\\s+'))
|
||||
self.assertEqual(2, s.Add('\\w+'))
|
||||
self.assertRaises(re2.error, s.Add, '(MEEP')
|
||||
s.Compile()
|
||||
self.assertItemsEqual([2], s.Match('Hello, world.'))
|
||||
|
||||
def test_fullmatch(self):
|
||||
s = re2.Set.FullMatchSet()
|
||||
self.assertEqual(0, s.Add('\\d+'))
|
||||
self.assertEqual(1, s.Add('\\s+'))
|
||||
self.assertEqual(2, s.Add('\\w+'))
|
||||
self.assertRaises(re2.error, s.Add, '(MEEP')
|
||||
s.Compile()
|
||||
self.assertIsNone(s.Match('Hello, world.'))
|
||||
|
||||
|
||||
class FilterTest(absltest.TestCase):
|
||||
|
||||
def test_match(self):
|
||||
f = re2.Filter()
|
||||
self.assertEqual(0, f.Add('Hello, \\w+\\.'))
|
||||
self.assertEqual(1, f.Add('\\w+, world\\.'))
|
||||
self.assertEqual(2, f.Add('Goodbye, \\w+\\.'))
|
||||
self.assertRaises(re2.error, f.Add, '(MEEP')
|
||||
f.Compile()
|
||||
self.assertItemsEqual([0, 1], f.Match('Hello, world.', potential=True))
|
||||
self.assertItemsEqual([0, 1], f.Match('HELLO, WORLD.', potential=True))
|
||||
self.assertItemsEqual([0, 1], f.Match('Hello, world.'))
|
||||
self.assertIsNone(f.Match('HELLO, WORLD.'))
|
||||
|
||||
self.assertRaises(IndexError, f.re, -1)
|
||||
self.assertRaises(IndexError, f.re, 3)
|
||||
self.assertEqual('Goodbye, \\w+\\.', f.re(2).pattern)
|
||||
# Verify whether the underlying RE2 object is usable.
|
||||
self.assertEqual(0, f.re(2).groups)
|
||||
|
||||
def test_issue_484(self):
|
||||
# Previously, the shim would dereference a null pointer and crash.
|
||||
f = re2.Filter()
|
||||
with self.assertRaisesRegex(re2.error,
|
||||
r'Match\(\) called before compiling'):
|
||||
f.Match('')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
absltest.main()
|
||||
+158
@@ -0,0 +1,158 @@
|
||||
# Copyright 2019 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
import os
|
||||
import re
|
||||
import setuptools
|
||||
import setuptools.command.build_ext
|
||||
import shutil
|
||||
|
||||
long_description = r"""A drop-in replacement for the re module.
|
||||
|
||||
It uses RE2 under the hood, of course, so various PCRE features
|
||||
(e.g. backreferences, look-around assertions) are not supported.
|
||||
See https://github.com/google/re2/wiki/Syntax for the canonical
|
||||
reference, but known syntactic "gotchas" relative to Python are:
|
||||
|
||||
* PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
|
||||
but calls it \Z. You must rewrite \Z to \z in pattern strings.
|
||||
|
||||
Known differences between this module's API and the re module's API:
|
||||
|
||||
* The error class does not provide any error information as attributes.
|
||||
* The Options class replaces the re module's flags with RE2's options as
|
||||
gettable/settable properties. Please see re2.h for their documentation.
|
||||
* The pattern string and the input string do not have to be the same type.
|
||||
Any str will be encoded to UTF-8.
|
||||
* The pattern string cannot be str if the options specify Latin-1 encoding.
|
||||
|
||||
Known issues with regard to building the C++ extension:
|
||||
|
||||
* Building requires RE2 to be installed on your system.
|
||||
On Debian, for example, install the libre2-dev package.
|
||||
* Building requires pybind11 to be installed on your system OR venv.
|
||||
On Debian, for example, install the pybind11-dev package.
|
||||
For a venv, install the pybind11 package from PyPI.
|
||||
* Building on macOS is known to work, but has been known to fail.
|
||||
For example, the system Python may not know which compiler flags
|
||||
to set when building bindings for software installed by Homebrew;
|
||||
see https://docs.brew.sh/Homebrew-and-Python#brewed-python-modules.
|
||||
* Building on Windows has not been tested yet and will probably fail.
|
||||
"""
|
||||
|
||||
|
||||
class BuildExt(setuptools.command.build_ext.build_ext):
|
||||
|
||||
def build_extension(self, ext):
|
||||
if 'GITHUB_ACTIONS' not in os.environ:
|
||||
return super().build_extension(ext)
|
||||
|
||||
cmd = ['bazel', 'build']
|
||||
try:
|
||||
cpu = os.environ['BAZEL_CPU']
|
||||
cmd.append(f'--cpu={cpu}')
|
||||
cmd.append(f'--platforms=//python:{cpu}')
|
||||
if cpu == 'x64_x86_windows':
|
||||
# Register the local 32-bit C++ toolchain with highest priority.
|
||||
# (This is likely to break in some release of Bazel after 7.0.0,
|
||||
# but this special case can hopefully be entirely removed then.)
|
||||
cmd.append(f'--extra_toolchains=@local_config_cc//:cc-toolchain-{cpu}')
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
ver = os.environ['MACOSX_DEPLOYMENT_TARGET']
|
||||
cmd.append(f'--macos_minimum_os={ver}')
|
||||
except KeyError:
|
||||
pass
|
||||
# Register the local Python toolchains with highest priority.
|
||||
cmd.append('--extra_toolchains=//python/toolchains:all')
|
||||
cmd += ['--compilation_mode=opt', '--', ':all']
|
||||
self.spawn(cmd)
|
||||
|
||||
# This ensures that f'_re2.{importlib.machinery.EXTENSION_SUFFIXES[0]}'
|
||||
# is the filename in the destination directory, which is what's needed.
|
||||
shutil.copyfile('../bazel-bin/python/_re2.so',
|
||||
self.get_ext_fullpath(ext.name))
|
||||
|
||||
cmd = ['bazel', 'clean', '--expunge']
|
||||
self.spawn(cmd)
|
||||
|
||||
|
||||
def options():
|
||||
bdist_wheel = {}
|
||||
try:
|
||||
bdist_wheel['plat_name'] = os.environ['PLAT_NAME']
|
||||
except KeyError:
|
||||
pass
|
||||
return {'bdist_wheel': bdist_wheel}
|
||||
|
||||
|
||||
def include_dirs():
|
||||
try:
|
||||
import pybind11
|
||||
yield pybind11.get_include()
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
ext_module = setuptools.Extension(
|
||||
name='_re2',
|
||||
sources=['_re2.cc'],
|
||||
include_dirs=list(include_dirs()),
|
||||
libraries=['re2'],
|
||||
extra_compile_args=['-fvisibility=hidden'],
|
||||
)
|
||||
|
||||
# We need `re2` to be a package, not a module, because it appears that
|
||||
# modules can't have `.pyi` files, so munge the module into a package.
|
||||
PACKAGE = 're2'
|
||||
try:
|
||||
# If we are building from the sdist, we are already in package form.
|
||||
if not os.path.exists('PKG-INFO'):
|
||||
os.makedirs(PACKAGE)
|
||||
for filename in (
|
||||
're2.py',
|
||||
# TODO(junyer): Populate as per https://github.com/google/re2/issues/496.
|
||||
# 're2.pyi',
|
||||
# '_re2.pyi',
|
||||
):
|
||||
with open(filename, 'r') as file:
|
||||
contents = file.read()
|
||||
filename = re.sub(r'^re2(?=\.py)', '__init__', filename)
|
||||
contents = re.sub(r'^(?=import _)', 'from . ', contents, flags=re.MULTILINE)
|
||||
with open(f'{PACKAGE}/{filename}', 'x') as file:
|
||||
file.write(contents)
|
||||
# TODO(junyer): Populate as per https://github.com/google/re2/issues/496.
|
||||
# with open(f'{PACKAGE}/py.typed', 'x') as file:
|
||||
# pass
|
||||
|
||||
setuptools.setup(
|
||||
name='google-re2',
|
||||
version='1.1.20240702',
|
||||
description='RE2 Python bindings',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/plain',
|
||||
author='The RE2 Authors',
|
||||
author_email='re2-dev@googlegroups.com',
|
||||
url='https://github.com/google/re2',
|
||||
packages=[PACKAGE],
|
||||
ext_package=PACKAGE,
|
||||
ext_modules=[ext_module],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
'License :: OSI Approved :: BSD License',
|
||||
'Programming Language :: C++',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
],
|
||||
options=options(),
|
||||
cmdclass={'build_ext': BuildExt},
|
||||
python_requires='~=3.8',
|
||||
)
|
||||
except:
|
||||
raise
|
||||
else:
|
||||
# If we are building from the sdist, we are already in package form.
|
||||
if not os.path.exists('PKG-INFO'):
|
||||
shutil.rmtree(PACKAGE)
|
||||
@@ -0,0 +1,98 @@
|
||||
# Copyright 2019 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import sysconfig
|
||||
|
||||
|
||||
def generate():
|
||||
include = sysconfig.get_path('include')
|
||||
libs = os.path.join(include, '../libs')
|
||||
|
||||
mydir = os.path.dirname(sys.argv[0]) or '.'
|
||||
shutil.copytree(include, f'{mydir}/include')
|
||||
try:
|
||||
shutil.copytree(libs, f'{mydir}/libs')
|
||||
except FileNotFoundError:
|
||||
# We must not be running on Windows. :)
|
||||
pass
|
||||
|
||||
with open(f'{mydir}/BUILD.bazel', 'x') as file:
|
||||
file.write(
|
||||
"""\
|
||||
load("@rules_python//python/cc:py_cc_toolchain.bzl", "py_cc_toolchain")
|
||||
load("@rules_python//python:py_runtime.bzl", "py_runtime")
|
||||
load("@rules_python//python:py_runtime_pair.bzl", "py_runtime_pair")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
toolchain(
|
||||
name = "py",
|
||||
toolchain = ":py_toolchain",
|
||||
toolchain_type = "@rules_python//python:toolchain_type",
|
||||
)
|
||||
|
||||
py_runtime_pair(
|
||||
name = "py_toolchain",
|
||||
py3_runtime = ":interpreter",
|
||||
)
|
||||
|
||||
py_runtime(
|
||||
name = "interpreter",
|
||||
interpreter_path = "{interpreter_path}",
|
||||
interpreter_version_info = {{
|
||||
"major": "{major}",
|
||||
"minor": "{minor}",
|
||||
}},
|
||||
python_version = "PY3",
|
||||
)
|
||||
|
||||
toolchain(
|
||||
name = "py_cc",
|
||||
toolchain = ":py_cc_toolchain",
|
||||
toolchain_type = "@rules_python//python/cc:toolchain_type",
|
||||
)
|
||||
|
||||
py_cc_toolchain(
|
||||
name = "py_cc_toolchain",
|
||||
headers = ":headers",
|
||||
libs = ":libraries",
|
||||
python_version = "{major}.{minor}",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "headers",
|
||||
hdrs = glob(["include/**/*.h"]),
|
||||
includes = ["include"],
|
||||
deps = select({{
|
||||
"@platforms//os:windows": [":interface_library"],
|
||||
"//conditions:default": [],
|
||||
}}),
|
||||
)
|
||||
|
||||
cc_import(
|
||||
name = "interface_library",
|
||||
interface_library = select({{
|
||||
"@platforms//os:windows": "libs/python{major}{minor}.lib",
|
||||
"//conditions:default": None,
|
||||
}}),
|
||||
system_provided = True,
|
||||
)
|
||||
|
||||
# Not actually necessary for our purposes. :)
|
||||
cc_library(
|
||||
name = "libraries",
|
||||
)
|
||||
""".format(
|
||||
interpreter_path=sys.executable.replace('\\', '/'),
|
||||
major=sys.version_info.major,
|
||||
minor=sys.version_info.minor,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
generate()
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
|
||||
Name: re2
|
||||
Description: RE2 is a fast, safe, thread-friendly regular expression engine.
|
||||
Requires: @REQUIRES@
|
||||
Version: @SONAME@.0.0
|
||||
Cflags: -pthread -I${includedir}
|
||||
Libs: -pthread -L${libdir} -lre2
|
||||
@@ -0,0 +1,44 @@
|
||||
// Copyright 2023 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/bitmap256.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
int Bitmap256::FindNextSetBit(int c) const {
|
||||
ABSL_DCHECK_GE(c, 0);
|
||||
ABSL_DCHECK_LE(c, 255);
|
||||
|
||||
// Check the word that contains the bit. Mask out any lower bits.
|
||||
int i = c / 64;
|
||||
uint64_t word = words_[i] & (~uint64_t{0} << (c % 64));
|
||||
if (word != 0)
|
||||
return (i * 64) + FindLSBSet(word);
|
||||
|
||||
// Check any following words.
|
||||
i++;
|
||||
switch (i) {
|
||||
case 1:
|
||||
if (words_[1] != 0)
|
||||
return (1 * 64) + FindLSBSet(words_[1]);
|
||||
ABSL_FALLTHROUGH_INTENDED;
|
||||
case 2:
|
||||
if (words_[2] != 0)
|
||||
return (2 * 64) + FindLSBSet(words_[2]);
|
||||
ABSL_FALLTHROUGH_INTENDED;
|
||||
case 3:
|
||||
if (words_[3] != 0)
|
||||
return (3 * 64) + FindLSBSet(words_[3]);
|
||||
ABSL_FALLTHROUGH_INTENDED;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,88 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_BITMAP256_H_
|
||||
#define RE2_BITMAP256_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class Bitmap256 {
|
||||
public:
|
||||
Bitmap256() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
// Clears all of the bits.
|
||||
void Clear() {
|
||||
memset(words_, 0, sizeof words_);
|
||||
}
|
||||
|
||||
// Tests the bit with index c.
|
||||
bool Test(int c) const {
|
||||
ABSL_DCHECK_GE(c, 0);
|
||||
ABSL_DCHECK_LE(c, 255);
|
||||
|
||||
return (words_[c / 64] & (uint64_t{1} << (c % 64))) != 0;
|
||||
}
|
||||
|
||||
// Sets the bit with index c.
|
||||
void Set(int c) {
|
||||
ABSL_DCHECK_GE(c, 0);
|
||||
ABSL_DCHECK_LE(c, 255);
|
||||
|
||||
words_[c / 64] |= (uint64_t{1} << (c % 64));
|
||||
}
|
||||
|
||||
// Finds the next non-zero bit with index >= c.
|
||||
// Returns -1 if no such bit exists.
|
||||
int FindNextSetBit(int c) const;
|
||||
|
||||
private:
|
||||
// Finds the least significant non-zero bit in n.
|
||||
static int FindLSBSet(uint64_t n) {
|
||||
ABSL_DCHECK_NE(n, uint64_t{0});
|
||||
#if defined(__GNUC__)
|
||||
return __builtin_ctzll(n);
|
||||
#elif defined(_MSC_VER) && defined(_M_X64)
|
||||
unsigned long c;
|
||||
_BitScanForward64(&c, n);
|
||||
return static_cast<int>(c);
|
||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||
unsigned long c;
|
||||
if (static_cast<uint32_t>(n) != 0) {
|
||||
_BitScanForward(&c, static_cast<uint32_t>(n));
|
||||
return static_cast<int>(c);
|
||||
} else {
|
||||
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
|
||||
return static_cast<int>(c) + 32;
|
||||
}
|
||||
#else
|
||||
int c = 63;
|
||||
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
|
||||
uint64_t word = n << shift;
|
||||
if (word != 0) {
|
||||
n = word;
|
||||
c -= shift;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t words_[4];
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_BITMAP256_H_
|
||||
+384
@@ -0,0 +1,384 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
|
||||
// Prog::SearchBitState is a regular expression search with submatch
|
||||
// tracking for small regular expressions and texts. Similarly to
|
||||
// testing/backtrack.cc, it allocates a bitmap with (count of
|
||||
// lists) * (length of text) bits to make sure it never explores the
|
||||
// same (instruction list, character position) multiple times. This
|
||||
// limits the search to run in time linear in the length of the text.
|
||||
//
|
||||
// Unlike testing/backtrack.cc, SearchBitState is not recursive
|
||||
// on the text.
|
||||
//
|
||||
// SearchBitState is a fast replacement for the NFA code on small
|
||||
// regexps and texts when SearchOnePass cannot be used.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct Job {
|
||||
int id;
|
||||
int rle; // run length encoding
|
||||
const char* p;
|
||||
};
|
||||
|
||||
class BitState {
|
||||
public:
|
||||
explicit BitState(Prog* prog);
|
||||
|
||||
// The usual Search prototype.
|
||||
// Can only call Search once per BitState.
|
||||
bool Search(absl::string_view text, absl::string_view context, bool anchored,
|
||||
bool longest, absl::string_view* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
inline bool ShouldVisit(int id, const char* p);
|
||||
void Push(int id, const char* p);
|
||||
void GrowStack();
|
||||
bool TrySearch(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
absl::string_view text_; // text being searched
|
||||
absl::string_view context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
absl::string_view* submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
static constexpr int kVisitedBits = 64;
|
||||
PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
|
||||
PODArray<const char*> cap_; // capture registers
|
||||
PODArray<Job> job_; // stack of text positions to explore
|
||||
int njob_; // stack size
|
||||
|
||||
BitState(const BitState&) = delete;
|
||||
BitState& operator=(const BitState&) = delete;
|
||||
};
|
||||
|
||||
BitState::BitState(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0),
|
||||
njob_(0) {
|
||||
}
|
||||
|
||||
// Given id, which *must* be a list head, we can look up its list ID.
|
||||
// Then the question is: Should the search visit the (list ID, p) pair?
|
||||
// If so, remember that it was visited so that the next time,
|
||||
// we don't repeat the visit.
|
||||
bool BitState::ShouldVisit(int id, const char* p) {
|
||||
int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
|
||||
static_cast<int>(p-text_.data());
|
||||
if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
|
||||
return false;
|
||||
visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Grow the stack.
|
||||
void BitState::GrowStack() {
|
||||
PODArray<Job> tmp(2*job_.size());
|
||||
memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
|
||||
job_ = std::move(tmp);
|
||||
}
|
||||
|
||||
// Push (id, p) onto the stack, growing it if necessary.
|
||||
void BitState::Push(int id, const char* p) {
|
||||
if (njob_ >= job_.size()) {
|
||||
GrowStack();
|
||||
if (njob_ >= job_.size()) {
|
||||
ABSL_LOG(DFATAL) << "GrowStack() failed: "
|
||||
<< "njob_ = " << njob_ << ", "
|
||||
<< "job_.size() = " << job_.size();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// If id < 0, it's undoing a Capture,
|
||||
// so we mustn't interfere with that.
|
||||
if (id >= 0 && njob_ > 0) {
|
||||
Job* top = &job_[njob_-1];
|
||||
if (id == top->id &&
|
||||
p == top->p + top->rle + 1 &&
|
||||
top->rle < std::numeric_limits<int>::max()) {
|
||||
++top->rle;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Job* top = &job_[njob_++];
|
||||
top->id = id;
|
||||
top->rle = 0;
|
||||
top->p = p;
|
||||
}
|
||||
|
||||
// Try a search from instruction id0 in state p0.
|
||||
// Return whether it succeeded.
|
||||
bool BitState::TrySearch(int id0, const char* p0) {
|
||||
bool matched = false;
|
||||
const char* end = text_.data() + text_.size();
|
||||
njob_ = 0;
|
||||
// Push() no longer checks ShouldVisit(),
|
||||
// so we must perform the check ourselves.
|
||||
if (ShouldVisit(id0, p0))
|
||||
Push(id0, p0);
|
||||
while (njob_ > 0) {
|
||||
// Pop job off stack.
|
||||
--njob_;
|
||||
int id = job_[njob_].id;
|
||||
int& rle = job_[njob_].rle;
|
||||
const char* p = job_[njob_].p;
|
||||
|
||||
if (id < 0) {
|
||||
// Undo the Capture.
|
||||
cap_[prog_->inst(-id)->cap()] = p;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (rle > 0) {
|
||||
p += rle;
|
||||
// Revivify job on stack.
|
||||
--rle;
|
||||
++njob_;
|
||||
}
|
||||
|
||||
Loop:
|
||||
// Visit id, p.
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
|
||||
return false;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
if (ip->greedy(prog_)) {
|
||||
// out1 is the Match instruction.
|
||||
id = ip->out1();
|
||||
p = end;
|
||||
goto Loop;
|
||||
}
|
||||
if (longest_) {
|
||||
// ip must be non-greedy...
|
||||
// out is the Match instruction.
|
||||
id = ip->out();
|
||||
p = end;
|
||||
goto Loop;
|
||||
}
|
||||
goto Next;
|
||||
|
||||
case kInstByteRange: {
|
||||
int c = -1;
|
||||
if (p < end)
|
||||
c = *p & 0xFF;
|
||||
if (!ip->Matches(c))
|
||||
goto Next;
|
||||
|
||||
if (ip->hint() != 0)
|
||||
Push(id+ip->hint(), p); // try the next when we're done
|
||||
id = ip->out();
|
||||
p++;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
if (!ip->last())
|
||||
Push(id+1, p); // try the next when we're done
|
||||
|
||||
if (0 <= ip->cap() && ip->cap() < cap_.size()) {
|
||||
// Capture p to register, but save old value first.
|
||||
Push(-id, cap_[ip->cap()]); // undo when we're done
|
||||
cap_[ip->cap()] = p;
|
||||
}
|
||||
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
goto Next;
|
||||
|
||||
if (!ip->last())
|
||||
Push(id+1, p); // try the next when we're done
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
Push(id+1, p); // try the next when we're done
|
||||
id = ip->out();
|
||||
|
||||
CheckAndLoop:
|
||||
// Sanity check: id is the head of its list, which must
|
||||
// be the case if id-1 is the last of *its* list. :)
|
||||
ABSL_DCHECK(id == 0 || prog_->inst(id-1)->last());
|
||||
if (ShouldVisit(id, p))
|
||||
goto Loop;
|
||||
break;
|
||||
|
||||
case kInstMatch: {
|
||||
if (endmatch_ && p != end)
|
||||
goto Next;
|
||||
|
||||
// We found a match. If the caller doesn't care
|
||||
// where the match is, no point going further.
|
||||
if (nsubmatch_ == 0)
|
||||
return true;
|
||||
|
||||
// Record best match so far.
|
||||
// Only need to check end point, because this entire
|
||||
// call is only considering one start position.
|
||||
matched = true;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL ||
|
||||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = absl::string_view(
|
||||
cap_[2 * i],
|
||||
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
|
||||
}
|
||||
|
||||
// If going for first match, we're done.
|
||||
if (!longest_)
|
||||
return true;
|
||||
|
||||
// If we used the entire text, no longer match is possible.
|
||||
if (p == end)
|
||||
return true;
|
||||
|
||||
// Otherwise, continue on in hope of a longer match.
|
||||
// Note the absence of the ShouldVisit() check here
|
||||
// due to execution remaining in the same list.
|
||||
Next:
|
||||
if (!ip->last()) {
|
||||
id++;
|
||||
goto Loop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return matched;
|
||||
}
|
||||
|
||||
// Search text (within context) for prog_.
|
||||
bool BitState::Search(absl::string_view text, absl::string_view context,
|
||||
bool anchored, bool longest, absl::string_view* submatch,
|
||||
int nsubmatch) {
|
||||
// Search parameters.
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.data() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
|
||||
return false;
|
||||
if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
|
||||
return false;
|
||||
anchored_ = anchored || prog_->anchor_start();
|
||||
longest_ = longest || prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = absl::string_view();
|
||||
|
||||
// Allocate scratch space.
|
||||
int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
|
||||
nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
|
||||
visited_ = PODArray<uint64_t>(nvisited);
|
||||
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
|
||||
|
||||
int ncap = 2*nsubmatch;
|
||||
if (ncap < 2)
|
||||
ncap = 2;
|
||||
cap_ = PODArray<const char*>(ncap);
|
||||
memset(cap_.data(), 0, ncap*sizeof cap_[0]);
|
||||
|
||||
// When sizeof(Job) == 16, we start with a nice round 1KiB. :)
|
||||
job_ = PODArray<Job>(64);
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.data();
|
||||
return TrySearch(prog_->start(), text.data());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
// This looks like it's quadratic in the size of the text,
|
||||
// but we are not clearing visited_ between calls to TrySearch,
|
||||
// so no work is duplicated and it ends up still being linear.
|
||||
const char* etext = text.data() + text.size();
|
||||
for (const char* p = text.data(); p <= etext; p++) {
|
||||
// Try to use prefix accel (e.g. memchr) to skip ahead.
|
||||
if (p < etext && prog_->can_prefix_accel()) {
|
||||
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
|
||||
if (p == NULL)
|
||||
p = etext;
|
||||
}
|
||||
|
||||
cap_[0] = p;
|
||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
// Avoid invoking undefined behavior (arithmetic on a null pointer)
|
||||
// by simply not continuing the loop.
|
||||
if (p == NULL)
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Bit-state search.
|
||||
bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
absl::string_view* match, int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
absl::string_view sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
BitState b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
+1265
File diff suppressed because it is too large
Load Diff
+2136
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,138 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/filtered_re2.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
FilteredRE2::FilteredRE2()
|
||||
: compiled_(false),
|
||||
prefilter_tree_(new PrefilterTree()) {
|
||||
}
|
||||
|
||||
FilteredRE2::FilteredRE2(int min_atom_len)
|
||||
: compiled_(false),
|
||||
prefilter_tree_(new PrefilterTree(min_atom_len)) {
|
||||
}
|
||||
|
||||
FilteredRE2::~FilteredRE2() {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
delete re2_vec_[i];
|
||||
}
|
||||
|
||||
FilteredRE2::FilteredRE2(FilteredRE2&& other)
|
||||
: re2_vec_(std::move(other.re2_vec_)),
|
||||
compiled_(other.compiled_),
|
||||
prefilter_tree_(std::move(other.prefilter_tree_)) {
|
||||
other.re2_vec_.clear();
|
||||
other.re2_vec_.shrink_to_fit();
|
||||
other.compiled_ = false;
|
||||
other.prefilter_tree_.reset(new PrefilterTree());
|
||||
}
|
||||
|
||||
FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) {
|
||||
this->~FilteredRE2();
|
||||
(void) new (this) FilteredRE2(std::move(other));
|
||||
return *this;
|
||||
}
|
||||
|
||||
RE2::ErrorCode FilteredRE2::Add(absl::string_view pattern,
|
||||
const RE2::Options& options, int* id) {
|
||||
RE2* re = new RE2(pattern, options);
|
||||
RE2::ErrorCode code = re->error_code();
|
||||
|
||||
if (!re->ok()) {
|
||||
if (options.log_errors()) {
|
||||
ABSL_LOG(ERROR) << "Couldn't compile regular expression, skipping: "
|
||||
<< pattern << " due to error " << re->error();
|
||||
}
|
||||
delete re;
|
||||
} else {
|
||||
*id = static_cast<int>(re2_vec_.size());
|
||||
re2_vec_.push_back(re);
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
void FilteredRE2::Compile(std::vector<std::string>* atoms) {
|
||||
if (compiled_) {
|
||||
ABSL_LOG(ERROR) << "Compile called already.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Similarly to PrefilterTree::Compile(), make compiling
|
||||
// a no-op if it's attempted before adding any patterns.
|
||||
if (re2_vec_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++) {
|
||||
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
|
||||
prefilter_tree_->Add(prefilter);
|
||||
}
|
||||
atoms->clear();
|
||||
prefilter_tree_->Compile(atoms);
|
||||
compiled_ = true;
|
||||
}
|
||||
|
||||
int FilteredRE2::SlowFirstMatch(absl::string_view text) const {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
||||
return static_cast<int>(i);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int FilteredRE2::FirstMatch(absl::string_view text,
|
||||
const std::vector<int>& atoms) const {
|
||||
if (!compiled_) {
|
||||
ABSL_LOG(DFATAL) << "FirstMatch called before Compile.";
|
||||
return -1;
|
||||
}
|
||||
std::vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
return regexps[i];
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool FilteredRE2::AllMatches(absl::string_view text,
|
||||
const std::vector<int>& atoms,
|
||||
std::vector<int>* matching_regexps) const {
|
||||
matching_regexps->clear();
|
||||
std::vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
matching_regexps->push_back(regexps[i]);
|
||||
return !matching_regexps->empty();
|
||||
}
|
||||
|
||||
void FilteredRE2::AllPotentials(const std::vector<int>& atoms,
|
||||
std::vector<int>* potential_regexps) const {
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* passed_regexps) {
|
||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
||||
prefilter_tree_->PrintPrefilter(regexpid);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,115 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_FILTERED_RE2_H_
|
||||
#define RE2_FILTERED_RE2_H_
|
||||
|
||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
||||
// It provides a prefilter mechanism that helps in cutting down the
|
||||
// number of regexps that need to be actually searched.
|
||||
//
|
||||
// By design, it does not include a string matching engine. This is to
|
||||
// allow the user of the class to use their favorite string matching
|
||||
// engine. The overall flow is: Add all the regexps using Add, then
|
||||
// Compile the FilteredRE2. Compile returns strings that need to be
|
||||
// matched. Note that the returned strings are lowercased and distinct.
|
||||
// For applying regexps to a search text, the caller does the string
|
||||
// matching using the returned strings. When doing the string match,
|
||||
// note that the caller has to do that in a case-insensitive way or
|
||||
// on a lowercased version of the search text. Then call FirstMatch
|
||||
// or AllMatches with a vector of indices of strings that were found
|
||||
// in the text to get the actual regexp matches.
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class PrefilterTree;
|
||||
|
||||
class FilteredRE2 {
|
||||
public:
|
||||
FilteredRE2();
|
||||
explicit FilteredRE2(int min_atom_len);
|
||||
~FilteredRE2();
|
||||
|
||||
// Not copyable.
|
||||
FilteredRE2(const FilteredRE2&) = delete;
|
||||
FilteredRE2& operator=(const FilteredRE2&) = delete;
|
||||
// Movable.
|
||||
FilteredRE2(FilteredRE2&& other);
|
||||
FilteredRE2& operator=(FilteredRE2&& other);
|
||||
|
||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
||||
// re->error_code(). If error_code is other than NoError, then re is
|
||||
// deleted and not added to re2_vec_.
|
||||
RE2::ErrorCode Add(absl::string_view pattern,
|
||||
const RE2::Options& options,
|
||||
int* id);
|
||||
|
||||
// Prepares the regexps added by Add for filtering. Returns a set
|
||||
// of strings that the caller should check for in candidate texts.
|
||||
// The returned strings are lowercased and distinct. When doing
|
||||
// string matching, it should be performed in a case-insensitive
|
||||
// way or the search text should be lowercased first. Call after
|
||||
// all Add calls are done.
|
||||
void Compile(std::vector<std::string>* strings_to_match);
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Can be called prior to Compile.
|
||||
// Does not do any filtering: simply tries to Match the
|
||||
// regexps in a loop.
|
||||
int SlowFirstMatch(absl::string_view text) const;
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Compile has to be called before
|
||||
// calling this.
|
||||
int FirstMatch(absl::string_view text,
|
||||
const std::vector<int>& atoms) const;
|
||||
|
||||
// Returns the indices of all matching regexps, after first clearing
|
||||
// matched_regexps.
|
||||
bool AllMatches(absl::string_view text,
|
||||
const std::vector<int>& atoms,
|
||||
std::vector<int>* matching_regexps) const;
|
||||
|
||||
// Returns the indices of all potentially matching regexps after first
|
||||
// clearing potential_regexps.
|
||||
// A regexp is potentially matching if it passes the filter.
|
||||
// If a regexp passes the filter it may still not match.
|
||||
// A regexp that does not pass the filter is guaranteed to not match.
|
||||
void AllPotentials(const std::vector<int>& atoms,
|
||||
std::vector<int>* potential_regexps) const;
|
||||
|
||||
// The number of regexps added.
|
||||
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
|
||||
|
||||
// Get the individual RE2 objects.
|
||||
const RE2& GetRE2(int regexpid) const { return *re2_vec_[regexpid]; }
|
||||
|
||||
private:
|
||||
// Print prefilter.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
// Useful for testing and debugging.
|
||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* passed_regexps);
|
||||
|
||||
// All the regexps in the FilteredRE2.
|
||||
std::vector<RE2*> re2_vec_;
|
||||
|
||||
// Has the FilteredRE2 been compiled using Compile()
|
||||
bool compiled_;
|
||||
|
||||
// An AND-OR tree of string atoms used for filtering regexps.
|
||||
std::unique_ptr<PrefilterTree> prefilter_tree_;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_FILTERED_RE2_H_
|
||||
@@ -0,0 +1,284 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <fuzzer/FuzzedDataProvider.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/filtered_re2.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/set.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
// NOT static, NOT signed.
|
||||
uint8_t dummy = 0;
|
||||
|
||||
// Walks kRegexpConcat and kRegexpAlternate subexpressions
|
||||
// to determine their maximum length.
|
||||
class SubexpressionWalker : public re2::Regexp::Walker<int> {
|
||||
public:
|
||||
SubexpressionWalker() = default;
|
||||
~SubexpressionWalker() override = default;
|
||||
|
||||
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
|
||||
int* child_args, int nchild_args) override {
|
||||
switch (re->op()) {
|
||||
case re2::kRegexpConcat:
|
||||
case re2::kRegexpAlternate: {
|
||||
int max = nchild_args;
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
max = std::max(max, child_args[i]);
|
||||
return max;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
int ShortVisit(re2::Regexp* re, int parent_arg) override {
|
||||
return parent_arg;
|
||||
}
|
||||
|
||||
private:
|
||||
SubexpressionWalker(const SubexpressionWalker&) = delete;
|
||||
SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
|
||||
};
|
||||
|
||||
// Walks substrings (i.e. kRegexpLiteralString subexpressions)
|
||||
// to determine their maximum length... in runes, but avoiding
|
||||
// overheads due to UTF-8 encoding is worthwhile when fuzzing.
|
||||
class SubstringWalker : public re2::Regexp::Walker<int> {
|
||||
public:
|
||||
SubstringWalker() = default;
|
||||
~SubstringWalker() override = default;
|
||||
|
||||
int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
|
||||
int* child_args, int nchild_args) override {
|
||||
switch (re->op()) {
|
||||
case re2::kRegexpConcat:
|
||||
case re2::kRegexpAlternate:
|
||||
case re2::kRegexpStar:
|
||||
case re2::kRegexpPlus:
|
||||
case re2::kRegexpQuest:
|
||||
case re2::kRegexpRepeat:
|
||||
case re2::kRegexpCapture: {
|
||||
int max = -1;
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
max = std::max(max, child_args[i]);
|
||||
return max;
|
||||
}
|
||||
|
||||
case re2::kRegexpLiteralString:
|
||||
return re->nrunes();
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
int ShortVisit(re2::Regexp* re, int parent_arg) override {
|
||||
return parent_arg;
|
||||
}
|
||||
|
||||
private:
|
||||
SubstringWalker(const SubstringWalker&) = delete;
|
||||
SubstringWalker& operator=(const SubstringWalker&) = delete;
|
||||
};
|
||||
|
||||
void TestOneInput(absl::string_view pattern, const RE2::Options& options,
|
||||
RE2::Anchor anchor, absl::string_view text) {
|
||||
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
|
||||
// Otherwise, we will waste time on inputs that have long runs of various
|
||||
// character classes. The fuzzer has shown itself to be easily capable of
|
||||
// generating such patterns that fall within the other limits, but result
|
||||
// in timeouts nonetheless. The marginal cost is high - even more so when
|
||||
// counted repetition is involved - whereas the marginal benefit is zero.
|
||||
// Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
|
||||
// three-element character classes when case-insensitive and using UTF-8.
|
||||
// TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
|
||||
int char_class = 0;
|
||||
int backslash_p = 0; // very expensive, so handle specially
|
||||
for (size_t i = 0; i < pattern.size(); i++) {
|
||||
if (pattern[i] == '.' ||
|
||||
pattern[i] == 'k' || pattern[i] == 'K' ||
|
||||
pattern[i] == 's' || pattern[i] == 'S')
|
||||
char_class++;
|
||||
if (pattern[i] != '\\')
|
||||
continue;
|
||||
i++;
|
||||
if (i >= pattern.size())
|
||||
break;
|
||||
if (pattern[i] == 'p' || pattern[i] == 'P' ||
|
||||
pattern[i] == 'd' || pattern[i] == 'D' ||
|
||||
pattern[i] == 's' || pattern[i] == 'S' ||
|
||||
pattern[i] == 'w' || pattern[i] == 'W')
|
||||
char_class++;
|
||||
if (pattern[i] == 'p' || pattern[i] == 'P')
|
||||
backslash_p++;
|
||||
}
|
||||
if (char_class > 9)
|
||||
return;
|
||||
if (backslash_p > 1)
|
||||
return;
|
||||
|
||||
// Iterate just once when fuzzing. Otherwise, we easily get bogged down
|
||||
// and coverage is unlikely to improve despite significant expense.
|
||||
RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
|
||||
// The default is 1000. Even 100 turned out to be too generous
|
||||
// for fuzzing, empirically speaking, so let's try 10 instead.
|
||||
re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
|
||||
|
||||
RE2 re(pattern, options);
|
||||
if (!re.ok())
|
||||
return;
|
||||
|
||||
// Don't waste time fuzzing programs with large subexpressions.
|
||||
// They can cause bug reports due to fuzzer timeouts. And they
|
||||
// aren't interesting for fuzzing purposes.
|
||||
if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
|
||||
return;
|
||||
|
||||
// Don't waste time fuzzing programs with large substrings.
|
||||
// They can cause bug reports due to fuzzer timeouts when they
|
||||
// are repetitions (e.g. hundreds of NUL bytes) and matching is
|
||||
// unanchored. And they aren't interesting for fuzzing purposes.
|
||||
if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
|
||||
return;
|
||||
|
||||
// Don't waste time fuzzing high-size programs.
|
||||
// They can cause bug reports due to fuzzer timeouts.
|
||||
int size = re.ProgramSize();
|
||||
if (size > 9999)
|
||||
return;
|
||||
int rsize = re.ReverseProgramSize();
|
||||
if (rsize > 9999)
|
||||
return;
|
||||
|
||||
// Don't waste time fuzzing high-fanout programs.
|
||||
// They can cause bug reports due to fuzzer timeouts.
|
||||
std::vector<int> histogram;
|
||||
int fanout = re.ProgramFanout(&histogram);
|
||||
if (fanout > 9)
|
||||
return;
|
||||
int rfanout = re.ReverseProgramFanout(&histogram);
|
||||
if (rfanout > 9)
|
||||
return;
|
||||
|
||||
if (re.NumberOfCapturingGroups() == 0) {
|
||||
// Avoid early return due to too many arguments.
|
||||
absl::string_view sp = text;
|
||||
RE2::FullMatch(sp, re);
|
||||
RE2::PartialMatch(sp, re);
|
||||
RE2::Consume(&sp, re);
|
||||
sp = text; // Reset.
|
||||
RE2::FindAndConsume(&sp, re);
|
||||
} else {
|
||||
// Okay, we have at least one capturing group...
|
||||
// Try conversion for variously typed arguments.
|
||||
absl::string_view sp = text;
|
||||
short s;
|
||||
RE2::FullMatch(sp, re, &s);
|
||||
long l;
|
||||
RE2::PartialMatch(sp, re, &l);
|
||||
float f;
|
||||
RE2::Consume(&sp, re, &f);
|
||||
sp = text; // Reset.
|
||||
double d;
|
||||
RE2::FindAndConsume(&sp, re, &d);
|
||||
}
|
||||
|
||||
std::string s = std::string(text);
|
||||
RE2::Replace(&s, re, "");
|
||||
s = std::string(text); // Reset.
|
||||
RE2::GlobalReplace(&s, re, "");
|
||||
|
||||
std::string min, max;
|
||||
re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
|
||||
|
||||
// Exercise some other API functionality.
|
||||
dummy += re.NamedCapturingGroups().size();
|
||||
dummy += re.CapturingGroupNames().size();
|
||||
dummy += RE2::QuoteMeta(pattern).size();
|
||||
dummy += re.Regexp()->ToString().size();
|
||||
|
||||
RE2::Set set(options, anchor);
|
||||
int index = set.Add(pattern, /*error=*/NULL); // -1 on error
|
||||
if (index != -1 && set.Compile()) {
|
||||
std::vector<int> matches;
|
||||
set.Match(text, &matches);
|
||||
}
|
||||
|
||||
re2::FilteredRE2 filter;
|
||||
index = -1; // not clobbered on error
|
||||
filter.Add(pattern, options, &index);
|
||||
if (index != -1) {
|
||||
std::vector<std::string> atoms;
|
||||
filter.Compile(&atoms);
|
||||
// Pretend that all atoms match, which
|
||||
// triggers the AND-OR tree maximally.
|
||||
std::vector<int> matched_atoms;
|
||||
matched_atoms.reserve(atoms.size());
|
||||
for (size_t i = 0; i < atoms.size(); ++i)
|
||||
matched_atoms.push_back(static_cast<int>(i));
|
||||
std::vector<int> matches;
|
||||
filter.AllMatches(text, matched_atoms, &matches);
|
||||
}
|
||||
}
|
||||
|
||||
// Entry point for libFuzzer.
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
|
||||
// An input larger than 4 KiB probably isn't interesting. (This limit
|
||||
// allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
|
||||
if (size == 0 || size > 4096)
|
||||
return 0;
|
||||
|
||||
FuzzedDataProvider fdp(data, size);
|
||||
|
||||
// The convention here is that fdp.ConsumeBool() returning false sets
|
||||
// the default value whereas returning true sets the alternate value:
|
||||
// most options default to false and so can be set directly; encoding
|
||||
// defaults to UTF-8; case_sensitive defaults to true. We do NOT want
|
||||
// to log errors. max_mem is 64 MiB because we can afford to use more
|
||||
// RAM in exchange for (hopefully) faster fuzzing.
|
||||
RE2::Options options;
|
||||
options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
|
||||
: RE2::Options::EncodingUTF8);
|
||||
options.set_posix_syntax(fdp.ConsumeBool());
|
||||
options.set_longest_match(fdp.ConsumeBool());
|
||||
options.set_log_errors(false);
|
||||
options.set_max_mem(64 << 20);
|
||||
options.set_literal(fdp.ConsumeBool());
|
||||
options.set_never_nl(fdp.ConsumeBool());
|
||||
options.set_dot_nl(fdp.ConsumeBool());
|
||||
options.set_never_capture(fdp.ConsumeBool());
|
||||
options.set_case_sensitive(!fdp.ConsumeBool());
|
||||
options.set_perl_classes(fdp.ConsumeBool());
|
||||
options.set_word_boundary(fdp.ConsumeBool());
|
||||
options.set_one_line(fdp.ConsumeBool());
|
||||
|
||||
// ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
|
||||
// kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
|
||||
RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
|
||||
RE2::UNANCHORED,
|
||||
RE2::ANCHOR_START,
|
||||
RE2::ANCHOR_BOTH,
|
||||
});
|
||||
|
||||
std::string pattern = fdp.ConsumeRandomLengthString(999);
|
||||
std::string text = fdp.ConsumeRandomLengthString(999);
|
||||
|
||||
TestOneInput(pattern, options, anchor, text);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/perl
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Generate table entries giving character ranges
|
||||
# for POSIX/Perl character classes. Rather than
|
||||
# figure out what the definition is, it is easier to ask
|
||||
# Perl about each letter from 0-128 and write down
|
||||
# its answer.
|
||||
|
||||
@posixclasses = (
|
||||
"[:alnum:]",
|
||||
"[:alpha:]",
|
||||
"[:ascii:]",
|
||||
"[:blank:]",
|
||||
"[:cntrl:]",
|
||||
"[:digit:]",
|
||||
"[:graph:]",
|
||||
"[:lower:]",
|
||||
"[:print:]",
|
||||
"[:punct:]",
|
||||
"[:space:]",
|
||||
"[:upper:]",
|
||||
"[:word:]",
|
||||
"[:xdigit:]",
|
||||
);
|
||||
|
||||
@perlclasses = (
|
||||
"\\d",
|
||||
"\\s",
|
||||
"\\w",
|
||||
);
|
||||
|
||||
%overrides = (
|
||||
# Prior to Perl 5.18, \s did not match vertical tab.
|
||||
# RE2 preserves that original behaviour.
|
||||
"\\s:11" => 0,
|
||||
);
|
||||
|
||||
sub ComputeClass($) {
|
||||
my ($cname) = @_;
|
||||
my @ranges;
|
||||
my $regexp = qr/[$cname]/;
|
||||
my $start = -1;
|
||||
for (my $i=0; $i<=129; $i++) {
|
||||
if ($i == 129) { $i = 256; }
|
||||
if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
|
||||
if ($start < 0) {
|
||||
$start = $i;
|
||||
}
|
||||
} else {
|
||||
if ($start >= 0) {
|
||||
push @ranges, [$start, $i-1];
|
||||
}
|
||||
$start = -1;
|
||||
}
|
||||
}
|
||||
return @ranges;
|
||||
}
|
||||
|
||||
sub PrintClass($$@) {
|
||||
my ($cnum, $cname, @ranges) = @_;
|
||||
print "static const URange16 code${cnum}[] = { /* $cname */\n";
|
||||
for (my $i=0; $i<@ranges; $i++) {
|
||||
my @a = @{$ranges[$i]};
|
||||
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
|
||||
}
|
||||
print "};\n";
|
||||
my $n = @ranges;
|
||||
my $escname = $cname;
|
||||
$escname =~ s/\\/\\\\/g;
|
||||
$negname = $escname;
|
||||
if ($negname =~ /:/) {
|
||||
$negname =~ s/:/:^/;
|
||||
} else {
|
||||
$negname =~ y/a-z/A-Z/;
|
||||
}
|
||||
return "{ \"$escname\", +1, code$cnum, $n, 0, 0 }", "{ \"$negname\", -1, code$cnum, $n, 0, 0 }";
|
||||
}
|
||||
|
||||
my $cnum = 0;
|
||||
|
||||
sub PrintClasses($@) {
|
||||
my ($pname, @classes) = @_;
|
||||
my @entries;
|
||||
foreach my $cname (@classes) {
|
||||
my @ranges = ComputeClass($cname);
|
||||
push @entries, PrintClass(++$cnum, $cname, @ranges);
|
||||
}
|
||||
print "const UGroup ${pname}_groups[] = {\n";
|
||||
foreach my $e (@entries) {
|
||||
print "\t$e,\n";
|
||||
}
|
||||
print "};\n";
|
||||
my $count = @entries;
|
||||
print "const int num_${pname}_groups = $count;\n";
|
||||
}
|
||||
|
||||
print <<EOF;
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
EOF
|
||||
|
||||
PrintClasses("perl", @perlclasses);
|
||||
PrintClasses("posix", @posixclasses);
|
||||
|
||||
print <<EOF;
|
||||
|
||||
} // namespace re2
|
||||
EOF
|
||||
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/python3
|
||||
# coding=utf-8
|
||||
#
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# See unicode_casefold.h for description of case folding tables.
|
||||
|
||||
"""Generate C++ table for Unicode case folding."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import unicode
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
||||
// make_unicode_casefold.py >unicode_casefold.cc
|
||||
|
||||
#include "re2/unicode_casefold.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
def _Delta(a, b):
|
||||
"""Compute the delta for b - a. Even/odd and odd/even
|
||||
are handled specially, as described above."""
|
||||
if a+1 == b:
|
||||
if a%2 == 0:
|
||||
return 'EvenOdd'
|
||||
else:
|
||||
return 'OddEven'
|
||||
if a == b+1:
|
||||
if a%2 == 0:
|
||||
return 'OddEven'
|
||||
else:
|
||||
return 'EvenOdd'
|
||||
return b - a
|
||||
|
||||
def _AddDelta(a, delta):
|
||||
"""Return a + delta, handling EvenOdd and OddEven specially."""
|
||||
if type(delta) == int:
|
||||
return a+delta
|
||||
if delta == 'EvenOdd':
|
||||
if a%2 == 0:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
if delta == 'OddEven':
|
||||
if a%2 == 1:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
print("Bad Delta:", delta, file=sys.stderr)
|
||||
raise unicode.Error("Bad Delta")
|
||||
|
||||
def _MakeRanges(pairs):
|
||||
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
|
||||
into [(65, 90, +32)]."""
|
||||
ranges = []
|
||||
last = -100
|
||||
|
||||
def evenodd(last, a, b, r):
|
||||
if a != last+1 or b != _AddDelta(a, r[2]):
|
||||
return False
|
||||
r[1] = a
|
||||
return True
|
||||
|
||||
def evenoddpair(last, a, b, r):
|
||||
if a != last+2:
|
||||
return False
|
||||
delta = r[2]
|
||||
d = delta
|
||||
if type(delta) is not str:
|
||||
return False
|
||||
if delta.endswith('Skip'):
|
||||
d = delta[:-4]
|
||||
else:
|
||||
delta = d + 'Skip'
|
||||
if b != _AddDelta(a, d):
|
||||
return False
|
||||
r[1] = a
|
||||
r[2] = delta
|
||||
return True
|
||||
|
||||
for a, b in pairs:
|
||||
if ranges and evenodd(last, a, b, ranges[-1]):
|
||||
pass
|
||||
elif ranges and evenoddpair(last, a, b, ranges[-1]):
|
||||
pass
|
||||
else:
|
||||
ranges.append([a, a, _Delta(a, b)])
|
||||
last = a
|
||||
return ranges
|
||||
|
||||
# The maximum size of a case-folding group.
|
||||
# Case folding is implemented in parse.cc by a recursive process
|
||||
# with a recursion depth equal to the size of the largest
|
||||
# case-folding group, so it is important that this bound be small.
|
||||
# The current tables have no group bigger than 4.
|
||||
# If there are ever groups bigger than 10 or so, it will be
|
||||
# time to rework the code in parse.cc.
|
||||
MaxCasefoldGroup = 4
|
||||
|
||||
def main():
|
||||
lowergroups, casegroups = unicode.CaseGroups()
|
||||
foldpairs = []
|
||||
seen = {}
|
||||
for c in casegroups:
|
||||
if len(c) > MaxCasefoldGroup:
|
||||
raise unicode.Error("casefold group too long: %s" % (c,))
|
||||
for i in range(len(c)):
|
||||
if c[i-1] in seen:
|
||||
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
|
||||
seen[c[i-1]] = True
|
||||
foldpairs.append([c[i-1], c[i]])
|
||||
|
||||
lowerpairs = []
|
||||
for lower, group in lowergroups.items():
|
||||
for g in group:
|
||||
if g != lower:
|
||||
lowerpairs.append([g, lower])
|
||||
|
||||
def printpairs(name, foldpairs):
|
||||
foldpairs.sort()
|
||||
foldranges = _MakeRanges(foldpairs)
|
||||
print("// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)))
|
||||
print("const CaseFold unicode_%s[] = {" % (name,))
|
||||
for lo, hi, delta in foldranges:
|
||||
print("\t{ %d, %d, %s }," % (lo, hi, delta))
|
||||
print("};")
|
||||
print("const int num_unicode_%s = %d;" % (name, len(foldranges)))
|
||||
print("")
|
||||
|
||||
print(_header)
|
||||
printpairs("casefold", foldpairs)
|
||||
printpairs("tolower", lowerpairs)
|
||||
print(_trailer)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/python3
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
"""Generate C++ tables for Unicode Script and Category groups."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import unicode
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
|
||||
// make_unicode_groups.py >unicode_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
n16 = 0
|
||||
n32 = 0
|
||||
|
||||
def MakeRanges(codes):
|
||||
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
|
||||
ranges = []
|
||||
last = -100
|
||||
for c in codes:
|
||||
if c == last+1:
|
||||
ranges[-1][1] = c
|
||||
else:
|
||||
ranges.append([c, c])
|
||||
last = c
|
||||
return ranges
|
||||
|
||||
def PrintRanges(type, name, ranges):
|
||||
"""Print the ranges as an array of type named name."""
|
||||
print("static const %s %s[] = {" % (type, name))
|
||||
for lo, hi in ranges:
|
||||
print("\t{ %d, %d }," % (lo, hi))
|
||||
print("};")
|
||||
|
||||
# def PrintCodes(type, name, codes):
|
||||
# """Print the codes as an array of type named name."""
|
||||
# print("static %s %s[] = {" % (type, name))
|
||||
# for c in codes:
|
||||
# print("\t%d," % (c,))
|
||||
# print("};")
|
||||
|
||||
def PrintGroup(name, codes):
|
||||
"""Print the data structures for the group of codes.
|
||||
Return a UGroup literal for the group."""
|
||||
|
||||
# See unicode_groups.h for a description of the data structure.
|
||||
|
||||
# Split codes into 16-bit ranges and 32-bit ranges.
|
||||
range16 = MakeRanges([c for c in codes if c < 65536])
|
||||
range32 = MakeRanges([c for c in codes if c >= 65536])
|
||||
|
||||
# Pull singleton ranges out of range16.
|
||||
# code16 = [lo for lo, hi in range16 if lo == hi]
|
||||
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
|
||||
|
||||
global n16
|
||||
global n32
|
||||
n16 += len(range16)
|
||||
n32 += len(range32)
|
||||
|
||||
ugroup = "{ \"%s\", +1" % (name,)
|
||||
# if len(code16) > 0:
|
||||
# PrintCodes("uint16_t", name+"_code16", code16)
|
||||
# ugroup += ", %s_code16, %d" % (name, len(code16))
|
||||
# else:
|
||||
# ugroup += ", 0, 0"
|
||||
if len(range16) > 0:
|
||||
PrintRanges("URange16", name+"_range16", range16)
|
||||
ugroup += ", %s_range16, %d" % (name, len(range16))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
if len(range32) > 0:
|
||||
PrintRanges("URange32", name+"_range32", range32)
|
||||
ugroup += ", %s_range32, %d" % (name, len(range32))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
ugroup += " }"
|
||||
return ugroup
|
||||
|
||||
def main():
|
||||
categories = unicode.Categories()
|
||||
scripts = unicode.Scripts()
|
||||
print(_header)
|
||||
ugroups = []
|
||||
for name in sorted(categories):
|
||||
ugroups.append(PrintGroup(name, categories[name]))
|
||||
for name in sorted(scripts):
|
||||
ugroups.append(PrintGroup(name, scripts[name]))
|
||||
print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
|
||||
print("const UGroup unicode_groups[] = {")
|
||||
ugroups.sort()
|
||||
for ug in ugroups:
|
||||
print("\t%s," % (ug,))
|
||||
print("};")
|
||||
print("const int num_unicode_groups = %d;" % (len(ugroups),))
|
||||
print(_trailer)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,196 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Determine whether this library should match PCRE exactly
|
||||
// for a particular Regexp. (If so, the testing framework can
|
||||
// check that it does.)
|
||||
//
|
||||
// This library matches PCRE except in these cases:
|
||||
// * the regexp contains a repetition of an empty string,
|
||||
// like (a*)* or (a*)+. In this case, PCRE will treat
|
||||
// the repetition sequence as ending with an empty string,
|
||||
// while this library does not.
|
||||
// * Perl and PCRE differ on whether \v matches \n.
|
||||
// For historical reasons, this library implements the Perl behavior.
|
||||
// * Perl and PCRE allow $ in one-line mode to match either the very
|
||||
// end of the text or just before a \n at the end of the text.
|
||||
// This library requires it to match only the end of the text.
|
||||
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
||||
// match the end of the text if the last character is a \n.
|
||||
// This library does allow it.
|
||||
//
|
||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
||||
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns whether re might match an empty string.
|
||||
static bool CanBeEmptyString(Regexp *re);
|
||||
|
||||
// Walker class to compute whether library handles a regexp
|
||||
// exactly as PCRE would. See comment at top for conditions.
|
||||
|
||||
class PCREWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
PCREWalker() {}
|
||||
|
||||
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
virtual bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
ABSL_LOG(DFATAL) << "PCREWalker::ShortVisit called";
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
PCREWalker(const PCREWalker&) = delete;
|
||||
PCREWalker& operator=(const PCREWalker&) = delete;
|
||||
};
|
||||
|
||||
// Called after visiting each of re's children and accumulating
|
||||
// the return values in child_args. So child_args contains whether
|
||||
// this library mimics PCRE for those subexpressions.
|
||||
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
// If children failed, so do we.
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
|
||||
// Otherwise look for other reasons to fail.
|
||||
switch (re->op()) {
|
||||
// Look for repeated empty string.
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
if (CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for \v
|
||||
case kRegexpLiteral:
|
||||
if (re->rune() == '\v')
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for $ in single-line mode.
|
||||
case kRegexpEndText:
|
||||
case kRegexpEmptyMatch:
|
||||
if (re->parse_flags() & Regexp::WasDollar)
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for ^ in multi-line mode.
|
||||
case kRegexpBeginLine:
|
||||
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
||||
return false;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Not proven guilty.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
||||
bool Regexp::MimicsPCRE() {
|
||||
PCREWalker w;
|
||||
return w.Walk(this, true);
|
||||
}
|
||||
|
||||
|
||||
// Walker class to compute whether a Regexp can match an empty string.
|
||||
// It is okay to overestimate. For example, \b\B cannot match an empty
|
||||
// string, because \b and \B are mutually exclusive, but this isn't
|
||||
// that smart and will say it can. Spurious empty strings
|
||||
// will reduce the number of regexps we sanity check against PCRE,
|
||||
// but they won't break anything.
|
||||
|
||||
class EmptyStringWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
EmptyStringWalker() {}
|
||||
|
||||
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
virtual bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
ABSL_LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
EmptyStringWalker(const EmptyStringWalker&) = delete;
|
||||
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
// value from each of the children's PostVisits (i.e., whether each child
|
||||
// can match an empty string). Returns whether this clause can match an
|
||||
// empty string.
|
||||
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch: // never empty
|
||||
case kRegexpLiteral:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpLiteralString:
|
||||
return false;
|
||||
|
||||
case kRegexpEmptyMatch: // always empty
|
||||
case kRegexpBeginLine: // always empty, when they match
|
||||
case kRegexpEndLine:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpEndText:
|
||||
case kRegexpStar: // can always be empty
|
||||
case kRegexpQuest:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
|
||||
case kRegexpConcat: // can be empty if all children can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
return true;
|
||||
|
||||
case kRegexpAlternate: // can be empty if any child can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (child_args[i])
|
||||
return true;
|
||||
return false;
|
||||
|
||||
case kRegexpPlus: // can be empty if the child can
|
||||
case kRegexpCapture:
|
||||
return child_args[0];
|
||||
|
||||
case kRegexpRepeat: // can be empty if child can or is x{0}
|
||||
return child_args[0] || re->min() == 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns whether re can match an empty string.
|
||||
static bool CanBeEmptyString(Regexp* re) {
|
||||
EmptyStringWalker w;
|
||||
return w.Walk(re, true);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
+714
@@ -0,0 +1,714 @@
|
||||
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchNFA, an NFA search.
|
||||
// This is an actual NFA like the theorists talk about,
|
||||
// not the pseudo-NFA found in backtracking regexp implementations.
|
||||
//
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
|
||||
// which is a variant of the one described in Thompson's 1968 CACM paper.
|
||||
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
|
||||
// over the DFA implementation is that it tracks submatch boundaries.
|
||||
//
|
||||
// When the choice of submatch boundaries is ambiguous, this particular
|
||||
// implementation makes the same choices that traditional backtracking
|
||||
// implementations (in particular, Perl and PCRE) do.
|
||||
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
|
||||
// time in the length of the input.
|
||||
//
|
||||
// Like Thompson's original machine and like the DFA implementation, this
|
||||
// implementation notices a match only once it is one byte past it.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <deque>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/sparse_array.h"
|
||||
#include "re2/sparse_set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
class NFA {
|
||||
public:
|
||||
NFA(Prog* prog);
|
||||
~NFA();
|
||||
|
||||
// Searches for a matching string.
|
||||
// * If anchored is true, only considers matches starting at offset.
|
||||
// Otherwise finds lefmost match at or after offset.
|
||||
// * If longest is true, returns the longest match starting
|
||||
// at the chosen start point. Otherwise returns the so-called
|
||||
// left-biased match, the one traditional backtracking engines
|
||||
// (like Perl and PCRE) find.
|
||||
// Records submatch boundaries in submatch[1..nsubmatch-1].
|
||||
// Submatch[0] is the entire match. When there is a choice in
|
||||
// which text matches each subexpression, the submatch boundaries
|
||||
// are chosen to match what a backtracking implementation would choose.
|
||||
bool Search(absl::string_view text, absl::string_view context, bool anchored,
|
||||
bool longest, absl::string_view* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
struct Thread {
|
||||
union {
|
||||
int ref;
|
||||
Thread* next; // when on free list
|
||||
};
|
||||
const char** capture;
|
||||
};
|
||||
|
||||
// State for explicit stack in AddToThreadq.
|
||||
struct AddState {
|
||||
int id; // Inst to process
|
||||
Thread* t; // if not null, set t0 = t before processing id
|
||||
};
|
||||
|
||||
// Threadq is a list of threads. The list is sorted by the order
|
||||
// in which Perl would explore that particular state -- the earlier
|
||||
// choices appear earlier in the list.
|
||||
typedef SparseArray<Thread*> Threadq;
|
||||
|
||||
inline Thread* AllocThread();
|
||||
inline Thread* Incref(Thread* t);
|
||||
inline void Decref(Thread* t);
|
||||
|
||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||
// Enqueues only the ByteRange instructions that match byte c.
|
||||
// context is used (with p) for evaluating empty-width specials.
|
||||
// p is the current input position, and t0 is the current thread.
|
||||
void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
|
||||
const char* p, Thread* t0);
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// context is used (with p) for evaluating empty-width specials.
|
||||
// p is the position of byte c in the input string for AddToThreadq;
|
||||
// p-1 will be used when processing Match instructions.
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
|
||||
const char* p);
|
||||
|
||||
// Returns text version of capture information, for debugging.
|
||||
std::string FormatCapture(const char** capture);
|
||||
|
||||
void CopyCapture(const char** dst, const char** src) {
|
||||
memmove(dst, src, ncapture_*sizeof src[0]);
|
||||
}
|
||||
|
||||
Prog* prog_; // underlying program
|
||||
int start_; // start instruction in program
|
||||
int ncapture_; // number of submatches to track
|
||||
bool longest_; // whether searching for longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
const char* btext_; // beginning of text (for FormatSubmatch)
|
||||
const char* etext_; // end of text (for endmatch_)
|
||||
Threadq q0_, q1_; // pre-allocated for Search.
|
||||
PODArray<AddState> stack_; // pre-allocated for AddToThreadq
|
||||
std::deque<Thread> arena_; // thread arena
|
||||
Thread* freelist_; // thread freelist
|
||||
const char** match_; // best match so far
|
||||
bool matched_; // any match so far?
|
||||
|
||||
NFA(const NFA&) = delete;
|
||||
NFA& operator=(const NFA&) = delete;
|
||||
};
|
||||
|
||||
NFA::NFA(Prog* prog) {
|
||||
prog_ = prog;
|
||||
start_ = prog_->start();
|
||||
ncapture_ = 0;
|
||||
longest_ = false;
|
||||
endmatch_ = false;
|
||||
btext_ = NULL;
|
||||
etext_ = NULL;
|
||||
q0_.resize(prog_->size());
|
||||
q1_.resize(prog_->size());
|
||||
// See NFA::AddToThreadq() for why this is so.
|
||||
int nstack = 2*prog_->inst_count(kInstCapture) +
|
||||
prog_->inst_count(kInstEmptyWidth) +
|
||||
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
|
||||
stack_ = PODArray<AddState>(nstack);
|
||||
freelist_ = NULL;
|
||||
match_ = NULL;
|
||||
matched_ = false;
|
||||
}
|
||||
|
||||
NFA::~NFA() {
|
||||
delete[] match_;
|
||||
for (const Thread& t : arena_)
|
||||
delete[] t.capture;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::AllocThread() {
|
||||
Thread* t = freelist_;
|
||||
if (t != NULL) {
|
||||
freelist_ = t->next;
|
||||
t->ref = 1;
|
||||
// We don't need to touch t->capture because
|
||||
// the caller will immediately overwrite it.
|
||||
return t;
|
||||
}
|
||||
arena_.emplace_back();
|
||||
t = &arena_.back();
|
||||
t->ref = 1;
|
||||
t->capture = new const char*[ncapture_];
|
||||
return t;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::Incref(Thread* t) {
|
||||
ABSL_DCHECK(t != NULL);
|
||||
t->ref++;
|
||||
return t;
|
||||
}
|
||||
|
||||
void NFA::Decref(Thread* t) {
|
||||
ABSL_DCHECK(t != NULL);
|
||||
t->ref--;
|
||||
if (t->ref > 0)
|
||||
return;
|
||||
ABSL_DCHECK_EQ(t->ref, 0);
|
||||
t->next = freelist_;
|
||||
freelist_ = t;
|
||||
}
|
||||
|
||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||
// Enqueues only the ByteRange instructions that match byte c.
|
||||
// context is used (with p) for evaluating empty-width specials.
|
||||
// p is the current input position, and t0 is the current thread.
|
||||
void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
|
||||
const char* p, Thread* t0) {
|
||||
if (id0 == 0)
|
||||
return;
|
||||
|
||||
// Use stack_ to hold our stack of instructions yet to process.
|
||||
// It was preallocated as follows:
|
||||
// two entries per Capture;
|
||||
// one entry per EmptyWidth; and
|
||||
// one entry per Nop.
|
||||
// This reflects the maximum number of stack pushes that each can
|
||||
// perform. (Each instruction can be processed at most once.)
|
||||
AddState* stk = stack_.data();
|
||||
int nstk = 0;
|
||||
|
||||
stk[nstk++] = {id0, NULL};
|
||||
while (nstk > 0) {
|
||||
ABSL_DCHECK_LE(nstk, stack_.size());
|
||||
AddState a = stk[--nstk];
|
||||
|
||||
Loop:
|
||||
if (a.t != NULL) {
|
||||
// t0 was a thread that we allocated and copied in order to
|
||||
// record the capture, so we must now decref it.
|
||||
Decref(t0);
|
||||
t0 = a.t;
|
||||
}
|
||||
|
||||
int id = a.id;
|
||||
if (id == 0)
|
||||
continue;
|
||||
if (q->has_index(id)) {
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create entry in q no matter what. We might fill it in below,
|
||||
// or we might not. Even if not, it is necessary to have it,
|
||||
// so that we don't revisit id0 during the recursion.
|
||||
q->set_new(id, NULL);
|
||||
Thread** tp = &q->get_existing(id);
|
||||
int j;
|
||||
Thread* t;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
// Save state; will pick up at next byte.
|
||||
t = Incref(t0);
|
||||
*tp = t;
|
||||
|
||||
ABSL_DCHECK(!ip->last());
|
||||
a = {id+1, NULL};
|
||||
goto Loop;
|
||||
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = {id+1, NULL};
|
||||
|
||||
// Continue on.
|
||||
a = {ip->out(), NULL};
|
||||
goto Loop;
|
||||
|
||||
case kInstCapture:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = {id+1, NULL};
|
||||
|
||||
if ((j=ip->cap()) < ncapture_) {
|
||||
// Push a dummy whose only job is to restore t0
|
||||
// once we finish exploring this possibility.
|
||||
stk[nstk++] = {0, t0};
|
||||
|
||||
// Record capture.
|
||||
t = AllocThread();
|
||||
CopyCapture(t->capture, t0->capture);
|
||||
t->capture[j] = p;
|
||||
t0 = t;
|
||||
}
|
||||
a = {ip->out(), NULL};
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange:
|
||||
if (!ip->Matches(c))
|
||||
goto Next;
|
||||
|
||||
// Save state; will pick up at next byte.
|
||||
t = Incref(t0);
|
||||
*tp = t;
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
|
||||
|
||||
if (ip->hint() == 0)
|
||||
break;
|
||||
a = {id+ip->hint(), NULL};
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
// Save state; will pick up at next byte.
|
||||
t = Incref(t0);
|
||||
*tp = t;
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
|
||||
|
||||
Next:
|
||||
if (ip->last())
|
||||
break;
|
||||
a = {id+1, NULL};
|
||||
goto Loop;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = {id+1, NULL};
|
||||
|
||||
// Continue on if we have all the right flag bits.
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context, p))
|
||||
break;
|
||||
a = {ip->out(), NULL};
|
||||
goto Loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// context is used (with p) for evaluating empty-width specials.
|
||||
// p is the position of byte c in the input string for AddToThreadq;
|
||||
// p-1 will be used when processing Match instructions.
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
|
||||
const char* p) {
|
||||
nextq->clear();
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->value();
|
||||
if (t == NULL)
|
||||
continue;
|
||||
|
||||
if (longest_) {
|
||||
// Can skip any threads started after our current best match.
|
||||
if (matched_ && match_[0] < t->capture[0]) {
|
||||
Decref(t);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
int id = i->index();
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
// Should only see the values handled below.
|
||||
ABSL_LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
AddToThreadq(nextq, ip->out(), c, context, p, t);
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
if (i != runq->begin())
|
||||
break;
|
||||
// The match is ours if we want it.
|
||||
if (ip->greedy(prog_) || longest_) {
|
||||
CopyCapture(match_, t->capture);
|
||||
matched_ = true;
|
||||
|
||||
Decref(t);
|
||||
for (++i; i != runq->end(); ++i) {
|
||||
if (i->value() != NULL)
|
||||
Decref(i->value());
|
||||
}
|
||||
runq->clear();
|
||||
if (ip->greedy(prog_))
|
||||
return ip->out1();
|
||||
return ip->out();
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstMatch: {
|
||||
// Avoid invoking undefined behavior (arithmetic on a null pointer)
|
||||
// by storing p instead of p-1. (What would the latter even mean?!)
|
||||
// This complements the special case in NFA::Search().
|
||||
if (p == NULL) {
|
||||
CopyCapture(match_, t->capture);
|
||||
match_[1] = p;
|
||||
matched_ = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (endmatch_ && p-1 != etext_)
|
||||
break;
|
||||
|
||||
if (longest_) {
|
||||
// Leftmost-longest mode: save this match only if
|
||||
// it is either farther to the left or at the same
|
||||
// point but longer than an existing match.
|
||||
if (!matched_ || t->capture[0] < match_[0] ||
|
||||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
|
||||
CopyCapture(match_, t->capture);
|
||||
match_[1] = p-1;
|
||||
matched_ = true;
|
||||
}
|
||||
} else {
|
||||
// Leftmost-biased mode: this match is by definition
|
||||
// better than what we've already found (see next line).
|
||||
CopyCapture(match_, t->capture);
|
||||
match_[1] = p-1;
|
||||
matched_ = true;
|
||||
|
||||
// Cut off the threads that can only find matches
|
||||
// worse than the one we just found: don't run the
|
||||
// rest of the current Threadq.
|
||||
Decref(t);
|
||||
for (++i; i != runq->end(); ++i) {
|
||||
if (i->value() != NULL)
|
||||
Decref(i->value());
|
||||
}
|
||||
runq->clear();
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
Decref(t);
|
||||
}
|
||||
runq->clear();
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string NFA::FormatCapture(const char** capture) {
|
||||
std::string s;
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
if (capture[i] == NULL)
|
||||
s += "(?,?)";
|
||||
else if (capture[i+1] == NULL)
|
||||
s += absl::StrFormat("(%d,?)",
|
||||
capture[i] - btext_);
|
||||
else
|
||||
s += absl::StrFormat("(%d,%d)",
|
||||
capture[i] - btext_,
|
||||
capture[i+1] - btext_);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
bool NFA::Search(absl::string_view text, absl::string_view context,
|
||||
bool anchored, bool longest, absl::string_view* submatch,
|
||||
int nsubmatch) {
|
||||
if (start_ == 0)
|
||||
return false;
|
||||
|
||||
if (context.data() == NULL)
|
||||
context = text;
|
||||
|
||||
// Sanity check: make sure that text lies within context.
|
||||
if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
|
||||
ABSL_LOG(DFATAL) << "context does not contain text";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
|
||||
return false;
|
||||
if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
|
||||
return false;
|
||||
anchored |= prog_->anchor_start();
|
||||
if (prog_->anchor_end()) {
|
||||
longest = true;
|
||||
endmatch_ = true;
|
||||
}
|
||||
|
||||
if (nsubmatch < 0) {
|
||||
ABSL_LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Save search parameters.
|
||||
ncapture_ = 2*nsubmatch;
|
||||
longest_ = longest;
|
||||
|
||||
if (nsubmatch == 0) {
|
||||
// We need to maintain match[0], both to distinguish the
|
||||
// longest match (if longest is true) and also to tell
|
||||
// whether we've seen any matches at all.
|
||||
ncapture_ = 2;
|
||||
}
|
||||
|
||||
match_ = new const char*[ncapture_];
|
||||
memset(match_, 0, ncapture_*sizeof match_[0]);
|
||||
matched_ = false;
|
||||
|
||||
// For debugging prints.
|
||||
btext_ = context.data();
|
||||
// For convenience.
|
||||
etext_ = text.data() + text.size();
|
||||
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
||||
text, context, anchored, longest);
|
||||
|
||||
// Set up search.
|
||||
Threadq* runq = &q0_;
|
||||
Threadq* nextq = &q1_;
|
||||
runq->clear();
|
||||
nextq->clear();
|
||||
|
||||
// Loop over the text, stepping the machine.
|
||||
for (const char* p = text.data();; p++) {
|
||||
if (ExtraDebug) {
|
||||
int c = 0;
|
||||
if (p == btext_)
|
||||
c = '^';
|
||||
else if (p > etext_)
|
||||
c = '$';
|
||||
else if (p < etext_)
|
||||
c = p[0] & 0xFF;
|
||||
|
||||
absl::FPrintF(stderr, "%c:", c);
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->value();
|
||||
if (t == NULL)
|
||||
continue;
|
||||
absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
|
||||
}
|
||||
absl::FPrintF(stderr, "\n");
|
||||
}
|
||||
|
||||
// This is a no-op the first time around the loop because runq is empty.
|
||||
int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
|
||||
ABSL_DCHECK_EQ(runq->size(), 0);
|
||||
using std::swap;
|
||||
swap(nextq, runq);
|
||||
nextq->clear();
|
||||
if (id != 0) {
|
||||
// We're done: full match ahead.
|
||||
p = etext_;
|
||||
for (;;) {
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "Unexpected opcode in short circuit: "
|
||||
<< ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
if (ip->cap() < ncapture_)
|
||||
match_[ip->cap()] = p;
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstMatch:
|
||||
match_[1] = p;
|
||||
matched_ = true;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (p > etext_)
|
||||
break;
|
||||
|
||||
// Start a new thread if there have not been any matches.
|
||||
// (No point in starting a new thread if there have been
|
||||
// matches, since it would be to the right of the match
|
||||
// we already found.)
|
||||
if (!matched_ && (!anchored || p == text.data())) {
|
||||
// Try to use prefix accel (e.g. memchr) to skip ahead.
|
||||
// The search must be unanchored and there must be zero
|
||||
// possible matches already.
|
||||
if (!anchored && runq->size() == 0 &&
|
||||
p < etext_ && prog_->can_prefix_accel()) {
|
||||
p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
|
||||
if (p == NULL)
|
||||
p = etext_;
|
||||
}
|
||||
|
||||
Thread* t = AllocThread();
|
||||
CopyCapture(t->capture, match_);
|
||||
t->capture[0] = p;
|
||||
AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
|
||||
t);
|
||||
Decref(t);
|
||||
}
|
||||
|
||||
// If all the threads have died, stop early.
|
||||
if (runq->size() == 0) {
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, "dead\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// Avoid invoking undefined behavior (arithmetic on a null pointer)
|
||||
// by simply not continuing the loop.
|
||||
// This complements the special case in NFA::Step().
|
||||
if (p == NULL) {
|
||||
(void) Step(runq, nextq, -1, context, p);
|
||||
ABSL_DCHECK_EQ(runq->size(), 0);
|
||||
using std::swap;
|
||||
swap(nextq, runq);
|
||||
nextq->clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
if (i->value() != NULL)
|
||||
Decref(i->value());
|
||||
}
|
||||
|
||||
if (matched_) {
|
||||
for (int i = 0; i < nsubmatch; i++)
|
||||
submatch[i] = absl::string_view(
|
||||
match_[2 * i],
|
||||
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
|
||||
if (ExtraDebug)
|
||||
absl::FPrintF(stderr, "match (%d,%d)\n",
|
||||
match_[0] - btext_,
|
||||
match_[1] - btext_);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind, absl::string_view* match,
|
||||
int nmatch) {
|
||||
if (ExtraDebug)
|
||||
Dump();
|
||||
|
||||
NFA nfa(this);
|
||||
absl::string_view sp;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch == 0) {
|
||||
match = &sp;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// For each instruction i in the program reachable from the start, compute the
|
||||
// number of instructions reachable from i by following only empty transitions
|
||||
// and record that count as fanout[i].
|
||||
//
|
||||
// fanout holds the results and is also the work queue for the outer iteration.
|
||||
// reachable holds the reached nodes for the inner iteration.
|
||||
void Prog::Fanout(SparseArray<int>* fanout) {
|
||||
ABSL_DCHECK_EQ(fanout->max_size(), size());
|
||||
SparseSet reachable(size());
|
||||
fanout->clear();
|
||||
fanout->set_new(start(), 0);
|
||||
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
|
||||
int* count = &i->value();
|
||||
reachable.clear();
|
||||
reachable.insert(i->index());
|
||||
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
|
||||
int id = *j;
|
||||
Prog::Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "unhandled " << ip->opcode()
|
||||
<< " in Prog::Fanout()";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
|
||||
(*count)++;
|
||||
if (!fanout->has_index(ip->out())) {
|
||||
fanout->set_new(ip->out(), 0);
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
ABSL_DCHECK(!ip->last());
|
||||
reachable.insert(id+1);
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
|
||||
reachable.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
+623
@@ -0,0 +1,623 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchOnePass is an efficient implementation of
|
||||
// regular expression search with submatch tracking for
|
||||
// what I call "one-pass regular expressions". (An alternate
|
||||
// name might be "backtracking-free regular expressions".)
|
||||
//
|
||||
// One-pass regular expressions have the property that
|
||||
// at each input byte during an anchored match, there may be
|
||||
// multiple alternatives but only one can proceed for any
|
||||
// given input byte.
|
||||
//
|
||||
// For example, the regexp /x*yx*/ is one-pass: you read
|
||||
// x's until a y, then you read the y, then you keep reading x's.
|
||||
// At no point do you have to guess what to do or back up
|
||||
// and try a different guess.
|
||||
//
|
||||
// On the other hand, /x*x/ is not one-pass: when you're
|
||||
// looking at an input "x", it's not clear whether you should
|
||||
// use it to extend the x* or as the final x.
|
||||
//
|
||||
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
|
||||
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
|
||||
//
|
||||
// A simple intuition for identifying one-pass regular expressions
|
||||
// is that it's always immediately obvious when a repetition ends.
|
||||
// It must also be immediately obvious which branch of an | to take:
|
||||
//
|
||||
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
|
||||
//
|
||||
// The NFA-based search in nfa.cc does some bookkeeping to
|
||||
// avoid the need for backtracking and its associated exponential blowup.
|
||||
// But if we have a one-pass regular expression, there is no
|
||||
// possibility of backtracking, so there is no need for the
|
||||
// extra bookkeeping. Hence, this code.
|
||||
//
|
||||
// On a one-pass regular expression, the NFA code in nfa.cc
|
||||
// runs at about 1/20 of the backtracking-based PCRE speed.
|
||||
// In contrast, the code in this file runs at about the same
|
||||
// speed as PCRE.
|
||||
//
|
||||
// One-pass regular expressions get used a lot when RE is
|
||||
// used for parsing simple strings, so it pays off to
|
||||
// notice them and handle them efficiently.
|
||||
//
|
||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "absl/container/inlined_vector.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/sparse_set.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
// Silence "zero-sized array in struct/union" warning for OneState::action.
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4200)
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
// The key insight behind this implementation is that the
|
||||
// non-determinism in an NFA for a one-pass regular expression
|
||||
// is contained. To explain what that means, first a
|
||||
// refresher about what regular expression programs look like
|
||||
// and how the usual NFA execution runs.
|
||||
//
|
||||
// In a regular expression program, only the kInstByteRange
|
||||
// instruction processes an input byte c and moves on to the
|
||||
// next byte in the string (it does so if c is in the given range).
|
||||
// The kInstByteRange instructions correspond to literal characters
|
||||
// and character classes in the regular expression.
|
||||
//
|
||||
// The kInstAlt instructions are used as wiring to connect the
|
||||
// kInstByteRange instructions together in interesting ways when
|
||||
// implementing | + and *.
|
||||
// The kInstAlt instruction forks execution, like a goto that
|
||||
// jumps to ip->out() and ip->out1() in parallel. Each of the
|
||||
// resulting computation paths is called a thread.
|
||||
//
|
||||
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
|
||||
// are interesting in their own right but like kInstAlt they don't
|
||||
// advance the input pointer. Only kInstByteRange does.
|
||||
//
|
||||
// The automaton execution in nfa.cc runs all the possible
|
||||
// threads of execution in lock-step over the input. To process
|
||||
// a particular byte, each thread gets run until it either dies
|
||||
// or finds a kInstByteRange instruction matching the byte.
|
||||
// If the latter happens, the thread stops just past the
|
||||
// kInstByteRange instruction (at ip->out()) and waits for
|
||||
// the other threads to finish processing the input byte.
|
||||
// Then, once all the threads have processed that input byte,
|
||||
// the whole process repeats. The kInstAlt state instruction
|
||||
// might create new threads during input processing, but no
|
||||
// matter what, all the threads stop after a kInstByteRange
|
||||
// and wait for the other threads to "catch up".
|
||||
// Running in lock step like this ensures that the NFA reads
|
||||
// the input string only once.
|
||||
//
|
||||
// Each thread maintains its own set of capture registers
|
||||
// (the string positions at which it executed the kInstCapture
|
||||
// instructions corresponding to capturing parentheses in the
|
||||
// regular expression). Repeated copying of the capture registers
|
||||
// is the main performance bottleneck in the NFA implementation.
|
||||
//
|
||||
// A regular expression program is "one-pass" if, no matter what
|
||||
// the input string, there is only one thread that makes it
|
||||
// past a kInstByteRange instruction at each input byte. This means
|
||||
// that there is in some sense only one active thread throughout
|
||||
// the execution. Other threads might be created during the
|
||||
// processing of an input byte, but they are ephemeral: only one
|
||||
// thread is left to start processing the next input byte.
|
||||
// This is what I meant above when I said the non-determinism
|
||||
// was "contained".
|
||||
//
|
||||
// To execute a one-pass regular expression program, we can build
|
||||
// a DFA (no non-determinism) that has at most as many states as
|
||||
// the NFA (compare this to the possibly exponential number of states
|
||||
// in the general case). Each state records, for each possible
|
||||
// input byte, the next state along with the conditions required
|
||||
// before entering that state -- empty-width flags that must be true
|
||||
// and capture operations that must be performed. It also records
|
||||
// whether a set of conditions required to finish a match at that
|
||||
// point in the input rather than process the next byte.
|
||||
|
||||
// A state in the one-pass NFA - just an array of actions indexed
|
||||
// by the bytemap_[] of the next input byte. (The bytemap
|
||||
// maps next input bytes into equivalence classes, to reduce
|
||||
// the memory footprint.)
|
||||
struct OneState {
|
||||
uint32_t matchcond; // conditions to match right now.
|
||||
uint32_t action[];
|
||||
};
|
||||
|
||||
// The uint32_t conditions in the action are a combination of
|
||||
// condition and capture bits and the next state. The bottom 16 bits
|
||||
// are the condition and capture bits, and the top 16 are the index of
|
||||
// the next state.
|
||||
//
|
||||
// Bits 0-5 are the empty-width flags from prog.h.
|
||||
// Bit 6 is kMatchWins, which means the match takes
|
||||
// priority over moving to next in a first-match search.
|
||||
// The remaining bits mark capture registers that should
|
||||
// be set to the current input position. The capture bits
|
||||
// start at index 2, since the search loop can take care of
|
||||
// cap[0], cap[1] (the overall match position).
|
||||
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
|
||||
// No input position can satisfy both kEmptyWordBoundary
|
||||
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
||||
// instead of needing an extra bit.
|
||||
|
||||
static const int kIndexShift = 16; // number of bits below index
|
||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
||||
static const int kRealCapShift = kEmptyShift + 1;
|
||||
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
||||
|
||||
// Parameters used to skip over cap[0], cap[1].
|
||||
static const int kCapShift = kRealCapShift - 2;
|
||||
static const int kMaxCap = kRealMaxCap + 2;
|
||||
|
||||
static const uint32_t kMatchWins = 1 << kEmptyShift;
|
||||
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||
|
||||
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||
|
||||
// Check, at compile time, that prog.h agrees with math above.
|
||||
// This function is never called.
|
||||
void OnePass_Checks() {
|
||||
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||
"kEmptyShift disagrees with kEmptyAllFlags");
|
||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
||||
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||
"kMaxCap disagrees with kMaxOnePassCapture");
|
||||
}
|
||||
|
||||
static bool Satisfy(uint32_t cond, absl::string_view context, const char* p) {
|
||||
uint32_t satisfied = Prog::EmptyFlags(context, p);
|
||||
if (cond & kEmptyAllFlags & ~satisfied)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Apply the capture bits in cond, saving p to the appropriate
|
||||
// locations in cap[].
|
||||
static void ApplyCaptures(uint32_t cond, const char* p,
|
||||
const char** cap, int ncap) {
|
||||
for (int i = 2; i < ncap; i++)
|
||||
if (cond & (1 << kCapShift << i))
|
||||
cap[i] = p;
|
||||
}
|
||||
|
||||
// Computes the OneState* for the given nodeindex.
|
||||
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
|
||||
int nodeindex) {
|
||||
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
|
||||
}
|
||||
|
||||
bool Prog::SearchOnePass(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
absl::string_view* match, int nmatch) {
|
||||
if (anchor != kAnchored && kind != kFullMatch) {
|
||||
ABSL_LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure we have at least cap[1],
|
||||
// because we use it to tell if we matched.
|
||||
int ncap = 2*nmatch;
|
||||
if (ncap < 2)
|
||||
ncap = 2;
|
||||
|
||||
const char* cap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
cap[i] = NULL;
|
||||
|
||||
const char* matchcap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
matchcap[i] = NULL;
|
||||
|
||||
if (context.data() == NULL)
|
||||
context = text;
|
||||
if (anchor_start() && BeginPtr(context) != BeginPtr(text))
|
||||
return false;
|
||||
if (anchor_end() && EndPtr(context) != EndPtr(text))
|
||||
return false;
|
||||
if (anchor_end())
|
||||
kind = kFullMatch;
|
||||
|
||||
uint8_t* nodes = onepass_nodes_.data();
|
||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||
// start() is always mapped to the zeroth OneState.
|
||||
OneState* state = IndexToNode(nodes, statesize, 0);
|
||||
uint8_t* bytemap = bytemap_;
|
||||
const char* bp = text.data();
|
||||
const char* ep = text.data() + text.size();
|
||||
const char* p;
|
||||
bool matched = false;
|
||||
matchcap[0] = bp;
|
||||
cap[0] = bp;
|
||||
uint32_t nextmatchcond = state->matchcond;
|
||||
for (p = bp; p < ep; p++) {
|
||||
int c = bytemap[*p & 0xFF];
|
||||
uint32_t matchcond = nextmatchcond;
|
||||
uint32_t cond = state->action[c];
|
||||
|
||||
// Determine whether we can reach act->next.
|
||||
// If so, advance state and nextmatchcond.
|
||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
||||
uint32_t nextindex = cond >> kIndexShift;
|
||||
state = IndexToNode(nodes, statesize, nextindex);
|
||||
nextmatchcond = state->matchcond;
|
||||
} else {
|
||||
state = NULL;
|
||||
nextmatchcond = kImpossible;
|
||||
}
|
||||
|
||||
// This code section is carefully tuned.
|
||||
// The goto sequence is about 10% faster than the
|
||||
// obvious rewrite as a large if statement in the
|
||||
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
|
||||
|
||||
// Saving the match capture registers is expensive.
|
||||
// Is this intermediate match worth thinking about?
|
||||
|
||||
// Not if we want a full match.
|
||||
if (kind == kFullMatch)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if it's impossible.
|
||||
if (matchcond == kImpossible)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if the possible match is beaten by the certain
|
||||
// match at the next byte. When this test is useless
|
||||
// (e.g., HTTPPartialMatchRE2) it slows the loop by
|
||||
// about 10%, but when it avoids work (e.g., DotMatchRE2),
|
||||
// it cuts the loop execution by about 45%.
|
||||
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
|
||||
goto skipmatch;
|
||||
|
||||
// Finally, the match conditions must be satisfied.
|
||||
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
|
||||
for (int i = 2; i < 2*nmatch; i++)
|
||||
matchcap[i] = cap[i];
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, matchcap, ncap);
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
|
||||
// If we're in longest match mode, we have to keep
|
||||
// going and see if we find a longer match.
|
||||
// In first match mode, we can stop if the match
|
||||
// takes priority over the next state for this input byte.
|
||||
// That bit is per-input byte and thus in cond, not matchcond.
|
||||
if (kind == kFirstMatch && (cond & kMatchWins))
|
||||
goto done;
|
||||
}
|
||||
|
||||
skipmatch:
|
||||
if (state == NULL)
|
||||
goto done;
|
||||
if ((cond & kCapMask) && nmatch > 1)
|
||||
ApplyCaptures(cond, p, cap, ncap);
|
||||
}
|
||||
|
||||
// Look for match at end of input.
|
||||
{
|
||||
uint32_t matchcond = state->matchcond;
|
||||
if (matchcond != kImpossible &&
|
||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, cap, ncap);
|
||||
for (int i = 2; i < ncap; i++)
|
||||
matchcap[i] = cap[i];
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
if (!matched)
|
||||
return false;
|
||||
for (int i = 0; i < nmatch; i++)
|
||||
match[i] = absl::string_view(
|
||||
matchcap[2 * i],
|
||||
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Analysis to determine whether a given regexp program is one-pass.
|
||||
|
||||
// If ip is not on workq, adds ip to work queue and returns true.
|
||||
// If ip is already on work queue, does nothing and returns false.
|
||||
// If ip is NULL, does nothing and returns true (pretends to add it).
|
||||
typedef SparseSet Instq;
|
||||
static bool AddQ(Instq *q, int id) {
|
||||
if (id == 0)
|
||||
return true;
|
||||
if (q->contains(id))
|
||||
return false;
|
||||
q->insert(id);
|
||||
return true;
|
||||
}
|
||||
|
||||
struct InstCond {
|
||||
int id;
|
||||
uint32_t cond;
|
||||
};
|
||||
|
||||
// Returns whether this is a one-pass program; that is,
|
||||
// returns whether it is safe to use SearchOnePass on this program.
|
||||
// These conditions must be true for any instruction ip:
|
||||
//
|
||||
// (1) for any other Inst nip, there is at most one input-free
|
||||
// path from ip to nip.
|
||||
// (2) there is at most one kInstByte instruction reachable from
|
||||
// ip that matches any particular byte c.
|
||||
// (3) there is at most one input-free path from ip to a kInstMatch
|
||||
// instruction.
|
||||
//
|
||||
// This is actually just a conservative approximation: it might
|
||||
// return false when the answer is true, when kInstEmptyWidth
|
||||
// instructions are involved.
|
||||
// Constructs and saves corresponding one-pass NFA on success.
|
||||
bool Prog::IsOnePass() {
|
||||
if (did_onepass_)
|
||||
return onepass_nodes_.data() != NULL;
|
||||
did_onepass_ = true;
|
||||
|
||||
if (start() == 0) // no match
|
||||
return false;
|
||||
|
||||
// Steal memory for the one-pass NFA from the overall DFA budget.
|
||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
||||
// Limit max node count to 65000 as a conservative estimate to
|
||||
// avoid overflowing 16-bit node index in encoding.
|
||||
int maxnodes = 2 + inst_count(kInstByteRange);
|
||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
||||
return false;
|
||||
|
||||
// Flood the graph starting at the start state, and check
|
||||
// that in each reachable state, each possible byte leads
|
||||
// to a unique next state.
|
||||
int stacksize = inst_count(kInstCapture) +
|
||||
inst_count(kInstEmptyWidth) +
|
||||
inst_count(kInstNop) + 1; // + 1 for start inst
|
||||
absl::FixedArray<InstCond, 64> stack_storage(stacksize);
|
||||
InstCond* stack = stack_storage.data();
|
||||
|
||||
int size = this->size();
|
||||
absl::FixedArray<int, 128> nodebyid_storage(size, -1); // indexed by ip
|
||||
int* nodebyid = nodebyid_storage.data();
|
||||
|
||||
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
|
||||
// unnecessarily optimistic: why allocate a large amount of memory
|
||||
// upfront for a large program when it is unlikely to be one-pass?
|
||||
absl::InlinedVector<uint8_t, 2048> nodes;
|
||||
|
||||
Instq tovisit(size), workq(size);
|
||||
AddQ(&tovisit, start());
|
||||
nodebyid[start()] = 0;
|
||||
int nalloc = 1;
|
||||
nodes.insert(nodes.end(), statesize, 0);
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
|
||||
// Flood graph using manual stack, filling in actions as found.
|
||||
// Default is none.
|
||||
for (int b = 0; b < bytemap_range_; b++)
|
||||
node->action[b] = kImpossible;
|
||||
node->matchcond = kImpossible;
|
||||
|
||||
workq.clear();
|
||||
bool matched = false;
|
||||
int nstack = 0;
|
||||
stack[nstack].id = id;
|
||||
stack[nstack++].cond = 0;
|
||||
while (nstack > 0) {
|
||||
int id = stack[--nstack].id;
|
||||
uint32_t cond = stack[nstack].cond;
|
||||
|
||||
Loop:
|
||||
Prog::Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
||||
// Should implement it in this engine, but it's subtle.
|
||||
ABSL_DCHECK(!ip->last());
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange: {
|
||||
int nextindex = nodebyid[ip->out()];
|
||||
if (nextindex == -1) {
|
||||
if (nalloc >= maxnodes) {
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << absl::StrFormat(
|
||||
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
|
||||
goto fail;
|
||||
}
|
||||
nextindex = nalloc;
|
||||
AddQ(&tovisit, ip->out());
|
||||
nodebyid[ip->out()] = nalloc;
|
||||
nalloc++;
|
||||
nodes.insert(nodes.end(), statesize, 0);
|
||||
// Update node because it might have been invalidated.
|
||||
node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
}
|
||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
||||
int b = bytemap_[c];
|
||||
// Skip any bytes immediately after c that are also in b.
|
||||
while (c < 256-1 && bytemap_[c+1] == b)
|
||||
c++;
|
||||
uint32_t act = node->action[b];
|
||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||
if (matched)
|
||||
newact |= kMatchWins;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << absl::StrFormat(
|
||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
if (ip->foldcase()) {
|
||||
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||
for (int c = lo; c <= hi; c++) {
|
||||
int b = bytemap_[c];
|
||||
// Skip any bytes immediately after c that are also in b.
|
||||
while (c < 256-1 && bytemap_[c+1] == b)
|
||||
c++;
|
||||
uint32_t act = node->action[b];
|
||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||
if (matched)
|
||||
newact |= kMatchWins;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << absl::StrFormat(
|
||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ip->last())
|
||||
break;
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
case kInstNop:
|
||||
if (!ip->last()) {
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
stack[nstack].id = id+1;
|
||||
stack[nstack++].cond = cond;
|
||||
}
|
||||
|
||||
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
|
||||
cond |= (1 << kCapShift) << ip->cap();
|
||||
if (ip->opcode() == kInstEmptyWidth)
|
||||
cond |= ip->empty();
|
||||
|
||||
// kInstCapture and kInstNop always proceed to ip->out().
|
||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
||||
// but as a conservative approximation we assume it always does.
|
||||
// We could be a little more precise by looking at what c
|
||||
// is, but that seems like overkill.
|
||||
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out())) {
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << absl::StrFormat(
|
||||
"Not OnePass: multiple paths %d -> %d", *it, ip->out());
|
||||
goto fail;
|
||||
}
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
if (matched) {
|
||||
// (3) is violated
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << absl::StrFormat(
|
||||
"Not OnePass: multiple matches from %d", *it);
|
||||
goto fail;
|
||||
}
|
||||
matched = true;
|
||||
node->matchcond = cond;
|
||||
|
||||
if (ip->last())
|
||||
break;
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ExtraDebug) { // For debugging, dump one-pass NFA to ABSL_LOG(ERROR).
|
||||
ABSL_LOG(ERROR) << "bytemap:\n" << DumpByteMap();
|
||||
ABSL_LOG(ERROR) << "prog:\n" << Dump();
|
||||
|
||||
std::map<int, int> idmap;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (nodebyid[i] != -1)
|
||||
idmap[nodebyid[i]] = i;
|
||||
|
||||
std::string dump;
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
if (nodeindex == -1)
|
||||
continue;
|
||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
dump += absl::StrFormat("node %d id=%d: matchcond=%#x\n",
|
||||
nodeindex, id, node->matchcond);
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
if ((node->action[i] & kImpossible) == kImpossible)
|
||||
continue;
|
||||
dump += absl::StrFormat(" %d cond %#x -> %d id=%d\n",
|
||||
i, node->action[i] & 0xFFFF,
|
||||
node->action[i] >> kIndexShift,
|
||||
idmap[node->action[i] >> kIndexShift]);
|
||||
}
|
||||
}
|
||||
ABSL_LOG(ERROR) << "nodes:\n" << dump;
|
||||
}
|
||||
|
||||
dfa_mem_ -= nalloc*statesize;
|
||||
onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
|
||||
memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
|
||||
return true;
|
||||
|
||||
fail:
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
+2530
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,119 @@
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const URange16 code1[] = { /* \d */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static const URange16 code2[] = { /* \s */
|
||||
{ 0x9, 0xa },
|
||||
{ 0xc, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code3[] = { /* \w */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
const UGroup perl_groups[] = {
|
||||
{ "\\d", +1, code1, 1, 0, 0 },
|
||||
{ "\\D", -1, code1, 1, 0, 0 },
|
||||
{ "\\s", +1, code2, 3, 0, 0 },
|
||||
{ "\\S", -1, code2, 3, 0, 0 },
|
||||
{ "\\w", +1, code3, 4, 0, 0 },
|
||||
{ "\\W", -1, code3, 4, 0, 0 },
|
||||
};
|
||||
const int num_perl_groups = 6;
|
||||
static const URange16 code4[] = { /* [:alnum:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code5[] = { /* [:alpha:] */
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code6[] = { /* [:ascii:] */
|
||||
{ 0x0, 0x7f },
|
||||
};
|
||||
static const URange16 code7[] = { /* [:blank:] */
|
||||
{ 0x9, 0x9 },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code8[] = { /* [:cntrl:] */
|
||||
{ 0x0, 0x1f },
|
||||
{ 0x7f, 0x7f },
|
||||
};
|
||||
static const URange16 code9[] = { /* [:digit:] */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static const URange16 code10[] = { /* [:graph:] */
|
||||
{ 0x21, 0x7e },
|
||||
};
|
||||
static const URange16 code11[] = { /* [:lower:] */
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code12[] = { /* [:print:] */
|
||||
{ 0x20, 0x7e },
|
||||
};
|
||||
static const URange16 code13[] = { /* [:punct:] */
|
||||
{ 0x21, 0x2f },
|
||||
{ 0x3a, 0x40 },
|
||||
{ 0x5b, 0x60 },
|
||||
{ 0x7b, 0x7e },
|
||||
};
|
||||
static const URange16 code14[] = { /* [:space:] */
|
||||
{ 0x9, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code15[] = { /* [:upper:] */
|
||||
{ 0x41, 0x5a },
|
||||
};
|
||||
static const URange16 code16[] = { /* [:word:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code17[] = { /* [:xdigit:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x46 },
|
||||
{ 0x61, 0x66 },
|
||||
};
|
||||
const UGroup posix_groups[] = {
|
||||
{ "[:alnum:]", +1, code4, 3, 0, 0 },
|
||||
{ "[:^alnum:]", -1, code4, 3, 0, 0 },
|
||||
{ "[:alpha:]", +1, code5, 2, 0, 0 },
|
||||
{ "[:^alpha:]", -1, code5, 2, 0, 0 },
|
||||
{ "[:ascii:]", +1, code6, 1, 0, 0 },
|
||||
{ "[:^ascii:]", -1, code6, 1, 0, 0 },
|
||||
{ "[:blank:]", +1, code7, 2, 0, 0 },
|
||||
{ "[:^blank:]", -1, code7, 2, 0, 0 },
|
||||
{ "[:cntrl:]", +1, code8, 2, 0, 0 },
|
||||
{ "[:^cntrl:]", -1, code8, 2, 0, 0 },
|
||||
{ "[:digit:]", +1, code9, 1, 0, 0 },
|
||||
{ "[:^digit:]", -1, code9, 1, 0, 0 },
|
||||
{ "[:graph:]", +1, code10, 1, 0, 0 },
|
||||
{ "[:^graph:]", -1, code10, 1, 0, 0 },
|
||||
{ "[:lower:]", +1, code11, 1, 0, 0 },
|
||||
{ "[:^lower:]", -1, code11, 1, 0, 0 },
|
||||
{ "[:print:]", +1, code12, 1, 0, 0 },
|
||||
{ "[:^print:]", -1, code12, 1, 0, 0 },
|
||||
{ "[:punct:]", +1, code13, 4, 0, 0 },
|
||||
{ "[:^punct:]", -1, code13, 4, 0, 0 },
|
||||
{ "[:space:]", +1, code14, 2, 0, 0 },
|
||||
{ "[:^space:]", -1, code14, 2, 0, 0 },
|
||||
{ "[:upper:]", +1, code15, 1, 0, 0 },
|
||||
{ "[:^upper:]", -1, code15, 1, 0, 0 },
|
||||
{ "[:word:]", +1, code16, 4, 0, 0 },
|
||||
{ "[:^word:]", -1, code16, 4, 0, 0 },
|
||||
{ "[:xdigit:]", +1, code17, 3, 0, 0 },
|
||||
{ "[:^xdigit:]", -1, code17, 3, 0, 0 },
|
||||
};
|
||||
const int num_posix_groups = 28;
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,55 @@
|
||||
// Copyright 2018 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_POD_ARRAY_H_
|
||||
#define RE2_POD_ARRAY_H_
|
||||
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
template <typename T>
|
||||
class PODArray {
|
||||
public:
|
||||
static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
|
||||
"T must be POD");
|
||||
|
||||
PODArray()
|
||||
: ptr_() {}
|
||||
explicit PODArray(int len)
|
||||
: ptr_(std::allocator<T>().allocate(len), Deleter(len)) {}
|
||||
|
||||
T* data() const {
|
||||
return ptr_.get();
|
||||
}
|
||||
|
||||
int size() const {
|
||||
return ptr_.get_deleter().len_;
|
||||
}
|
||||
|
||||
T& operator[](int pos) const {
|
||||
return ptr_[pos];
|
||||
}
|
||||
|
||||
private:
|
||||
struct Deleter {
|
||||
Deleter()
|
||||
: len_(0) {}
|
||||
explicit Deleter(int len)
|
||||
: len_(len) {}
|
||||
|
||||
void operator()(T* ptr) const {
|
||||
std::allocator<T>().deallocate(ptr, len_);
|
||||
}
|
||||
|
||||
int len_;
|
||||
};
|
||||
|
||||
std::unique_ptr<T[], Deleter> ptr_;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_POD_ARRAY_H_
|
||||
@@ -0,0 +1,711 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/prefilter.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/unicode_casefold.h"
|
||||
#include "re2/walker-inl.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
||||
Prefilter::Prefilter(Op op) {
|
||||
op_ = op;
|
||||
subs_ = NULL;
|
||||
if (op_ == AND || op_ == OR)
|
||||
subs_ = new std::vector<Prefilter*>;
|
||||
}
|
||||
|
||||
// Destroys a Prefilter.
|
||||
Prefilter::~Prefilter() {
|
||||
if (subs_) {
|
||||
for (size_t i = 0; i < subs_->size(); i++)
|
||||
delete (*subs_)[i];
|
||||
delete subs_;
|
||||
subs_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Simplify if the node is an empty Or or And.
|
||||
Prefilter* Prefilter::Simplify() {
|
||||
if (op_ != AND && op_ != OR) {
|
||||
return this;
|
||||
}
|
||||
|
||||
// Nothing left in the AND/OR.
|
||||
if (subs_->empty()) {
|
||||
if (op_ == AND)
|
||||
op_ = ALL; // AND of nothing is true
|
||||
else
|
||||
op_ = NONE; // OR of nothing is false
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Just one subnode: throw away wrapper.
|
||||
if (subs_->size() == 1) {
|
||||
Prefilter* a = (*subs_)[0];
|
||||
subs_->clear();
|
||||
delete this;
|
||||
return a->Simplify();
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Combines two Prefilters together to create an "op" (AND or OR).
|
||||
// The passed Prefilters will be part of the returned Prefilter or deleted.
|
||||
// Does lots of work to avoid creating unnecessarily complicated structures.
|
||||
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
|
||||
// If a, b can be rewritten as op, do so.
|
||||
a = a->Simplify();
|
||||
b = b->Simplify();
|
||||
|
||||
// Canonicalize: a->op <= b->op.
|
||||
if (a->op() > b->op()) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
|
||||
// Trivial cases.
|
||||
// ALL AND b = b
|
||||
// NONE OR b = b
|
||||
// ALL OR b = ALL
|
||||
// NONE AND b = NONE
|
||||
// Don't need to look at b, because of canonicalization above.
|
||||
// ALL and NONE are smallest opcodes.
|
||||
if (a->op() == ALL || a->op() == NONE) {
|
||||
if ((a->op() == ALL && op == AND) ||
|
||||
(a->op() == NONE && op == OR)) {
|
||||
delete a;
|
||||
return b;
|
||||
} else {
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
// If a and b match op, merge their contents.
|
||||
if (a->op() == op && b->op() == op) {
|
||||
for (size_t i = 0; i < b->subs()->size(); i++) {
|
||||
Prefilter* bb = (*b->subs())[i];
|
||||
a->subs()->push_back(bb);
|
||||
}
|
||||
b->subs()->clear();
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
|
||||
// If a already has the same op as the op that is under construction
|
||||
// add in b (similarly if b already has the same op, add in a).
|
||||
if (b->op() == op) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
if (a->op() == op) {
|
||||
a->subs()->push_back(b);
|
||||
return a;
|
||||
}
|
||||
|
||||
// Otherwise just return the op.
|
||||
Prefilter* c = new Prefilter(op);
|
||||
c->subs()->push_back(a);
|
||||
c->subs()->push_back(b);
|
||||
return c;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(AND, a, b);
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(OR, a, b);
|
||||
}
|
||||
|
||||
void Prefilter::SimplifyStringSet(SSet* ss) {
|
||||
// Now make sure that the strings aren't redundant. For example, if
|
||||
// we know "ab" is a required string, then it doesn't help at all to
|
||||
// know that "abc" is also a required string, so delete "abc". This
|
||||
// is because, when we are performing a string search to filter
|
||||
// regexps, matching "ab" will already allow this regexp to be a
|
||||
// candidate for match, so further matching "abc" is redundant.
|
||||
// Note that we must ignore "" because find() would find it at the
|
||||
// start of everything and thus we would end up erasing everything.
|
||||
//
|
||||
// The SSet sorts strings by length, then lexicographically. Note that
|
||||
// smaller strings appear first and all strings must be unique. These
|
||||
// observations let us skip string comparisons when possible.
|
||||
SSIter i = ss->begin();
|
||||
if (i != ss->end() && i->empty()) {
|
||||
++i;
|
||||
}
|
||||
for (; i != ss->end(); ++i) {
|
||||
SSIter j = i;
|
||||
++j;
|
||||
while (j != ss->end()) {
|
||||
if (j->size() > i->size() && j->find(*i) != std::string::npos) {
|
||||
j = ss->erase(j);
|
||||
continue;
|
||||
}
|
||||
++j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::OrStrings(SSet* ss) {
|
||||
Prefilter* or_prefilter = new Prefilter(NONE);
|
||||
SimplifyStringSet(ss);
|
||||
for (SSIter i = ss->begin(); i != ss->end(); ++i)
|
||||
or_prefilter = Or(or_prefilter, FromString(*i));
|
||||
return or_prefilter;
|
||||
}
|
||||
|
||||
static Rune ToLowerRune(Rune r) {
|
||||
if (r < Runeself) {
|
||||
if ('A' <= r && r <= 'Z')
|
||||
r += 'a' - 'A';
|
||||
return r;
|
||||
}
|
||||
|
||||
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
||||
if (f == NULL || r < f->lo)
|
||||
return r;
|
||||
return ApplyFold(f, r);
|
||||
}
|
||||
|
||||
static Rune ToLowerRuneLatin1(Rune r) {
|
||||
if ('A' <= r && r <= 'Z')
|
||||
r += 'a' - 'A';
|
||||
return r;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromString(const std::string& str) {
|
||||
Prefilter* m = new Prefilter(Prefilter::ATOM);
|
||||
m->atom_ = str;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Information about a regexp used during computation of Prefilter.
|
||||
// Can be thought of as information about the set of strings matching
|
||||
// the given regular expression.
|
||||
class Prefilter::Info {
|
||||
public:
|
||||
Info();
|
||||
~Info();
|
||||
|
||||
// More constructors. They delete their Info* arguments.
|
||||
static Info* Alt(Info* a, Info* b);
|
||||
static Info* Concat(Info* a, Info* b);
|
||||
static Info* And(Info* a, Info* b);
|
||||
static Info* Star(Info* a);
|
||||
static Info* Plus(Info* a);
|
||||
static Info* Quest(Info* a);
|
||||
static Info* EmptyString();
|
||||
static Info* NoMatch();
|
||||
static Info* AnyCharOrAnyByte();
|
||||
static Info* CClass(CharClass* cc, bool latin1);
|
||||
static Info* Literal(Rune r);
|
||||
static Info* LiteralLatin1(Rune r);
|
||||
static Info* AnyMatch();
|
||||
|
||||
// Format Info as a string.
|
||||
std::string ToString();
|
||||
|
||||
// Caller takes ownership of the Prefilter.
|
||||
Prefilter* TakeMatch();
|
||||
|
||||
SSet& exact() { return exact_; }
|
||||
|
||||
bool is_exact() const { return is_exact_; }
|
||||
|
||||
class Walker;
|
||||
|
||||
private:
|
||||
SSet exact_;
|
||||
|
||||
// When is_exact_ is true, the strings that match
|
||||
// are placed in exact_. When it is no longer an exact
|
||||
// set of strings that match this RE, then is_exact_
|
||||
// is false and the match_ contains the required match
|
||||
// criteria.
|
||||
bool is_exact_;
|
||||
|
||||
// Accumulated Prefilter query that any
|
||||
// match for this regexp is guaranteed to match.
|
||||
Prefilter* match_;
|
||||
};
|
||||
|
||||
|
||||
Prefilter::Info::Info()
|
||||
: is_exact_(false),
|
||||
match_(NULL) {
|
||||
}
|
||||
|
||||
Prefilter::Info::~Info() {
|
||||
delete match_;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Info::TakeMatch() {
|
||||
if (is_exact_) {
|
||||
match_ = Prefilter::OrStrings(&exact_);
|
||||
is_exact_ = false;
|
||||
}
|
||||
Prefilter* m = match_;
|
||||
match_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Format a Info in string form.
|
||||
std::string Prefilter::Info::ToString() {
|
||||
if (is_exact_) {
|
||||
int n = 0;
|
||||
std::string s;
|
||||
for (SSIter i = exact_.begin(); i != exact_.end(); ++i) {
|
||||
if (n++ > 0)
|
||||
s += ",";
|
||||
s += *i;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
if (match_)
|
||||
return match_->DebugString();
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) {
|
||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
||||
dst->insert(*i + *j);
|
||||
}
|
||||
|
||||
// Concats a and b. Requires that both are exact sets.
|
||||
// Forms an exact set that is a crossproduct of a and b.
|
||||
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
ABSL_DCHECK(a->is_exact_);
|
||||
ABSL_DCHECK(b && b->is_exact_);
|
||||
Info *ab = new Info();
|
||||
|
||||
CrossProduct(a->exact_, b->exact_, &ab->exact_);
|
||||
ab->is_exact_ = true;
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs an inexact Info for ab given a and b.
|
||||
// Used only when a or b is not exact or when the
|
||||
// exact cross product is likely to be too big.
|
||||
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
if (b == NULL)
|
||||
return a;
|
||||
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a|b given a and b.
|
||||
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
|
||||
Info *ab = new Info();
|
||||
|
||||
if (a->is_exact_ && b->is_exact_) {
|
||||
// Avoid string copies by moving the larger exact_ set into
|
||||
// ab directly, then merge in the smaller set.
|
||||
if (a->exact_.size() < b->exact_.size()) {
|
||||
using std::swap;
|
||||
swap(a, b);
|
||||
}
|
||||
ab->exact_ = std::move(a->exact_);
|
||||
ab->exact_.insert(b->exact_.begin(), b->exact_.end());
|
||||
ab->is_exact_ = true;
|
||||
} else {
|
||||
// Either a or b has is_exact_ = false. If the other
|
||||
// one has is_exact_ = true, we move it to match_ and
|
||||
// then create a OR of a,b. The resulting Info has
|
||||
// is_exact_ = false.
|
||||
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
}
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a? given a.
|
||||
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->is_exact_ = false;
|
||||
ab->match_ = new Prefilter(ALL);
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a* given a.
|
||||
// Same as a? -- not much to do.
|
||||
Prefilter::Info* Prefilter::Info::Star(Info *a) {
|
||||
return Quest(a);
|
||||
}
|
||||
|
||||
// Constructs Info for a+ given a. If a was exact set, it isn't
|
||||
// anymore.
|
||||
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = a->TakeMatch();
|
||||
ab->is_exact_ = false;
|
||||
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
static std::string RuneToString(Rune r) {
|
||||
char buf[UTFmax];
|
||||
int n = runetochar(buf, &r);
|
||||
return std::string(buf, n);
|
||||
}
|
||||
|
||||
static std::string RuneToStringLatin1(Rune r) {
|
||||
char c = r & 0xff;
|
||||
return std::string(&c, 1);
|
||||
}
|
||||
|
||||
// Constructs Info for literal rune.
|
||||
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
|
||||
Info* info = new Info();
|
||||
info->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
info->is_exact_ = true;
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Info for literal rune for Latin1 encoded string.
|
||||
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
|
||||
Info* info = new Info();
|
||||
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
||||
info->is_exact_ = true;
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Info for dot (any character) or \C (any byte).
|
||||
Prefilter::Info* Prefilter::Info::AnyCharOrAnyByte() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for no possible match.
|
||||
Prefilter::Info* Prefilter::Info::NoMatch() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(NONE);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for any possible match.
|
||||
// This Prefilter::Info is valid for any regular expression,
|
||||
// since it makes no assertions whatsoever about the
|
||||
// strings being matched.
|
||||
Prefilter::Info* Prefilter::Info::AnyMatch() {
|
||||
Prefilter::Info *info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for just the empty string.
|
||||
Prefilter::Info* Prefilter::Info::EmptyString() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->is_exact_ = true;
|
||||
info->exact_.insert("");
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for a character class.
|
||||
typedef CharClass::iterator CCIter;
|
||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
||||
bool latin1) {
|
||||
if (ExtraDebug) {
|
||||
ABSL_LOG(ERROR) << "CharClassInfo:";
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
ABSL_LOG(ERROR) << " " << i->lo << "-" << i->hi;
|
||||
}
|
||||
|
||||
// If the class is too large, it's okay to overestimate.
|
||||
if (cc->size() > 10)
|
||||
return AnyCharOrAnyByte();
|
||||
|
||||
Prefilter::Info *a = new Prefilter::Info();
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
for (Rune r = i->lo; r <= i->hi; r++) {
|
||||
if (latin1) {
|
||||
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
||||
} else {
|
||||
a->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
a->is_exact_ = true;
|
||||
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << " = " << a->ToString();
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
||||
public:
|
||||
Walker(bool latin1) : latin1_(latin1) {}
|
||||
|
||||
virtual Info* PostVisit(
|
||||
Regexp* re, Info* parent_arg,
|
||||
Info* pre_arg,
|
||||
Info** child_args, int nchild_args);
|
||||
|
||||
virtual Info* ShortVisit(
|
||||
Regexp* re,
|
||||
Info* parent_arg);
|
||||
|
||||
bool latin1() { return latin1_; }
|
||||
private:
|
||||
bool latin1_;
|
||||
|
||||
Walker(const Walker&) = delete;
|
||||
Walker& operator=(const Walker&) = delete;
|
||||
};
|
||||
|
||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
|
||||
|
||||
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
|
||||
Prefilter::Info::Walker w(latin1);
|
||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
||||
|
||||
if (w.stopped_early()) {
|
||||
delete info;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg) {
|
||||
return AnyMatch();
|
||||
}
|
||||
|
||||
// Constructs the Prefilter::Info for the given regular expression.
|
||||
// Assumes re is simplified.
|
||||
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg,
|
||||
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
|
||||
int nchild_args) {
|
||||
Prefilter::Info *info;
|
||||
switch (re->op()) {
|
||||
default:
|
||||
case kRegexpRepeat:
|
||||
info = EmptyString();
|
||||
ABSL_LOG(DFATAL) << "Bad regexp op " << re->op();
|
||||
break;
|
||||
|
||||
case kRegexpNoMatch:
|
||||
info = NoMatch();
|
||||
break;
|
||||
|
||||
// These ops match the empty string:
|
||||
case kRegexpEmptyMatch: // anywhere
|
||||
case kRegexpBeginLine: // at beginning of line
|
||||
case kRegexpEndLine: // at end of line
|
||||
case kRegexpBeginText: // at beginning of text
|
||||
case kRegexpEndText: // at end of text
|
||||
case kRegexpWordBoundary: // at word boundary
|
||||
case kRegexpNoWordBoundary: // not at word boundary
|
||||
info = EmptyString();
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
if (latin1()) {
|
||||
info = LiteralLatin1(re->rune());
|
||||
}
|
||||
else {
|
||||
info = Literal(re->rune());
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
if (re->nrunes() == 0) {
|
||||
info = NoMatch();
|
||||
break;
|
||||
}
|
||||
if (latin1()) {
|
||||
info = LiteralLatin1(re->runes()[0]);
|
||||
for (int i = 1; i < re->nrunes(); i++) {
|
||||
info = Concat(info, LiteralLatin1(re->runes()[i]));
|
||||
}
|
||||
} else {
|
||||
info = Literal(re->runes()[0]);
|
||||
for (int i = 1; i < re->nrunes(); i++) {
|
||||
info = Concat(info, Literal(re->runes()[i]));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpConcat: {
|
||||
// Accumulate in info.
|
||||
// Exact is concat of recent contiguous exact nodes.
|
||||
info = NULL;
|
||||
Info* exact = NULL;
|
||||
for (int i = 0; i < nchild_args; i++) {
|
||||
Info* ci = child_args[i]; // child info
|
||||
if (!ci->is_exact() ||
|
||||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
|
||||
// Exact run is over.
|
||||
info = And(info, exact);
|
||||
exact = NULL;
|
||||
// Add this child's info.
|
||||
info = And(info, ci);
|
||||
} else {
|
||||
// Append to exact run.
|
||||
exact = Concat(exact, ci);
|
||||
}
|
||||
}
|
||||
info = And(info, exact);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpAlternate:
|
||||
info = child_args[0];
|
||||
for (int i = 1; i < nchild_args; i++)
|
||||
info = Alt(info, child_args[i]);
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
info = Star(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
info = Quest(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
info = Plus(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
// Claim nothing, except that it's not empty.
|
||||
info = AnyCharOrAnyByte();
|
||||
break;
|
||||
|
||||
case kRegexpCharClass:
|
||||
info = CClass(re->cc(), latin1());
|
||||
break;
|
||||
|
||||
case kRegexpCapture:
|
||||
// These don't affect the set of matching strings.
|
||||
info = child_args[0];
|
||||
break;
|
||||
}
|
||||
|
||||
if (ExtraDebug)
|
||||
ABSL_LOG(ERROR) << "BuildInfo " << re->ToString()
|
||||
<< ": " << (info ? info->ToString() : "");
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
Prefilter* Prefilter::FromRegexp(Regexp* re) {
|
||||
if (re == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* simple = re->Simplify();
|
||||
if (simple == NULL)
|
||||
return NULL;
|
||||
|
||||
Prefilter::Info* info = BuildInfo(simple);
|
||||
simple->Decref();
|
||||
if (info == NULL)
|
||||
return NULL;
|
||||
|
||||
Prefilter* m = info->TakeMatch();
|
||||
delete info;
|
||||
return m;
|
||||
}
|
||||
|
||||
std::string Prefilter::DebugString() const {
|
||||
switch (op_) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
|
||||
return absl::StrFormat("op%d", op_);
|
||||
case NONE:
|
||||
return "*no-matches*";
|
||||
case ATOM:
|
||||
return atom_;
|
||||
case ALL:
|
||||
return "";
|
||||
case AND: {
|
||||
std::string s = "";
|
||||
for (size_t i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += " ";
|
||||
Prefilter* sub = (*subs_)[i];
|
||||
s += sub ? sub->DebugString() : "<nil>";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
case OR: {
|
||||
std::string s = "(";
|
||||
for (size_t i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += "|";
|
||||
Prefilter* sub = (*subs_)[i];
|
||||
s += sub ? sub->DebugString() : "<nil>";
|
||||
}
|
||||
s += ")";
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromRE2(const RE2* re2) {
|
||||
if (re2 == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* regexp = re2->Regexp();
|
||||
if (regexp == NULL)
|
||||
return NULL;
|
||||
|
||||
return FromRegexp(regexp);
|
||||
}
|
||||
|
||||
|
||||
} // namespace re2
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PREFILTER_H_
|
||||
#define RE2_PREFILTER_H_
|
||||
|
||||
// Prefilter is the class used to extract string guards from regexps.
|
||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
||||
// See filtered_re2.h
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class RE2;
|
||||
|
||||
class Regexp;
|
||||
|
||||
class Prefilter {
|
||||
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
||||
public:
|
||||
enum Op {
|
||||
ALL = 0, // Everything matches
|
||||
NONE, // Nothing matches
|
||||
ATOM, // The string atom() must match
|
||||
AND, // All in subs() must match
|
||||
OR, // One of subs() must match
|
||||
};
|
||||
|
||||
explicit Prefilter(Op op);
|
||||
~Prefilter();
|
||||
|
||||
Op op() { return op_; }
|
||||
const std::string& atom() const { return atom_; }
|
||||
void set_unique_id(int id) { unique_id_ = id; }
|
||||
int unique_id() const { return unique_id_; }
|
||||
|
||||
// The children of the Prefilter node.
|
||||
std::vector<Prefilter*>* subs() {
|
||||
ABSL_DCHECK(op_ == AND || op_ == OR);
|
||||
return subs_;
|
||||
}
|
||||
|
||||
// Set the children vector. Prefilter takes ownership of subs and
|
||||
// subs_ will be deleted when Prefilter is deleted.
|
||||
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
|
||||
|
||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
||||
// cannot be formed.
|
||||
static Prefilter* FromRE2(const RE2* re2);
|
||||
|
||||
// Returns a readable debug string of the prefilter.
|
||||
std::string DebugString() const;
|
||||
|
||||
private:
|
||||
template <typename H>
|
||||
friend H AbslHashValue(H h, const Prefilter& a) {
|
||||
h = H::combine(std::move(h), a.op_);
|
||||
if (a.op_ == ATOM) {
|
||||
h = H::combine(std::move(h), a.atom_);
|
||||
} else if (a.op_ == AND || a.op_ == OR) {
|
||||
h = H::combine(std::move(h), a.subs_->size());
|
||||
for (size_t i = 0; i < a.subs_->size(); ++i) {
|
||||
h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_);
|
||||
}
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
friend bool operator==(const Prefilter& a, const Prefilter& b) {
|
||||
if (&a == &b) {
|
||||
return true;
|
||||
}
|
||||
if (a.op_ != b.op_) {
|
||||
return false;
|
||||
}
|
||||
if (a.op_ == ATOM) {
|
||||
if (a.atom_ != b.atom_) {
|
||||
return false;
|
||||
}
|
||||
} else if (a.op_ == AND || a.op_ == OR) {
|
||||
if (a.subs_->size() != b.subs_->size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < a.subs_->size(); ++i) {
|
||||
if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// A comparator used to store exact strings. We compare by length,
|
||||
// then lexicographically. This ordering makes it easier to reduce the
|
||||
// set of strings in SimplifyStringSet.
|
||||
struct LengthThenLex {
|
||||
bool operator()(const std::string& a, const std::string& b) const {
|
||||
return (a.size() < b.size()) || (a.size() == b.size() && a < b);
|
||||
}
|
||||
};
|
||||
|
||||
class Info;
|
||||
|
||||
using SSet = std::set<std::string, LengthThenLex>;
|
||||
using SSIter = SSet::iterator;
|
||||
using ConstSSIter = SSet::const_iterator;
|
||||
|
||||
// Combines two prefilters together to create an AND. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* And(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Combines two prefilters together to create an OR. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Generalized And/Or
|
||||
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
||||
|
||||
static Prefilter* FromRegexp(Regexp* a);
|
||||
|
||||
static Prefilter* FromString(const std::string& str);
|
||||
|
||||
static Prefilter* OrStrings(SSet* ss);
|
||||
|
||||
static Info* BuildInfo(Regexp* re);
|
||||
|
||||
Prefilter* Simplify();
|
||||
|
||||
// Removes redundant strings from the set. A string is redundant if
|
||||
// any of the other strings appear as a substring. The empty string
|
||||
// is a special case, which is ignored.
|
||||
static void SimplifyStringSet(SSet* ss);
|
||||
|
||||
// Adds the cross-product of a and b to dst.
|
||||
// (For each string i in a and j in b, add i+j.)
|
||||
static void CrossProduct(const SSet& a, const SSet& b, SSet* dst);
|
||||
|
||||
// Kind of Prefilter.
|
||||
Op op_;
|
||||
|
||||
// Sub-matches for AND or OR Prefilter.
|
||||
std::vector<Prefilter*>* subs_;
|
||||
|
||||
// Actual string to match in leaf node.
|
||||
std::string atom_;
|
||||
|
||||
// If different prefilters have the same string atom, or if they are
|
||||
// structurally the same (e.g., OR of same atom strings) they are
|
||||
// considered the same unique nodes. This is the id for each unique
|
||||
// node. This field is populated with a unique id for every node,
|
||||
// and -1 for duplicate nodes.
|
||||
int unique_id_;
|
||||
|
||||
Prefilter(const Prefilter&) = delete;
|
||||
Prefilter& operator=(const Prefilter&) = delete;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PREFILTER_H_
|
||||
@@ -0,0 +1,376 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/prefilter_tree.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "re2/prefilter.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
PrefilterTree::PrefilterTree()
|
||||
: compiled_(false),
|
||||
min_atom_len_(3) {
|
||||
}
|
||||
|
||||
PrefilterTree::PrefilterTree(int min_atom_len)
|
||||
: compiled_(false),
|
||||
min_atom_len_(min_atom_len) {
|
||||
}
|
||||
|
||||
PrefilterTree::~PrefilterTree() {
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++)
|
||||
delete prefilter_vec_[i];
|
||||
}
|
||||
|
||||
void PrefilterTree::Add(Prefilter* prefilter) {
|
||||
if (compiled_) {
|
||||
ABSL_LOG(DFATAL) << "Add called after Compile.";
|
||||
return;
|
||||
}
|
||||
if (prefilter != NULL && !KeepNode(prefilter)) {
|
||||
delete prefilter;
|
||||
prefilter = NULL;
|
||||
}
|
||||
|
||||
prefilter_vec_.push_back(prefilter);
|
||||
}
|
||||
|
||||
void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
|
||||
if (compiled_) {
|
||||
ABSL_LOG(DFATAL) << "Compile called already.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Some legacy users of PrefilterTree call Compile() before
|
||||
// adding any regexps and expect Compile() to have no effect.
|
||||
if (prefilter_vec_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
compiled_ = true;
|
||||
|
||||
NodeSet nodes;
|
||||
AssignUniqueIds(&nodes, atom_vec);
|
||||
if (ExtraDebug)
|
||||
PrintDebugInfo(&nodes);
|
||||
}
|
||||
|
||||
Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
|
||||
NodeSet::const_iterator iter = nodes->find(node);
|
||||
if (iter != nodes->end()) {
|
||||
return *iter;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool PrefilterTree::KeepNode(Prefilter* node) const {
|
||||
if (node == NULL)
|
||||
return false;
|
||||
|
||||
switch (node->op()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
|
||||
return false;
|
||||
|
||||
case Prefilter::ALL:
|
||||
case Prefilter::NONE:
|
||||
return false;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
|
||||
|
||||
case Prefilter::AND: {
|
||||
int j = 0;
|
||||
std::vector<Prefilter*>* subs = node->subs();
|
||||
for (size_t i = 0; i < subs->size(); i++)
|
||||
if (KeepNode((*subs)[i]))
|
||||
(*subs)[j++] = (*subs)[i];
|
||||
else
|
||||
delete (*subs)[i];
|
||||
|
||||
subs->resize(j);
|
||||
return j > 0;
|
||||
}
|
||||
|
||||
case Prefilter::OR:
|
||||
for (size_t i = 0; i < node->subs()->size(); i++)
|
||||
if (!KeepNode((*node->subs())[i]))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
|
||||
std::vector<std::string>* atom_vec) {
|
||||
atom_vec->clear();
|
||||
|
||||
// Build vector of all filter nodes, sorted topologically
|
||||
// from top to bottom in v.
|
||||
std::vector<Prefilter*> v;
|
||||
|
||||
// Add the top level nodes of each regexp prefilter.
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
Prefilter* f = prefilter_vec_[i];
|
||||
if (f == NULL)
|
||||
unfiltered_.push_back(static_cast<int>(i));
|
||||
|
||||
// We push NULL also on to v, so that we maintain the
|
||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
||||
v.push_back(f);
|
||||
}
|
||||
|
||||
// Now add all the descendant nodes.
|
||||
for (size_t i = 0; i < v.size(); i++) {
|
||||
Prefilter* f = v[i];
|
||||
if (f == NULL)
|
||||
continue;
|
||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
||||
const std::vector<Prefilter*>& subs = *f->subs();
|
||||
for (size_t j = 0; j < subs.size(); j++)
|
||||
v.push_back(subs[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Identify unique nodes.
|
||||
int unique_id = 0;
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter *node = v[i];
|
||||
if (node == NULL)
|
||||
continue;
|
||||
node->set_unique_id(-1);
|
||||
Prefilter* canonical = CanonicalNode(nodes, node);
|
||||
if (canonical == NULL) {
|
||||
// Any further nodes that have the same atom/subs
|
||||
// will find this node as the canonical node.
|
||||
nodes->emplace(node);
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
atom_vec->push_back(node->atom());
|
||||
atom_index_to_id_.push_back(unique_id);
|
||||
}
|
||||
node->set_unique_id(unique_id++);
|
||||
} else {
|
||||
node->set_unique_id(canonical->unique_id());
|
||||
}
|
||||
}
|
||||
entries_.resize(unique_id);
|
||||
|
||||
// Fill the entries.
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||
continue;
|
||||
int id = prefilter->unique_id();
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
ABSL_LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
||||
return;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
entries_[id].propagate_up_at_count = 1;
|
||||
break;
|
||||
|
||||
case Prefilter::OR:
|
||||
case Prefilter::AND: {
|
||||
// For each child, we append our id to the child's list of
|
||||
// parent ids... unless we happen to have done so already.
|
||||
// The number of appends is the number of unique children,
|
||||
// which allows correct upward propagation from AND nodes.
|
||||
int up_count = 0;
|
||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
||||
int child_id = (*prefilter->subs())[j]->unique_id();
|
||||
std::vector<int>& parents = entries_[child_id].parents;
|
||||
if (parents.empty() || parents.back() != id) {
|
||||
parents.push_back(id);
|
||||
up_count++;
|
||||
}
|
||||
}
|
||||
entries_[id].propagate_up_at_count =
|
||||
prefilter->op() == Prefilter::AND ? up_count : 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For top level nodes, populate regexp id.
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
if (prefilter_vec_[i] == NULL)
|
||||
continue;
|
||||
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
|
||||
ABSL_DCHECK_LE(0, id);
|
||||
Entry* entry = &entries_[id];
|
||||
entry->regexps.push_back(static_cast<int>(i));
|
||||
}
|
||||
|
||||
// Lastly, using probability-based heuristics, we identify nodes
|
||||
// that trigger too many parents and then we try to prune edges.
|
||||
// We use logarithms below to avoid the likelihood of underflow.
|
||||
double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
|
||||
// Hoisted this above the loop so that we don't thrash the heap.
|
||||
std::vector<std::pair<size_t, int>> entries_by_num_edges;
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
// Pruning applies only to AND nodes because it "just" reduces
|
||||
// precision; applied to OR nodes, it would break correctness.
|
||||
if (prefilter == NULL || prefilter->op() != Prefilter::AND)
|
||||
continue;
|
||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||
continue;
|
||||
int id = prefilter->unique_id();
|
||||
|
||||
// Sort the current node's children by the numbers of parents.
|
||||
entries_by_num_edges.clear();
|
||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
||||
int child_id = (*prefilter->subs())[j]->unique_id();
|
||||
const std::vector<int>& parents = entries_[child_id].parents;
|
||||
entries_by_num_edges.emplace_back(parents.size(), child_id);
|
||||
}
|
||||
std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
|
||||
|
||||
// A running estimate of how many regexps will be triggered by
|
||||
// pruning the remaining children's edges to the current node.
|
||||
// Our nominal target is one, so the threshold is log(1) == 0;
|
||||
// pruning occurs iff the child has more than nine edges left.
|
||||
double log_num_triggered = log_num_regexps;
|
||||
for (const auto& pair : entries_by_num_edges) {
|
||||
int child_id = pair.second;
|
||||
std::vector<int>& parents = entries_[child_id].parents;
|
||||
if (log_num_triggered > 0.) {
|
||||
log_num_triggered += std::log(parents.size());
|
||||
log_num_triggered -= log_num_regexps;
|
||||
} else if (parents.size() > 9) {
|
||||
auto it = std::find(parents.begin(), parents.end(), id);
|
||||
if (it != parents.end()) {
|
||||
parents.erase(it);
|
||||
entries_[id].propagate_up_at_count--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for triggering during search.
|
||||
void PrefilterTree::RegexpsGivenStrings(
|
||||
const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* regexps) const {
|
||||
regexps->clear();
|
||||
if (!compiled_) {
|
||||
// Some legacy users of PrefilterTree call Compile() before
|
||||
// adding any regexps and expect Compile() to have no effect.
|
||||
// This kludge is a counterpart to that kludge.
|
||||
if (prefilter_vec_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ABSL_LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++)
|
||||
regexps->push_back(static_cast<int>(i));
|
||||
} else {
|
||||
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
|
||||
std::vector<int> matched_atom_ids;
|
||||
for (size_t j = 0; j < matched_atoms.size(); j++)
|
||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
||||
for (IntMap::const_iterator it = regexps_map.begin();
|
||||
it != regexps_map.end();
|
||||
++it)
|
||||
regexps->push_back(it->index());
|
||||
|
||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
||||
}
|
||||
std::sort(regexps->begin(), regexps->end());
|
||||
}
|
||||
|
||||
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
|
||||
IntMap* regexps) const {
|
||||
IntMap count(static_cast<int>(entries_.size()));
|
||||
IntMap work(static_cast<int>(entries_.size()));
|
||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
||||
work.set(atom_ids[i], 1);
|
||||
for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
|
||||
const Entry& entry = entries_[it->index()];
|
||||
// Record regexps triggered.
|
||||
for (size_t i = 0; i < entry.regexps.size(); i++)
|
||||
regexps->set(entry.regexps[i], 1);
|
||||
int c;
|
||||
// Pass trigger up to parents.
|
||||
for (int j : entry.parents) {
|
||||
const Entry& parent = entries_[j];
|
||||
// Delay until all the children have succeeded.
|
||||
if (parent.propagate_up_at_count > 1) {
|
||||
if (count.has_index(j)) {
|
||||
c = count.get_existing(j) + 1;
|
||||
count.set_existing(j, c);
|
||||
} else {
|
||||
c = 1;
|
||||
count.set_new(j, c);
|
||||
}
|
||||
if (c < parent.propagate_up_at_count)
|
||||
continue;
|
||||
}
|
||||
// Trigger the parent.
|
||||
work.set(j, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Debugging help.
|
||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
||||
ABSL_LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||
}
|
||||
|
||||
void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
|
||||
ABSL_LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||
ABSL_LOG(ERROR) << "#Unique Nodes: " << entries_.size();
|
||||
|
||||
for (size_t i = 0; i < entries_.size(); i++) {
|
||||
const std::vector<int>& parents = entries_[i].parents;
|
||||
const std::vector<int>& regexps = entries_[i].regexps;
|
||||
ABSL_LOG(ERROR) << "EntryId: " << i
|
||||
<< " N: " << parents.size() << " R: " << regexps.size();
|
||||
for (int parent : parents)
|
||||
ABSL_LOG(ERROR) << parent;
|
||||
}
|
||||
ABSL_LOG(ERROR) << "Set:";
|
||||
for (NodeSet::const_iterator iter = nodes->begin();
|
||||
iter != nodes->end(); ++iter)
|
||||
ABSL_LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
|
||||
}
|
||||
|
||||
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
||||
std::string node_string = "";
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
ABSL_DCHECK(!node->atom().empty());
|
||||
node_string += node->atom();
|
||||
} else {
|
||||
// Adding the operation disambiguates AND and OR nodes.
|
||||
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
||||
node_string += "(";
|
||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||
if (i > 0)
|
||||
node_string += ',';
|
||||
node_string += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
|
||||
node_string += ":";
|
||||
node_string += DebugNodeString((*node->subs())[i]);
|
||||
}
|
||||
node_string += ")";
|
||||
}
|
||||
return node_string;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,153 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PREFILTER_TREE_H_
|
||||
#define RE2_PREFILTER_TREE_H_
|
||||
|
||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
||||
// added to PrefilterTree, and then PrefilterTree is used to find all
|
||||
// the unique strings across the prefilters. During search, by using
|
||||
// matches from a string matching engine, PrefilterTree deduces the
|
||||
// set of regexps that are to be triggered. The 'string matching
|
||||
// engine' itself is outside of this class, and the caller can use any
|
||||
// favorite engine. PrefilterTree provides a set of strings (called
|
||||
// atoms) that the user of this class should use to do the string
|
||||
// matching.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/sparse_array.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class PrefilterTree {
|
||||
public:
|
||||
PrefilterTree();
|
||||
explicit PrefilterTree(int min_atom_len);
|
||||
~PrefilterTree();
|
||||
|
||||
// Adds the prefilter for the next regexp. Note that we assume that
|
||||
// Add called sequentially for all regexps. All Add calls
|
||||
// must precede Compile.
|
||||
void Add(Prefilter* prefilter);
|
||||
|
||||
// The Compile returns a vector of string in atom_vec.
|
||||
// Call this after all the prefilters are added through Add.
|
||||
// No calls to Add after Compile are allowed.
|
||||
// The caller should use the returned set of strings to do string matching.
|
||||
// Each time a string matches, the corresponding index then has to be
|
||||
// and passed to RegexpsGivenStrings below.
|
||||
void Compile(std::vector<std::string>* atom_vec);
|
||||
|
||||
// Given the indices of the atoms that matched, returns the indexes
|
||||
// of regexps that should be searched. The matched_atoms should
|
||||
// contain all the ids of string atoms that were found to match the
|
||||
// content. The caller can use any string match engine to perform
|
||||
// this function. This function is thread safe.
|
||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* regexps) const;
|
||||
|
||||
// Print debug prefilter. Also prints unique ids associated with
|
||||
// nodes of the prefilter of the regexp.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
private:
|
||||
using IntMap = SparseArray<int>;
|
||||
|
||||
struct PrefilterHash {
|
||||
size_t operator()(const Prefilter* a) const {
|
||||
ABSL_DCHECK(a != NULL);
|
||||
return absl::Hash<Prefilter>()(*a);
|
||||
}
|
||||
};
|
||||
|
||||
struct PrefilterEqual {
|
||||
bool operator()(const Prefilter* a, const Prefilter* b) const {
|
||||
ABSL_DCHECK(a != NULL);
|
||||
ABSL_DCHECK(b != NULL);
|
||||
return *a == *b;
|
||||
}
|
||||
};
|
||||
|
||||
using NodeSet =
|
||||
absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;
|
||||
|
||||
// Each unique node has a corresponding Entry that helps in
|
||||
// passing the matching trigger information along the tree.
|
||||
struct Entry {
|
||||
public:
|
||||
// How many children should match before this node triggers the
|
||||
// parent. For an atom and an OR node, this is 1 and for an AND
|
||||
// node, it is the number of unique children.
|
||||
int propagate_up_at_count;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the indices
|
||||
// of the parent nodes to trigger. The reason there may be more than
|
||||
// one is because of sharing. For example (abc | def) and (xyz | def)
|
||||
// are two different nodes, but they share the atom 'def'. So when
|
||||
// 'def' matches, it triggers two parents, corresponding to the two
|
||||
// different OR nodes.
|
||||
std::vector<int> parents;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the
|
||||
// regexps that are triggered.
|
||||
std::vector<int> regexps;
|
||||
};
|
||||
|
||||
// Returns true if the prefilter node should be kept.
|
||||
bool KeepNode(Prefilter* node) const;
|
||||
|
||||
// This function assigns unique ids to various parts of the
|
||||
// prefilter, by looking at if these nodes are already in the
|
||||
// PrefilterTree.
|
||||
void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);
|
||||
|
||||
// Given the matching atoms, find the regexps to be triggered.
|
||||
void PropagateMatch(const std::vector<int>& atom_ids,
|
||||
IntMap* regexps) const;
|
||||
|
||||
// Returns the prefilter node that has the same atom/subs as this
|
||||
// node. For the canonical node, returns node. Assumes that the
|
||||
// children of node have already been assigned unique ids.
|
||||
Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);
|
||||
|
||||
// Recursively constructs a readable prefilter string.
|
||||
std::string DebugNodeString(Prefilter* node) const;
|
||||
|
||||
// Used for debugging.
|
||||
void PrintDebugInfo(NodeSet* nodes);
|
||||
|
||||
// These are all the nodes formed by Compile. Essentially, there is
|
||||
// one node for each unique atom and each unique AND/OR node.
|
||||
std::vector<Entry> entries_;
|
||||
|
||||
// indices of regexps that always pass through the filter (since we
|
||||
// found no required literals in these regexps).
|
||||
std::vector<int> unfiltered_;
|
||||
|
||||
// vector of Prefilter for all regexps.
|
||||
std::vector<Prefilter*> prefilter_vec_;
|
||||
|
||||
// Atom index in returned strings to entry id mapping.
|
||||
std::vector<int> atom_index_to_id_;
|
||||
|
||||
// Has the prefilter tree been compiled.
|
||||
bool compiled_;
|
||||
|
||||
// Strings less than this length are not stored as atoms.
|
||||
const int min_atom_len_;
|
||||
|
||||
PrefilterTree(const PrefilterTree&) = delete;
|
||||
PrefilterTree& operator=(const PrefilterTree&) = delete;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // RE2_PREFILTER_TREE_H_
|
||||
+1182
File diff suppressed because it is too large
Load Diff
+492
@@ -0,0 +1,492 @@
|
||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PROG_H_
|
||||
#define RE2_PROG_H_
|
||||
|
||||
// Compiled representation of regular expressions.
|
||||
// See regexp.h for the Regexp class, which represents a regular
|
||||
// expression symbolically.
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/call_once.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/sparse_array.h"
|
||||
#include "re2/sparse_set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Opcodes for Inst
|
||||
enum InstOp {
|
||||
kInstAlt = 0, // choose between out_ and out1_
|
||||
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
||||
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
||||
kInstCapture, // capturing parenthesis number cap_
|
||||
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
||||
kInstMatch, // found a match!
|
||||
kInstNop, // no-op; occasionally unavoidable
|
||||
kInstFail, // never match; occasionally unavoidable
|
||||
kNumInst,
|
||||
};
|
||||
|
||||
// Bit flags for empty-width specials
|
||||
enum EmptyOp {
|
||||
kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
||||
kEmptyEndLine = 1<<1, // $ - end of line
|
||||
kEmptyBeginText = 1<<2, // \A - beginning of text
|
||||
kEmptyEndText = 1<<3, // \z - end of text
|
||||
kEmptyWordBoundary = 1<<4, // \b - word boundary
|
||||
kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
||||
kEmptyAllFlags = (1<<6)-1,
|
||||
};
|
||||
|
||||
class DFA;
|
||||
class Regexp;
|
||||
|
||||
// Compiled form of regexp program.
|
||||
class Prog {
|
||||
public:
|
||||
Prog();
|
||||
~Prog();
|
||||
|
||||
// Single instruction in regexp program.
|
||||
class Inst {
|
||||
public:
|
||||
// See the assertion below for why this is so.
|
||||
Inst() = default;
|
||||
|
||||
// Copyable.
|
||||
Inst(const Inst&) = default;
|
||||
Inst& operator=(const Inst&) = default;
|
||||
|
||||
// Constructors per opcode
|
||||
void InitAlt(uint32_t out, uint32_t out1);
|
||||
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
|
||||
void InitCapture(int cap, uint32_t out);
|
||||
void InitEmptyWidth(EmptyOp empty, uint32_t out);
|
||||
void InitMatch(int id);
|
||||
void InitNop(uint32_t out);
|
||||
void InitFail();
|
||||
|
||||
// Getters
|
||||
int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
|
||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_ & 7); }
|
||||
int last() { return (out_opcode_ >> 3) & 1; }
|
||||
int out() { return out_opcode_ >> 4; }
|
||||
int out1() {
|
||||
ABSL_DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch);
|
||||
return out1_;
|
||||
}
|
||||
int cap() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstCapture);
|
||||
return cap_;
|
||||
}
|
||||
int lo() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstByteRange);
|
||||
return lo_;
|
||||
}
|
||||
int hi() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstByteRange);
|
||||
return hi_;
|
||||
}
|
||||
int foldcase() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstByteRange);
|
||||
return hint_foldcase_ & 1;
|
||||
}
|
||||
int hint() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstByteRange);
|
||||
return hint_foldcase_ >> 1;
|
||||
}
|
||||
int match_id() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstMatch);
|
||||
return match_id_;
|
||||
}
|
||||
EmptyOp empty() {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstEmptyWidth);
|
||||
return empty_;
|
||||
}
|
||||
|
||||
bool greedy(Prog* p) {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstAltMatch);
|
||||
return p->inst(out())->opcode() == kInstByteRange ||
|
||||
(p->inst(out())->opcode() == kInstNop &&
|
||||
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
|
||||
}
|
||||
|
||||
// Does this inst (an kInstByteRange) match c?
|
||||
inline bool Matches(int c) {
|
||||
ABSL_DCHECK_EQ(opcode(), kInstByteRange);
|
||||
if (foldcase() && 'A' <= c && c <= 'Z')
|
||||
c += 'a' - 'A';
|
||||
return lo_ <= c && c <= hi_;
|
||||
}
|
||||
|
||||
// Returns string representation for debugging.
|
||||
std::string Dump();
|
||||
|
||||
// Maximum instruction id.
|
||||
// (Must fit in out_opcode_. PatchList/last steal another bit.)
|
||||
static const int kMaxInst = (1<<28) - 1;
|
||||
|
||||
private:
|
||||
void set_opcode(InstOp opcode) {
|
||||
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
|
||||
}
|
||||
|
||||
void set_last() {
|
||||
out_opcode_ = (out()<<4) | (1<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out(int out) {
|
||||
out_opcode_ = (out<<4) | (last()<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out_opcode(int out, InstOp opcode) {
|
||||
out_opcode_ = (out<<4) | (last()<<3) | opcode;
|
||||
}
|
||||
|
||||
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
|
||||
union { // additional instruction arguments:
|
||||
uint32_t out1_; // opcode == kInstAlt
|
||||
// alternate next instruction
|
||||
|
||||
int32_t cap_; // opcode == kInstCapture
|
||||
// Index of capture register (holds text
|
||||
// position recorded by capturing parentheses).
|
||||
// For \n (the submatch for the nth parentheses),
|
||||
// the left parenthesis captures into register 2*n
|
||||
// and the right one captures into register 2*n+1.
|
||||
|
||||
int32_t match_id_; // opcode == kInstMatch
|
||||
// Match ID to identify this match (for re2::Set).
|
||||
|
||||
struct { // opcode == kInstByteRange
|
||||
uint8_t lo_; // byte range is lo_-hi_ inclusive
|
||||
uint8_t hi_; //
|
||||
uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
|
||||
// hint to execution engines: the delta to the
|
||||
// next instruction (in the current list) worth
|
||||
// exploring iff this instruction matched; 0
|
||||
// means there are no remaining possibilities,
|
||||
// which is most likely for character classes.
|
||||
// foldcase: A-Z -> a-z before checking range.
|
||||
};
|
||||
|
||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||
// empty_ is bitwise OR of kEmpty* flags above.
|
||||
};
|
||||
|
||||
friend class Compiler;
|
||||
friend struct PatchList;
|
||||
friend class Prog;
|
||||
};
|
||||
|
||||
// Inst must be trivial so that we can freely clear it with memset(3).
|
||||
// Arrays of Inst are initialised by copying the initial elements with
|
||||
// memmove(3) and then clearing any remaining elements with memset(3).
|
||||
static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
|
||||
|
||||
// Whether to anchor the search.
|
||||
enum Anchor {
|
||||
kUnanchored, // match anywhere
|
||||
kAnchored, // match only starting at beginning of text
|
||||
};
|
||||
|
||||
// Kind of match to look for (for anchor != kFullMatch)
|
||||
//
|
||||
// kLongestMatch mode finds the overall longest
|
||||
// match but still makes its submatch choices the way
|
||||
// Perl would, not in the way prescribed by POSIX.
|
||||
// The POSIX rules are much more expensive to implement,
|
||||
// and no one has needed them.
|
||||
//
|
||||
// kFullMatch is not strictly necessary -- we could use
|
||||
// kLongestMatch and then check the length of the match -- but
|
||||
// the matching code can run faster if it knows to consider only
|
||||
// full matches.
|
||||
enum MatchKind {
|
||||
kFirstMatch, // like Perl, PCRE
|
||||
kLongestMatch, // like egrep or POSIX
|
||||
kFullMatch, // match only entire text; implies anchor==kAnchored
|
||||
kManyMatch // for SearchDFA, records set of matches
|
||||
};
|
||||
|
||||
Inst *inst(int id) { return &inst_[id]; }
|
||||
int start() { return start_; }
|
||||
void set_start(int start) { start_ = start; }
|
||||
int start_unanchored() { return start_unanchored_; }
|
||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
||||
int size() { return size_; }
|
||||
bool reversed() { return reversed_; }
|
||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
||||
int list_count() { return list_count_; }
|
||||
int inst_count(InstOp op) { return inst_count_[op]; }
|
||||
uint16_t* list_heads() { return list_heads_.data(); }
|
||||
size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
|
||||
int64_t dfa_mem() { return dfa_mem_; }
|
||||
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||
bool anchor_start() { return anchor_start_; }
|
||||
void set_anchor_start(bool b) { anchor_start_ = b; }
|
||||
bool anchor_end() { return anchor_end_; }
|
||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
||||
int bytemap_range() { return bytemap_range_; }
|
||||
const uint8_t* bytemap() { return bytemap_; }
|
||||
bool can_prefix_accel() { return prefix_size_ != 0; }
|
||||
|
||||
// Accelerates to the first likely occurrence of the prefix.
|
||||
// Returns a pointer to the first byte or NULL if not found.
|
||||
const void* PrefixAccel(const void* data, size_t size) {
|
||||
ABSL_DCHECK(can_prefix_accel());
|
||||
if (prefix_foldcase_) {
|
||||
return PrefixAccel_ShiftDFA(data, size);
|
||||
} else if (prefix_size_ != 1) {
|
||||
return PrefixAccel_FrontAndBack(data, size);
|
||||
} else {
|
||||
return memchr(data, prefix_front_, size);
|
||||
}
|
||||
}
|
||||
|
||||
// Configures prefix accel using the analysis performed during compilation.
|
||||
void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
|
||||
|
||||
// An implementation of prefix accel that uses prefix_dfa_ to perform
|
||||
// case-insensitive search.
|
||||
const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
|
||||
|
||||
// An implementation of prefix accel that looks for prefix_front_ and
|
||||
// prefix_back_ to return fewer false positives than memchr(3) alone.
|
||||
const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
|
||||
|
||||
// Returns string representation of program for debugging.
|
||||
std::string Dump();
|
||||
std::string DumpUnanchored();
|
||||
std::string DumpByteMap();
|
||||
|
||||
// Returns the set of kEmpty flags that are in effect at
|
||||
// position p within context.
|
||||
static uint32_t EmptyFlags(absl::string_view context, const char* p);
|
||||
|
||||
// Returns whether byte c is a word character: ASCII only.
|
||||
// Used by the implementation of \b and \B.
|
||||
// This is not right for Unicode, but:
|
||||
// - it's hard to get right in a byte-at-a-time matching world
|
||||
// (the DFA has only one-byte lookahead).
|
||||
// - even if the lookahead were possible, the Progs would be huge.
|
||||
// This crude approximation is the same one PCRE uses.
|
||||
static bool IsWordChar(uint8_t c) {
|
||||
return ('A' <= c && c <= 'Z') ||
|
||||
('a' <= c && c <= 'z') ||
|
||||
('0' <= c && c <= '9') ||
|
||||
c == '_';
|
||||
}
|
||||
|
||||
// Execution engines. They all search for the regexp (run the prog)
|
||||
// in text, which is in the larger context (used for ^ $ \b etc).
|
||||
// Anchor and kind control the kind of search.
|
||||
// Returns true if match found, false if not.
|
||||
// If match found, fills match[0..nmatch-1] with submatch info.
|
||||
// match[0] is overall match, match[1] is first set of parens, etc.
|
||||
// If a particular submatch is not matched during the regexp match,
|
||||
// it is set to NULL.
|
||||
//
|
||||
// Matching text == absl::string_view() is treated as any other empty
|
||||
// string, but note that on return, it will not be possible to distinguish
|
||||
// submatches that matched that empty string from submatches that didn't
|
||||
// match anything. Either way, match[i] == NULL.
|
||||
|
||||
// Search using NFA: can find submatches but kind of slow.
|
||||
bool SearchNFA(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind, absl::string_view* match,
|
||||
int nmatch);
|
||||
|
||||
// Search using DFA: much faster than NFA but only finds
|
||||
// end of match and can use a lot more memory.
|
||||
// Returns whether a match was found.
|
||||
// If the DFA runs out of memory, sets *failed to true and returns false.
|
||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
||||
bool SearchDFA(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind, absl::string_view* match0,
|
||||
bool* failed, SparseSet* matches);
|
||||
|
||||
// The callback issued after building each DFA state with BuildEntireDFA().
|
||||
// If next is null, then the memory budget has been exhausted and building
|
||||
// will halt. Otherwise, the state has been built and next points to an array
|
||||
// of bytemap_range()+1 slots holding the next states as per the bytemap and
|
||||
// kByteEndText. The number of the state is implied by the callback sequence:
|
||||
// the first callback is for state 0, the second callback is for state 1, ...
|
||||
// match indicates whether the state is a matching state.
|
||||
using DFAStateCallback = std::function<void(const int* next, bool match)>;
|
||||
|
||||
// Build the entire DFA for the given match kind.
|
||||
// Usually the DFA is built out incrementally, as needed, which
|
||||
// avoids lots of unnecessary work.
|
||||
// If cb is not empty, it receives one callback per state built.
|
||||
// Returns the number of states built.
|
||||
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
|
||||
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
|
||||
|
||||
// Compute bytemap.
|
||||
void ComputeByteMap();
|
||||
|
||||
// Run peep-hole optimizer on program.
|
||||
void Optimize();
|
||||
|
||||
// One-pass NFA: only correct if IsOnePass() is true,
|
||||
// but much faster than NFA (competitive with PCRE)
|
||||
// for those expressions.
|
||||
bool IsOnePass();
|
||||
bool SearchOnePass(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind, absl::string_view* match,
|
||||
int nmatch);
|
||||
|
||||
// Bit-state backtracking. Fast on small cases but uses memory
|
||||
// proportional to the product of the list count and the text size.
|
||||
bool CanBitState() { return list_heads_.data() != NULL; }
|
||||
bool SearchBitState(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind, absl::string_view* match,
|
||||
int nmatch);
|
||||
|
||||
static const int kMaxOnePassCapture = 5; // $0 through $4
|
||||
|
||||
// Backtracking search: the gold standard against which the other
|
||||
// implementations are checked. FOR TESTING ONLY.
|
||||
// It allocates a ton of memory to avoid running forever.
|
||||
// It is also recursive, so can't use in production (will overflow stacks).
|
||||
// The name "Unsafe" here is supposed to be a flag that
|
||||
// you should not be using this function.
|
||||
bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
absl::string_view* match, int nmatch);
|
||||
|
||||
// Computes range for any strings matching regexp. The min and max can in
|
||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
||||
// maximum desired length of string returned.
|
||||
//
|
||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
||||
// string s that is an anchored match for this regexp satisfies
|
||||
// min <= s && s <= max.
|
||||
//
|
||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
||||
// do not compile down to infinite repetitions.
|
||||
//
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
|
||||
|
||||
// Outputs the program fanout into the given sparse array.
|
||||
void Fanout(SparseArray<int>* fanout);
|
||||
|
||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
||||
// its own Match instruction recording the index in the output vector.
|
||||
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
||||
|
||||
// Flattens the Prog from "tree" form to "list" form. This is an in-place
|
||||
// operation in the sense that the old instructions are lost.
|
||||
void Flatten();
|
||||
|
||||
// Walks the Prog; the "successor roots" or predecessors of the reachable
|
||||
// instructions are marked in rootmap or predmap/predvec, respectively.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void MarkSuccessors(SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
// Walks the Prog from the given "root" instruction; the "dominator root"
|
||||
// of the reachable instructions (if such exists) is marked in rootmap.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void MarkDominator(int root, SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
// Walks the Prog from the given "root" instruction; the reachable
|
||||
// instructions are emitted in "list" form and appended to flat.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void EmitList(int root, SparseArray<int>* rootmap,
|
||||
std::vector<Inst>* flat,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
// Computes hints for ByteRange instructions in [begin, end).
|
||||
void ComputeHints(std::vector<Inst>* flat, int begin, int end);
|
||||
|
||||
// Controls whether the DFA should bail out early if the NFA would be faster.
|
||||
// FOR TESTING ONLY.
|
||||
static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
|
||||
|
||||
private:
|
||||
friend class Compiler;
|
||||
|
||||
DFA* GetDFA(MatchKind kind);
|
||||
void DeleteDFA(DFA* dfa);
|
||||
|
||||
bool anchor_start_; // regexp has explicit start anchor
|
||||
bool anchor_end_; // regexp has explicit end anchor
|
||||
bool reversed_; // whether program runs backward over input
|
||||
bool did_flatten_; // has Flatten been called?
|
||||
bool did_onepass_; // has IsOnePass been called?
|
||||
|
||||
int start_; // entry point for program
|
||||
int start_unanchored_; // unanchored entry point for program
|
||||
int size_; // number of instructions
|
||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
||||
|
||||
bool prefix_foldcase_; // whether prefix is case-insensitive
|
||||
size_t prefix_size_; // size of prefix (0 if no prefix)
|
||||
union {
|
||||
uint64_t* prefix_dfa_; // "Shift DFA" for prefix
|
||||
struct {
|
||||
int prefix_front_; // first byte of prefix
|
||||
int prefix_back_; // last byte of prefix
|
||||
};
|
||||
};
|
||||
|
||||
int list_count_; // count of lists (see above)
|
||||
int inst_count_[kNumInst]; // count of instructions by opcode
|
||||
PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
|
||||
// not populated if size_ is overly large
|
||||
size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
|
||||
|
||||
PODArray<Inst> inst_; // pointer to instruction array
|
||||
PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
|
||||
|
||||
int64_t dfa_mem_; // Maximum memory for DFAs.
|
||||
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
|
||||
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
|
||||
|
||||
uint8_t bytemap_[256]; // map from input bytes to byte classes
|
||||
|
||||
absl::once_flag dfa_first_once_;
|
||||
absl::once_flag dfa_longest_once_;
|
||||
|
||||
Prog(const Prog&) = delete;
|
||||
Prog& operator=(const Prog&) = delete;
|
||||
};
|
||||
|
||||
// std::string_view in MSVC has iterators that aren't just pointers and
|
||||
// that don't allow comparisons between different objects - not even if
|
||||
// those objects are views into the same string! Thus, we provide these
|
||||
// conversion functions for convenience.
|
||||
static inline const char* BeginPtr(absl::string_view s) {
|
||||
return s.data();
|
||||
}
|
||||
static inline const char* EndPtr(absl::string_view s) {
|
||||
return s.data() + s.size();
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PROG_H_
|
||||
+1350
File diff suppressed because it is too large
Load Diff
+1004
File diff suppressed because it is too large
Load Diff
+693
@@ -0,0 +1,693 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_REGEXP_H_
|
||||
#define RE2_REGEXP_H_
|
||||
|
||||
// --- SPONSORED LINK --------------------------------------------------
|
||||
// If you want to use this library for regular expression matching,
|
||||
// you should use re2/re2.h, which provides a class RE2 that
|
||||
// mimics the PCRE interface provided by PCRE's C++ wrappers.
|
||||
// This header describes the low-level interface used to implement RE2
|
||||
// and may change in backwards-incompatible ways from time to time.
|
||||
// In contrast, RE2's interface will not.
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
// Regular expression library: parsing, execution, and manipulation
|
||||
// of regular expressions.
|
||||
//
|
||||
// Any operation that traverses the Regexp structures should be written
|
||||
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
|
||||
// regular expressions such as x++++++++++++++++++++... might cause recursive
|
||||
// traversals to overflow the stack.
|
||||
//
|
||||
// It is the caller's responsibility to provide appropriate mutual exclusion
|
||||
// around manipulation of the regexps. RE2 does this.
|
||||
//
|
||||
// PARSING
|
||||
//
|
||||
// Regexp::Parse parses regular expressions encoded in UTF-8.
|
||||
// The default syntax is POSIX extended regular expressions,
|
||||
// with the following changes:
|
||||
//
|
||||
// 1. Backreferences (optional in POSIX EREs) are not supported.
|
||||
// (Supporting them precludes the use of DFA-based
|
||||
// matching engines.)
|
||||
//
|
||||
// 2. Collating elements and collation classes are not supported.
|
||||
// (No one has needed or wanted them.)
|
||||
//
|
||||
// The exact syntax accepted can be modified by passing flags to
|
||||
// Regexp::Parse. In particular, many of the basic Perl additions
|
||||
// are available. The flags are documented below (search for LikePerl).
|
||||
//
|
||||
// If parsed with the flag Regexp::Latin1, both the regular expression
|
||||
// and the input to the matching routines are assumed to be encoded in
|
||||
// Latin-1, not UTF-8.
|
||||
//
|
||||
// EXECUTION
|
||||
//
|
||||
// Once Regexp has parsed a regular expression, it provides methods
|
||||
// to search text using that regular expression. These methods are
|
||||
// implemented via calling out to other regular expression libraries.
|
||||
// (Let's call them the sublibraries.)
|
||||
//
|
||||
// To call a sublibrary, Regexp does not simply prepare a
|
||||
// string version of the regular expression and hand it to the
|
||||
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
|
||||
// corresponding internal representation used by the sublibrary.
|
||||
// This has the drawback of needing to know the internal representation
|
||||
// used by the sublibrary, but it has two important benefits:
|
||||
//
|
||||
// 1. The syntax and meaning of regular expressions is guaranteed
|
||||
// to be that used by Regexp's parser, not the syntax expected
|
||||
// by the sublibrary. Regexp might accept a restricted or
|
||||
// expanded syntax for regular expressions as compared with
|
||||
// the sublibrary. As long as Regexp can translate from its
|
||||
// internal form into the sublibrary's, clients need not know
|
||||
// exactly which sublibrary they are using.
|
||||
//
|
||||
// 2. The sublibrary parsers are bypassed. For whatever reason,
|
||||
// sublibrary regular expression parsers often have security
|
||||
// problems. For example, plan9grep's regular expression parser
|
||||
// has a buffer overflow in its handling of large character
|
||||
// classes, and PCRE's parser has had buffer overflow problems
|
||||
// in the past. Security-team requires sandboxing of sublibrary
|
||||
// regular expression parsers. Avoiding the sublibrary parsers
|
||||
// avoids the sandbox.
|
||||
//
|
||||
// The execution methods we use now are provided by the compiled form,
|
||||
// Prog, described in prog.h
|
||||
//
|
||||
// MANIPULATION
|
||||
//
|
||||
// Unlike other regular expression libraries, Regexp makes its parsed
|
||||
// form accessible to clients, so that client code can analyze the
|
||||
// parsed regular expressions.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
|
||||
enum RegexpOp {
|
||||
// Matches no strings.
|
||||
kRegexpNoMatch = 1,
|
||||
|
||||
// Matches empty string.
|
||||
kRegexpEmptyMatch,
|
||||
|
||||
// Matches rune_.
|
||||
kRegexpLiteral,
|
||||
|
||||
// Matches runes_.
|
||||
kRegexpLiteralString,
|
||||
|
||||
// Matches concatenation of sub_[0..nsub-1].
|
||||
kRegexpConcat,
|
||||
// Matches union of sub_[0..nsub-1].
|
||||
kRegexpAlternate,
|
||||
|
||||
// Matches sub_[0] zero or more times.
|
||||
kRegexpStar,
|
||||
// Matches sub_[0] one or more times.
|
||||
kRegexpPlus,
|
||||
// Matches sub_[0] zero or one times.
|
||||
kRegexpQuest,
|
||||
|
||||
// Matches sub_[0] at least min_ times, at most max_ times.
|
||||
// max_ == -1 means no upper limit.
|
||||
kRegexpRepeat,
|
||||
|
||||
// Parenthesized (capturing) subexpression. Index is cap_.
|
||||
// Optionally, capturing name is name_.
|
||||
kRegexpCapture,
|
||||
|
||||
// Matches any character.
|
||||
kRegexpAnyChar,
|
||||
|
||||
// Matches any byte [sic].
|
||||
kRegexpAnyByte,
|
||||
|
||||
// Matches empty string at beginning of line.
|
||||
kRegexpBeginLine,
|
||||
// Matches empty string at end of line.
|
||||
kRegexpEndLine,
|
||||
|
||||
// Matches word boundary "\b".
|
||||
kRegexpWordBoundary,
|
||||
// Matches not-a-word boundary "\B".
|
||||
kRegexpNoWordBoundary,
|
||||
|
||||
// Matches empty string at beginning of text.
|
||||
kRegexpBeginText,
|
||||
// Matches empty string at end of text.
|
||||
kRegexpEndText,
|
||||
|
||||
// Matches character class given by cc_.
|
||||
kRegexpCharClass,
|
||||
|
||||
// Forces match of entire expression right now,
|
||||
// with match ID match_id_ (used by RE2::Set).
|
||||
kRegexpHaveMatch,
|
||||
|
||||
kMaxRegexpOp = kRegexpHaveMatch,
|
||||
};
|
||||
|
||||
// Keep in sync with string list in regexp.cc
|
||||
enum RegexpStatusCode {
|
||||
// No error
|
||||
kRegexpSuccess = 0,
|
||||
|
||||
// Unexpected error
|
||||
kRegexpInternalError,
|
||||
|
||||
// Parse errors
|
||||
kRegexpBadEscape, // bad escape sequence
|
||||
kRegexpBadCharClass, // bad character class
|
||||
kRegexpBadCharRange, // bad character class range
|
||||
kRegexpMissingBracket, // missing closing ]
|
||||
kRegexpMissingParen, // missing closing )
|
||||
kRegexpUnexpectedParen, // unexpected closing )
|
||||
kRegexpTrailingBackslash, // at end of regexp
|
||||
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
|
||||
kRegexpRepeatSize, // bad repetition argument
|
||||
kRegexpRepeatOp, // bad repetition operator
|
||||
kRegexpBadPerlOp, // bad perl operator
|
||||
kRegexpBadUTF8, // invalid UTF-8 in regexp
|
||||
kRegexpBadNamedCapture, // bad named capture
|
||||
};
|
||||
|
||||
// Error status for certain operations.
|
||||
class RegexpStatus {
|
||||
public:
|
||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
||||
~RegexpStatus() { delete tmp_; }
|
||||
|
||||
void set_code(RegexpStatusCode code) { code_ = code; }
|
||||
void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; }
|
||||
void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
|
||||
RegexpStatusCode code() const { return code_; }
|
||||
absl::string_view error_arg() const { return error_arg_; }
|
||||
bool ok() const { return code() == kRegexpSuccess; }
|
||||
|
||||
// Copies state from status.
|
||||
void Copy(const RegexpStatus& status);
|
||||
|
||||
// Returns text equivalent of code, e.g.:
|
||||
// "Bad character class"
|
||||
static std::string CodeText(RegexpStatusCode code);
|
||||
|
||||
// Returns text describing error, e.g.:
|
||||
// "Bad character class: [z-a]"
|
||||
std::string Text() const;
|
||||
|
||||
private:
|
||||
RegexpStatusCode code_; // Kind of error.
|
||||
absl::string_view error_arg_; // Piece of regexp containing syntax error.
|
||||
std::string* tmp_; // Temporary storage, possibly for error_arg_.
|
||||
|
||||
RegexpStatus(const RegexpStatus&) = delete;
|
||||
RegexpStatus& operator=(const RegexpStatus&) = delete;
|
||||
};
|
||||
|
||||
// Compiled form; see prog.h
|
||||
class Prog;
|
||||
|
||||
struct RuneRange {
|
||||
RuneRange() : lo(0), hi(0) { }
|
||||
RuneRange(int l, int h) : lo(l), hi(h) { }
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
};
|
||||
|
||||
// Less-than on RuneRanges treats a == b if they overlap at all.
|
||||
// This lets us look in a set to find the range covering a particular Rune.
|
||||
struct RuneRangeLess {
|
||||
bool operator()(const RuneRange& a, const RuneRange& b) const {
|
||||
return a.hi < b.lo;
|
||||
}
|
||||
};
|
||||
|
||||
class CharClassBuilder;
|
||||
|
||||
class CharClass {
|
||||
public:
|
||||
void Delete();
|
||||
|
||||
typedef RuneRange* iterator;
|
||||
iterator begin() { return ranges_; }
|
||||
iterator end() { return ranges_ + nranges_; }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
bool FoldsASCII() { return folds_ascii_; }
|
||||
|
||||
bool Contains(Rune r) const;
|
||||
CharClass* Negate();
|
||||
|
||||
private:
|
||||
CharClass(); // not implemented
|
||||
~CharClass(); // not implemented
|
||||
static CharClass* New(size_t maxranges);
|
||||
|
||||
friend class CharClassBuilder;
|
||||
|
||||
bool folds_ascii_;
|
||||
int nrunes_;
|
||||
RuneRange *ranges_;
|
||||
int nranges_;
|
||||
|
||||
CharClass(const CharClass&) = delete;
|
||||
CharClass& operator=(const CharClass&) = delete;
|
||||
};
|
||||
|
||||
class Regexp {
|
||||
public:
|
||||
|
||||
// Flags for parsing. Can be ORed together.
|
||||
enum ParseFlags {
|
||||
NoParseFlags = 0,
|
||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
||||
// and [[:space:]] to match newline.
|
||||
DotNL = 1<<3, // Allow . to match newline.
|
||||
MatchNL = ClassNL | DotNL,
|
||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
||||
// end of text, not around embedded newlines.
|
||||
// (Perl's default)
|
||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
||||
PerlX = 1<<9, // Perl extensions:
|
||||
// non-capturing parens - (?: )
|
||||
// non-greedy operators - *? +? ?? {}?
|
||||
// flag edits - (?i) (?-i) (?i: )
|
||||
// i - FoldCase
|
||||
// m - !OneLine
|
||||
// s - DotNL
|
||||
// U - NonGreedy
|
||||
// line ends: \A \z
|
||||
// \Q and \E to disable/enable metacharacters
|
||||
// (?P<name>expr) for named captures
|
||||
// \C to match any single byte
|
||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
||||
// and \P{Han} for its negation.
|
||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
||||
// it explicitly.
|
||||
NeverCapture = 1<<12, // Parse all parens as non-capturing.
|
||||
|
||||
// As close to Perl as we can get.
|
||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
||||
UnicodeGroups,
|
||||
|
||||
// Internal use only.
|
||||
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
|
||||
AllParseFlags = (1<<14)-1,
|
||||
};
|
||||
|
||||
// Get. No set, Regexps are logically immutable once created.
|
||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
||||
int nsub() { return nsub_; }
|
||||
bool simple() { return simple_ != 0; }
|
||||
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||
int Ref(); // For testing.
|
||||
|
||||
Regexp** sub() {
|
||||
if(nsub_ <= 1)
|
||||
return &subone_;
|
||||
else
|
||||
return submany_;
|
||||
}
|
||||
|
||||
int min() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpRepeat);
|
||||
return min_;
|
||||
}
|
||||
int max() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpRepeat);
|
||||
return max_;
|
||||
}
|
||||
Rune rune() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpLiteral);
|
||||
return rune_;
|
||||
}
|
||||
CharClass* cc() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpCharClass);
|
||||
return cc_;
|
||||
}
|
||||
int cap() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpCapture);
|
||||
return cap_;
|
||||
}
|
||||
const std::string* name() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpCapture);
|
||||
return name_;
|
||||
}
|
||||
Rune* runes() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
|
||||
return runes_;
|
||||
}
|
||||
int nrunes() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
|
||||
return nrunes_;
|
||||
}
|
||||
int match_id() {
|
||||
ABSL_DCHECK_EQ(op_, kRegexpHaveMatch);
|
||||
return match_id_;
|
||||
}
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Incref();
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Decref();
|
||||
|
||||
// Parses string s to produce regular expression, returned.
|
||||
// Caller must release return value with re->Decref().
|
||||
// On failure, sets *status (if status != NULL) and returns NULL.
|
||||
static Regexp* Parse(absl::string_view s, ParseFlags flags,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Returns a _new_ simplified version of the current regexp.
|
||||
// Does not edit the current regexp.
|
||||
// Caller must release return value with re->Decref().
|
||||
// Simplified means that counted repetition has been rewritten
|
||||
// into simpler terms and all Perl/POSIX features have been
|
||||
// removed. The result will capture exactly the same
|
||||
// subexpressions the original did, unless formatted with ToString.
|
||||
Regexp* Simplify();
|
||||
friend class CoalesceWalker;
|
||||
friend class SimplifyWalker;
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *status (if status != NULL) on parse error.
|
||||
static bool SimplifyRegexp(absl::string_view src, ParseFlags flags,
|
||||
std::string* dst, RegexpStatus* status);
|
||||
|
||||
// Returns the number of capturing groups in the regexp.
|
||||
int NumCaptures();
|
||||
friend class NumCapturesWalker;
|
||||
|
||||
// Returns a map from names to capturing group indices,
|
||||
// or NULL if the regexp contains no named capture groups.
|
||||
// The caller is responsible for deleting the map.
|
||||
std::map<std::string, int>* NamedCaptures();
|
||||
|
||||
// Returns a map from capturing group indices to capturing group
|
||||
// names or NULL if the regexp contains no named capture groups. The
|
||||
// caller is responsible for deleting the map.
|
||||
std::map<int, std::string>* CaptureNames();
|
||||
|
||||
// Returns a string representation of the current regexp,
|
||||
// using as few parentheses as possible.
|
||||
std::string ToString();
|
||||
|
||||
// Convenience functions. They consume the passed reference,
|
||||
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
|
||||
// They do not consume allocated arrays like subs or runes.
|
||||
static Regexp* Plus(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Star(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Quest(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
|
||||
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
|
||||
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
|
||||
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
|
||||
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
|
||||
static Regexp* HaveMatch(int match_id, ParseFlags flags);
|
||||
|
||||
// Like Alternate but does not factor out common prefixes.
|
||||
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
|
||||
// Debugging function. Returns string format for regexp
|
||||
// that makes structure clear. Does NOT use regexp syntax.
|
||||
std::string Dump();
|
||||
|
||||
// Helper traversal class, defined fully in walker-inl.h.
|
||||
template<typename T> class Walker;
|
||||
|
||||
// Compile to Prog. See prog.h
|
||||
// Reverse prog expects to be run over text backward.
|
||||
// Construction and execution of prog will
|
||||
// stay within approximately max_mem bytes of memory.
|
||||
// If max_mem <= 0, a reasonable default is used.
|
||||
Prog* CompileToProg(int64_t max_mem);
|
||||
Prog* CompileToReverseProg(int64_t max_mem);
|
||||
|
||||
// Whether to expect this library to find exactly the same answer as PCRE
|
||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
||||
// obscure cases behave differently. Technically this is more a property
|
||||
// of the Prog than the Regexp, but the computation is much easier to do
|
||||
// on the Regexp. See mimics_pcre.cc for the exact conditions.
|
||||
bool MimicsPCRE();
|
||||
|
||||
// Benchmarking function.
|
||||
void NullWalk();
|
||||
|
||||
// Whether every match of this regexp must be anchored and
|
||||
// begin with a non-empty fixed string (perhaps after ASCII
|
||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
||||
// follows it.
|
||||
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
|
||||
// regardless of the return value.
|
||||
bool RequiredPrefix(std::string* prefix, bool* foldcase,
|
||||
Regexp** suffix);
|
||||
|
||||
// Whether every match of this regexp must be unanchored and
|
||||
// begin with a non-empty fixed string (perhaps after ASCII
|
||||
// case-folding). If so, returns the prefix.
|
||||
// Callers should expect *prefix and *foldcase to be "zeroed"
|
||||
// regardless of the return value.
|
||||
bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
|
||||
|
||||
// Controls the maximum repeat count permitted by the parser.
|
||||
// FOR FUZZING ONLY.
|
||||
static void FUZZING_ONLY_set_maximum_repeat_count(int i);
|
||||
|
||||
private:
|
||||
// Constructor allocates vectors as appropriate for operator.
|
||||
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
|
||||
|
||||
// Use Decref() instead of delete to release Regexps.
|
||||
// This is private to catch deletes at compile time.
|
||||
~Regexp();
|
||||
void Destroy();
|
||||
bool QuickDestroy();
|
||||
|
||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
||||
class ParseState;
|
||||
|
||||
friend class ParseState;
|
||||
friend bool ParseCharClass(absl::string_view* s, Regexp** out_re,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Helper for testing [sic].
|
||||
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
|
||||
|
||||
// Computes whether Regexp is already simple.
|
||||
bool ComputeSimple();
|
||||
|
||||
// Constructor that generates a Star, Plus or Quest,
|
||||
// squashing the pair if sub is also a Star, Plus or Quest.
|
||||
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
|
||||
|
||||
// Constructor that generates a concatenation or alternation,
|
||||
// enforcing the limit on the number of subexpressions for
|
||||
// a particular Regexp.
|
||||
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
|
||||
ParseFlags flags, bool can_factor);
|
||||
|
||||
// Returns the leading string that re starts with.
|
||||
// The returned Rune* points into a piece of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
|
||||
|
||||
// Removes the first n leading runes from the beginning of re.
|
||||
// Edits re in place.
|
||||
static void RemoveLeadingString(Regexp* re, int n);
|
||||
|
||||
// Returns the leading regexp in re's top-level concatenation.
|
||||
// The returned Regexp* points at re or a sub-expression of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Regexp* LeadingRegexp(Regexp* re);
|
||||
|
||||
// Removes LeadingRegexp(re) from re and returns the remainder.
|
||||
// Might edit re in place.
|
||||
static Regexp* RemoveLeadingRegexp(Regexp* re);
|
||||
|
||||
// Simplifies an alternation of literal strings by factoring out
|
||||
// common prefixes.
|
||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
||||
friend class FactorAlternationImpl;
|
||||
|
||||
// Is a == b? Only efficient on regexps that have not been through
|
||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
||||
// take a long time. Do not call on such regexps, hence private.
|
||||
static bool Equal(Regexp* a, Regexp* b);
|
||||
|
||||
// Allocate space for n sub-regexps.
|
||||
void AllocSub(int n) {
|
||||
ABSL_DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
|
||||
if (n > 1)
|
||||
submany_ = new Regexp*[n];
|
||||
nsub_ = static_cast<uint16_t>(n);
|
||||
}
|
||||
|
||||
// Add Rune to LiteralString
|
||||
void AddRuneToString(Rune r);
|
||||
|
||||
// Swaps this with that, in place.
|
||||
void Swap(Regexp *that);
|
||||
|
||||
// Operator. See description of operators above.
|
||||
// uint8_t instead of RegexpOp to control space usage.
|
||||
uint8_t op_;
|
||||
|
||||
// Is this regexp structure already simple
|
||||
// (has it been returned by Simplify)?
|
||||
// uint8_t instead of bool to control space usage.
|
||||
uint8_t simple_;
|
||||
|
||||
// Flags saved from parsing and used during execution.
|
||||
// (Only FoldCase is used.)
|
||||
// uint16_t instead of ParseFlags to control space usage.
|
||||
uint16_t parse_flags_;
|
||||
|
||||
// Reference count. Exists so that SimplifyRegexp can build
|
||||
// regexp structures that are dags rather than trees to avoid
|
||||
// exponential blowup in space requirements.
|
||||
// uint16_t to control space usage.
|
||||
// The standard regexp routines will never generate a
|
||||
// ref greater than the maximum repeat count (kMaxRepeat),
|
||||
// but even so, Incref and Decref consult an overflow map
|
||||
// when ref_ reaches kMaxRef.
|
||||
uint16_t ref_;
|
||||
static const uint16_t kMaxRef = 0xffff;
|
||||
|
||||
// Subexpressions.
|
||||
// uint16_t to control space usage.
|
||||
// Concat and Alternate handle larger numbers of subexpressions
|
||||
// by building concatenation or alternation trees.
|
||||
// Other routines should call Concat or Alternate instead of
|
||||
// filling in sub() by hand.
|
||||
uint16_t nsub_;
|
||||
static const uint16_t kMaxNsub = 0xffff;
|
||||
union {
|
||||
Regexp** submany_; // if nsub_ > 1
|
||||
Regexp* subone_; // if nsub_ == 1
|
||||
};
|
||||
|
||||
// Extra space for parse and teardown stacks.
|
||||
Regexp* down_;
|
||||
|
||||
// Arguments to operator. See description of operators above.
|
||||
union {
|
||||
struct { // Repeat
|
||||
int max_;
|
||||
int min_;
|
||||
};
|
||||
struct { // Capture
|
||||
int cap_;
|
||||
std::string* name_;
|
||||
};
|
||||
struct { // LiteralString
|
||||
int nrunes_;
|
||||
Rune* runes_;
|
||||
};
|
||||
struct { // CharClass
|
||||
// These two could be in separate union members,
|
||||
// but it wouldn't save any space (there are other two-word structs)
|
||||
// and keeping them separate avoids confusion during parsing.
|
||||
CharClass* cc_;
|
||||
CharClassBuilder* ccb_;
|
||||
};
|
||||
Rune rune_; // Literal
|
||||
int match_id_; // HaveMatch
|
||||
void *the_union_[2]; // as big as any other element, for memset
|
||||
};
|
||||
|
||||
Regexp(const Regexp&) = delete;
|
||||
Regexp& operator=(const Regexp&) = delete;
|
||||
};
|
||||
|
||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
||||
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||
|
||||
class CharClassBuilder {
|
||||
public:
|
||||
CharClassBuilder();
|
||||
|
||||
typedef RuneRangeSet::iterator iterator;
|
||||
iterator begin() { return ranges_.begin(); }
|
||||
iterator end() { return ranges_.end(); }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
|
||||
bool Contains(Rune r);
|
||||
bool FoldsASCII();
|
||||
bool AddRange(Rune lo, Rune hi); // returns whether class changed
|
||||
CharClassBuilder* Copy();
|
||||
void AddCharClass(CharClassBuilder* cc);
|
||||
void Negate();
|
||||
void RemoveAbove(Rune r);
|
||||
CharClass* GetCharClass();
|
||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
||||
|
||||
private:
|
||||
static const uint32_t AlphaMask = (1<<26) - 1;
|
||||
uint32_t upper_; // bitmap of A-Z
|
||||
uint32_t lower_; // bitmap of a-z
|
||||
int nrunes_;
|
||||
RuneRangeSet ranges_;
|
||||
|
||||
CharClassBuilder(const CharClassBuilder&) = delete;
|
||||
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
|
||||
};
|
||||
|
||||
// Bitwise ops on ParseFlags produce ParseFlags.
|
||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) | static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) ^ static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) & static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
|
||||
// Attempting to produce a value out of enum's range has undefined behaviour.
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_REGEXP_H_
|
||||
+179
@@ -0,0 +1,179 @@
|
||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/set.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/sparse_set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor)
|
||||
: options_(options),
|
||||
anchor_(anchor),
|
||||
compiled_(false),
|
||||
size_(0) {
|
||||
options_.set_never_capture(true); // might unblock some optimisations
|
||||
}
|
||||
|
||||
RE2::Set::~Set() {
|
||||
for (size_t i = 0; i < elem_.size(); i++)
|
||||
elem_[i].second->Decref();
|
||||
}
|
||||
|
||||
RE2::Set::Set(Set&& other)
|
||||
: options_(other.options_),
|
||||
anchor_(other.anchor_),
|
||||
elem_(std::move(other.elem_)),
|
||||
compiled_(other.compiled_),
|
||||
size_(other.size_),
|
||||
prog_(std::move(other.prog_)) {
|
||||
other.elem_.clear();
|
||||
other.elem_.shrink_to_fit();
|
||||
other.compiled_ = false;
|
||||
other.size_ = 0;
|
||||
other.prog_.reset();
|
||||
}
|
||||
|
||||
RE2::Set& RE2::Set::operator=(Set&& other) {
|
||||
this->~Set();
|
||||
(void) new (this) Set(std::move(other));
|
||||
return *this;
|
||||
}
|
||||
|
||||
int RE2::Set::Add(absl::string_view pattern, std::string* error) {
|
||||
if (compiled_) {
|
||||
ABSL_LOG(DFATAL) << "RE2::Set::Add() called after compiling";
|
||||
return -1;
|
||||
}
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
RegexpStatus status;
|
||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
||||
if (re == NULL) {
|
||||
if (error != NULL)
|
||||
*error = status.Text();
|
||||
if (options_.log_errors())
|
||||
ABSL_LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Concatenate with match index and push on vector.
|
||||
int n = static_cast<int>(elem_.size());
|
||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
||||
if (re->op() == kRegexpConcat) {
|
||||
int nsub = re->nsub();
|
||||
PODArray<re2::Regexp*> sub(nsub + 1);
|
||||
for (int i = 0; i < nsub; i++)
|
||||
sub[i] = re->sub()[i]->Incref();
|
||||
sub[nsub] = m;
|
||||
re->Decref();
|
||||
re = re2::Regexp::Concat(sub.data(), nsub + 1, pf);
|
||||
} else {
|
||||
re2::Regexp* sub[2];
|
||||
sub[0] = re;
|
||||
sub[1] = m;
|
||||
re = re2::Regexp::Concat(sub, 2, pf);
|
||||
}
|
||||
elem_.emplace_back(std::string(pattern), re);
|
||||
return n;
|
||||
}
|
||||
|
||||
bool RE2::Set::Compile() {
|
||||
if (compiled_) {
|
||||
ABSL_LOG(DFATAL) << "RE2::Set::Compile() called more than once";
|
||||
return false;
|
||||
}
|
||||
compiled_ = true;
|
||||
size_ = static_cast<int>(elem_.size());
|
||||
|
||||
// Sort the elements by their patterns. This is good enough for now
|
||||
// until we have a Regexp comparison function. (Maybe someday...)
|
||||
std::sort(elem_.begin(), elem_.end(),
|
||||
[](const Elem& a, const Elem& b) -> bool {
|
||||
return a.first < b.first;
|
||||
});
|
||||
|
||||
PODArray<re2::Regexp*> sub(size_);
|
||||
for (int i = 0; i < size_; i++)
|
||||
sub[i] = elem_[i].second;
|
||||
elem_.clear();
|
||||
elem_.shrink_to_fit();
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf);
|
||||
|
||||
prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem()));
|
||||
re->Decref();
|
||||
return prog_ != nullptr;
|
||||
}
|
||||
|
||||
bool RE2::Set::Match(absl::string_view text, std::vector<int>* v) const {
|
||||
return Match(text, v, NULL);
|
||||
}
|
||||
|
||||
bool RE2::Set::Match(absl::string_view text, std::vector<int>* v,
|
||||
ErrorInfo* error_info) const {
|
||||
if (!compiled_) {
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNotCompiled;
|
||||
ABSL_LOG(DFATAL) << "RE2::Set::Match() called before compiling";
|
||||
return false;
|
||||
}
|
||||
#ifdef RE2_HAVE_THREAD_LOCAL
|
||||
hooks::context = NULL;
|
||||
#endif
|
||||
bool dfa_failed = false;
|
||||
std::unique_ptr<SparseSet> matches;
|
||||
if (v != NULL) {
|
||||
matches.reset(new SparseSet(size_));
|
||||
v->clear();
|
||||
}
|
||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
|
||||
NULL, &dfa_failed, matches.get());
|
||||
if (dfa_failed) {
|
||||
if (options_.log_errors())
|
||||
ABSL_LOG(ERROR) << "DFA out of memory: "
|
||||
<< "program size " << prog_->size() << ", "
|
||||
<< "list count " << prog_->list_count() << ", "
|
||||
<< "bytemap range " << prog_->bytemap_range();
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kOutOfMemory;
|
||||
return false;
|
||||
}
|
||||
if (ret == false) {
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNoError;
|
||||
return false;
|
||||
}
|
||||
if (v != NULL) {
|
||||
if (matches->empty()) {
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kInconsistent;
|
||||
ABSL_LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned";
|
||||
return false;
|
||||
}
|
||||
v->assign(matches->begin(), matches->end());
|
||||
}
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNoError;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,86 @@
|
||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SET_H_
|
||||
#define RE2_SET_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
class Prog;
|
||||
class Regexp;
|
||||
} // namespace re2
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// An RE2::Set represents a collection of regexps that can
|
||||
// be searched for simultaneously.
|
||||
class RE2::Set {
|
||||
public:
|
||||
enum ErrorKind {
|
||||
kNoError = 0,
|
||||
kNotCompiled, // The set is not compiled.
|
||||
kOutOfMemory, // The DFA ran out of memory.
|
||||
kInconsistent, // The result is inconsistent. This should never happen.
|
||||
};
|
||||
|
||||
struct ErrorInfo {
|
||||
ErrorKind kind;
|
||||
};
|
||||
|
||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
||||
~Set();
|
||||
|
||||
// Not copyable.
|
||||
Set(const Set&) = delete;
|
||||
Set& operator=(const Set&) = delete;
|
||||
// Movable.
|
||||
Set(Set&& other);
|
||||
Set& operator=(Set&& other);
|
||||
|
||||
// Adds pattern to the set using the options passed to the constructor.
|
||||
// Returns the index that will identify the regexp in the output of Match(),
|
||||
// or -1 if the regexp cannot be parsed.
|
||||
// Indices are assigned in sequential order starting from 0.
|
||||
// Errors do not increment the index; if error is not NULL, *error will hold
|
||||
// the error message from the parser.
|
||||
int Add(absl::string_view pattern, std::string* error);
|
||||
|
||||
// Compiles the set in preparation for matching.
|
||||
// Returns false if the compiler runs out of memory.
|
||||
// Add() must not be called again after Compile().
|
||||
// Compile() must be called before Match().
|
||||
bool Compile();
|
||||
|
||||
// Returns true if text matches at least one of the regexps in the set.
|
||||
// Fills v (if not NULL) with the indices of the matching regexps.
|
||||
// Callers must not expect v to be sorted.
|
||||
bool Match(absl::string_view text, std::vector<int>* v) const;
|
||||
|
||||
// As above, but populates error_info (if not NULL) when none of the regexps
|
||||
// in the set matched. This can inform callers when DFA execution fails, for
|
||||
// example, because they might wish to handle that case differently.
|
||||
bool Match(absl::string_view text, std::vector<int>* v,
|
||||
ErrorInfo* error_info) const;
|
||||
|
||||
private:
|
||||
typedef std::pair<std::string, re2::Regexp*> Elem;
|
||||
|
||||
RE2::Options options_;
|
||||
RE2::Anchor anchor_;
|
||||
std::vector<Elem> elem_;
|
||||
bool compiled_;
|
||||
int size_;
|
||||
std::unique_ptr<re2::Prog> prog_;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SET_H_
|
||||
+689
@@ -0,0 +1,689 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Rewrite POSIX and other features in re
|
||||
// to use simple extended regular expression features.
|
||||
// Also sort and simplify character classes.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *error (if error != NULL) on error.
|
||||
bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
|
||||
std::string* dst, RegexpStatus* status) {
|
||||
Regexp* re = Parse(src, flags, status);
|
||||
if (re == NULL)
|
||||
return false;
|
||||
Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
if (sre == NULL) {
|
||||
if (status) {
|
||||
status->set_code(kRegexpInternalError);
|
||||
status->set_error_arg(src);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
*dst = sre->ToString();
|
||||
sre->Decref();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assuming the simple_ flags on the children are accurate,
|
||||
// is this Regexp* simple?
|
||||
bool Regexp::ComputeSimple() {
|
||||
Regexp** subs;
|
||||
switch (op_) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
// These are simple as long as the subpieces are simple.
|
||||
subs = sub();
|
||||
for (int i = 0; i < nsub_; i++)
|
||||
if (!subs[i]->simple())
|
||||
return false;
|
||||
return true;
|
||||
case kRegexpCharClass:
|
||||
// Simple as long as the char class is not empty, not full.
|
||||
if (ccb_ != NULL)
|
||||
return !ccb_->empty() && !ccb_->full();
|
||||
return !cc_->empty() && !cc_->full();
|
||||
case kRegexpCapture:
|
||||
subs = sub();
|
||||
return subs[0]->simple();
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
subs = sub();
|
||||
if (!subs[0]->simple())
|
||||
return false;
|
||||
switch (subs[0]->op_) {
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpNoMatch:
|
||||
return false;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
case kRegexpRepeat:
|
||||
return false;
|
||||
}
|
||||
ABSL_LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
||||
// occurrences of that literal into repeats of that literal. It also works for
|
||||
// char classes, any char and any byte.
|
||||
// PostVisit creates the coalesced result, which should then be simplified.
|
||||
class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
CoalesceWalker() {}
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside CoalesceWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
||||
// the parse flags are consistent. (They will not be checked again later.)
|
||||
static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
||||
|
||||
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
||||
// will be empty match and the coalesced op. In other cases, where part of a
|
||||
// literal string was removed to be coalesced, the array elements afterwards
|
||||
// will be the coalesced op and the remainder of the literal string.
|
||||
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
||||
|
||||
CoalesceWalker(const CoalesceWalker&) = delete;
|
||||
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
|
||||
};
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// The simplify walk is purely post-recursive: given the simplified children,
|
||||
// PostVisit creates the simplified result.
|
||||
// The child_args are simplified Regexp*s.
|
||||
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
SimplifyWalker() {}
|
||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside SimplifyWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags parse_flags);
|
||||
|
||||
// Simplifies a character class by expanding any named classes
|
||||
// into rune ranges. Does not edit re. Does not consume ref to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyCharClass(Regexp* re);
|
||||
|
||||
SimplifyWalker(const SimplifyWalker&) = delete;
|
||||
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
|
||||
};
|
||||
|
||||
// Simplifies a regular expression, returning a new regexp.
|
||||
// The new regexp uses traditional Unix egrep features only,
|
||||
// plus the Perl (?:) non-capturing parentheses.
|
||||
// Otherwise, no POSIX or Perl additions. The new regexp
|
||||
// captures exactly the same subexpressions (with the same indices)
|
||||
// as the original.
|
||||
// Does not edit current object.
|
||||
// Caller must Decref() return value when done with it.
|
||||
|
||||
Regexp* Regexp::Simplify() {
|
||||
CoalesceWalker cw;
|
||||
Regexp* cre = cw.Walk(this, NULL);
|
||||
if (cre == NULL)
|
||||
return NULL;
|
||||
if (cw.stopped_early()) {
|
||||
cre->Decref();
|
||||
return NULL;
|
||||
}
|
||||
SimplifyWalker sw;
|
||||
Regexp* sre = sw.Walk(cre, NULL);
|
||||
cre->Decref();
|
||||
if (sre == NULL)
|
||||
return NULL;
|
||||
if (sw.stopped_early()) {
|
||||
sre->Decref();
|
||||
return NULL;
|
||||
}
|
||||
return sre;
|
||||
}
|
||||
|
||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
||||
|
||||
// Utility function for PostVisit implementations that compares re->sub() with
|
||||
// child_args to determine whether any child_args changed. In the common case,
|
||||
// where nothing changed, calls Decref() for all child_args and returns false,
|
||||
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
||||
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* sub = re->sub()[i];
|
||||
Regexp* newsub = child_args[i];
|
||||
if (newsub != sub)
|
||||
return true;
|
||||
}
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* newsub = child_args[i];
|
||||
newsub->Decref();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
ABSL_LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
||||
#endif
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
if (re->nsub() == 0)
|
||||
return re->Incref();
|
||||
|
||||
if (re->op() != kRegexpConcat) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
// Repeats and Captures have additional data that must be copied.
|
||||
if (re->op() == kRegexpRepeat) {
|
||||
nre->min_ = re->min();
|
||||
nre->max_ = re->max();
|
||||
} else if (re->op() == kRegexpCapture) {
|
||||
nre->cap_ = re->cap();
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool can_coalesce = false;
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1])) {
|
||||
can_coalesce = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!can_coalesce) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
return nre;
|
||||
}
|
||||
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1]))
|
||||
DoCoalesce(&child_args[i], &child_args[i+1]);
|
||||
}
|
||||
// Determine how many empty matches were left by DoCoalesce.
|
||||
int n = 0;
|
||||
for (int i = n; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch)
|
||||
n++;
|
||||
}
|
||||
// Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub() - n);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
||||
child_args[i]->Decref();
|
||||
continue;
|
||||
}
|
||||
nre_subs[j] = child_args[i];
|
||||
j++;
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
||||
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
||||
// any byte.
|
||||
if ((r1->op() == kRegexpStar ||
|
||||
r1->op() == kRegexpPlus ||
|
||||
r1->op() == kRegexpQuest ||
|
||||
r1->op() == kRegexpRepeat) &&
|
||||
(r1->sub()[0]->op() == kRegexpLiteral ||
|
||||
r1->sub()[0]->op() == kRegexpCharClass ||
|
||||
r1->sub()[0]->op() == kRegexpAnyChar ||
|
||||
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
||||
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
||||
// any char or any byte.
|
||||
if ((r2->op() == kRegexpStar ||
|
||||
r2->op() == kRegexpPlus ||
|
||||
r2->op() == kRegexpQuest ||
|
||||
r2->op() == kRegexpRepeat) &&
|
||||
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->parse_flags() & Regexp::NonGreedy) ==
|
||||
(r2->parse_flags() & Regexp::NonGreedy))) {
|
||||
return true;
|
||||
}
|
||||
// ... OR an occurrence of that literal, char class, any char or any byte
|
||||
if (Regexp::Equal(r1->sub()[0], r2)) {
|
||||
return true;
|
||||
}
|
||||
// ... OR a literal string that begins with that literal.
|
||||
if (r1->sub()[0]->op() == kRegexpLiteral &&
|
||||
r2->op() == kRegexpLiteralString &&
|
||||
r2->runes()[0] == r1->sub()[0]->rune() &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
||||
(r2->parse_flags() & Regexp::FoldCase))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
||||
Regexp* r1 = *r1ptr;
|
||||
Regexp* r2 = *r2ptr;
|
||||
|
||||
Regexp* nre = Regexp::Repeat(
|
||||
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
||||
|
||||
switch (r1->op()) {
|
||||
case kRegexpStar:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_ = 1;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = 1;
|
||||
break;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ = r1->min();
|
||||
nre->max_ = r1->max();
|
||||
break;
|
||||
|
||||
default:
|
||||
nre->Decref();
|
||||
ABSL_LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
||||
return;
|
||||
}
|
||||
|
||||
switch (r2->op()) {
|
||||
case kRegexpStar:
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_++;
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpQuest:
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ += r2->min();
|
||||
if (r2->max() == -1)
|
||||
nre->max_ = -1;
|
||||
else if (nre->max() != -1)
|
||||
nre->max_ += r2->max();
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpLiteral:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
nre->min_++;
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
LeaveEmpty:
|
||||
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
||||
*r2ptr = nre;
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString: {
|
||||
Rune r = r1->sub()[0]->rune();
|
||||
// Determine how much of the literal string is removed.
|
||||
// We know that we have at least one rune. :)
|
||||
int n = 1;
|
||||
while (n < r2->nrunes() && r2->runes()[n] == r)
|
||||
n++;
|
||||
nre->min_ += n;
|
||||
if (nre->max() != -1)
|
||||
nre->max_ += n;
|
||||
if (n == r2->nrunes())
|
||||
goto LeaveEmpty;
|
||||
*r1ptr = nre;
|
||||
*r2ptr = Regexp::LiteralString(
|
||||
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
nre->Decref();
|
||||
ABSL_LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
||||
return;
|
||||
}
|
||||
|
||||
r1->Decref();
|
||||
r2->Decref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
ABSL_LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
||||
#endif
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
||||
if (re->simple()) {
|
||||
*stop = true;
|
||||
return re->Incref();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
// All these are always simple.
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate: {
|
||||
// These are simple as long as the subpieces are simple.
|
||||
if (!ChildArgsChanged(re, child_args)) {
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCapture: {
|
||||
Regexp* newsub = child_args[0];
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->cap_ = re->cap();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
// These are simple as long as the subpiece is simple.
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// These are also idempotent if flags are constant.
|
||||
if (re->op() == newsub->op() &&
|
||||
re->parse_flags() == newsub->parse_flags())
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpRepeat: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
||||
re->parse_flags());
|
||||
newsub->Decref();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCharClass: {
|
||||
Regexp* nre = SimplifyCharClass(re);
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
}
|
||||
|
||||
ABSL_LOG(ERROR) << "Simplify case not handled: " << re->op();
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Returns a new Regexp, handing the ref to the caller.
|
||||
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
||||
Regexp::ParseFlags parse_flags) {
|
||||
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
||||
re->AllocSub(2);
|
||||
Regexp** subs = re->sub();
|
||||
subs[0] = re1;
|
||||
subs[1] = re2;
|
||||
return re;
|
||||
}
|
||||
|
||||
// Returns true if re is an empty-width op.
|
||||
static bool IsEmptyOp(Regexp* re) {
|
||||
return (re->op() == kRegexpBeginLine ||
|
||||
re->op() == kRegexpEndLine ||
|
||||
re->op() == kRegexpWordBoundary ||
|
||||
re->op() == kRegexpNoWordBoundary ||
|
||||
re->op() == kRegexpBeginText ||
|
||||
re->op() == kRegexpEndText);
|
||||
}
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
// The result will *not* necessarily have the right capturing parens
|
||||
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
||||
// but in the Regexp* representation, both (x) are marked as $1.
|
||||
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags f) {
|
||||
// For an empty-width op OR a concatenation or alternation of empty-width
|
||||
// ops, cap the repetition count at 1.
|
||||
if (IsEmptyOp(re) ||
|
||||
((re->op() == kRegexpConcat ||
|
||||
re->op() == kRegexpAlternate) &&
|
||||
std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
|
||||
min = std::min(min, 1);
|
||||
max = std::min(max, 1);
|
||||
}
|
||||
|
||||
// x{n,} means at least n matches of x.
|
||||
if (max == -1) {
|
||||
// Special case: x{0,} is x*
|
||||
if (min == 0)
|
||||
return Regexp::Star(re->Incref(), f);
|
||||
|
||||
// Special case: x{1,} is x+
|
||||
if (min == 1)
|
||||
return Regexp::Plus(re->Incref(), f);
|
||||
|
||||
// General case: x{4,} is xxxx+
|
||||
PODArray<Regexp*> nre_subs(min);
|
||||
for (int i = 0; i < min-1; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
||||
return Regexp::Concat(nre_subs.data(), min, f);
|
||||
}
|
||||
|
||||
// Special case: (x){0} matches only empty string.
|
||||
if (min == 0 && max == 0)
|
||||
return new Regexp(kRegexpEmptyMatch, f);
|
||||
|
||||
// Special case: x{1} is just x.
|
||||
if (min == 1 && max == 1)
|
||||
return re->Incref();
|
||||
|
||||
// General case: x{n,m} means n copies of x and m copies of x?.
|
||||
// The machine will do less work if we nest the final m copies,
|
||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||
|
||||
// Build leading prefix: xx. Capturing only on the last one.
|
||||
Regexp* nre = NULL;
|
||||
if (min > 0) {
|
||||
PODArray<Regexp*> nre_subs(min);
|
||||
for (int i = 0; i < min; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre = Regexp::Concat(nre_subs.data(), min, f);
|
||||
}
|
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
if (max > min) {
|
||||
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
||||
for (int i = min+1; i < max; i++)
|
||||
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
||||
if (nre == NULL)
|
||||
nre = suf;
|
||||
else
|
||||
nre = Concat2(nre, suf, f);
|
||||
}
|
||||
|
||||
if (nre == NULL) {
|
||||
// Some degenerate case, like min > max, or min < max < 0.
|
||||
// This shouldn't happen, because the parser rejects such regexps.
|
||||
ABSL_LOG(DFATAL) << "Malformed repeat of " << re->ToString()
|
||||
<< " min " << min << " max " << max;
|
||||
return new Regexp(kRegexpNoMatch, f);
|
||||
}
|
||||
|
||||
return nre;
|
||||
}
|
||||
|
||||
// Simplifies a character class.
|
||||
// Caller must Decref return value when done with it.
|
||||
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
||||
CharClass* cc = re->cc();
|
||||
|
||||
// Special cases
|
||||
if (cc->empty())
|
||||
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
||||
if (cc->full())
|
||||
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
||||
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,394 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SPARSE_ARRAY_H_
|
||||
#define RE2_SPARSE_ARRAY_H_
|
||||
|
||||
// DESCRIPTION
|
||||
//
|
||||
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
||||
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
|
||||
// fast iteration through the elements in the array and fast clearing
|
||||
// of the array. The array has a concept of certain elements being
|
||||
// uninitialized (having no value).
|
||||
//
|
||||
// Insertion and deletion are constant time operations.
|
||||
//
|
||||
// Allocating the array is a constant time operation
|
||||
// when memory allocation is a constant time operation.
|
||||
//
|
||||
// Clearing the array is a constant time operation (unusual!).
|
||||
//
|
||||
// Iterating through the array is an O(n) operation, where n
|
||||
// is the number of items in the array (not O(m)).
|
||||
//
|
||||
// The array iterator visits entries in the order they were first
|
||||
// inserted into the array. It is safe to add items to the array while
|
||||
// using an iterator: the iterator will visit indices added to the array
|
||||
// during the iteration, but will not re-visit indices whose values
|
||||
// change after visiting. Thus SparseArray can be a convenient
|
||||
// implementation of a work queue.
|
||||
//
|
||||
// The SparseArray implementation is NOT thread-safe. It is up to the
|
||||
// caller to make sure only one thread is accessing the array. (Typically
|
||||
// these arrays are temporary values and used in situations where speed is
|
||||
// important.)
|
||||
//
|
||||
// The SparseArray interface does not present all the usual STL bells and
|
||||
// whistles.
|
||||
//
|
||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
||||
//
|
||||
// Briggs & Torczon popularized this technique, but it had been known
|
||||
// long before their paper. They point out that Aho, Hopcroft, and
|
||||
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
|
||||
// 1986 Programming Pearls both hint at the technique in exercises to the
|
||||
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
|
||||
// exercise 8).
|
||||
//
|
||||
// Briggs & Torczon describe a sparse set implementation. I have
|
||||
// trivially generalized it to create a sparse array (actually the original
|
||||
// target of the AHU and Bentley exercises).
|
||||
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// SparseArray is an array dense_ and an array sparse_ of identical size.
|
||||
// At any point, the number of elements in the sparse array is size_.
|
||||
//
|
||||
// The array dense_ contains the size_ elements in the sparse array (with
|
||||
// their indices),
|
||||
// in the order that the elements were first inserted. This array is dense:
|
||||
// the size_ pairs are dense_[0] through dense_[size_-1].
|
||||
//
|
||||
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
|
||||
// For indices present in the array, dense_[sparse_[i]].index_ == i.
|
||||
// For indices not present in the array, sparse_ can contain any value at all,
|
||||
// perhaps outside the range [0, size_) but perhaps not.
|
||||
//
|
||||
// The lax requirement on sparse_ values makes clearing the array very easy:
|
||||
// set size_ to 0. Lookups are slightly more complicated.
|
||||
// An index i has a value in the array if and only if:
|
||||
// sparse_[i] is in [0, size_) AND
|
||||
// dense_[sparse_[i]].index_ == i.
|
||||
// If both these properties hold, only then it is safe to refer to
|
||||
// dense_[sparse_[i]].value_
|
||||
// as the value associated with index i.
|
||||
//
|
||||
// To insert a new entry, set sparse_[i] to size_,
|
||||
// initialize dense_[size_], and then increment size_.
|
||||
//
|
||||
// To make the sparse array as efficient as possible for non-primitive types,
|
||||
// elements may or may not be destroyed when they are deleted from the sparse
|
||||
// array through a call to resize(). They immediately become inaccessible, but
|
||||
// they are only guaranteed to be destroyed when the SparseArray destructor is
|
||||
// called.
|
||||
//
|
||||
// A moved-from SparseArray will be empty.
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "re2/pod_array.h"
|
||||
|
||||
// Doing this simplifies the logic below.
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
template<typename Value>
|
||||
class SparseArray {
|
||||
public:
|
||||
SparseArray();
|
||||
explicit SparseArray(int max_size);
|
||||
~SparseArray();
|
||||
|
||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||
class IndexValue;
|
||||
|
||||
typedef IndexValue* iterator;
|
||||
typedef const IndexValue* const_iterator;
|
||||
|
||||
SparseArray(const SparseArray& src);
|
||||
SparseArray(SparseArray&& src);
|
||||
|
||||
SparseArray& operator=(const SparseArray& src);
|
||||
SparseArray& operator=(SparseArray&& src);
|
||||
|
||||
// Return the number of entries in the array.
|
||||
int size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
// Indicate whether the array is empty.
|
||||
int empty() const {
|
||||
return size_ == 0;
|
||||
}
|
||||
|
||||
// Iterate over the array.
|
||||
iterator begin() {
|
||||
return dense_.data();
|
||||
}
|
||||
iterator end() {
|
||||
return dense_.data() + size_;
|
||||
}
|
||||
|
||||
const_iterator begin() const {
|
||||
return dense_.data();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return dense_.data() + size_;
|
||||
}
|
||||
|
||||
// Change the maximum size of the array.
|
||||
// Invalidates all iterators.
|
||||
void resize(int new_max_size);
|
||||
|
||||
// Return the maximum size of the array.
|
||||
// Indices can be in the range [0, max_size).
|
||||
int max_size() const {
|
||||
if (dense_.data() != NULL)
|
||||
return dense_.size();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Clear the array.
|
||||
void clear() {
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
// Check whether index i is in the array.
|
||||
bool has_index(int i) const;
|
||||
|
||||
// Comparison function for sorting.
|
||||
// Can sort the sparse array so that future iterations
|
||||
// will visit indices in increasing order using
|
||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||
static bool less(const IndexValue& a, const IndexValue& b);
|
||||
|
||||
public:
|
||||
// Set the value at index i to v.
|
||||
iterator set(int i, const Value& v) {
|
||||
return SetInternal(true, i, v);
|
||||
}
|
||||
|
||||
// Set the value at new index i to v.
|
||||
// Fast but unsafe: only use if has_index(i) is false.
|
||||
iterator set_new(int i, const Value& v) {
|
||||
return SetInternal(false, i, v);
|
||||
}
|
||||
|
||||
// Set the value at index i to v.
|
||||
// Fast but unsafe: only use if has_index(i) is true.
|
||||
iterator set_existing(int i, const Value& v) {
|
||||
return SetExistingInternal(i, v);
|
||||
}
|
||||
|
||||
// Get the value at index i.
|
||||
// Fast but unsafe: only use if has_index(i) is true.
|
||||
Value& get_existing(int i) {
|
||||
assert(has_index(i));
|
||||
return dense_[sparse_[i]].value_;
|
||||
}
|
||||
const Value& get_existing(int i) const {
|
||||
assert(has_index(i));
|
||||
return dense_[sparse_[i]].value_;
|
||||
}
|
||||
|
||||
private:
|
||||
iterator SetInternal(bool allow_existing, int i, const Value& v) {
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
|
||||
assert(false && "illegal index");
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return begin();
|
||||
}
|
||||
if (!allow_existing) {
|
||||
assert(!has_index(i));
|
||||
create_index(i);
|
||||
} else {
|
||||
if (!has_index(i))
|
||||
create_index(i);
|
||||
}
|
||||
return SetExistingInternal(i, v);
|
||||
}
|
||||
|
||||
iterator SetExistingInternal(int i, const Value& v) {
|
||||
DebugCheckInvariants();
|
||||
assert(has_index(i));
|
||||
dense_[sparse_[i]].value_ = v;
|
||||
DebugCheckInvariants();
|
||||
return dense_.data() + sparse_[i];
|
||||
}
|
||||
|
||||
// Add the index i to the array.
|
||||
// Only use if has_index(i) is known to be false.
|
||||
// Since it doesn't set the value associated with i,
|
||||
// this function is private, only intended as a helper
|
||||
// for other methods.
|
||||
void create_index(int i);
|
||||
|
||||
// In debug mode, verify that some invariant properties of the class
|
||||
// are being maintained. This is called at the end of the constructor
|
||||
// and at the beginning and end of all public non-const member functions.
|
||||
void DebugCheckInvariants() const;
|
||||
|
||||
// Initializes memory for elements [min, max).
|
||||
void MaybeInitializeMemory(int min, int max) {
|
||||
#if __has_feature(memory_sanitizer)
|
||||
__msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
|
||||
#elif defined(RE2_ON_VALGRIND)
|
||||
for (int i = min; i < max; i++) {
|
||||
sparse_[i] = 0xababababU;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int size_ = 0;
|
||||
PODArray<int> sparse_;
|
||||
PODArray<IndexValue> dense_;
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray() = default;
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray(const SparseArray& src)
|
||||
: size_(src.size_),
|
||||
sparse_(src.max_size()),
|
||||
dense_(src.max_size()) {
|
||||
std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
|
||||
std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray(SparseArray&& src)
|
||||
: size_(src.size_),
|
||||
sparse_(std::move(src.sparse_)),
|
||||
dense_(std::move(src.dense_)) {
|
||||
src.size_ = 0;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
|
||||
// Construct these first for exception safety.
|
||||
PODArray<int> a(src.max_size());
|
||||
PODArray<IndexValue> b(src.max_size());
|
||||
|
||||
size_ = src.size_;
|
||||
sparse_ = std::move(a);
|
||||
dense_ = std::move(b);
|
||||
std::copy_n(src.sparse_.data(), src.max_size(), sparse_.data());
|
||||
std::copy_n(src.dense_.data(), src.max_size(), dense_.data());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>& SparseArray<Value>::operator=(SparseArray&& src) {
|
||||
size_ = src.size_;
|
||||
sparse_ = std::move(src.sparse_);
|
||||
dense_ = std::move(src.dense_);
|
||||
src.size_ = 0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||
template<typename Value>
|
||||
class SparseArray<Value>::IndexValue {
|
||||
public:
|
||||
int index() const { return index_; }
|
||||
Value& value() { return value_; }
|
||||
const Value& value() const { return value_; }
|
||||
|
||||
private:
|
||||
friend class SparseArray;
|
||||
int index_;
|
||||
Value value_;
|
||||
};
|
||||
|
||||
// Change the maximum size of the array.
|
||||
// Invalidates all iterators.
|
||||
template<typename Value>
|
||||
void SparseArray<Value>::resize(int new_max_size) {
|
||||
DebugCheckInvariants();
|
||||
if (new_max_size > max_size()) {
|
||||
const int old_max_size = max_size();
|
||||
|
||||
// Construct these first for exception safety.
|
||||
PODArray<int> a(new_max_size);
|
||||
PODArray<IndexValue> b(new_max_size);
|
||||
|
||||
std::copy_n(sparse_.data(), old_max_size, a.data());
|
||||
std::copy_n(dense_.data(), old_max_size, b.data());
|
||||
|
||||
sparse_ = std::move(a);
|
||||
dense_ = std::move(b);
|
||||
|
||||
MaybeInitializeMemory(old_max_size, new_max_size);
|
||||
}
|
||||
if (size_ > new_max_size)
|
||||
size_ = new_max_size;
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
// Check whether index i is in the array.
|
||||
template<typename Value>
|
||||
bool SparseArray<Value>::has_index(int i) const {
|
||||
assert(i >= 0);
|
||||
assert(i < max_size());
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
|
||||
return false;
|
||||
}
|
||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||
dense_[sparse_[i]].index_ == i;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
void SparseArray<Value>::create_index(int i) {
|
||||
assert(!has_index(i));
|
||||
assert(size_ < max_size());
|
||||
sparse_[i] = size_;
|
||||
dense_[size_].index_ = i;
|
||||
size_++;
|
||||
}
|
||||
|
||||
template<typename Value> SparseArray<Value>::SparseArray(int max_size) :
|
||||
sparse_(max_size), dense_(max_size) {
|
||||
MaybeInitializeMemory(size_, max_size);
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> SparseArray<Value>::~SparseArray() {
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
||||
assert(0 <= size_);
|
||||
assert(size_ <= max_size());
|
||||
}
|
||||
|
||||
// Comparison function for sorting.
|
||||
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
||||
const IndexValue& b) {
|
||||
return a.index_ < b.index_;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SPARSE_ARRAY_H_
|
||||
@@ -0,0 +1,266 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SPARSE_SET_H_
|
||||
#define RE2_SPARSE_SET_H_
|
||||
|
||||
// DESCRIPTION
|
||||
//
|
||||
// SparseSet(m) is a set of integers in [0, m).
|
||||
// It requires sizeof(int)*m memory, but it provides
|
||||
// fast iteration through the elements in the set and fast clearing
|
||||
// of the set.
|
||||
//
|
||||
// Insertion and deletion are constant time operations.
|
||||
//
|
||||
// Allocating the set is a constant time operation
|
||||
// when memory allocation is a constant time operation.
|
||||
//
|
||||
// Clearing the set is a constant time operation (unusual!).
|
||||
//
|
||||
// Iterating through the set is an O(n) operation, where n
|
||||
// is the number of items in the set (not O(m)).
|
||||
//
|
||||
// The set iterator visits entries in the order they were first
|
||||
// inserted into the set. It is safe to add items to the set while
|
||||
// using an iterator: the iterator will visit indices added to the set
|
||||
// during the iteration, but will not re-visit indices whose values
|
||||
// change after visiting. Thus SparseSet can be a convenient
|
||||
// implementation of a work queue.
|
||||
//
|
||||
// The SparseSet implementation is NOT thread-safe. It is up to the
|
||||
// caller to make sure only one thread is accessing the set. (Typically
|
||||
// these sets are temporary values and used in situations where speed is
|
||||
// important.)
|
||||
//
|
||||
// The SparseSet interface does not present all the usual STL bells and
|
||||
// whistles.
|
||||
//
|
||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
||||
//
|
||||
// This is a specialization of sparse array; see sparse_array.h.
|
||||
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// See sparse_array.h for implementation details.
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "re2/pod_array.h"
|
||||
|
||||
// Doing this simplifies the logic below.
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
template<typename Value>
|
||||
class SparseSetT {
|
||||
public:
|
||||
SparseSetT();
|
||||
explicit SparseSetT(int max_size);
|
||||
~SparseSetT();
|
||||
|
||||
typedef int* iterator;
|
||||
typedef const int* const_iterator;
|
||||
|
||||
// Return the number of entries in the set.
|
||||
int size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
// Indicate whether the set is empty.
|
||||
int empty() const {
|
||||
return size_ == 0;
|
||||
}
|
||||
|
||||
// Iterate over the set.
|
||||
iterator begin() {
|
||||
return dense_.data();
|
||||
}
|
||||
iterator end() {
|
||||
return dense_.data() + size_;
|
||||
}
|
||||
|
||||
const_iterator begin() const {
|
||||
return dense_.data();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return dense_.data() + size_;
|
||||
}
|
||||
|
||||
// Change the maximum size of the set.
|
||||
// Invalidates all iterators.
|
||||
void resize(int new_max_size);
|
||||
|
||||
// Return the maximum size of the set.
|
||||
// Indices can be in the range [0, max_size).
|
||||
int max_size() const {
|
||||
if (dense_.data() != NULL)
|
||||
return dense_.size();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Clear the set.
|
||||
void clear() {
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
// Check whether index i is in the set.
|
||||
bool contains(int i) const;
|
||||
|
||||
// Comparison function for sorting.
|
||||
// Can sort the sparse set so that future iterations
|
||||
// will visit indices in increasing order using
|
||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||
static bool less(int a, int b);
|
||||
|
||||
public:
|
||||
// Insert index i into the set.
|
||||
iterator insert(int i) {
|
||||
return InsertInternal(true, i);
|
||||
}
|
||||
|
||||
// Insert index i into the set.
|
||||
// Fast but unsafe: only use if contains(i) is false.
|
||||
iterator insert_new(int i) {
|
||||
return InsertInternal(false, i);
|
||||
}
|
||||
|
||||
private:
|
||||
iterator InsertInternal(bool allow_existing, int i) {
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
|
||||
assert(false && "illegal index");
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return begin();
|
||||
}
|
||||
if (!allow_existing) {
|
||||
assert(!contains(i));
|
||||
create_index(i);
|
||||
} else {
|
||||
if (!contains(i))
|
||||
create_index(i);
|
||||
}
|
||||
DebugCheckInvariants();
|
||||
return dense_.data() + sparse_[i];
|
||||
}
|
||||
|
||||
// Add the index i to the set.
|
||||
// Only use if contains(i) is known to be false.
|
||||
// This function is private, only intended as a helper
|
||||
// for other methods.
|
||||
void create_index(int i);
|
||||
|
||||
// In debug mode, verify that some invariant properties of the class
|
||||
// are being maintained. This is called at the end of the constructor
|
||||
// and at the beginning and end of all public non-const member functions.
|
||||
void DebugCheckInvariants() const;
|
||||
|
||||
// Initializes memory for elements [min, max).
|
||||
void MaybeInitializeMemory(int min, int max) {
|
||||
#if __has_feature(memory_sanitizer)
|
||||
__msan_unpoison(sparse_.data() + min, (max - min) * sizeof sparse_[0]);
|
||||
#elif defined(RE2_ON_VALGRIND)
|
||||
for (int i = min; i < max; i++) {
|
||||
sparse_[i] = 0xababababU;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int size_ = 0;
|
||||
PODArray<int> sparse_;
|
||||
PODArray<int> dense_;
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
SparseSetT<Value>::SparseSetT() = default;
|
||||
|
||||
// Change the maximum size of the set.
|
||||
// Invalidates all iterators.
|
||||
template<typename Value>
|
||||
void SparseSetT<Value>::resize(int new_max_size) {
|
||||
DebugCheckInvariants();
|
||||
if (new_max_size > max_size()) {
|
||||
const int old_max_size = max_size();
|
||||
|
||||
// Construct these first for exception safety.
|
||||
PODArray<int> a(new_max_size);
|
||||
PODArray<int> b(new_max_size);
|
||||
|
||||
std::copy_n(sparse_.data(), old_max_size, a.data());
|
||||
std::copy_n(dense_.data(), old_max_size, b.data());
|
||||
|
||||
sparse_ = std::move(a);
|
||||
dense_ = std::move(b);
|
||||
|
||||
MaybeInitializeMemory(old_max_size, new_max_size);
|
||||
}
|
||||
if (size_ > new_max_size)
|
||||
size_ = new_max_size;
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
// Check whether index i is in the set.
|
||||
template<typename Value>
|
||||
bool SparseSetT<Value>::contains(int i) const {
|
||||
assert(i >= 0);
|
||||
assert(i < max_size());
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size())) {
|
||||
return false;
|
||||
}
|
||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||
dense_[sparse_[i]] == i;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
void SparseSetT<Value>::create_index(int i) {
|
||||
assert(!contains(i));
|
||||
assert(size_ < max_size());
|
||||
sparse_[i] = size_;
|
||||
dense_[size_] = i;
|
||||
size_++;
|
||||
}
|
||||
|
||||
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) :
|
||||
sparse_(max_size), dense_(max_size) {
|
||||
MaybeInitializeMemory(size_, max_size);
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> SparseSetT<Value>::~SparseSetT() {
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
|
||||
assert(0 <= size_);
|
||||
assert(size_ <= max_size());
|
||||
}
|
||||
|
||||
// Comparison function for sorting.
|
||||
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
|
||||
return a < b;
|
||||
}
|
||||
|
||||
typedef SparseSetT<void> SparseSet;
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SPARSE_SET_H_
|
||||
@@ -0,0 +1,18 @@
|
||||
// Copyright 2022 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_STRINGPIECE_H_
|
||||
#define RE2_STRINGPIECE_H_
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Until RE2 requires C++17 and uses std::string_view, allow users to
|
||||
// continue to #include "re2/stringpiece.h" and use re2::StringPiece.
|
||||
using StringPiece = absl::string_view;
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_STRINGPIECE_H_
|
||||
@@ -0,0 +1,274 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
//
|
||||
// Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
|
||||
// except that it remembers where it has been, trading a lot of
|
||||
// memory for a lot of time. It exists only for testing purposes.
|
||||
//
|
||||
// Let me repeat that.
|
||||
//
|
||||
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
|
||||
// - It uses a ton of memory.
|
||||
// - It uses a ton of stack.
|
||||
// - It uses ABSL_CHECK() and ABSL_LOG(FATAL).
|
||||
// - It implements unanchored search by repeated anchored search.
|
||||
//
|
||||
// On the other hand, it is very simple and a good reference
|
||||
// implementation for the more complicated regexp packages.
|
||||
//
|
||||
// In BUILD, this file is linked into the ":testing" library,
|
||||
// not the main library, in order to make it harder to pick up
|
||||
// accidentally.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "re2/pod_array.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Backtracker holds the state for a backtracking search.
|
||||
//
|
||||
// Excluding the search parameters, the main search state
|
||||
// is just the "capture registers", which record, for the
|
||||
// current execution, the string position at which each
|
||||
// parenthesis was passed. cap_[0] and cap_[1] are the
|
||||
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
|
||||
//
|
||||
// To avoid infinite loops during backtracking on expressions
|
||||
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
|
||||
// pairs that have already been explored and are thus not worth
|
||||
// re-exploring if we get there via another path. Modern backtracking
|
||||
// libraries engineer their program representation differently, to make
|
||||
// such infinite loops possible to avoid without keeping a giant visited_
|
||||
// bitmap, but visited_ works fine for a reference implementation
|
||||
// and it has the nice benefit of making the search run in linear time.
|
||||
class Backtracker {
|
||||
public:
|
||||
explicit Backtracker(Prog* prog);
|
||||
|
||||
bool Search(absl::string_view text, absl::string_view context, bool anchored,
|
||||
bool longest, absl::string_view* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
// Explores from instruction id at string position p looking for a match.
|
||||
// Returns true if found (so that caller can stop trying other possibilities).
|
||||
bool Visit(int id, const char* p);
|
||||
|
||||
// Tries instruction id at string position p.
|
||||
// Returns true if a match is found.
|
||||
bool Try(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
absl::string_view text_; // text being searched
|
||||
absl::string_view context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether search must end at text.end()
|
||||
absl::string_view* submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
const char* cap_[64]; // capture registers
|
||||
PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited
|
||||
|
||||
Backtracker(const Backtracker&) = delete;
|
||||
Backtracker& operator=(const Backtracker&) = delete;
|
||||
};
|
||||
|
||||
Backtracker::Backtracker(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0) {
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Backtracker::Search(absl::string_view text, absl::string_view context,
|
||||
bool anchored, bool longest,
|
||||
absl::string_view* submatch, int nsubmatch) {
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.data() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_))
|
||||
return false;
|
||||
if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_))
|
||||
return false;
|
||||
anchored_ = anchored | prog_->anchor_start();
|
||||
longest_ = longest | prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
ABSL_CHECK_LT(2*nsubmatch_, static_cast<int>(ABSL_ARRAYSIZE(cap_)));
|
||||
memset(cap_, 0, sizeof cap_);
|
||||
|
||||
// We use submatch_[0] for our own bookkeeping,
|
||||
// so it had better exist.
|
||||
absl::string_view sp0;
|
||||
if (nsubmatch < 1) {
|
||||
submatch_ = &sp0;
|
||||
nsubmatch_ = 1;
|
||||
}
|
||||
submatch_[0] = absl::string_view();
|
||||
|
||||
// Allocate new visited_ bitmap -- size is proportional
|
||||
// to text, so have to reallocate on each call to Search.
|
||||
int nvisited = prog_->size() * static_cast<int>(text.size()+1);
|
||||
nvisited = (nvisited + 31) / 32;
|
||||
visited_ = PODArray<uint32_t>(nvisited);
|
||||
memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.data();
|
||||
return Visit(prog_->start(), text.data());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
|
||||
cap_[0] = p;
|
||||
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
// Avoid invoking undefined behavior (arithmetic on a null pointer)
|
||||
// by simply not continuing the loop.
|
||||
if (p == NULL)
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Explores from instruction id at string position p looking for a match.
|
||||
// Return true if found (so that caller can stop trying other possibilities).
|
||||
bool Backtracker::Visit(int id, const char* p) {
|
||||
// Check bitmap. If we've already explored from here,
|
||||
// either it didn't match or it did but we're hoping for a better match.
|
||||
// Either way, don't go down that road again.
|
||||
ABSL_CHECK(p <= text_.data() + text_.size());
|
||||
int n = id * static_cast<int>(text_.size()+1) +
|
||||
static_cast<int>(p-text_.data());
|
||||
ABSL_CHECK_LT(n/32, visited_.size());
|
||||
if (visited_[n/32] & (1 << (n&31)))
|
||||
return false;
|
||||
visited_[n/32] |= 1 << (n&31);
|
||||
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
if (Try(id, p)) {
|
||||
if (longest_ && !ip->last())
|
||||
Visit(id+1, p);
|
||||
return true;
|
||||
}
|
||||
if (!ip->last())
|
||||
return Visit(id+1, p);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Tries instruction id at string position p.
|
||||
// Returns true if a match is found.
|
||||
bool Backtracker::Try(int id, const char* p) {
|
||||
// Pick out byte at current position. If at end of string,
|
||||
// have to explore in hope of finishing a match. Use impossible byte -1.
|
||||
int c = -1;
|
||||
if (p < text_.data() + text_.size())
|
||||
c = *p & 0xFF;
|
||||
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
ABSL_LOG(FATAL) << "Unexpected opcode: " << ip->opcode();
|
||||
return false; // not reached
|
||||
|
||||
case kInstAltMatch:
|
||||
// Ignored.
|
||||
return false;
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
return Visit(ip->out(), p+1);
|
||||
return false;
|
||||
|
||||
case kInstCapture:
|
||||
if (0 <= ip->cap() &&
|
||||
ip->cap() < static_cast<int>(ABSL_ARRAYSIZE(cap_))) {
|
||||
// Capture p to register, but save old value.
|
||||
const char* q = cap_[ip->cap()];
|
||||
cap_[ip->cap()] = p;
|
||||
bool ret = Visit(ip->out(), p);
|
||||
// Restore old value as we backtrack.
|
||||
cap_[ip->cap()] = q;
|
||||
return ret;
|
||||
}
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
return false;
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstNop:
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstMatch:
|
||||
// We found a match. If it's the best so far, record the
|
||||
// parameters in the caller's submatch_ array.
|
||||
if (endmatch_ && p != context_.data() + context_.size())
|
||||
return false;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL ||
|
||||
(longest_ && p > submatch_[0].data() + submatch_[0].size())) {
|
||||
// First match so far - or better match.
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = absl::string_view(
|
||||
cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
|
||||
}
|
||||
return true;
|
||||
|
||||
case kInstFail:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Prog::UnsafeSearchBacktrack(absl::string_view text,
|
||||
absl::string_view context, Anchor anchor,
|
||||
MatchKind kind, absl::string_view* match,
|
||||
int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
absl::string_view sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
Backtracker b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,228 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test character class manipulations.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct CCTest {
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} add[10];
|
||||
int remove;
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} final[10];
|
||||
};
|
||||
|
||||
static CCTest tests[] = {
|
||||
{ { { 10, 20 }, {-1} }, -1,
|
||||
{ { 10, 20 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 30 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 40 }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
|
||||
{ { 5, 25 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
|
||||
{ { 10, 23 }, {-1} } },
|
||||
|
||||
// These check boundary cases during negation.
|
||||
{ { { 0, Runemax }, {-1} }, -1,
|
||||
{ { 0, Runemax }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, -1,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
// Check RemoveAbove.
|
||||
{ { { 50, Runemax }, {-1} }, 255,
|
||||
{ { 50, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, 65535,
|
||||
{ { 50, 65535 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, Runemax,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
|
||||
{ { 50, 60 }, { 250, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, {-1} }, 255,
|
||||
{ { 50, 60 }, {-1} } },
|
||||
|
||||
{ { { 350, 360 }, {-1} }, 255,
|
||||
{ {-1} } },
|
||||
|
||||
{ { {-1} }, 255,
|
||||
{ {-1} } },
|
||||
};
|
||||
|
||||
template <typename CharClass>
|
||||
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
|
||||
if (t == NULL) {
|
||||
absl::PrintF("\t%s:", desc);
|
||||
} else {
|
||||
absl::PrintF("\n");
|
||||
absl::PrintF("CharClass added: [%s]", desc);
|
||||
for (int k = 0; t->add[k].lo >= 0; k++)
|
||||
absl::PrintF(" %d-%d", t->add[k].lo, t->add[k].hi);
|
||||
absl::PrintF("\n");
|
||||
if (t->remove >= 0)
|
||||
absl::PrintF("Removed > %d\n", t->remove);
|
||||
absl::PrintF("\twant:");
|
||||
for (int k = 0; t->final[k].lo >= 0; k++)
|
||||
absl::PrintF(" %d-%d", t->final[k].lo, t->final[k].hi);
|
||||
absl::PrintF("\n");
|
||||
absl::PrintF("\thave:");
|
||||
}
|
||||
|
||||
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
absl::PrintF(" %d-%d", it->lo, it->hi);
|
||||
absl::PrintF("\n");
|
||||
}
|
||||
|
||||
bool ShouldContain(CCTest *t, int x) {
|
||||
for (int j = 0; t->final[j].lo >= 0; j++)
|
||||
if (t->final[j].lo <= x && x <= t->final[j].hi)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
|
||||
|
||||
CharClass* Negate(CharClass *cc) {
|
||||
return cc->Negate();
|
||||
}
|
||||
|
||||
void Delete(CharClass* cc) {
|
||||
cc->Delete();
|
||||
}
|
||||
|
||||
CharClassBuilder* Negate(CharClassBuilder* cc) {
|
||||
CharClassBuilder* ncc = cc->Copy();
|
||||
ncc->Negate();
|
||||
return ncc;
|
||||
}
|
||||
|
||||
void Delete(CharClassBuilder* cc) {
|
||||
delete cc;
|
||||
}
|
||||
|
||||
template <typename CharClass>
|
||||
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
|
||||
typename CharClass::iterator it = cc->begin();
|
||||
int size = 0;
|
||||
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
|
||||
if (it == cc->end() ||
|
||||
it->lo != t->final[j].lo ||
|
||||
it->hi != t->final[j].hi) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
size += it->hi - it->lo + 1;
|
||||
}
|
||||
if (it != cc->end()) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
if (cc->size() != size) {
|
||||
Broke(desc, t, cc);
|
||||
absl::PrintF("wrong size: want %d have %d\n", size, cc->size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) != cc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
absl::PrintF("want contains(%d)=%d, got %d\n",
|
||||
j, ShouldContain(t, j), cc->Contains(j));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
CharClass* ncc = Negate(cc);
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) == ncc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
absl::PrintF("want ncc contains(%d)!=%d, got %d\n",
|
||||
j, ShouldContain(t, j), ncc->Contains(j));
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
if (ncc->size() != Runemax+1 - cc->size()) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
absl::PrintF("ncc size should be %d is %d\n",
|
||||
Runemax+1 - cc->size(), ncc->size());
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Delete(ncc);
|
||||
return true;
|
||||
}
|
||||
|
||||
TEST(TestCharClassBuilder, Adds) {
|
||||
int nfail = 0;
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
CharClassBuilder ccb;
|
||||
CCTest* t = &tests[i];
|
||||
for (int j = 0; t->add[j].lo >= 0; j++)
|
||||
ccb.AddRange(t->add[j].lo, t->add[j].hi);
|
||||
if (t->remove >= 0)
|
||||
ccb.RemoveAbove(t->remove);
|
||||
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
CharClass* cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "before copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
|
||||
CharClassBuilder *ccb1 = ccb.Copy();
|
||||
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "after copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
delete ccb1;
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,431 @@
|
||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test prog.cc, compile.cc
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple input/output tests checking that
|
||||
// the regexp compiles to the expected code.
|
||||
// These are just to sanity check the basic implementation.
|
||||
// The real confidence tests happen by testing the NFA/DFA
|
||||
// that run the compiled code.
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* code;
|
||||
};
|
||||
|
||||
static Test tests[] = {
|
||||
{ "a",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "ab",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4. byte [62-62] 0 -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "a|c",
|
||||
"3+ byte [61-61] 0 -> 5\n"
|
||||
"4. byte [63-63] 0 -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "a|b",
|
||||
"3. byte [61-62] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "[ab]",
|
||||
"3. byte [61-62] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "a+",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4+ nop -> 3\n"
|
||||
"5. match! 0\n" },
|
||||
{ "a+?",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4+ match! 0\n"
|
||||
"5. nop -> 3\n" },
|
||||
{ "a*",
|
||||
"3+ byte [61-61] 1 -> 3\n"
|
||||
"4. match! 0\n" },
|
||||
{ "a*?",
|
||||
"3+ match! 0\n"
|
||||
"4. byte [61-61] 0 -> 3\n" },
|
||||
{ "a?",
|
||||
"3+ byte [61-61] 1 -> 5\n"
|
||||
"4. nop -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "a??",
|
||||
"3+ nop -> 5\n"
|
||||
"4. byte [61-61] 0 -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "a{4}",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4. byte [61-61] 0 -> 5\n"
|
||||
"5. byte [61-61] 0 -> 6\n"
|
||||
"6. byte [61-61] 0 -> 7\n"
|
||||
"7. match! 0\n" },
|
||||
{ "(a)",
|
||||
"3. capture 2 -> 4\n"
|
||||
"4. byte [61-61] 0 -> 5\n"
|
||||
"5. capture 3 -> 6\n"
|
||||
"6. match! 0\n" },
|
||||
{ "(?:a)",
|
||||
"3. byte [61-61] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "",
|
||||
"3. match! 0\n" },
|
||||
{ ".",
|
||||
"3+ byte [00-09] 0 -> 5\n"
|
||||
"4. byte [0b-ff] 0 -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "[^ab]",
|
||||
"3+ byte [00-09] 0 -> 6\n"
|
||||
"4+ byte [0b-60] 0 -> 6\n"
|
||||
"5. byte [63-ff] 0 -> 6\n"
|
||||
"6. match! 0\n" },
|
||||
{ "[Aa]",
|
||||
"3. byte/i [61-61] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "\\C+",
|
||||
"3. byte [00-ff] 0 -> 4\n"
|
||||
"4+ altmatch -> 5 | 6\n"
|
||||
"5+ nop -> 3\n"
|
||||
"6. match! 0\n" },
|
||||
{ "\\C*",
|
||||
"3+ altmatch -> 4 | 5\n"
|
||||
"4+ byte [00-ff] 1 -> 3\n"
|
||||
"5. match! 0\n" },
|
||||
{ "\\C?",
|
||||
"3+ byte [00-ff] 1 -> 5\n"
|
||||
"4. nop -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
// Issue 20992936
|
||||
{ "[[-`]",
|
||||
"3. byte [5b-60] 0 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
// Issue 310
|
||||
{ "(?:|a)*",
|
||||
"3+ nop -> 7\n"
|
||||
"4. nop -> 9\n"
|
||||
"5+ nop -> 7\n"
|
||||
"6. nop -> 9\n"
|
||||
"7+ nop -> 5\n"
|
||||
"8. byte [61-61] 0 -> 5\n"
|
||||
"9. match! 0\n" },
|
||||
{ "(?:|a)+",
|
||||
"3+ nop -> 5\n"
|
||||
"4. byte [61-61] 0 -> 5\n"
|
||||
"5+ nop -> 3\n"
|
||||
"6. match! 0\n" },
|
||||
};
|
||||
|
||||
TEST(TestRegexpCompileToProg, Simple) {
|
||||
int failed = 0;
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
const re2::Test& t = tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
|
||||
if (re == NULL) {
|
||||
ABSL_LOG(ERROR) << "Cannot parse: " << t.regexp;
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
if (prog == NULL) {
|
||||
ABSL_LOG(ERROR) << "Cannot compile: " << t.regexp;
|
||||
re->Decref();
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
ASSERT_TRUE(re->CompileToProg(1) == NULL);
|
||||
std::string s = prog->Dump();
|
||||
if (s != t.code) {
|
||||
ABSL_LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
|
||||
ABSL_LOG(ERROR) << "Want:\n" << t.code;
|
||||
ABSL_LOG(ERROR) << "Got:\n" << s;
|
||||
failed++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(failed, 0);
|
||||
}
|
||||
|
||||
static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags,
|
||||
std::string* bytemap) {
|
||||
Regexp* re = Regexp::Parse(pattern, flags, NULL);
|
||||
EXPECT_TRUE(re != NULL);
|
||||
|
||||
{
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
*bytemap = prog->DumpByteMap();
|
||||
delete prog;
|
||||
}
|
||||
|
||||
{
|
||||
Prog* prog = re->CompileToReverseProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
EXPECT_EQ(*bytemap, prog->DumpByteMap());
|
||||
delete prog;
|
||||
}
|
||||
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
TEST(TestCompile, Latin1Ranges) {
|
||||
// The distinct byte ranges involved in the Latin-1 dot ([^\n]).
|
||||
|
||||
std::string bytemap;
|
||||
|
||||
DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
|
||||
EXPECT_EQ("[00-09] -> 0\n"
|
||||
"[0a-0a] -> 1\n"
|
||||
"[0b-ff] -> 0\n",
|
||||
bytemap);
|
||||
}
|
||||
|
||||
TEST(TestCompile, OtherByteMapTests) {
|
||||
std::string bytemap;
|
||||
|
||||
// Test that "absent" ranges are mapped to the same byte class.
|
||||
DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
|
||||
EXPECT_EQ("[00-2f] -> 0\n"
|
||||
"[30-39] -> 1\n"
|
||||
"[3a-40] -> 0\n"
|
||||
"[41-46] -> 1\n"
|
||||
"[47-60] -> 0\n"
|
||||
"[61-66] -> 1\n"
|
||||
"[67-ff] -> 0\n",
|
||||
bytemap);
|
||||
|
||||
// Test the byte classes for \b.
|
||||
DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
|
||||
EXPECT_EQ("[00-2f] -> 0\n"
|
||||
"[30-39] -> 1\n"
|
||||
"[3a-40] -> 0\n"
|
||||
"[41-5a] -> 1\n"
|
||||
"[5b-5e] -> 0\n"
|
||||
"[5f-5f] -> 1\n"
|
||||
"[60-60] -> 0\n"
|
||||
"[61-7a] -> 1\n"
|
||||
"[7b-ff] -> 0\n",
|
||||
bytemap);
|
||||
|
||||
// Bug in the ASCII case-folding optimization created too many byte classes.
|
||||
DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
|
||||
EXPECT_EQ("[00-5e] -> 0\n"
|
||||
"[5f-5f] -> 1\n"
|
||||
"[60-ff] -> 0\n",
|
||||
bytemap);
|
||||
}
|
||||
|
||||
TEST(TestCompile, UTF8Ranges) {
|
||||
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
|
||||
// Once, erroneously split between 0x3f and 0x40 because it is
|
||||
// a 6-bit boundary.
|
||||
|
||||
std::string bytemap;
|
||||
|
||||
DumpByteMap(".", Regexp::PerlX, &bytemap);
|
||||
EXPECT_EQ("[00-09] -> 0\n"
|
||||
"[0a-0a] -> 1\n"
|
||||
"[0b-7f] -> 0\n"
|
||||
"[80-bf] -> 2\n"
|
||||
"[c0-c1] -> 1\n"
|
||||
"[c2-df] -> 3\n"
|
||||
"[e0-ef] -> 4\n"
|
||||
"[f0-f4] -> 5\n"
|
||||
"[f5-ff] -> 1\n",
|
||||
bytemap);
|
||||
}
|
||||
|
||||
TEST(TestCompile, InsufficientMemory) {
|
||||
Regexp* re = Regexp::Parse(
|
||||
"^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
|
||||
Regexp::LikePerl, NULL);
|
||||
EXPECT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(850);
|
||||
// If the memory budget has been exhausted, compilation should fail
|
||||
// and return NULL instead of trying to do anything with NoMatch().
|
||||
EXPECT_TRUE(prog == NULL);
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
static void Dump(absl::string_view pattern, Regexp::ParseFlags flags,
|
||||
std::string* forward, std::string* reverse) {
|
||||
Regexp* re = Regexp::Parse(pattern, flags, NULL);
|
||||
EXPECT_TRUE(re != NULL);
|
||||
|
||||
if (forward != NULL) {
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
*forward = prog->Dump();
|
||||
delete prog;
|
||||
}
|
||||
|
||||
if (reverse != NULL) {
|
||||
Prog* prog = re->CompileToReverseProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
*reverse = prog->Dump();
|
||||
delete prog;
|
||||
}
|
||||
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
TEST(TestCompile, Bug26705922) {
|
||||
// Bug in the compiler caused inefficient bytecode to be generated for Unicode
|
||||
// groups: common suffixes were cached, but common prefixes were not factored.
|
||||
|
||||
std::string forward, reverse;
|
||||
|
||||
Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
|
||||
EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
|
||||
"4. byte [90-90] 0 -> 5\n"
|
||||
"5. byte [80-80] 0 -> 6\n"
|
||||
"6+ byte [80-80] 0 -> 8\n"
|
||||
"7. byte [90-90] 0 -> 8\n"
|
||||
"8. match! 0\n",
|
||||
forward);
|
||||
EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
|
||||
"4. byte [90-90] 0 -> 5\n"
|
||||
"5. byte [80-80] 0 -> 6\n"
|
||||
"6. byte [90-90] 0 -> 7\n"
|
||||
"7. byte [f0-f0] 0 -> 8\n"
|
||||
"8. match! 0\n",
|
||||
reverse);
|
||||
|
||||
Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
|
||||
EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
|
||||
"4. byte [f0-f0] 0 -> 8\n"
|
||||
"5. byte [80-bf] 0 -> 6\n"
|
||||
"6. byte [80-bf] 0 -> 7\n"
|
||||
"7. match! 0\n"
|
||||
"8. byte [90-90] 0 -> 5\n",
|
||||
forward);
|
||||
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
|
||||
"4. byte [80-bf] 0 -> 5\n"
|
||||
"5+ byte [e8-ef] 0 -> 7\n"
|
||||
"6. byte [90-90] 0 -> 8\n"
|
||||
"7. match! 0\n"
|
||||
"8. byte [f0-f0] 0 -> 7\n",
|
||||
reverse);
|
||||
|
||||
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
|
||||
EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
|
||||
"4+ byte [e0-ef] 0 -> 8\n"
|
||||
"5. byte [f0-f4] 0 -> 9\n"
|
||||
"6. byte [80-bf] 0 -> 7\n"
|
||||
"7. match! 0\n"
|
||||
"8. byte [80-bf] 0 -> 6\n"
|
||||
"9. byte [80-bf] 0 -> 8\n",
|
||||
forward);
|
||||
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
|
||||
"4+ byte [c2-df] 0 -> 6\n"
|
||||
"5. byte [80-bf] 0 -> 7\n"
|
||||
"6. match! 0\n"
|
||||
"7+ byte [e0-ef] 0 -> 6\n"
|
||||
"8. byte [80-bf] 0 -> 9\n"
|
||||
"9. byte [f0-f4] 0 -> 6\n",
|
||||
reverse);
|
||||
}
|
||||
|
||||
TEST(TestCompile, Bug35237384) {
|
||||
// Bug in the compiler caused inefficient bytecode to be generated for
|
||||
// nested nullable subexpressions.
|
||||
|
||||
std::string forward;
|
||||
|
||||
Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
|
||||
EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
|
||||
"4. nop -> 5\n"
|
||||
"5+ byte [61-61] 1 -> 5\n"
|
||||
"6. nop -> 7\n"
|
||||
"7+ byte [61-61] 1 -> 7\n"
|
||||
"8. match! 0\n",
|
||||
forward);
|
||||
|
||||
Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
|
||||
EXPECT_EQ("3+ nop -> 28\n"
|
||||
"4. nop -> 30\n"
|
||||
"5+ byte [61-61] 1 -> 5\n"
|
||||
"6. nop -> 32\n"
|
||||
"7+ byte [61-61] 1 -> 7\n"
|
||||
"8. nop -> 26\n"
|
||||
"9+ byte [61-61] 1 -> 9\n"
|
||||
"10. nop -> 20\n"
|
||||
"11+ byte [62-62] 1 -> 11\n"
|
||||
"12. nop -> 20\n"
|
||||
"13+ byte [62-62] 1 -> 13\n"
|
||||
"14. nop -> 26\n"
|
||||
"15+ byte [62-62] 1 -> 15\n"
|
||||
"16. nop -> 32\n"
|
||||
"17+ nop -> 9\n"
|
||||
"18. nop -> 11\n"
|
||||
"19. match! 0\n"
|
||||
"20+ nop -> 17\n"
|
||||
"21. nop -> 19\n"
|
||||
"22+ nop -> 7\n"
|
||||
"23. nop -> 13\n"
|
||||
"24+ nop -> 17\n"
|
||||
"25. nop -> 19\n"
|
||||
"26+ nop -> 22\n"
|
||||
"27. nop -> 24\n"
|
||||
"28+ nop -> 5\n"
|
||||
"29. nop -> 15\n"
|
||||
"30+ nop -> 22\n"
|
||||
"31. nop -> 24\n"
|
||||
"32+ nop -> 28\n"
|
||||
"33. nop -> 30\n",
|
||||
forward);
|
||||
|
||||
Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
|
||||
EXPECT_EQ("3+ nop -> 36\n"
|
||||
"4+ nop -> 31\n"
|
||||
"5. nop -> 33\n"
|
||||
"6+ byte [00-09] 0 -> 8\n"
|
||||
"7. byte [0b-ff] 0 -> 8\n"
|
||||
"8+ nop -> 6\n"
|
||||
"9+ nop -> 29\n"
|
||||
"10. nop -> 28\n"
|
||||
"11+ byte [00-09] 0 -> 13\n"
|
||||
"12. byte [0b-ff] 0 -> 13\n"
|
||||
"13+ nop -> 11\n"
|
||||
"14+ nop -> 26\n"
|
||||
"15. nop -> 28\n"
|
||||
"16+ byte [00-09] 0 -> 18\n"
|
||||
"17. byte [0b-ff] 0 -> 18\n"
|
||||
"18+ nop -> 16\n"
|
||||
"19+ nop -> 36\n"
|
||||
"20. nop -> 33\n"
|
||||
"21+ byte [00-09] 0 -> 23\n"
|
||||
"22. byte [0b-ff] 0 -> 23\n"
|
||||
"23+ nop -> 21\n"
|
||||
"24+ nop -> 31\n"
|
||||
"25. nop -> 33\n"
|
||||
"26+ nop -> 28\n"
|
||||
"27. byte [53-53] 0 -> 11\n"
|
||||
"28. match! 0\n"
|
||||
"29+ nop -> 28\n"
|
||||
"30. byte [53-53] 0 -> 6\n"
|
||||
"31+ nop -> 33\n"
|
||||
"32. byte [53-53] 0 -> 21\n"
|
||||
"33+ nop -> 29\n"
|
||||
"34+ nop -> 26\n"
|
||||
"35. nop -> 28\n"
|
||||
"36+ nop -> 33\n"
|
||||
"37. byte [53-53] 0 -> 16\n",
|
||||
forward);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,376 @@
|
||||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/flags/flag.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
#include "util/malloc_counter.h"
|
||||
|
||||
static const bool UsingMallocCounter = false;
|
||||
|
||||
ABSL_FLAG(int, size, 8, "log2(number of DFA nodes)");
|
||||
ABSL_FLAG(int, repeat, 2, "Repetition count.");
|
||||
ABSL_FLAG(int, threads, 4, "number of threads");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static int state_cache_resets = 0;
|
||||
static int search_failures = 0;
|
||||
|
||||
struct SetHooks {
|
||||
SetHooks() {
|
||||
hooks::SetDFAStateCacheResetHook([](const hooks::DFAStateCacheReset&) {
|
||||
++state_cache_resets;
|
||||
});
|
||||
hooks::SetDFASearchFailureHook([](const hooks::DFASearchFailure&) {
|
||||
++search_failures;
|
||||
});
|
||||
}
|
||||
} set_hooks;
|
||||
|
||||
// Check that multithreaded access to DFA class works.
|
||||
|
||||
// Helper function: builds entire DFA for prog.
|
||||
static void DoBuild(Prog* prog) {
|
||||
ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
|
||||
}
|
||||
|
||||
TEST(Multithreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^FLAGS_size states in DFA.
|
||||
std::string s = "a";
|
||||
for (int i = 0; i < absl::GetFlag(FLAGS_size); i++)
|
||||
s += "[ab]";
|
||||
s += "b";
|
||||
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
|
||||
std::thread t(DoBuild, prog);
|
||||
t.join();
|
||||
|
||||
delete prog;
|
||||
}
|
||||
|
||||
// Build the DFA simultaneously in a bunch of threads.
|
||||
for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
|
||||
threads.emplace_back(DoBuild, prog);
|
||||
for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
|
||||
threads[j].join();
|
||||
|
||||
// One more compile, to make sure everything is okay.
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
|
||||
delete prog;
|
||||
}
|
||||
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Check that DFA size requirements are followed.
|
||||
// BuildEntireDFA will, like SearchDFA, stop building out
|
||||
// the DFA once the memory limits are reached.
|
||||
TEST(SingleThreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^30 states in DFA.
|
||||
Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
|
||||
for (int i = 17; i < 24; i++) {
|
||||
int64_t limit = int64_t{1}<<i;
|
||||
int64_t usage;
|
||||
//int64_t progusage, dfamem;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(limit);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
//progusage = m.HeapGrowth();
|
||||
//dfamem = prog->dfa_mem();
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
|
||||
prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
|
||||
usage = m.HeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
if (UsingMallocCounter) {
|
||||
//ABSL_LOG(INFO) << "limit " << limit << ", "
|
||||
// << "prog usage " << progusage << ", "
|
||||
// << "DFA budget " << dfamem << ", "
|
||||
// << "total " << usage;
|
||||
// Tolerate +/- 10%.
|
||||
ASSERT_GT(usage, limit*9/10);
|
||||
ASSERT_LT(usage, limit*11/10);
|
||||
}
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Test that the DFA gets the right result even if it runs
|
||||
// out of memory during a search. The regular expression
|
||||
// 0[01]{n}$ matches a binary string of 0s and 1s only if
|
||||
// the (n+1)th-to-last character is a 0. Matching this in
|
||||
// a single forward pass (as done by the DFA) requires
|
||||
// keeping one bit for each of the last n+1 characters
|
||||
// (whether each was a 0), or 2^(n+1) possible states.
|
||||
// If we run this regexp to search in a string that contains
|
||||
// every possible n-character binary string as a substring,
|
||||
// then it will have to run through at least 2^n states.
|
||||
// States are big data structures -- certainly more than 1 byte --
|
||||
// so if the DFA can search correctly while staying within a
|
||||
// 2^n byte limit, it must be handling out-of-memory conditions
|
||||
// gracefully.
|
||||
TEST(SingleThreaded, SearchDFA) {
|
||||
// The De Bruijn string is the worst case input for this regexp.
|
||||
// By default, the DFA will notice that it is flushing its cache
|
||||
// too frequently and will bail out early, so that RE2 can use the
|
||||
// NFA implementation instead. (The DFA loses its speed advantage
|
||||
// if it can't get a good cache hit rate.)
|
||||
// Tell the DFA to trudge along instead.
|
||||
Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false);
|
||||
state_cache_resets = 0;
|
||||
search_failures = 0;
|
||||
|
||||
// Choice of n is mostly arbitrary, except that:
|
||||
// * making n too big makes the test run for too long.
|
||||
// * making n too small makes the DFA refuse to run,
|
||||
// because it has so little memory compared to the program size.
|
||||
// Empirically, n = 18 is a good compromise between the two.
|
||||
const int n = 18;
|
||||
|
||||
Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
|
||||
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
|
||||
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
|
||||
std::string no_match = DeBruijnString(n);
|
||||
std::string match = no_match + "0";
|
||||
|
||||
int64_t usage;
|
||||
int64_t peak_usage;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
bool matched = false;
|
||||
bool failed = false;
|
||||
matched =
|
||||
prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
|
||||
Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
ASSERT_FALSE(failed);
|
||||
ASSERT_TRUE(matched);
|
||||
matched =
|
||||
prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
|
||||
Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
ASSERT_FALSE(failed);
|
||||
ASSERT_FALSE(matched);
|
||||
}
|
||||
usage = m.HeapGrowth();
|
||||
peak_usage = m.PeakHeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
if (UsingMallocCounter) {
|
||||
//ABSL_LOG(INFO) << "usage " << usage << ", "
|
||||
// << "peak usage " << peak_usage;
|
||||
ASSERT_LT(usage, 1<<n);
|
||||
ASSERT_LT(peak_usage, 1<<n);
|
||||
}
|
||||
re->Decref();
|
||||
|
||||
// Reset to original behaviour.
|
||||
Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true);
|
||||
ASSERT_GT(state_cache_resets, 0);
|
||||
ASSERT_EQ(search_failures, 0);
|
||||
}
|
||||
|
||||
// Helper function: searches for match, which should match,
|
||||
// and no_match, which should not.
|
||||
static void DoSearch(Prog* prog, absl::string_view match,
|
||||
absl::string_view no_match) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
bool matched = false;
|
||||
bool failed = false;
|
||||
matched =
|
||||
prog->SearchDFA(match, absl::string_view(), Prog::kUnanchored,
|
||||
Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
ASSERT_FALSE(failed);
|
||||
ASSERT_TRUE(matched);
|
||||
matched =
|
||||
prog->SearchDFA(no_match, absl::string_view(), Prog::kUnanchored,
|
||||
Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
ASSERT_FALSE(failed);
|
||||
ASSERT_FALSE(matched);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Multithreaded, SearchDFA) {
|
||||
Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(false);
|
||||
state_cache_resets = 0;
|
||||
search_failures = 0;
|
||||
|
||||
// Same as single-threaded test above.
|
||||
const int n = 18;
|
||||
Regexp* re = Regexp::Parse(absl::StrFormat("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
std::string no_match = DeBruijnString(n);
|
||||
std::string match = no_match + "0";
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
|
||||
std::thread t(DoSearch, prog, match, no_match);
|
||||
t.join();
|
||||
|
||||
delete prog;
|
||||
}
|
||||
|
||||
// Run the search simultaneously in a bunch of threads.
|
||||
// Reuse same flags for Multithreaded.BuildDFA above.
|
||||
for (int i = 0; i < absl::GetFlag(FLAGS_repeat); i++) {
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
|
||||
threads.emplace_back(DoSearch, prog, match, no_match);
|
||||
for (int j = 0; j < absl::GetFlag(FLAGS_threads); j++)
|
||||
threads[j].join();
|
||||
|
||||
delete prog;
|
||||
}
|
||||
|
||||
re->Decref();
|
||||
|
||||
// Reset to original behaviour.
|
||||
Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(true);
|
||||
ASSERT_GT(state_cache_resets, 0);
|
||||
ASSERT_EQ(search_failures, 0);
|
||||
}
|
||||
|
||||
struct ReverseTest {
|
||||
const char* regexp;
|
||||
const char* text;
|
||||
bool match;
|
||||
};
|
||||
|
||||
// Test that reverse DFA handles anchored/unanchored correctly.
|
||||
// It's in the DFA interface but not used by RE2.
|
||||
ReverseTest reverse_tests[] = {
|
||||
{ "\\A(a|b)", "abc", true },
|
||||
{ "(a|b)\\z", "cba", true },
|
||||
{ "\\A(a|b)", "cba", false },
|
||||
{ "(a|b)\\z", "abc", false },
|
||||
};
|
||||
|
||||
TEST(DFA, ReverseMatch) {
|
||||
int nfail = 0;
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(reverse_tests); i++) {
|
||||
const ReverseTest& t = reverse_tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToReverseProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
bool failed = false;
|
||||
bool matched =
|
||||
prog->SearchDFA(t.text, absl::string_view(), Prog::kUnanchored,
|
||||
Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
if (matched != t.match) {
|
||||
ABSL_LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
|
||||
nfail++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
struct CallbackTest {
|
||||
const char* regexp;
|
||||
const char* dump;
|
||||
};
|
||||
|
||||
// Test that DFA::BuildAllStates() builds the expected DFA states
|
||||
// and issues the expected callbacks. These test cases reflect the
|
||||
// very compact encoding of the callbacks, but that also makes them
|
||||
// very difficult to understand, so let's work through "\\Aa\\z".
|
||||
// There are three slots per DFA state because the bytemap has two
|
||||
// equivalence classes and there is a third slot for kByteEndText:
|
||||
// 0: all bytes that are not 'a'
|
||||
// 1: the byte 'a'
|
||||
// 2: kByteEndText
|
||||
// -1 means that there is no transition from that DFA state to any
|
||||
// other DFA state for that slot. The valid transitions are thus:
|
||||
// state 0 --slot 1--> state 1
|
||||
// state 1 --slot 2--> state 2
|
||||
// The double brackets indicate that state 2 is a matching state.
|
||||
// Putting it together, this means that the DFA must consume the
|
||||
// byte 'a' and then hit end of text. Q.E.D.
|
||||
CallbackTest callback_tests[] = {
|
||||
{ "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
|
||||
{ "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
|
||||
{ "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
|
||||
{ "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
|
||||
{ "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
|
||||
{ "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
|
||||
{ "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
|
||||
{ "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
|
||||
{ "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
|
||||
{ "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
|
||||
};
|
||||
|
||||
TEST(DFA, Callback) {
|
||||
int nfail = 0;
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(callback_tests); i++) {
|
||||
const CallbackTest& t = callback_tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
std::string dump;
|
||||
prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
|
||||
ASSERT_TRUE(next != NULL);
|
||||
if (!dump.empty())
|
||||
dump += " ";
|
||||
dump += match ? "[[" : "[";
|
||||
for (int b = 0; b < prog->bytemap_range() + 1; b++)
|
||||
dump += absl::StrFormat("%d,", next[b]);
|
||||
dump.pop_back();
|
||||
dump += match ? "]]" : "]";
|
||||
});
|
||||
if (dump != t.dump) {
|
||||
ABSL_LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
|
||||
ABSL_LOG(ERROR) << t.regexp << " dump:\n" << "got " << dump << "\n"
|
||||
<< "want " << t.dump;
|
||||
nfail++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,172 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Dump the regexp into a string showing structure.
|
||||
// Tested by parse_unittest.cc
|
||||
|
||||
// This function traverses the regexp recursively,
|
||||
// meaning that on inputs like Regexp::Simplify of
|
||||
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
|
||||
// it takes time and space exponential in the size of the
|
||||
// original regular expression. It can also use stack space
|
||||
// linear in the size of the regular expression for inputs
|
||||
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
|
||||
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
|
||||
// As a result, Dump is provided only in the testing
|
||||
// library (see BUILD).
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const char* kOpcodeNames[] = {
|
||||
"bad",
|
||||
"no",
|
||||
"emp",
|
||||
"lit",
|
||||
"str",
|
||||
"cat",
|
||||
"alt",
|
||||
"star",
|
||||
"plus",
|
||||
"que",
|
||||
"rep",
|
||||
"cap",
|
||||
"dot",
|
||||
"byte",
|
||||
"bol",
|
||||
"eol",
|
||||
"wb", // kRegexpWordBoundary
|
||||
"nwb", // kRegexpNoWordBoundary
|
||||
"bot",
|
||||
"eot",
|
||||
"cc",
|
||||
"match",
|
||||
};
|
||||
|
||||
// Create string representation of regexp with explicit structure.
|
||||
// Nothing pretty, just for testing.
|
||||
static void DumpRegexpAppending(Regexp* re, std::string* s) {
|
||||
if (re->op() < 0 || re->op() >= ABSL_ARRAYSIZE(kOpcodeNames)) {
|
||||
*s += absl::StrFormat("op%d", re->op());
|
||||
} else {
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
s->append("n");
|
||||
break;
|
||||
}
|
||||
s->append(kOpcodeNames[re->op()]);
|
||||
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
Rune r = re->rune();
|
||||
if ('a' <= r && r <= 'z')
|
||||
s->append("fold");
|
||||
}
|
||||
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
if ('a' <= r && r <= 'z') {
|
||||
s->append("fold");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
s->append("{");
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpEndText:
|
||||
if (!(re->parse_flags() & Regexp::WasDollar)) {
|
||||
s->append("\\z");
|
||||
}
|
||||
break;
|
||||
case kRegexpLiteral: {
|
||||
Rune r = re->rune();
|
||||
if (re->parse_flags() & Regexp::Latin1) {
|
||||
s->push_back(r);
|
||||
} else {
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case kRegexpLiteralString:
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
if (re->parse_flags() & Regexp::Latin1) {
|
||||
s->push_back(r);
|
||||
} else {
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
DumpRegexpAppending(re->sub()[i], s);
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
if (re->cap() == 0)
|
||||
ABSL_LOG(DFATAL) << "kRegexpCapture cap() == 0";
|
||||
if (re->name()) {
|
||||
s->append(*re->name());
|
||||
s->append(":");
|
||||
}
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
s->append(absl::StrFormat("%d,%d ", re->min(), re->max()));
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCharClass: {
|
||||
std::string sep;
|
||||
for (CharClass::iterator it = re->cc()->begin();
|
||||
it != re->cc()->end(); ++it) {
|
||||
RuneRange rr = *it;
|
||||
s->append(sep);
|
||||
if (rr.lo == rr.hi)
|
||||
s->append(absl::StrFormat("%#x", rr.lo));
|
||||
else
|
||||
s->append(absl::StrFormat("%#x-%#x", rr.lo, rr.hi));
|
||||
sep = " ";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
s->append("}");
|
||||
}
|
||||
|
||||
std::string Regexp::Dump() {
|
||||
// Make sure that we are being called from a unit test.
|
||||
// Should cause a link error if used outside of testing.
|
||||
ABSL_CHECK(!::testing::TempDir().empty());
|
||||
|
||||
std::string s;
|
||||
DumpRegexpAppending(this, &s);
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,40 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple repetition operators
|
||||
TEST(Repetition, Simple) {
|
||||
std::vector<std::string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
6, Explode("ab"), "(?:%s)", "");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
40, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
// Test capturing parens -- (a) -- inside repetition operators
|
||||
TEST(Repetition, Capturing) {
|
||||
std::vector<std::string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
|
||||
7, Explode("ab"), "(?:%s)", "");
|
||||
ExhaustiveTest(3, 2, Split(" ", "a (a)"), ops,
|
||||
50, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,72 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test empty string matches (aka "(?:)")
|
||||
TEST(EmptyString, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
5, Split("", "ab"), "", "");
|
||||
}
|
||||
|
||||
// Test escaped versions of regexp syntax.
|
||||
TEST(Punctuation, Literals) {
|
||||
std::vector<std::string> alphabet = Explode("()*+?{}[]\\^$.");
|
||||
std::vector<std::string> escaped = alphabet;
|
||||
for (size_t i = 0; i < escaped.size(); i++)
|
||||
escaped[i] = "\\" + escaped[i];
|
||||
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
|
||||
2, alphabet, "", "");
|
||||
}
|
||||
|
||||
// Test ^ $ . \A \z in presence of line endings.
|
||||
// Have to wrap the empty-width ones in (?:) so that
|
||||
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
|
||||
TEST(LineEnds, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
4, Explode("ab\n"), "", "");
|
||||
}
|
||||
|
||||
// Test what does and does not match \n.
|
||||
// This would be a good test, except that PCRE seems to have a bug:
|
||||
// in single-byte character set mode (the default),
|
||||
// [^a] matches \n, but in UTF-8 mode it does not.
|
||||
// So when we run the test, the tester complains that
|
||||
// we don't agree with PCRE, but it's PCRE that is at fault.
|
||||
// For what it's worth, Perl gets this right (matches
|
||||
// regardless of whether UTF-8 input is selected):
|
||||
//
|
||||
// #!/usr/bin/perl
|
||||
// use POSIX qw(locale_h);
|
||||
// print "matches in latin1\n" if "\n" =~ /[^a]/;
|
||||
// setlocale("en_US.utf8");
|
||||
// print "matches in utf8\n" if "\n" =~ /[^a]/;
|
||||
//
|
||||
// The rule chosen for RE2 is that by default, like Perl,
|
||||
// dot does not match \n but negated character classes [^a] do.
|
||||
// (?s) will allow dot to match \n; there is no way in RE2
|
||||
// to stop [^a] from matching \n, though the underlying library
|
||||
// provides a mechanism, and RE2 could add new syntax if needed.
|
||||
//
|
||||
// TEST(Newlines, Exhaustive) {
|
||||
// std::vector<std::string> empty_vector;
|
||||
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
|
||||
// RegexpGenerator::EgrepOps(),
|
||||
// 4, Explode("a\n"), "");
|
||||
// }
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,100 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple character classes by themselves.
|
||||
TEST(CharacterClasses, Exhaustive) {
|
||||
std::vector<std::string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "", "");
|
||||
}
|
||||
|
||||
// Test simple character classes inside a___b (for example, a[a]b).
|
||||
TEST(CharacterClasses, ExhaustiveAB) {
|
||||
std::vector<std::string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "a%sb", "");
|
||||
}
|
||||
|
||||
// Returns UTF8 for Rune r
|
||||
static std::string UTF8(Rune r) {
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// Returns a vector of "interesting" UTF8 characters.
|
||||
// Unicode is now too big to just return all of them,
|
||||
// so UTF8Characters return a set likely to be good test cases.
|
||||
static const std::vector<std::string>& InterestingUTF8() {
|
||||
static bool init;
|
||||
static std::vector<std::string> v;
|
||||
|
||||
if (init)
|
||||
return v;
|
||||
|
||||
init = true;
|
||||
// All the Latin1 equivalents are interesting.
|
||||
for (int i = 1; i < 256; i++)
|
||||
v.push_back(UTF8(i));
|
||||
|
||||
// After that, the codes near bit boundaries are
|
||||
// interesting, because they span byte sequence lengths.
|
||||
for (int j = 0; j < 8; j++)
|
||||
v.push_back(UTF8(256 + j));
|
||||
for (int i = 512; i < Runemax; i <<= 1)
|
||||
for (int j = -8; j < 8; j++)
|
||||
v.push_back(UTF8(i + j));
|
||||
|
||||
// The codes near Runemax, including Runemax itself, are interesting.
|
||||
for (int j = -8; j <= 0; j++)
|
||||
v.push_back(UTF8(Runemax + j));
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes.
|
||||
TEST(InterestingUTF8, SingleOps) {
|
||||
std::vector<std::string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
std::vector<std::string> ops; // no ops
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, InterestingUTF8(), "", "");
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes,
|
||||
// but wrap everything inside AB.
|
||||
TEST(InterestingUTF8, AB) {
|
||||
std::vector<std::string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
std::vector<std::string> ops; // no ops
|
||||
std::vector<std::string> alpha = InterestingUTF8();
|
||||
for (size_t i = 0; i < alpha.size(); i++)
|
||||
alpha[i] = "a" + alpha[i] + "b";
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, alpha, "a%sb", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,35 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, Lowercase) {
|
||||
EgrepTest(3, 2, "abc.", 3, "abc", "");
|
||||
}
|
||||
|
||||
// Test mixed-case expressions.
|
||||
TEST(EgrepLiterals, MixedCase) {
|
||||
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
|
||||
}
|
||||
|
||||
// Test mixed-case in case-insensitive mode.
|
||||
TEST(EgrepLiterals, FoldCase) {
|
||||
// The punctuation characters surround A-Z and a-z
|
||||
// in the ASCII table. This looks for bugs in the
|
||||
// bytemap range code in the DFA.
|
||||
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
|
||||
}
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, UTF8) {
|
||||
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,204 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
|
||||
// a maximum regular expression length, and a maximum number of letters
|
||||
// that can appear in the regular expression. Given these parameters,
|
||||
// it tries every possible regular expression and string, verifying that
|
||||
// the NFA, DFA, and a trivial backtracking implementation agree about
|
||||
// the location of the match.
|
||||
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/flags/flag.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/tester.h"
|
||||
|
||||
// For target `log' in the Makefile.
|
||||
#ifndef LOGGING
|
||||
#define LOGGING 0
|
||||
#endif
|
||||
|
||||
ABSL_FLAG(bool, show_regexps, false, "show regexps during testing");
|
||||
|
||||
ABSL_FLAG(int, max_bad_regexp_inputs, 1,
|
||||
"Stop testing a regular expression after finding this many "
|
||||
"strings that break it.");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static char* escape(absl::string_view sp) {
|
||||
static char buf[512];
|
||||
char* p = buf;
|
||||
*p++ = '\"';
|
||||
for (size_t i = 0; i < sp.size(); i++) {
|
||||
if(p+5 >= buf+sizeof buf)
|
||||
ABSL_LOG(FATAL) << "ExhaustiveTester escape: too long";
|
||||
if(sp[i] == '\\' || sp[i] == '\"') {
|
||||
*p++ = '\\';
|
||||
*p++ = sp[i];
|
||||
} else if(sp[i] == '\n') {
|
||||
*p++ = '\\';
|
||||
*p++ = 'n';
|
||||
} else {
|
||||
*p++ = sp[i];
|
||||
}
|
||||
}
|
||||
*p++ = '\"';
|
||||
*p = '\0';
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void PrintResult(const RE2& re, absl::string_view input,
|
||||
RE2::Anchor anchor, absl::string_view* m, int n) {
|
||||
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
|
||||
absl::PrintF("-");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (i > 0)
|
||||
absl::PrintF(" ");
|
||||
if (m[i].data() == NULL)
|
||||
absl::PrintF("-");
|
||||
else
|
||||
absl::PrintF("%d-%d",
|
||||
BeginPtr(m[i]) - BeginPtr(input),
|
||||
EndPtr(m[i]) - BeginPtr(input));
|
||||
}
|
||||
}
|
||||
|
||||
// Processes a single generated regexp.
|
||||
// Compiles it using Regexp interface and PCRE, and then
|
||||
// checks that NFA, DFA, and PCRE all return the same results.
|
||||
void ExhaustiveTester::HandleRegexp(const std::string& const_regexp) {
|
||||
regexps_++;
|
||||
std::string regexp = const_regexp;
|
||||
if (!topwrapper_.empty()) {
|
||||
auto fmt = absl::ParsedFormat<'s'>::New(topwrapper_);
|
||||
ABSL_CHECK(fmt != nullptr);
|
||||
regexp = absl::StrFormat(*fmt, regexp);
|
||||
}
|
||||
|
||||
if (absl::GetFlag(FLAGS_show_regexps)) {
|
||||
absl::PrintF("\r%s", regexp);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
if (LOGGING) {
|
||||
// Write out test cases and answers for use in testing
|
||||
// other implementations, such as Go's regexp package.
|
||||
if (randomstrings_)
|
||||
ABSL_LOG(ERROR) << "Cannot log with random strings.";
|
||||
if (regexps_ == 1) { // first
|
||||
absl::PrintF("strings\n");
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext())
|
||||
absl::PrintF("%s\n", escape(strgen_.Next()));
|
||||
absl::PrintF("regexps\n");
|
||||
}
|
||||
absl::PrintF("%s\n", escape(regexp));
|
||||
|
||||
RE2 re(regexp);
|
||||
RE2::Options longest;
|
||||
longest.set_longest_match(true);
|
||||
RE2 relongest(regexp, longest);
|
||||
int ngroup = re.NumberOfCapturingGroups()+1;
|
||||
absl::string_view* group = new absl::string_view[ngroup];
|
||||
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext()) {
|
||||
absl::string_view input = strgen_.Next();
|
||||
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
absl::PrintF(";");
|
||||
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
|
||||
absl::PrintF(";");
|
||||
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
absl::PrintF(";");
|
||||
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
|
||||
absl::PrintF("\n");
|
||||
}
|
||||
delete[] group;
|
||||
return;
|
||||
}
|
||||
|
||||
Tester tester(regexp);
|
||||
if (tester.error())
|
||||
return;
|
||||
|
||||
strgen_.Reset();
|
||||
strgen_.GenerateNULL();
|
||||
if (randomstrings_)
|
||||
strgen_.Random(stringseed_, stringcount_);
|
||||
int bad_inputs = 0;
|
||||
while (strgen_.HasNext()) {
|
||||
tests_++;
|
||||
if (!tester.TestInput(strgen_.Next())) {
|
||||
failures_++;
|
||||
if (++bad_inputs >= absl::GetFlag(FLAGS_max_bad_regexp_inputs))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const std::vector<std::string>& alphabet,
|
||||
const std::vector<std::string>& ops,
|
||||
int maxstrlen,
|
||||
const std::vector<std::string>& stralphabet,
|
||||
const std::string& wrapper,
|
||||
const std::string& topwrapper) {
|
||||
if (RE2_DEBUG_MODE) {
|
||||
if (maxatoms > 1)
|
||||
maxatoms--;
|
||||
if (maxops > 1)
|
||||
maxops--;
|
||||
if (maxstrlen > 1)
|
||||
maxstrlen--;
|
||||
}
|
||||
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
|
||||
maxstrlen, stralphabet, wrapper,
|
||||
topwrapper);
|
||||
t.Generate();
|
||||
if (!LOGGING) {
|
||||
absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
|
||||
t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
|
||||
}
|
||||
EXPECT_EQ(0, t.failures());
|
||||
}
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
|
||||
int maxstrlen, const std::string& stralphabet,
|
||||
const std::string& wrapper) {
|
||||
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
|
||||
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tops); i++) {
|
||||
ExhaustiveTest(maxatoms, maxops,
|
||||
Split("", alphabet),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
maxstrlen,
|
||||
Split("", stralphabet),
|
||||
wrapper,
|
||||
tops[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,105 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H_
|
||||
#define RE2_TESTING_EXHAUSTIVE_TESTER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Doing this simplifies the logic below.
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
// We are in a debug build.
|
||||
const bool RE2_DEBUG_MODE = true;
|
||||
#elif __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) || __has_feature(thread_sanitizer)
|
||||
// Not a debug build, but still under sanitizers.
|
||||
const bool RE2_DEBUG_MODE = true;
|
||||
#else
|
||||
const bool RE2_DEBUG_MODE = false;
|
||||
#endif
|
||||
|
||||
// Exhaustive regular expression test: generate all regexps within parameters,
|
||||
// then generate all strings of a given length over a given alphabet,
|
||||
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
|
||||
// each possible string, and if so, where the match is.
|
||||
//
|
||||
// Can also be used in a "random" mode that generates a given number
|
||||
// of random regexp and strings, allowing testing of larger expressions
|
||||
// and inputs.
|
||||
class ExhaustiveTester : public RegexpGenerator {
|
||||
public:
|
||||
ExhaustiveTester(int maxatoms,
|
||||
int maxops,
|
||||
const std::vector<std::string>& alphabet,
|
||||
const std::vector<std::string>& ops,
|
||||
int maxstrlen,
|
||||
const std::vector<std::string>& stralphabet,
|
||||
const std::string& wrapper,
|
||||
const std::string& topwrapper)
|
||||
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
|
||||
strgen_(maxstrlen, stralphabet),
|
||||
wrapper_(wrapper),
|
||||
topwrapper_(topwrapper),
|
||||
regexps_(0), tests_(0), failures_(0),
|
||||
randomstrings_(0), stringseed_(0), stringcount_(0) { }
|
||||
|
||||
int regexps() { return regexps_; }
|
||||
int tests() { return tests_; }
|
||||
int failures() { return failures_; }
|
||||
|
||||
// Needed for RegexpGenerator interface.
|
||||
void HandleRegexp(const std::string& regexp);
|
||||
|
||||
// Causes testing to generate random input strings.
|
||||
void RandomStrings(int32_t seed, int32_t count) {
|
||||
randomstrings_ = true;
|
||||
stringseed_ = seed;
|
||||
stringcount_ = count;
|
||||
}
|
||||
|
||||
private:
|
||||
StringGenerator strgen_;
|
||||
std::string wrapper_; // Regexp wrapper - either empty or has one %s.
|
||||
std::string topwrapper_; // Regexp top-level wrapper.
|
||||
int regexps_; // Number of HandleRegexp calls
|
||||
int tests_; // Number of regexp tests.
|
||||
int failures_; // Number of tests failed.
|
||||
|
||||
bool randomstrings_; // Whether to use random strings
|
||||
int32_t stringseed_; // If so, the seed.
|
||||
int stringcount_; // If so, how many to generate.
|
||||
|
||||
ExhaustiveTester(const ExhaustiveTester&) = delete;
|
||||
ExhaustiveTester& operator=(const ExhaustiveTester&) = delete;
|
||||
};
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const std::vector<std::string>& alphabet,
|
||||
const std::vector<std::string>& ops,
|
||||
int maxstrlen,
|
||||
const std::vector<std::string>& stralphabet,
|
||||
const std::string& wrapper,
|
||||
const std::string& topwrapper);
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const std::string& alphabet,
|
||||
int maxstrlen, const std::string& stralphabet,
|
||||
const std::string& wrapper);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H_
|
||||
@@ -0,0 +1,343 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/filtered_re2.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct FilterTestVars {
|
||||
FilterTestVars() {}
|
||||
explicit FilterTestVars(int min_atom_len) : f(min_atom_len) {}
|
||||
|
||||
std::vector<std::string> atoms;
|
||||
std::vector<int> atom_indices;
|
||||
std::vector<int> matches;
|
||||
RE2::Options opts;
|
||||
FilteredRE2 f;
|
||||
};
|
||||
|
||||
TEST(FilteredRE2Test, EmptyTest) {
|
||||
FilterTestVars v;
|
||||
|
||||
v.f.Compile(&v.atoms);
|
||||
EXPECT_EQ(size_t{0}, v.atoms.size());
|
||||
|
||||
// Compile has no effect at all when called before Add: it will not
|
||||
// record that it has been called and it will not clear the vector.
|
||||
// The second point does not matter here, but the first point means
|
||||
// that an error will be logged during the call to AllMatches.
|
||||
v.f.AllMatches("foo", v.atom_indices, &v.matches);
|
||||
EXPECT_EQ(size_t{0}, v.matches.size());
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, SmallOrTest) {
|
||||
FilterTestVars v(4); // override the minimum atom length
|
||||
int id;
|
||||
v.f.Add("(foo|bar)", v.opts, &id);
|
||||
|
||||
v.f.Compile(&v.atoms);
|
||||
EXPECT_EQ(size_t{0}, v.atoms.size());
|
||||
|
||||
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
|
||||
EXPECT_EQ(size_t{1}, v.matches.size());
|
||||
EXPECT_EQ(id, v.matches[0]);
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, SmallLatinTest) {
|
||||
FilterTestVars v;
|
||||
int id;
|
||||
|
||||
v.opts.set_encoding(RE2::Options::EncodingLatin1);
|
||||
v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
|
||||
v.f.Compile(&v.atoms);
|
||||
EXPECT_EQ(size_t{1}, v.atoms.size());
|
||||
EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
|
||||
|
||||
v.atom_indices.push_back(0);
|
||||
v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
|
||||
EXPECT_EQ(size_t{1}, v.matches.size());
|
||||
EXPECT_EQ(id, v.matches[0]);
|
||||
}
|
||||
|
||||
struct AtomTest {
|
||||
const char* testname;
|
||||
// If any test needs more than this many regexps or atoms, increase
|
||||
// the size of the corresponding array.
|
||||
const char* regexps[20];
|
||||
const char* atoms[20];
|
||||
};
|
||||
|
||||
AtomTest atom_tests[] = {
|
||||
{
|
||||
// This test checks to make sure empty patterns are allowed.
|
||||
"CheckEmptyPattern",
|
||||
{""},
|
||||
{}
|
||||
}, {
|
||||
// This test checks that all atoms of length greater than min length
|
||||
// are found, and no atoms that are of smaller length are found.
|
||||
"AllAtomsGtMinLengthFound", {
|
||||
"(abc123|def456|ghi789).*mnop[x-z]+",
|
||||
"abc..yyy..zz",
|
||||
"mnmnpp[a-z]+PPP"
|
||||
}, {
|
||||
"abc123",
|
||||
"def456",
|
||||
"ghi789",
|
||||
"mnop",
|
||||
"abc",
|
||||
"yyy",
|
||||
"mnmnpp",
|
||||
"ppp"
|
||||
}
|
||||
}, {
|
||||
// Test to make sure that any atoms that have another atom as a
|
||||
// substring in an OR are removed; that is, only the shortest
|
||||
// substring is kept.
|
||||
"SubstrAtomRemovesSuperStrInOr", {
|
||||
"(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+",
|
||||
"abcd..yyy..yyyzzz",
|
||||
"mnmnpp[a-z]+PPP"
|
||||
}, {
|
||||
"abc",
|
||||
"ghi789",
|
||||
"xyz",
|
||||
"abcd",
|
||||
"yyy",
|
||||
"yyyzzz",
|
||||
"mnmnpp",
|
||||
"ppp"
|
||||
}
|
||||
}, {
|
||||
// Test character class expansion.
|
||||
"CharClassExpansion", {
|
||||
"m[a-c][d-f]n.*[x-z]+",
|
||||
"[x-y]bcde[ab]"
|
||||
}, {
|
||||
"madn", "maen", "mafn",
|
||||
"mbdn", "mben", "mbfn",
|
||||
"mcdn", "mcen", "mcfn",
|
||||
"xbcdea", "xbcdeb",
|
||||
"ybcdea", "ybcdeb"
|
||||
}
|
||||
}, {
|
||||
// Test upper/lower of non-ASCII.
|
||||
"UnicodeLower", {
|
||||
"(?i)ΔδΠϖπΣςσ",
|
||||
"ΛΜΝΟΠ",
|
||||
"ψρστυ",
|
||||
}, {
|
||||
"δδπππσσσ",
|
||||
"λμνοπ",
|
||||
"ψρστυ",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
void AddRegexpsAndCompile(const char* regexps[],
|
||||
size_t n,
|
||||
struct FilterTestVars* v) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
int id;
|
||||
v->f.Add(regexps[i], v->opts, &id);
|
||||
}
|
||||
v->f.Compile(&v->atoms);
|
||||
}
|
||||
|
||||
bool CheckExpectedAtoms(const char* atoms[],
|
||||
size_t n,
|
||||
const char* testname,
|
||||
struct FilterTestVars* v) {
|
||||
std::vector<std::string> expected;
|
||||
for (size_t i = 0; i < n; i++)
|
||||
expected.push_back(atoms[i]);
|
||||
|
||||
bool pass = expected.size() == v->atoms.size();
|
||||
|
||||
std::sort(v->atoms.begin(), v->atoms.end());
|
||||
std::sort(expected.begin(), expected.end());
|
||||
for (size_t i = 0; pass && i < n; i++)
|
||||
pass = pass && expected[i] == v->atoms[i];
|
||||
|
||||
if (!pass) {
|
||||
ABSL_LOG(ERROR) << "Failed " << testname;
|
||||
ABSL_LOG(ERROR) << "Expected #atoms = " << expected.size();
|
||||
for (size_t i = 0; i < expected.size(); i++)
|
||||
ABSL_LOG(ERROR) << expected[i];
|
||||
ABSL_LOG(ERROR) << "Found #atoms = " << v->atoms.size();
|
||||
for (size_t i = 0; i < v->atoms.size(); i++)
|
||||
ABSL_LOG(ERROR) << v->atoms[i];
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, AtomTests) {
|
||||
int nfail = 0;
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(atom_tests); i++) {
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[i];
|
||||
size_t nregexp, natom;
|
||||
for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
for (natom = 0; natom < ABSL_ARRAYSIZE(t->atoms); natom++)
|
||||
if (t->atoms[natom] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
|
||||
nfail++;
|
||||
}
|
||||
EXPECT_EQ(0, nfail);
|
||||
}
|
||||
|
||||
void FindAtomIndices(const std::vector<std::string>& atoms,
|
||||
const std::vector<std::string>& matched_atoms,
|
||||
std::vector<int>* atom_indices) {
|
||||
atom_indices->clear();
|
||||
for (size_t i = 0; i < matched_atoms.size(); i++) {
|
||||
for (size_t j = 0; j < atoms.size(); j++) {
|
||||
if (matched_atoms[i] == atoms[j]) {
|
||||
atom_indices->push_back(static_cast<int>(j));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, MatchEmptyPattern) {
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[0];
|
||||
// We are using the regexps used in one of the atom tests
|
||||
// for this test. Adding the EXPECT here to make sure
|
||||
// the index we use for the test is for the correct test.
|
||||
EXPECT_EQ("CheckEmptyPattern", std::string(t->testname));
|
||||
size_t nregexp;
|
||||
for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
std::string text = "0123";
|
||||
std::vector<int> atom_ids;
|
||||
std::vector<int> matching_regexps;
|
||||
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, MatchTests) {
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[2];
|
||||
// We are using the regexps used in one of the atom tests
|
||||
// for this test.
|
||||
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", std::string(t->testname));
|
||||
size_t nregexp;
|
||||
for (nregexp = 0; nregexp < ABSL_ARRAYSIZE(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
|
||||
std::string text = "abc121212xyz";
|
||||
// atoms = abc
|
||||
std::vector<int> atom_ids;
|
||||
std::vector<std::string> atoms;
|
||||
atoms.push_back("abc");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
std::vector<int> matching_regexps;
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(size_t{1}, matching_regexps.size());
|
||||
|
||||
text = "abc12312yyyzzz";
|
||||
atoms.clear();
|
||||
atoms.push_back("abc");
|
||||
atoms.push_back("yyy");
|
||||
atoms.push_back("yyyzzz");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(size_t{1}, matching_regexps.size());
|
||||
|
||||
text = "abcd12yyy32yyyzzz";
|
||||
atoms.clear();
|
||||
atoms.push_back("abc");
|
||||
atoms.push_back("abcd");
|
||||
atoms.push_back("yyy");
|
||||
atoms.push_back("yyyzzz");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
ABSL_LOG(INFO) << "S: " << atom_ids.size();
|
||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
||||
ABSL_LOG(INFO) << "i: " << i << " : " << atom_ids[i];
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(size_t{2}, matching_regexps.size());
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
|
||||
// Bug due to find() finding "" at the start of everything in a string
|
||||
// set and thus SimplifyStringSet() would end up erasing everything.
|
||||
// In order to test this, we have to keep PrefilterTree from discarding
|
||||
// the OR entirely, so we have to make the minimum atom length zero.
|
||||
|
||||
FilterTestVars v(0); // override the minimum atom length
|
||||
const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
|
||||
const char* atoms[] = {"", "-r", "add=;aa", "}"};
|
||||
AddRegexpsAndCompile(regexps, ABSL_ARRAYSIZE(regexps), &v);
|
||||
EXPECT_TRUE(CheckExpectedAtoms(atoms, ABSL_ARRAYSIZE(atoms),
|
||||
"EmptyStringInStringSetBug", &v));
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, MoveSemantics) {
|
||||
FilterTestVars v1;
|
||||
int id;
|
||||
v1.f.Add("foo\\d+", v1.opts, &id);
|
||||
EXPECT_EQ(0, id);
|
||||
v1.f.Compile(&v1.atoms);
|
||||
EXPECT_EQ(size_t{1}, v1.atoms.size());
|
||||
EXPECT_EQ("foo", v1.atoms[0]);
|
||||
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{1}, v1.matches.size());
|
||||
EXPECT_EQ(0, v1.matches[0]);
|
||||
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{0}, v1.matches.size());
|
||||
|
||||
// The moved-to object should do what the moved-from object did.
|
||||
FilterTestVars v2;
|
||||
v2.f = std::move(v1.f);
|
||||
v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches);
|
||||
EXPECT_EQ(size_t{1}, v2.matches.size());
|
||||
EXPECT_EQ(0, v2.matches[0]);
|
||||
v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches);
|
||||
EXPECT_EQ(size_t{0}, v2.matches.size());
|
||||
|
||||
// The moved-from object should have been reset and be reusable.
|
||||
v1.f.Add("bar\\d+", v1.opts, &id);
|
||||
EXPECT_EQ(0, id);
|
||||
v1.f.Compile(&v1.atoms);
|
||||
EXPECT_EQ(size_t{1}, v1.atoms.size());
|
||||
EXPECT_EQ("bar", v1.atoms[0]);
|
||||
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{0}, v1.matches.size());
|
||||
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{1}, v1.matches.size());
|
||||
EXPECT_EQ(0, v1.matches[0]);
|
||||
|
||||
// Verify that "overwriting" works and also doesn't leak memory.
|
||||
// (The latter will need a leak detector such as LeakSanitizer.)
|
||||
v1.f = std::move(v2.f);
|
||||
v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{1}, v1.matches.size());
|
||||
EXPECT_EQ(0, v1.matches[0]);
|
||||
v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches);
|
||||
EXPECT_EQ(size_t{0}, v1.matches.size());
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,79 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct PCRETest {
|
||||
const char* regexp;
|
||||
bool should_match;
|
||||
};
|
||||
|
||||
static PCRETest tests[] = {
|
||||
// Most things should behave exactly.
|
||||
{ "abc", true },
|
||||
{ "(a|b)c", true },
|
||||
{ "(a*|b)c", true },
|
||||
{ "(a|b*)c", true },
|
||||
{ "a(b|c)d", true },
|
||||
{ "a(()|())c", true },
|
||||
{ "ab*c", true },
|
||||
{ "ab+c", true },
|
||||
{ "a(b*|c*)d", true },
|
||||
{ "\\W", true },
|
||||
{ "\\W{1,2}", true },
|
||||
{ "\\d", true },
|
||||
|
||||
// Check that repeated empty strings do not.
|
||||
{ "(a*)*", false },
|
||||
{ "x(a*)*y", false },
|
||||
{ "(a*)+", false },
|
||||
{ "(a+)*", true },
|
||||
{ "(a+)+", true },
|
||||
{ "(a+)+", true },
|
||||
|
||||
// \v is the only character class that shouldn't.
|
||||
{ "\\b", true },
|
||||
{ "\\v", false },
|
||||
{ "\\d", true },
|
||||
|
||||
// The handling of ^ in multi-line mode is different, as is
|
||||
// the handling of $ in single-line mode. (Both involve
|
||||
// boundary cases if the string ends with \n.)
|
||||
{ "\\A", true },
|
||||
{ "\\z", true },
|
||||
{ "(?m)^", false },
|
||||
{ "(?m)$", true },
|
||||
{ "(?-m)^", true },
|
||||
{ "(?-m)$", false }, // In PCRE, == \Z
|
||||
{ "(?m)\\A", true },
|
||||
{ "(?m)\\z", true },
|
||||
{ "(?-m)\\A", true },
|
||||
{ "(?-m)\\z", true },
|
||||
};
|
||||
|
||||
TEST(MimicsPCRE, SimpleTests) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
const PCRETest& t = tests[i];
|
||||
for (size_t j = 0; j < 2; j++) {
|
||||
Regexp::ParseFlags flags = Regexp::LikePerl;
|
||||
if (j == 0)
|
||||
flags = flags | Regexp::Latin1;
|
||||
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
|
||||
ASSERT_TRUE(re != NULL) << " " << t.regexp;
|
||||
ASSERT_EQ(t.should_match, re->MimicsPCRE())
|
||||
<< " " << t.regexp << " "
|
||||
<< (j == 0 ? "latin1" : "utf");
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,48 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Null walker. For benchmarking the walker itself.
|
||||
|
||||
class NullWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
NullWalker() {}
|
||||
|
||||
virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
virtual bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk(), not WalkExponential().
|
||||
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
||||
ABSL_LOG(DFATAL) << "NullWalker::ShortVisit called";
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
NullWalker(const NullWalker&) = delete;
|
||||
NullWalker& operator=(const NullWalker&) = delete;
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
// value from each of the children's PostVisits (i.e., whether each child
|
||||
// can match an empty string). Returns whether this clause can match an
|
||||
// empty string.
|
||||
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns whether re can match an empty string.
|
||||
void Regexp::NullWalk() {
|
||||
NullWalker w;
|
||||
w.Walk(this, false);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,586 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test parse.cc, dump.cc, and tostring.cc.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// In the past, we used 1<<30 here and zeroed the bit later, but that
|
||||
// has undefined behaviour, so now we use an internal-only flag because
|
||||
// otherwise we would have to introduce a new flag value just for this.
|
||||
static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* parse;
|
||||
Regexp::ParseFlags flags;
|
||||
};
|
||||
|
||||
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
|
||||
Regexp::PerlX |
|
||||
Regexp::PerlClasses |
|
||||
Regexp::UnicodeGroups;
|
||||
|
||||
static Test tests[] = {
|
||||
// Base cases
|
||||
{ "a", "lit{a}" },
|
||||
{ "a.", "cat{lit{a}dot{}}" },
|
||||
{ "a.b", "cat{lit{a}dot{}lit{b}}" },
|
||||
{ "ab", "str{ab}" },
|
||||
{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
|
||||
{ "abc", "str{abc}" },
|
||||
{ "a|^", "alt{lit{a}bol{}}" },
|
||||
{ "a|b", "cc{0x61-0x62}" },
|
||||
{ "(a)", "cap{lit{a}}" },
|
||||
{ "(a)|b", "alt{cap{lit{a}}lit{b}}" },
|
||||
{ "a*", "star{lit{a}}" },
|
||||
{ "a+", "plus{lit{a}}" },
|
||||
{ "a?", "que{lit{a}}" },
|
||||
{ "a{2}", "rep{2,2 lit{a}}" },
|
||||
{ "a{2,3}", "rep{2,3 lit{a}}" },
|
||||
{ "a{2,}", "rep{2,-1 lit{a}}" },
|
||||
{ "a*?", "nstar{lit{a}}" },
|
||||
{ "a+?", "nplus{lit{a}}" },
|
||||
{ "a??", "nque{lit{a}}" },
|
||||
{ "a{2}?", "nrep{2,2 lit{a}}" },
|
||||
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
|
||||
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
|
||||
{ "", "emp{}" },
|
||||
{ "|", "alt{emp{}emp{}}" },
|
||||
{ "|x|", "alt{emp{}lit{x}emp{}}" },
|
||||
{ ".", "dot{}" },
|
||||
{ "^", "bol{}" },
|
||||
{ "$", "eol{}" },
|
||||
{ "\\|", "lit{|}" },
|
||||
{ "\\(", "lit{(}" },
|
||||
{ "\\)", "lit{)}" },
|
||||
{ "\\*", "lit{*}" },
|
||||
{ "\\+", "lit{+}" },
|
||||
{ "\\?", "lit{?}" },
|
||||
{ "{", "lit{{}" },
|
||||
{ "}", "lit{}}" },
|
||||
{ "\\.", "lit{.}" },
|
||||
{ "\\^", "lit{^}" },
|
||||
{ "\\$", "lit{$}" },
|
||||
{ "\\\\", "lit{\\}" },
|
||||
{ "[ace]", "cc{0x61 0x63 0x65}" },
|
||||
{ "[abc]", "cc{0x61-0x63}" },
|
||||
{ "[a-z]", "cc{0x61-0x7a}" },
|
||||
{ "[a]", "lit{a}" },
|
||||
{ "\\-", "lit{-}" },
|
||||
{ "-", "lit{-}" },
|
||||
{ "\\_", "lit{_}" },
|
||||
|
||||
// Posix and Perl extensions
|
||||
{ "[[:lower:]]", "cc{0x61-0x7a}" },
|
||||
{ "[a-z]", "cc{0x61-0x7a}" },
|
||||
{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
|
||||
{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
|
||||
{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "\\d", "cc{0x30-0x39}" },
|
||||
{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
|
||||
{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
|
||||
{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
|
||||
{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
|
||||
{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
|
||||
{ "\\C", "byte{}" },
|
||||
|
||||
// Unicode, negatives, and a double negative.
|
||||
{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
|
||||
{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
|
||||
{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
|
||||
{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
|
||||
|
||||
// More interesting regular expressions.
|
||||
{ "a{,2}", "str{a{,2}}" },
|
||||
{ "\\.\\^\\$\\\\", "str{.^$\\}" },
|
||||
{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
|
||||
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
|
||||
{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
|
||||
{ "a*{", "cat{star{lit{a}}lit{{}}" },
|
||||
|
||||
// Test precedences
|
||||
{ "(?:ab)*", "star{str{ab}}" },
|
||||
{ "(ab)*", "star{cap{str{ab}}}" },
|
||||
{ "ab|cd", "alt{str{ab}str{cd}}" },
|
||||
{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
|
||||
|
||||
// Test squashing of **, ++, ?? et cetera.
|
||||
{ "(?:(?:a)*)*", "star{lit{a}}" },
|
||||
{ "(?:(?:a)+)+", "plus{lit{a}}" },
|
||||
{ "(?:(?:a)?)?", "que{lit{a}}" },
|
||||
{ "(?:(?:a)*)+", "star{lit{a}}" },
|
||||
{ "(?:(?:a)*)?", "star{lit{a}}" },
|
||||
{ "(?:(?:a)+)*", "star{lit{a}}" },
|
||||
{ "(?:(?:a)+)?", "star{lit{a}}" },
|
||||
{ "(?:(?:a)?)*", "star{lit{a}}" },
|
||||
{ "(?:(?:a)?)+", "star{lit{a}}" },
|
||||
|
||||
// Test flattening.
|
||||
{ "(?:a)", "lit{a}" },
|
||||
{ "(?:ab)(?:cd)", "str{abcd}" },
|
||||
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
|
||||
{ "a|c", "cc{0x61 0x63}" },
|
||||
{ "a|[cd]", "cc{0x61 0x63-0x64}" },
|
||||
{ "a|.", "dot{}" },
|
||||
{ "[ab]|c", "cc{0x61-0x63}" },
|
||||
{ "[ab]|[cd]", "cc{0x61-0x64}" },
|
||||
{ "[ab]|.", "dot{}" },
|
||||
{ ".|c", "dot{}" },
|
||||
{ ".|[cd]", "dot{}" },
|
||||
{ ".|.", "dot{}" },
|
||||
|
||||
// Test Perl quoted literals
|
||||
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
|
||||
{ "\\Q+\\E+", "plus{lit{+}}" },
|
||||
{ "\\Q\\\\E", "lit{\\}" },
|
||||
{ "\\Q\\\\\\E", "str{\\\\}" },
|
||||
{ "\\Qa\\E*", "star{lit{a}}" },
|
||||
{ "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
|
||||
{ "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
|
||||
|
||||
// Test Perl \A and \z
|
||||
{ "(?m)^", "bol{}" },
|
||||
{ "(?m)$", "eol{}" },
|
||||
{ "(?-m)^", "bot{}" },
|
||||
{ "(?-m)$", "eot{}" },
|
||||
{ "(?m)\\A", "bot{}" },
|
||||
{ "(?m)\\z", "eot{\\z}" },
|
||||
{ "(?-m)\\A", "bot{}" },
|
||||
{ "(?-m)\\z", "eot{\\z}" },
|
||||
|
||||
// Test named captures
|
||||
{ "(?P<name>a)", "cap{name:lit{a}}" },
|
||||
{ "(?P<中文>a)", "cap{中文:lit{a}}" },
|
||||
{ "(?<name>a)", "cap{name:lit{a}}" },
|
||||
{ "(?<中文>a)", "cap{中文:lit{a}}" },
|
||||
|
||||
// Case-folded literals
|
||||
{ "[Aa]", "litfold{a}" },
|
||||
|
||||
// Strings
|
||||
{ "abcde", "str{abcde}" },
|
||||
{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
|
||||
|
||||
// Reported bug involving \n leaking in despite use of NeverNL.
|
||||
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
|
||||
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
|
||||
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
|
||||
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
|
||||
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
|
||||
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
|
||||
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
|
||||
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
|
||||
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
|
||||
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
|
||||
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
|
||||
{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses },
|
||||
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::FoldCase },
|
||||
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::NeverNL },
|
||||
{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::FoldCase },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::NeverNL },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
|
||||
Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
|
||||
|
||||
// Bug in Regexp::ToString() that emitted [^], which
|
||||
// would (obviously) fail to parse when fed back in.
|
||||
{ "[\\s\\S]", "cc{0-0x10ffff}" },
|
||||
|
||||
// As per https://github.com/google/re2/issues/477,
|
||||
// there were long-standing bugs involving Latin-1.
|
||||
// Here, we exercise it WITHOUT case folding...
|
||||
{ "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
|
||||
{ "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
|
||||
{ "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
|
||||
{ "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
|
||||
{ "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
|
||||
{ "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
|
||||
{ "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
|
||||
{ "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
|
||||
// Here, we exercise it WITH case folding...
|
||||
// 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
|
||||
// should fold to 0xF1 and 0xF2, respectively.
|
||||
{ "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
{ "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
|
||||
};
|
||||
|
||||
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
|
||||
return Regexp::Equal(a, b);
|
||||
}
|
||||
|
||||
void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
|
||||
const std::string& title) {
|
||||
Regexp** re = new Regexp*[ntests];
|
||||
for (int i = 0; i < ntests; i++) {
|
||||
RegexpStatus status;
|
||||
Regexp::ParseFlags f = flags;
|
||||
if (tests[i].flags != 0) {
|
||||
f = tests[i].flags & ~TestZeroFlags;
|
||||
}
|
||||
re[i] = Regexp::Parse(tests[i].regexp, f, &status);
|
||||
ASSERT_TRUE(re[i] != NULL)
|
||||
<< " " << tests[i].regexp << " " << status.Text();
|
||||
std::string s = re[i]->Dump();
|
||||
EXPECT_EQ(std::string(tests[i].parse), s)
|
||||
<< "Regexp: " << tests[i].regexp
|
||||
<< "\nparse: " << std::string(tests[i].parse)
|
||||
<< " s: " << s << " flag=" << f;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ntests; i++) {
|
||||
for (int j = 0; j < ntests; j++) {
|
||||
EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
|
||||
RegexpEqualTestingOnly(re[i], re[j]))
|
||||
<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ntests; i++)
|
||||
re[i]->Decref();
|
||||
delete[] re;
|
||||
}
|
||||
|
||||
// Test that regexps parse to expected structures.
|
||||
TEST(TestParse, SimpleRegexps) {
|
||||
TestParse(tests, ABSL_ARRAYSIZE(tests), kTestFlags, "simple");
|
||||
}
|
||||
|
||||
Test foldcase_tests[] = {
|
||||
{ "AbCdE", "strfold{abcde}" },
|
||||
{ "[Aa]", "litfold{a}" },
|
||||
{ "a", "litfold{a}" },
|
||||
|
||||
// 0x17F is an old English long s (looks like an f) and folds to s.
|
||||
// 0x212A is the Kelvin symbol and folds to k.
|
||||
{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
|
||||
{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
};
|
||||
|
||||
// Test that parsing with FoldCase works.
|
||||
TEST(TestParse, FoldCase) {
|
||||
TestParse(foldcase_tests, ABSL_ARRAYSIZE(foldcase_tests), Regexp::FoldCase, "foldcase");
|
||||
}
|
||||
|
||||
Test literal_tests[] = {
|
||||
{ "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
|
||||
};
|
||||
|
||||
// Test that parsing with Literal works.
|
||||
TEST(TestParse, Literal) {
|
||||
TestParse(literal_tests, ABSL_ARRAYSIZE(literal_tests), Regexp::Literal, "literal");
|
||||
}
|
||||
|
||||
Test matchnl_tests[] = {
|
||||
{ ".", "dot{}" },
|
||||
{ "\n", "lit{\n}" },
|
||||
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
|
||||
{ "[a\\n]", "cc{0xa 0x61}" },
|
||||
};
|
||||
|
||||
// Test that parsing with MatchNL works.
|
||||
// (Also tested above during simple cases.)
|
||||
TEST(TestParse, MatchNL) {
|
||||
TestParse(matchnl_tests, ABSL_ARRAYSIZE(matchnl_tests), Regexp::MatchNL, "with MatchNL");
|
||||
}
|
||||
|
||||
Test nomatchnl_tests[] = {
|
||||
{ ".", "cc{0-0x9 0xb-0x10ffff}" },
|
||||
{ "\n", "lit{\n}" },
|
||||
{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
|
||||
{ "[a\\n]", "cc{0xa 0x61}" },
|
||||
};
|
||||
|
||||
// Test that parsing without MatchNL works.
|
||||
TEST(TestParse, NoMatchNL) {
|
||||
TestParse(nomatchnl_tests, ABSL_ARRAYSIZE(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
|
||||
}
|
||||
|
||||
Test prefix_tests[] = {
|
||||
{ "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
|
||||
{ "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
|
||||
{ "abc|abd|aef|bcx|bcy",
|
||||
"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
|
||||
"cat{str{bc}cc{0x78-0x79}}}" },
|
||||
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
|
||||
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
|
||||
{ "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
|
||||
{ ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
|
||||
{ "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
|
||||
{ "x{2}|x{2}[0-9]",
|
||||
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
|
||||
{ "x{2}y|x{2}[0-9]y",
|
||||
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
|
||||
{ "n|r|rs",
|
||||
"alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
|
||||
{ "n|rs|r",
|
||||
"alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
|
||||
{ "r|rs|n",
|
||||
"alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
|
||||
{ "rs|r|n",
|
||||
"alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
|
||||
{ "a\\C*?c|a\\C*?b",
|
||||
"cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
|
||||
{ "^/a/bc|^/a/de",
|
||||
"cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
|
||||
// In the past, factoring was limited to kFactorAlternationMaxDepth (8).
|
||||
{ "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
|
||||
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
|
||||
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
|
||||
"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
|
||||
"lit{a}}}}}}}}}}}}}}}}}}}" },
|
||||
{ "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
|
||||
"cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
|
||||
"cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
|
||||
"str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
|
||||
// As per https://github.com/google/re2/issues/467,
|
||||
// these should factor identically, but they didn't
|
||||
// because AddFoldedRange() terminated prematurely.
|
||||
{ "0A|0[aA]", "cat{lit{0}cc{0x41 0x61}}" },
|
||||
{ "0a|0[aA]", "cat{lit{0}cc{0x41 0x61}}" },
|
||||
{ "0[aA]|0A", "cat{lit{0}cc{0x41 0x61}}" },
|
||||
{ "0[aA]|0a", "cat{lit{0}cc{0x41 0x61}}" },
|
||||
};
|
||||
|
||||
// Test that prefix factoring works.
|
||||
TEST(TestParse, Prefix) {
|
||||
TestParse(prefix_tests, ABSL_ARRAYSIZE(prefix_tests), Regexp::PerlX, "prefix");
|
||||
}
|
||||
|
||||
Test nested_tests[] = {
|
||||
{ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
|
||||
"cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
|
||||
{ "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
|
||||
"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
|
||||
{ "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
|
||||
"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
|
||||
{ "((((((x{2}){2}){2}){5}){5}){5})",
|
||||
"cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
|
||||
};
|
||||
|
||||
// Test that nested repetition works.
|
||||
TEST(TestParse, Nested) {
|
||||
TestParse(nested_tests, ABSL_ARRAYSIZE(nested_tests), Regexp::PerlX, "nested");
|
||||
}
|
||||
|
||||
// Invalid regular expressions
|
||||
const char* badtests[] = {
|
||||
"(",
|
||||
")",
|
||||
"(a",
|
||||
"(a|b|",
|
||||
"(a|b",
|
||||
"[a-z",
|
||||
"([a-z)",
|
||||
"x{1001}",
|
||||
"\xff", // Invalid UTF-8
|
||||
"[\xff]",
|
||||
"[\\\xff]",
|
||||
"\\\xff",
|
||||
"(?P<name>a",
|
||||
"(?P<name>",
|
||||
"(?P<name",
|
||||
"(?P<x y>a)",
|
||||
"(?P<>a)",
|
||||
"(?<name>a",
|
||||
"(?<name>",
|
||||
"(?<name",
|
||||
"(?<x y>a)",
|
||||
"(?<>a)",
|
||||
"[a-Z]",
|
||||
"(?i)[a-Z]",
|
||||
"a{100000}",
|
||||
"a{100000,}",
|
||||
"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
|
||||
"(((x{7}){11}){13})",
|
||||
"\\Q\\E*",
|
||||
};
|
||||
|
||||
// Valid in Perl, bad in POSIX
|
||||
const char* only_perl[] = {
|
||||
"[a-b-c]",
|
||||
"\\Qabc\\E",
|
||||
"\\Q*+?{[\\E",
|
||||
"\\Q\\\\E",
|
||||
"\\Q\\\\\\E",
|
||||
"\\Q\\\\\\\\E",
|
||||
"\\Q\\\\\\\\\\E",
|
||||
"(?:a)",
|
||||
"(?P<name>a)",
|
||||
"(?<name>a)",
|
||||
};
|
||||
|
||||
// Valid in POSIX, bad in Perl.
|
||||
const char* only_posix[] = {
|
||||
"a++",
|
||||
"a**",
|
||||
"a?*",
|
||||
"a+*",
|
||||
"a{1}*",
|
||||
};
|
||||
|
||||
// Test that parser rejects bad regexps.
|
||||
TEST(TestParse, InvalidRegexps) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(badtests); i++) {
|
||||
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
|
||||
<< " " << badtests[i];
|
||||
ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
|
||||
<< " " << badtests[i];
|
||||
}
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(only_posix); i++) {
|
||||
ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
|
||||
<< " " << only_posix[i];
|
||||
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
|
||||
ASSERT_TRUE(re != NULL) << " " << only_posix[i];
|
||||
re->Decref();
|
||||
}
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(only_perl); i++) {
|
||||
ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
|
||||
<< " " << only_perl[i];
|
||||
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
|
||||
ASSERT_TRUE(re != NULL) << " " << only_perl[i];
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Test that ToString produces original regexp or equivalent one.
|
||||
TEST(TestToString, EquivalentParse) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
RegexpStatus status;
|
||||
Regexp::ParseFlags f = kTestFlags;
|
||||
if (tests[i].flags != 0) {
|
||||
f = tests[i].flags & ~TestZeroFlags;
|
||||
}
|
||||
Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
|
||||
ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
|
||||
std::string s = re->Dump();
|
||||
EXPECT_EQ(std::string(tests[i].parse), s)
|
||||
<< "Regexp: " << tests[i].regexp
|
||||
<< "\nparse: " << std::string(tests[i].parse)
|
||||
<< " s: " << s << " flag=" << f;
|
||||
std::string t = re->ToString();
|
||||
if (t != tests[i].regexp) {
|
||||
// If ToString didn't return the original regexp,
|
||||
// it must have found one with fewer parens.
|
||||
// Unfortunately we can't check the length here, because
|
||||
// ToString produces "\\{" for a literal brace,
|
||||
// but "{" is a shorter equivalent.
|
||||
// ASSERT_LT(t.size(), strlen(tests[i].regexp))
|
||||
// << " t=" << t << " regexp=" << tests[i].regexp;
|
||||
|
||||
// Test that if we parse the new regexp we get the same structure.
|
||||
Regexp* nre = Regexp::Parse(t, f, &status);
|
||||
ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
|
||||
std::string ss = nre->Dump();
|
||||
std::string tt = nre->ToString();
|
||||
if (s != ss || t != tt)
|
||||
ABSL_LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
|
||||
EXPECT_EQ(s, ss);
|
||||
EXPECT_EQ(t, tt);
|
||||
nre->Decref();
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Test that capture error args are correct.
|
||||
TEST(NamedCaptures, ErrorArgs) {
|
||||
RegexpStatus status;
|
||||
Regexp* re;
|
||||
|
||||
re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?P<name");
|
||||
|
||||
re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
|
||||
|
||||
re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?<name");
|
||||
|
||||
re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?<space bar>");
|
||||
}
|
||||
|
||||
// Test that look-around error args are correct.
|
||||
TEST(LookAround, ErrorArgs) {
|
||||
RegexpStatus status;
|
||||
Regexp* re;
|
||||
|
||||
re = Regexp::Parse("(?=foo).*", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadPerlOp);
|
||||
EXPECT_EQ(status.error_arg(), "(?=");
|
||||
|
||||
re = Regexp::Parse("(?!foo).*", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadPerlOp);
|
||||
EXPECT_EQ(status.error_arg(), "(?!");
|
||||
|
||||
re = Regexp::Parse("(?<=foo).*", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadPerlOp);
|
||||
EXPECT_EQ(status.error_arg(), "(?<=");
|
||||
|
||||
re = Regexp::Parse("(?<!foo).*", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadPerlOp);
|
||||
EXPECT_EQ(status.error_arg(), "(?<!");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,251 @@
|
||||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/escaping.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test that C++ strings are compared as uint8s, not int8s.
|
||||
// PossibleMatchRange doesn't depend on this, but callers probably will.
|
||||
TEST(CplusplusStrings, EightBit) {
|
||||
std::string s = "\x70";
|
||||
std::string t = "\xA0";
|
||||
EXPECT_LT(s, t);
|
||||
}
|
||||
|
||||
struct PrefixTest {
|
||||
const char* regexp;
|
||||
int maxlen;
|
||||
const char* min;
|
||||
const char* max;
|
||||
};
|
||||
|
||||
static PrefixTest tests[] = {
|
||||
{ "", 10, "", "", },
|
||||
{ "Abcdef", 10, "Abcdef", "Abcdef" },
|
||||
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
|
||||
{ "a+hello", 10, "aa", "ahello" },
|
||||
{ "a*hello", 10, "a", "hello" },
|
||||
{ "def|abc", 10, "abc", "def" },
|
||||
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
|
||||
{ "ab(cab|cat)", 10, "abcab", "abcat" },
|
||||
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
|
||||
{ "(ab|x)(c|de)", 10, "abc", "xde" },
|
||||
{ "(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "[^\\s\\S]", 10, "", "" },
|
||||
{ "(abc)+", 5, "abc", "abcac" },
|
||||
{ "(abc)+", 2, "ab", "ac" },
|
||||
{ "(abc)+", 1, "a", "b" },
|
||||
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
|
||||
{ "a*", 10, "", "ab" },
|
||||
|
||||
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
|
||||
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
|
||||
{ "(?i)a+hello", 10, "AA", "ahello" },
|
||||
{ "(?i)a*hello", 10, "A", "hello" },
|
||||
{ "(?i)def|abc", 10, "ABC", "def" },
|
||||
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
|
||||
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
|
||||
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
|
||||
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
|
||||
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "(?i)[^\\s\\S]", 10, "", "" },
|
||||
{ "(?i)(abc)+", 5, "ABC", "abcac" },
|
||||
{ "(?i)(abc)+", 2, "AB", "ac" },
|
||||
{ "(?i)(abc)+", 1, "A", "b" },
|
||||
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
|
||||
{ "(?i)a*", 10, "", "ab" },
|
||||
{ "(?i)A*", 10, "", "ab" },
|
||||
|
||||
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
|
||||
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
|
||||
{ "\\Aa+hello", 10, "aa", "ahello" },
|
||||
{ "\\Aa*hello", 10, "a", "hello" },
|
||||
{ "\\Adef|abc", 10, "abc", "def" },
|
||||
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
|
||||
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
|
||||
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
|
||||
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
|
||||
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "\\A[^\\s\\S]", 10, "", "" },
|
||||
{ "\\A(abc)+", 5, "abc", "abcac" },
|
||||
{ "\\A(abc)+", 2, "ab", "ac" },
|
||||
{ "\\A(abc)+", 1, "a", "b" },
|
||||
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
|
||||
{ "\\Aa*", 10, "", "ab" },
|
||||
|
||||
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
|
||||
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
|
||||
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
|
||||
{ "(?i)\\Aa*hello", 10, "A", "hello" },
|
||||
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
|
||||
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
|
||||
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
|
||||
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
|
||||
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
|
||||
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
|
||||
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
|
||||
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
|
||||
{ "(?i)\\A(abc)+", 1, "A", "b" },
|
||||
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
|
||||
{ "(?i)\\Aa*", 10, "", "ab" },
|
||||
{ "(?i)\\AA*", 10, "", "ab" },
|
||||
};
|
||||
|
||||
TEST(PossibleMatchRange, HandWritten) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
for (size_t j = 0; j < 2; j++) {
|
||||
const PrefixTest& t = tests[i];
|
||||
std::string min, max;
|
||||
if (j == 0) {
|
||||
ABSL_LOG(INFO) << "Checking regexp=" << absl::CEscape(t.regexp);
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
|
||||
<< " " << t.regexp;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
} else {
|
||||
ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
|
||||
}
|
||||
EXPECT_EQ(t.min, min) << t.regexp;
|
||||
EXPECT_EQ(t.max, max) << t.regexp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test cases where PossibleMatchRange should return false.
|
||||
TEST(PossibleMatchRange, Failures) {
|
||||
std::string min, max;
|
||||
|
||||
// Fails because no room to write max.
|
||||
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
|
||||
|
||||
// Fails because there is no max -- any non-empty string matches
|
||||
// or begins a match. Have to use Latin-1 input, because there
|
||||
// are no valid UTF-8 strings beginning with byte 0xFF.
|
||||
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
EXPECT_FALSE(RE2(".*", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
EXPECT_FALSE(RE2("\\C*").
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
|
||||
// Fails because it's a malformed regexp.
|
||||
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << absl::CEscape(min) << ", max=" << absl::CEscape(max);
|
||||
}
|
||||
|
||||
// Exhaustive test: generate all regexps within parameters,
|
||||
// then generate all strings of a given length over a given alphabet,
|
||||
// then check that the prefix information agrees with whether
|
||||
// the regexp matches each of the strings.
|
||||
class PossibleMatchTester : public RegexpGenerator {
|
||||
public:
|
||||
PossibleMatchTester(int maxatoms,
|
||||
int maxops,
|
||||
const std::vector<std::string>& alphabet,
|
||||
const std::vector<std::string>& ops,
|
||||
int maxstrlen,
|
||||
const std::vector<std::string>& stralphabet)
|
||||
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
|
||||
strgen_(maxstrlen, stralphabet),
|
||||
regexps_(0), tests_(0) { }
|
||||
|
||||
int regexps() { return regexps_; }
|
||||
int tests() { return tests_; }
|
||||
|
||||
// Needed for RegexpGenerator interface.
|
||||
void HandleRegexp(const std::string& regexp);
|
||||
|
||||
private:
|
||||
StringGenerator strgen_;
|
||||
|
||||
int regexps_; // Number of HandleRegexp calls
|
||||
int tests_; // Number of regexp tests.
|
||||
|
||||
PossibleMatchTester(const PossibleMatchTester&) = delete;
|
||||
PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
|
||||
};
|
||||
|
||||
// Processes a single generated regexp.
|
||||
// Checks that all accepted strings agree with the prefix range.
|
||||
void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
|
||||
regexps_++;
|
||||
|
||||
ABSL_VLOG(3) << absl::CEscape(regexp);
|
||||
|
||||
RE2 re(regexp, RE2::Latin1);
|
||||
ASSERT_EQ(re.error(), "");
|
||||
|
||||
std::string min, max;
|
||||
if(!re.PossibleMatchRange(&min, &max, 10)) {
|
||||
// There's no good max for "\\C*". Can't use strcmp
|
||||
// because sometimes it gets embedded in more
|
||||
// complicated expressions.
|
||||
if(strstr(regexp.c_str(), "\\C*"))
|
||||
return;
|
||||
ABSL_LOG(QFATAL) << "PossibleMatchRange failed on: "
|
||||
<< absl::CEscape(regexp);
|
||||
}
|
||||
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext()) {
|
||||
absl::string_view s = strgen_.Next();
|
||||
tests_++;
|
||||
if (!RE2::FullMatch(s, re))
|
||||
continue;
|
||||
ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
|
||||
ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PossibleMatchRange, Exhaustive) {
|
||||
int natom = 3;
|
||||
int noperator = 3;
|
||||
int stringlen = 5;
|
||||
if (RE2_DEBUG_MODE) {
|
||||
natom = 2;
|
||||
noperator = 3;
|
||||
stringlen = 3;
|
||||
}
|
||||
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
stringlen, Explode("ab4"));
|
||||
t.Generate();
|
||||
ABSL_LOG(INFO) << t.regexps() << " regexps, "
|
||||
<< t.tests() << " tests";
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,102 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Random testing of regular expression matching.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/flags/flag.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
ABSL_FLAG(int, regexpseed, 404, "Random regexp seed.");
|
||||
ABSL_FLAG(int, regexpcount, 100, "How many random regexps to generate.");
|
||||
ABSL_FLAG(int, stringseed, 200, "Random string seed.");
|
||||
ABSL_FLAG(int, stringcount, 100, "How many random strings to generate.");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Runs a random test on the given parameters.
|
||||
// (Always uses the same random seeds for reproducibility.
|
||||
// Can give different seeds on command line.)
|
||||
static void RandomTest(int maxatoms, int maxops,
|
||||
const std::vector<std::string>& alphabet,
|
||||
const std::vector<std::string>& ops,
|
||||
int maxstrlen,
|
||||
const std::vector<std::string>& stralphabet,
|
||||
const std::string& wrapper) {
|
||||
// Limit to smaller test cases in debug mode,
|
||||
// because everything is so much slower.
|
||||
if (RE2_DEBUG_MODE) {
|
||||
maxatoms--;
|
||||
maxops--;
|
||||
maxstrlen /= 2;
|
||||
}
|
||||
|
||||
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
|
||||
maxstrlen, stralphabet, wrapper, "");
|
||||
t.RandomStrings(absl::GetFlag(FLAGS_stringseed),
|
||||
absl::GetFlag(FLAGS_stringcount));
|
||||
t.GenerateRandom(absl::GetFlag(FLAGS_regexpseed),
|
||||
absl::GetFlag(FLAGS_regexpcount));
|
||||
absl::PrintF("%d regexps, %d tests, %d failures [%d/%d str]\n",
|
||||
t.regexps(), t.tests(), t.failures(), maxstrlen, stralphabet.size());
|
||||
EXPECT_EQ(0, t.failures());
|
||||
}
|
||||
|
||||
// Tests random small regexps involving literals and egrep operators.
|
||||
TEST(Random, SmallEgrepLiterals) {
|
||||
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random bigger regexps involving literals and egrep operators.
|
||||
TEST(Random, BigEgrepLiterals) {
|
||||
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random small regexps involving literals, capturing parens,
|
||||
// and egrep operators.
|
||||
TEST(Random, SmallEgrepCaptures) {
|
||||
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random bigger regexps involving literals, capturing parens,
|
||||
// and egrep operators.
|
||||
TEST(Random, BigEgrepCaptures) {
|
||||
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random large complicated expressions, using all the possible
|
||||
// operators, some literals, some parenthesized literals, and predefined
|
||||
// character classes like \d. (Adding larger character classes would
|
||||
// make for too many possibilities.)
|
||||
TEST(Random, Complicated) {
|
||||
std::vector<std::string> ops = Split(" ",
|
||||
"%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
|
||||
"%s{2} %s{2,} %s{3,4} %s{4,5}");
|
||||
|
||||
// Use (?:\b) and (?:\B) instead of \b and \B,
|
||||
// because PCRE rejects \b* but accepts (?:\b)*.
|
||||
// Ditto ^ and $.
|
||||
std::vector<std::string> atoms = Split(" ",
|
||||
". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
|
||||
"\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
|
||||
"a (a) b c - \\\\");
|
||||
std::vector<std::string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
|
||||
RandomTest(10, 10, atoms, ops, 20, alphabet, "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,182 @@
|
||||
// Copyright 2005 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This tests to make sure numbers are parsed from strings
|
||||
// correctly.
|
||||
// Todo: Expand the test to validate strings parsed to the other types
|
||||
// supported by RE2::Arg class
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/types/optional.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct SuccessTable {
|
||||
const char * value_string;
|
||||
int64_t value;
|
||||
bool success[6];
|
||||
};
|
||||
|
||||
// Test boundary cases for different integral sizes.
|
||||
// Specifically I want to make sure that values outside the boundries
|
||||
// of an integral type will fail and that negative numbers will fail
|
||||
// for unsigned types. The following table contains the boundaries for
|
||||
// the various integral types and has entries for whether or not each
|
||||
// type can contain the given value.
|
||||
const SuccessTable kSuccessTable[] = {
|
||||
// string integer value i16 u16 i32 u32 i64 u64
|
||||
// 0 to 2^7-1
|
||||
{ "0", 0, { true, true, true, true, true, true }},
|
||||
{ "127", 127, { true, true, true, true, true, true }},
|
||||
|
||||
// -1 to -2^7
|
||||
{ "-1", -1, { true, false, true, false, true, false }},
|
||||
{ "-128", -128, { true, false, true, false, true, false }},
|
||||
|
||||
// 2^7 to 2^8-1
|
||||
{ "128", 128, { true, true, true, true, true, true }},
|
||||
{ "255", 255, { true, true, true, true, true, true }},
|
||||
|
||||
// 2^8 to 2^15-1
|
||||
{ "256", 256, { true, true, true, true, true, true }},
|
||||
{ "32767", 32767, { true, true, true, true, true, true }},
|
||||
|
||||
// -2^7-1 to -2^15
|
||||
{ "-129", -129, { true, false, true, false, true, false }},
|
||||
{ "-32768", -32768, { true, false, true, false, true, false }},
|
||||
|
||||
// 2^15 to 2^16-1
|
||||
{ "32768", 32768, { false, true, true, true, true, true }},
|
||||
{ "65535", 65535, { false, true, true, true, true, true }},
|
||||
|
||||
// 2^16 to 2^31-1
|
||||
{ "65536", 65536, { false, false, true, true, true, true }},
|
||||
{ "2147483647", 2147483647, { false, false, true, true, true, true }},
|
||||
|
||||
// -2^15-1 to -2^31
|
||||
{ "-32769", -32769, { false, false, true, false, true, false }},
|
||||
{ "-2147483648", static_cast<int64_t>(0xFFFFFFFF80000000LL),
|
||||
{ false, false, true, false, true, false }},
|
||||
|
||||
// 2^31 to 2^32-1
|
||||
{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
|
||||
{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
|
||||
|
||||
// 2^32 to 2^63-1
|
||||
{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
|
||||
{ "9223372036854775807",
|
||||
9223372036854775807LL, { false, false, false, false, true, true }},
|
||||
|
||||
// -2^31-1 to -2^63
|
||||
{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
|
||||
{ "-9223372036854775808", static_cast<int64_t>(0x8000000000000000LL),
|
||||
{ false, false, false, false, true, false }},
|
||||
|
||||
// 2^63 to 2^64-1
|
||||
{ "9223372036854775808", static_cast<int64_t>(9223372036854775808ULL),
|
||||
{ false, false, false, false, false, true }},
|
||||
{ "18446744073709551615", static_cast<int64_t>(18446744073709551615ULL),
|
||||
{ false, false, false, false, false, true }},
|
||||
|
||||
// >= 2^64
|
||||
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
|
||||
};
|
||||
|
||||
const int kNumStrings = ABSL_ARRAYSIZE(kSuccessTable);
|
||||
|
||||
// It's ugly to use a macro, but we apparently can't use the EXPECT_EQ
|
||||
// macro outside of a TEST block and this seems to be the only way to
|
||||
// avoid code duplication. I can also pull off a couple nice tricks
|
||||
// using concatenation for the type I'm checking against.
|
||||
#define PARSE_FOR_TYPE(type, column) { \
|
||||
type r; \
|
||||
for (int i = 0; i < kNumStrings; ++i) { \
|
||||
RE2::Arg arg(&r); \
|
||||
const char* const p = kSuccessTable[i].value_string; \
|
||||
bool retval = arg.Parse(p, strlen(p)); \
|
||||
bool success = kSuccessTable[i].success[column]; \
|
||||
EXPECT_EQ(retval, success) \
|
||||
<< "Parsing '" << p << "' for type " #type " should return " \
|
||||
<< success; \
|
||||
if (success) { \
|
||||
EXPECT_EQ(r, (type)kSuccessTable[i].value); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Int16Test) {
|
||||
PARSE_FOR_TYPE(int16_t, 0);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Uint16Test) {
|
||||
PARSE_FOR_TYPE(uint16_t, 1);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Int32Test) {
|
||||
PARSE_FOR_TYPE(int32_t, 2);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Uint32Test) {
|
||||
PARSE_FOR_TYPE(uint32_t, 3);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Int64Test) {
|
||||
PARSE_FOR_TYPE(int64_t, 4);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, Uint64Test) {
|
||||
PARSE_FOR_TYPE(uint64_t, 5);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, ParseFromTest) {
|
||||
struct {
|
||||
bool ParseFrom(const char* str, size_t n) {
|
||||
ABSL_LOG(INFO) << "str = " << str << ", n = " << n;
|
||||
return true;
|
||||
}
|
||||
} obj1;
|
||||
RE2::Arg arg1(&obj1);
|
||||
EXPECT_TRUE(arg1.Parse("one", 3));
|
||||
|
||||
struct {
|
||||
bool ParseFrom(const char* str, size_t n) {
|
||||
ABSL_LOG(INFO) << "str = " << str << ", n = " << n;
|
||||
return false;
|
||||
}
|
||||
// Ensure that RE2::Arg works even with overloaded ParseFrom().
|
||||
void ParseFrom(const char* str) {}
|
||||
} obj2;
|
||||
RE2::Arg arg2(&obj2);
|
||||
EXPECT_FALSE(arg2.Parse("two", 3));
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, OptionalDoubleTest) {
|
||||
absl::optional<double> opt;
|
||||
RE2::Arg arg(&opt);
|
||||
EXPECT_TRUE(arg.Parse(NULL, 0));
|
||||
EXPECT_FALSE(opt.has_value());
|
||||
EXPECT_FALSE(arg.Parse("", 0));
|
||||
EXPECT_TRUE(arg.Parse("28.30", 5));
|
||||
EXPECT_TRUE(opt.has_value());
|
||||
EXPECT_EQ(*opt, 28.30);
|
||||
}
|
||||
|
||||
TEST(RE2ArgTest, OptionalIntWithCRadixTest) {
|
||||
absl::optional<int> opt;
|
||||
RE2::Arg arg = RE2::CRadix(&opt);
|
||||
EXPECT_TRUE(arg.Parse(NULL, 0));
|
||||
EXPECT_FALSE(opt.has_value());
|
||||
EXPECT_FALSE(arg.Parse("", 0));
|
||||
EXPECT_TRUE(arg.Parse("0xb0e", 5));
|
||||
EXPECT_TRUE(opt.has_value());
|
||||
EXPECT_EQ(*opt, 2830);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,284 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression generator: generates all possible
|
||||
// regular expressions within parameters (see regexp_generator.h for details).
|
||||
|
||||
// The regexp generator first generates a sequence of commands in a simple
|
||||
// postfix language. Each command in the language is a string,
|
||||
// like "a" or "%s*" or "%s|%s".
|
||||
//
|
||||
// To evaluate a command, enough arguments are popped from the value stack to
|
||||
// plug into the %s slots. Then the result is pushed onto the stack.
|
||||
// For example, the command sequence
|
||||
// a b %s%s c
|
||||
// results in the stack
|
||||
// ab c
|
||||
//
|
||||
// GeneratePostfix generates all possible command sequences.
|
||||
// Then RunPostfix turns each sequence into a regular expression
|
||||
// and passes the regexp to HandleRegexp.
|
||||
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <stack>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/log/absl_check.h"
|
||||
#include "absl/log/absl_log.h"
|
||||
#include "absl/strings/escaping.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns a vector of the egrep regexp operators.
|
||||
const std::vector<std::string>& RegexpGenerator::EgrepOps() {
|
||||
static const char *ops[] = {
|
||||
"%s%s",
|
||||
"%s|%s",
|
||||
"%s*",
|
||||
"%s+",
|
||||
"%s?",
|
||||
"%s\\C*",
|
||||
};
|
||||
static std::vector<std::string> v(ops, ops + ABSL_ARRAYSIZE(ops));
|
||||
return v;
|
||||
}
|
||||
|
||||
RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
|
||||
const std::vector<std::string>& atoms,
|
||||
const std::vector<std::string>& ops)
|
||||
: maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
|
||||
// Degenerate case.
|
||||
if (atoms_.empty())
|
||||
maxatoms_ = 0;
|
||||
if (ops_.empty())
|
||||
maxops_ = 0;
|
||||
}
|
||||
|
||||
// Generates all possible regular expressions (within the parameters),
|
||||
// calling HandleRegexp for each one.
|
||||
void RegexpGenerator::Generate() {
|
||||
std::vector<std::string> postfix;
|
||||
GeneratePostfix(&postfix, 0, 0, 0);
|
||||
}
|
||||
|
||||
// Generates random regular expressions, calling HandleRegexp for each one.
|
||||
void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
|
||||
rng_.seed(seed);
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
std::vector<std::string> postfix;
|
||||
GenerateRandomPostfix(&postfix, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Counts and returns the number of occurrences of "%s" in s.
|
||||
static int CountArgs(const std::string& s) {
|
||||
const char *p = s.c_str();
|
||||
int n = 0;
|
||||
while ((p = strstr(p, "%s")) != NULL) {
|
||||
p += 2;
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// Generates all possible postfix command sequences.
|
||||
// Each sequence is handed off to RunPostfix to generate a regular expression.
|
||||
// The arguments are:
|
||||
// post: the current postfix sequence
|
||||
// nstk: the number of elements that would be on the stack after executing
|
||||
// the sequence
|
||||
// ops: the number of operators used in the sequence
|
||||
// atoms: the number of atoms used in the sequence
|
||||
// For example, if post were ["a", "b", "%s%s", "c"],
|
||||
// then nstk = 2, ops = 1, atoms = 3.
|
||||
//
|
||||
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
|
||||
//
|
||||
void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
|
||||
int nstk, int ops, int atoms) {
|
||||
if (nstk == 1)
|
||||
RunPostfix(*post);
|
||||
|
||||
// Early out: if used too many operators or can't
|
||||
// get back down to a single expression on the stack
|
||||
// using binary operators, give up.
|
||||
if (ops + nstk - 1 > maxops_)
|
||||
return;
|
||||
|
||||
// Add atoms if there is room.
|
||||
if (atoms < maxatoms_) {
|
||||
for (size_t i = 0; i < atoms_.size(); i++) {
|
||||
post->push_back(atoms_[i]);
|
||||
GeneratePostfix(post, nstk + 1, ops, atoms + 1);
|
||||
post->pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
// Add operators if there are enough arguments.
|
||||
if (ops < maxops_) {
|
||||
for (size_t i = 0; i < ops_.size(); i++) {
|
||||
const std::string& fmt = ops_[i];
|
||||
int nargs = CountArgs(fmt);
|
||||
if (nargs <= nstk) {
|
||||
post->push_back(fmt);
|
||||
GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
|
||||
post->pop_back();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generates a random postfix command sequence.
|
||||
// Stops and returns true once a single sequence has been generated.
|
||||
bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
|
||||
int nstk, int ops, int atoms) {
|
||||
std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
|
||||
std::uniform_int_distribution<int> random_bit(0, 1);
|
||||
std::uniform_int_distribution<int> random_ops_index(
|
||||
0, static_cast<int>(ops_.size()) - 1);
|
||||
std::uniform_int_distribution<int> random_atoms_index(
|
||||
0, static_cast<int>(atoms_.size()) - 1);
|
||||
|
||||
for (;;) {
|
||||
// Stop if we get to a single element, but only sometimes.
|
||||
if (nstk == 1 && random_stop(rng_) == 0) {
|
||||
RunPostfix(*post);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Early out: if used too many operators or can't
|
||||
// get back down to a single expression on the stack
|
||||
// using binary operators, give up.
|
||||
if (ops + nstk - 1 > maxops_)
|
||||
return false;
|
||||
|
||||
// Add operators if there are enough arguments.
|
||||
if (ops < maxops_ && random_bit(rng_) == 0) {
|
||||
const std::string& fmt = ops_[random_ops_index(rng_)];
|
||||
int nargs = CountArgs(fmt);
|
||||
if (nargs <= nstk) {
|
||||
post->push_back(fmt);
|
||||
bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
|
||||
ops + 1, atoms);
|
||||
post->pop_back();
|
||||
if (ret)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Add atoms if there is room.
|
||||
if (atoms < maxatoms_ && random_bit(rng_) == 0) {
|
||||
post->push_back(atoms_[random_atoms_index(rng_)]);
|
||||
bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
|
||||
post->pop_back();
|
||||
if (ret)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Interprets the postfix command sequence to create a regular expression
|
||||
// passed to HandleRegexp. The results of operators like %s|%s are wrapped
|
||||
// in (?: ) to avoid needing to maintain a precedence table.
|
||||
void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
|
||||
std::stack<std::string> regexps;
|
||||
for (size_t i = 0; i < post.size(); i++) {
|
||||
switch (CountArgs(post[i])) {
|
||||
default:
|
||||
ABSL_LOG(FATAL) << "Bad operator: " << post[i];
|
||||
case 0:
|
||||
regexps.push(post[i]);
|
||||
break;
|
||||
case 1: {
|
||||
auto fmt = absl::ParsedFormat<'s'>::New(post[i]);
|
||||
ABSL_CHECK(fmt != nullptr);
|
||||
std::string a = regexps.top();
|
||||
regexps.pop();
|
||||
regexps.push("(?:" + absl::StrFormat(*fmt, a) + ")");
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
auto fmt = absl::ParsedFormat<'s', 's'>::New(post[i]);
|
||||
ABSL_CHECK(fmt != nullptr);
|
||||
std::string b = regexps.top();
|
||||
regexps.pop();
|
||||
std::string a = regexps.top();
|
||||
regexps.pop();
|
||||
regexps.push("(?:" + absl::StrFormat(*fmt, a, b) + ")");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (regexps.size() != 1) {
|
||||
// Internal error - should never happen.
|
||||
absl::PrintF("Bad regexp program:\n");
|
||||
for (size_t i = 0; i < post.size(); i++) {
|
||||
absl::PrintF(" %s\n", absl::CEscape(post[i]));
|
||||
}
|
||||
absl::PrintF("Stack after running program:\n");
|
||||
while (!regexps.empty()) {
|
||||
absl::PrintF(" %s\n", absl::CEscape(regexps.top()));
|
||||
regexps.pop();
|
||||
}
|
||||
ABSL_LOG(FATAL) << "Bad regexp program.";
|
||||
}
|
||||
|
||||
HandleRegexp(regexps.top());
|
||||
HandleRegexp("^(?:" + regexps.top() + ")$");
|
||||
HandleRegexp("^(?:" + regexps.top() + ")");
|
||||
HandleRegexp("(?:" + regexps.top() + ")$");
|
||||
}
|
||||
|
||||
// Split s into an vector of strings, one for each UTF-8 character.
|
||||
std::vector<std::string> Explode(absl::string_view s) {
|
||||
std::vector<std::string> v;
|
||||
|
||||
for (const char *q = s.data(); q < s.data() + s.size(); ) {
|
||||
const char* p = q;
|
||||
Rune r;
|
||||
q += chartorune(&r, q);
|
||||
v.push_back(std::string(p, q - p));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// Split string everywhere a substring is found, returning
|
||||
// vector of pieces.
|
||||
std::vector<std::string> Split(absl::string_view sep, absl::string_view s) {
|
||||
std::vector<std::string> v;
|
||||
|
||||
if (sep.empty())
|
||||
return Explode(s);
|
||||
|
||||
const char *p = s.data();
|
||||
for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
|
||||
if (absl::string_view(q, sep.size()) == sep) {
|
||||
v.push_back(std::string(p, q - p));
|
||||
p = q + sep.size();
|
||||
q = p - 1; // -1 for ++ in loop
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (p < s.data() + s.size())
|
||||
v.push_back(std::string(p, s.data() + s.size() - p));
|
||||
return v;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_TESTING_REGEXP_GENERATOR_H_
|
||||
#define RE2_TESTING_REGEXP_GENERATOR_H_
|
||||
|
||||
// Regular expression generator: generates all possible
|
||||
// regular expressions within given parameters (see below for details).
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Regular expression generator.
|
||||
//
|
||||
// Given a set of atom expressions like "a", "b", or "."
|
||||
// and operators like "%s*", generates all possible regular expressions
|
||||
// using at most maxbases base expressions and maxops operators.
|
||||
// For each such expression re, calls HandleRegexp(re).
|
||||
//
|
||||
// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
|
||||
//
|
||||
class RegexpGenerator {
|
||||
public:
|
||||
RegexpGenerator(int maxatoms, int maxops,
|
||||
const std::vector<std::string>& atoms,
|
||||
const std::vector<std::string>& ops);
|
||||
virtual ~RegexpGenerator() {}
|
||||
|
||||
// Generates all the regular expressions, calling HandleRegexp(re) for each.
|
||||
void Generate();
|
||||
|
||||
// Generates n random regular expressions, calling HandleRegexp(re) for each.
|
||||
void GenerateRandom(int32_t seed, int n);
|
||||
|
||||
// Handles a regular expression. Must be provided by subclass.
|
||||
virtual void HandleRegexp(const std::string& regexp) = 0;
|
||||
|
||||
// The egrep regexp operators: * + ? | and concatenation.
|
||||
static const std::vector<std::string>& EgrepOps();
|
||||
|
||||
private:
|
||||
void RunPostfix(const std::vector<std::string>& post);
|
||||
void GeneratePostfix(std::vector<std::string>* post,
|
||||
int nstk, int ops, int lits);
|
||||
bool GenerateRandomPostfix(std::vector<std::string>* post,
|
||||
int nstk, int ops, int lits);
|
||||
|
||||
int maxatoms_; // Maximum number of atoms allowed in expr.
|
||||
int maxops_; // Maximum number of ops allowed in expr.
|
||||
std::vector<std::string> atoms_; // Possible atoms.
|
||||
std::vector<std::string> ops_; // Possible ops.
|
||||
std::minstd_rand0 rng_; // Random number generator.
|
||||
|
||||
RegexpGenerator(const RegexpGenerator&) = delete;
|
||||
RegexpGenerator& operator=(const RegexpGenerator&) = delete;
|
||||
};
|
||||
|
||||
// Helpers for preparing arguments to RegexpGenerator constructor.
|
||||
|
||||
// Returns one string for each character in s.
|
||||
std::vector<std::string> Explode(absl::string_view s);
|
||||
|
||||
// Splits string everywhere sep is found, returning
|
||||
// vector of pieces.
|
||||
std::vector<std::string> Split(absl::string_view sep, absl::string_view s);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_REGEXP_GENERATOR_H_
|
||||
@@ -0,0 +1,87 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test parse.cc, dump.cc, and tostring.cc.
|
||||
|
||||
#include "re2/regexp.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test that overflowed ref counts work.
|
||||
TEST(Regexp, BigRef) {
|
||||
Regexp* re;
|
||||
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
|
||||
for (int i = 0; i < 100000; i++)
|
||||
re->Incref();
|
||||
for (int i = 0; i < 100000; i++)
|
||||
re->Decref();
|
||||
ASSERT_EQ(re->Ref(), 1);
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Test that very large Concats work.
|
||||
// Depends on overflowed ref counts working.
|
||||
TEST(Regexp, BigConcat) {
|
||||
Regexp* x;
|
||||
x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
|
||||
std::vector<Regexp*> v(90000, x); // ToString bails out at 100000
|
||||
for (size_t i = 0; i < v.size(); i++)
|
||||
x->Incref();
|
||||
ASSERT_EQ(x->Ref(), 1 + static_cast<int>(v.size())) << x->Ref();
|
||||
Regexp* re = Regexp::Concat(v.data(), static_cast<int>(v.size()),
|
||||
Regexp::NoParseFlags);
|
||||
ASSERT_EQ(re->ToString(), std::string(v.size(), 'x'));
|
||||
re->Decref();
|
||||
ASSERT_EQ(x->Ref(), 1) << x->Ref();
|
||||
x->Decref();
|
||||
}
|
||||
|
||||
TEST(Regexp, NamedCaptures) {
|
||||
Regexp* x;
|
||||
RegexpStatus status;
|
||||
x = Regexp::Parse(
|
||||
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
|
||||
EXPECT_TRUE(status.ok());
|
||||
EXPECT_EQ(4, x->NumCaptures());
|
||||
const std::map<std::string, int>* have = x->NamedCaptures();
|
||||
EXPECT_TRUE(have != NULL);
|
||||
// there are only two named groups in the regexp: 'g1' and 'g2'.
|
||||
EXPECT_EQ(size_t{2}, have->size());
|
||||
std::map<std::string, int> want;
|
||||
want["g1"] = 1;
|
||||
want["g2"] = 3;
|
||||
EXPECT_EQ(want, *have);
|
||||
x->Decref();
|
||||
delete have;
|
||||
}
|
||||
|
||||
TEST(Regexp, CaptureNames) {
|
||||
Regexp* x;
|
||||
RegexpStatus status;
|
||||
x = Regexp::Parse(
|
||||
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
|
||||
EXPECT_TRUE(status.ok());
|
||||
EXPECT_EQ(4, x->NumCaptures());
|
||||
const std::map<int, std::string>* have = x->CaptureNames();
|
||||
EXPECT_TRUE(have != NULL);
|
||||
EXPECT_EQ(size_t{3}, have->size());
|
||||
std::map<int, std::string> want;
|
||||
want[1] = "g1";
|
||||
want[3] = "g2";
|
||||
want[4] = "g1";
|
||||
|
||||
EXPECT_EQ(want, *have);
|
||||
x->Decref();
|
||||
delete have;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
@@ -0,0 +1,201 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct PrefixTest {
|
||||
const char* regexp;
|
||||
bool return_value;
|
||||
const char* prefix;
|
||||
bool foldcase;
|
||||
const char* suffix;
|
||||
};
|
||||
|
||||
static PrefixTest tests[] = {
|
||||
// Empty cases.
|
||||
{ "", false },
|
||||
{ "(?m)^", false },
|
||||
{ "(?-m)^", false },
|
||||
|
||||
// If the regexp has no ^, there's no required prefix.
|
||||
{ "abc", false },
|
||||
|
||||
// If the regexp immediately goes into
|
||||
// something not a literal match, there's no required prefix.
|
||||
{ "^a*", false },
|
||||
{ "^(abc)", false },
|
||||
|
||||
// Otherwise, it should work.
|
||||
{ "^abc$", true, "abc", false, "(?-m:$)" },
|
||||
{ "^abc", true, "abc", false, "" },
|
||||
{ "^(?i)abc", true, "abc", true, "" },
|
||||
{ "^abcd*", true, "abc", false, "d*" },
|
||||
{ "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
|
||||
{ "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
|
||||
{ "^☺abc", true, "☺abc", false, "" },
|
||||
};
|
||||
|
||||
TEST(RequiredPrefix, SimpleTests) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
|
||||
const PrefixTest& t = tests[i];
|
||||
for (size_t j = 0; j < 2; j++) {
|
||||
Regexp::ParseFlags flags = Regexp::LikePerl;
|
||||
if (j == 0)
|
||||
flags = flags | Regexp::Latin1;
|
||||
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
|
||||
ASSERT_TRUE(re != NULL) << " " << t.regexp;
|
||||
|
||||
std::string p;
|
||||
bool f;
|
||||
Regexp* s;
|
||||
ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
|
||||
<< " " << re->Dump();
|
||||
if (t.return_value) {
|
||||
ASSERT_EQ(p, std::string(t.prefix))
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
|
||||
ASSERT_EQ(f, t.foldcase)
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
|
||||
ASSERT_EQ(s->ToString(), std::string(t.suffix))
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
|
||||
s->Decref();
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static PrefixTest for_accel_tests[] = {
|
||||
// Empty cases.
|
||||
{ "", false },
|
||||
{ "(?m)^", false },
|
||||
{ "(?-m)^", false },
|
||||
|
||||
// If the regexp has a ^, there's no required prefix.
|
||||
{ "^abc", false },
|
||||
|
||||
// If the regexp immediately goes into
|
||||
// something not a literal match, there's no required prefix.
|
||||
{ "a*", false },
|
||||
|
||||
// Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
|
||||
// capturing groups, but doesn't try to glue prefix fragments together.
|
||||
{ "(a?)def", false },
|
||||
{ "(ab?)def", true, "a", false },
|
||||
{ "(abc?)def", true, "ab", false },
|
||||
{ "(()a)def", false },
|
||||
{ "((a)b)def", true, "a", false },
|
||||
{ "((ab)c)def", true, "ab", false },
|
||||
|
||||
// Otherwise, it should work.
|
||||
{ "abc$", true, "abc", false },
|
||||
{ "abc", true, "abc", false },
|
||||
{ "(?i)abc", true, "abc", true },
|
||||
{ "abcd*", true, "abc", false },
|
||||
{ "[Aa][Bb]cd*", true, "ab", true },
|
||||
{ "ab[Cc]d*", true, "ab", false },
|
||||
{ "☺abc", true, "☺abc", false },
|
||||
};
|
||||
|
||||
TEST(RequiredPrefixForAccel, SimpleTests) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) {
|
||||
const PrefixTest& t = for_accel_tests[i];
|
||||
for (size_t j = 0; j < 2; j++) {
|
||||
Regexp::ParseFlags flags = Regexp::LikePerl;
|
||||
if (j == 0)
|
||||
flags = flags | Regexp::Latin1;
|
||||
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
|
||||
ASSERT_TRUE(re != NULL) << " " << t.regexp;
|
||||
|
||||
std::string p;
|
||||
bool f;
|
||||
ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
|
||||
<< " " << re->Dump();
|
||||
if (t.return_value) {
|
||||
ASSERT_EQ(p, std::string(t.prefix))
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
|
||||
ASSERT_EQ(f, t.foldcase)
|
||||
<< " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
|
||||
Regexp* re;
|
||||
std::string p;
|
||||
bool f;
|
||||
|
||||
// With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
|
||||
re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
|
||||
ASSERT_EQ(p, "klm");
|
||||
ASSERT_EQ(f, true);
|
||||
re->Decref();
|
||||
|
||||
re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
|
||||
ASSERT_EQ(p, "stu");
|
||||
ASSERT_EQ(f, true);
|
||||
re->Decref();
|
||||
|
||||
// With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
|
||||
// This is because they match U+212A and U+017F, respectively, and
|
||||
// so the parser ends up emitting character classes, not literals.
|
||||
re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
|
||||
re->Decref();
|
||||
|
||||
re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
static const char* prefix_accel_tests[] = {
|
||||
"aababc\\d+",
|
||||
"(?i)AABABC\\d+",
|
||||
};
|
||||
|
||||
TEST(PrefixAccel, SimpleTests) {
|
||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) {
|
||||
const char* pattern = prefix_accel_tests[i];
|
||||
Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
|
||||
ASSERT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
ASSERT_TRUE(prog != NULL);
|
||||
ASSERT_TRUE(prog->can_prefix_accel());
|
||||
for (int j = 0; j < 100; j++) {
|
||||
std::string text(j, 'a');
|
||||
const char* p = reinterpret_cast<const char*>(
|
||||
prog->PrefixAccel(text.data(), text.size()));
|
||||
EXPECT_TRUE(p == NULL);
|
||||
text.append("aababc");
|
||||
for (int k = 0; k < 100; k++) {
|
||||
text.append(k, 'a');
|
||||
p = reinterpret_cast<const char*>(
|
||||
prog->PrefixAccel(text.data(), text.size()));
|
||||
EXPECT_EQ(j, p - text.data());
|
||||
}
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user