third_party_rust_regex/scripts/generate-unicode-tables
Andrew Gallant 8fe3716e29 scripts: update docs for 'generate-unicode-tables'
The docs are now updated to work with Unicode 14. (In particular,
emoji-data.txt no longer needs to be downloaded separately.) We also
include a note about adding a new case for "age" in regex-syntax.
2022-07-05 13:00:10 -04:00

67 lines
2.4 KiB
Bash
Executable File

#!/bin/sh
# This script is responsible for generating some of the Unicode tables used
# in regex-syntax.
#
# Usage is simple, first download the Unicode data:
#
# $ mkdir ucd
# $ cd ucd
# $ curl -LO https://www.unicode.org/Public/zipped/14.0.0/UCD.zip
# $ unzip UCD.zip
#
# And then run this script from the root of this repository by pointing it at
# the data directory downloaded above:
#
# $ ./scripts/generate-unicode-tables path/to/ucd
#
# Once complete, if you are upgrading to a new version of Unicode,
# you'll need to add a new "age" value to the 'ages' routine in
# regex-syntax/src/unicode.rs.
if [ $# != 1 ]; then
echo "Usage: $(basename "$0") <ucd-data-directory>" >&2
exit 1
fi
ucddir="$1"
out="regex-syntax/src/unicode_tables"
ucd-generate age "$ucddir" \
--chars > "$out/age.rs"
ucd-generate case-folding-simple "$ucddir" \
--chars --all-pairs > "$out/case_folding_simple.rs"
ucd-generate general-category "$ucddir" \
--chars --exclude surrogate > "$out/general_category.rs"
ucd-generate grapheme-cluster-break "$ucddir" \
--chars > "$out/grapheme_cluster_break.rs"
ucd-generate property-bool "$ucddir" \
--chars > "$out/property_bool.rs"
ucd-generate property-names "$ucddir" \
> "$out/property_names.rs"
ucd-generate property-values "$ucddir" \
--include gc,script,scx,age,gcb,wb,sb > "$out/property_values.rs"
ucd-generate script "$ucddir" \
--chars > "$out/script.rs"
ucd-generate script-extension "$ucddir" \
--chars > "$out/script_extension.rs"
ucd-generate sentence-break "$ucddir" \
--chars > "$out/sentence_break.rs"
ucd-generate word-break "$ucddir" \
--chars > "$out/word_break.rs"
# These generate the \w, \d and \s Unicode-aware character classes. \d and \s
# are technically part of the general category and boolean properties generated
# above. However, these are generated separately to make it possible to enable
# or disable them via Cargo features independently of whether all boolean
# properties or general categories are enabled or disabled. The crate ensures
# that only one copy is compiled.
ucd-generate perl-word "$ucddir" \
--chars > "$out/perl_word.rs"
ucd-generate general-category "$ucddir" \
--chars --include decimalnumber > "$out/perl_decimal.rs"
ucd-generate property-bool "$ucddir" \
--chars --include whitespace > "$out/perl_space.rs"
# Make sure everything is formatted.
cargo +stable fmt --all