8229831: Upgrade Character.isUnicodeIdentifierStart/Part() methods to the latest standard

Reviewed-by: rriggs
This commit is contained in:
Naoto Sato 2019-09-05 17:38:54 -07:00
parent 4d70cdac4f
commit 2aac0e925d
13 changed files with 12096 additions and 75 deletions

View File

@ -115,13 +115,14 @@ class CharacterData00 extends CharacterData {
}
boolean isUnicodeIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
ch == 0x2E2F;
}
boolean isUnicodeIdentifierPart(int ch) {
int props = getProperties(ch);
return ((props & $$maskUnicodePart) != 0);
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
isIdentifierIgnorable(ch) ||
ch == 0x2E2F;
}
boolean isIdentifierIgnorable(int ch) {

View File

@ -114,13 +114,14 @@ class CharacterData01 extends CharacterData {
}
boolean isUnicodeIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
ch == 0x2E2F;
}
boolean isUnicodeIdentifierPart(int ch) {
int props = getProperties(ch);
return ((props & $$maskUnicodePart) != 0);
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
isIdentifierIgnorable(ch) ||
ch == 0x2E2F;
}
boolean isIdentifierIgnorable(int ch) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -113,13 +113,14 @@ class CharacterData02 extends CharacterData {
}
boolean isUnicodeIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
ch == 0x2E2F;
}
boolean isUnicodeIdentifierPart(int ch) {
int props = getProperties(ch);
return ((props & $$maskUnicodePart) != 0);
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
isIdentifierIgnorable(ch) ||
ch == 0x2E2F;
}
boolean isIdentifierIgnorable(int ch) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -113,15 +113,16 @@ class CharacterData0E extends CharacterData {
}
boolean isUnicodeIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
ch == 0x2E2F;
}
boolean isUnicodeIdentifierPart(int ch) {
int props = getProperties(ch);
return ((props & $$maskUnicodePart) != 0);
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
isIdentifierIgnorable(ch) ||
ch == 0x2E2F;
}
boolean isIdentifierIgnorable(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueIgnorable);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -133,13 +133,14 @@ class CharacterDataLatin1 extends CharacterData {
}
boolean isUnicodeIdentifierStart(int ch) {
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) == $$valueUnicodeStart);
return (getPropertiesEx(ch) & $$maskIDStart) != 0 ||
ch == 0x2E2F;
}
boolean isUnicodeIdentifierPart(int ch) {
int props = getProperties(ch);
return ((props & $$maskUnicodePart) != 0);
return (getPropertiesEx(ch) & $$maskIDContinue) != 0 ||
isIdentifierIgnorable(ch) ||
ch == 0x2E2F;
}
boolean isIdentifierIgnorable(int ch) {

File diff suppressed because it is too large Load Diff

View File

@ -42,6 +42,7 @@ define SetupCharacterData
-spec $(UNICODEDATA)/UnicodeData.txt \
-specialcasing $(UNICODEDATA)/SpecialCasing.txt \
-proplist $(UNICODEDATA)/PropList.txt \
-derivedprops $(UNICODEDATA)/DerivedCoreProperties.txt \
-o $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/lang/$1.java \
-usecharforbyte $3

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -73,6 +73,7 @@ public class GenerateCharacter {
static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
static String DefaultPropListFileName = ROOT + "PropList.txt";
static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
static String DefaultJavaOutputFileName = ROOT + "Character.java";
static String DefaultCTemplateFileName = ROOT + "Character.c.template";
@ -159,6 +160,8 @@ public class GenerateCharacter {
1 bit Other_Math property
1 bit Ideographic property
1 bit Noncharacter codepoint property
1 bit ID_Start property
1 bit ID_Continue property
*/
@ -190,7 +193,7 @@ public class GenerateCharacter {
// maskMirrored needs to be long, if up 16-bit
private static final long maskMirrored = 0x80000000L;
// bit masks identify the 16-bit priperty field described above, in B
// bit masks identify the 16-bit property field described above, in B
// table
private static final long
maskOtherLowercase = 0x100000000L,
@ -198,7 +201,9 @@ public class GenerateCharacter {
maskOtherAlphabetic = 0x400000000L,
maskOtherMath = 0x800000000L,
maskIdeographic = 0x1000000000L,
maskNoncharacterCP = 0x2000000000L;
maskNoncharacterCP = 0x2000000000L,
maskIDStart = 0x4000000000L,
maskIDContinue = 0x8000000000L;
// Can compare masked values with these to determine
// numeric or lexical types.
@ -367,6 +372,8 @@ public class GenerateCharacter {
addExProp(result, propList, "Ideographic", maskIdeographic);
//addExProp(result, propList, "Other_Math", maskOtherMath);
//addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
addExProp(result, propList, "ID_Start", maskIDStart);
addExProp(result, propList, "ID_Continue", maskIDContinue);
return result;
}
@ -780,6 +787,8 @@ OUTER: for (int i = 0; i < n; i += m) {
if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
@ -1612,6 +1621,7 @@ OUTER: for (int i = 0; i < n; i += m) {
static String UnicodeSpecFileName = null; // liu
static String SpecialCasingFileName = null;
static String PropListFileName = null;
static String DerivedPropsFileName = null;
static boolean useCharForByte = false;
static int[] sizes;
static int bins = 0; // liu; if > 0, then perform search
@ -1739,6 +1749,14 @@ OUTER: for (int i = 0; i < n; i += m) {
PropListFileName = args[++j];
}
}
else if (args[j].equals("-derivedprops")) {
if (j == args.length -1) {
FAIL("File name missing after -derivedprops");
}
else {
DerivedPropsFileName = args[++j];
}
}
else if (args[j].equals("-plane")) {
if (j == args.length -1) {
FAIL("Plane number missing after -plane");
@ -1803,6 +1821,10 @@ OUTER: for (int i = 0; i < n; i += m) {
PropListFileName = DefaultPropListFileName;
desc.append(" [-proplist " + PropListFileName + ']');
}
if (DerivedPropsFileName == null) {
DerivedPropsFileName = DefaultDerivedPropsFileName;
desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
}
if (TemplateFileName == null) {
TemplateFileName = (Csyntax ? DefaultCTemplateFileName
: DefaultJavaTemplateFileName);
@ -1954,6 +1976,7 @@ OUTER: for (int i = 0; i < n; i += m) {
UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
if (verbose) {
System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -31,7 +31,8 @@ import java.io.*;
/**
* A PropList object contains the lists of code points that have
* the same Unicode property defined in PropList.txt
* the same Unicode property defined in PropList.txt and
* DerivedCoreProperties.txt
*
* @author Xueming Shen
*/
@ -51,8 +52,13 @@ public class PropList {
return propMap.keySet();
}
private Map<String, ArrayList<Integer>> propMap =
new LinkedHashMap<String, ArrayList<Integer>>();
public void putAll(PropList pl) {
pl.names().stream()
.forEach(name -> propMap.put(name, pl.codepoints(name)));
}
private Map<String, List<Integer>> propMap =
new LinkedHashMap<String, List<Integer>>();
private PropList(File file, int plane) throws IOException {
@ -78,7 +84,7 @@ public class PropList {
start &= 0xffff;
end &= 0xffff;
ArrayList<Integer> list = propMap.get(name);
List<Integer> list = propMap.get(name);
if (list == null) {
list = new ArrayList<Integer>();
propMap.put(name, list);

View File

@ -9917,7 +9917,18 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <li> {@link #isLetter(char) isLetter(ch)} returns {@code true}
* <li> {@link #getType(char) getType(ch)} returns
* {@code LETTER_NUMBER}.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
* {@code Other_ID_Start}</a> character.
* </ul>
* <p>
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
* with the following profile of UAX31:
* <pre>
* Start := ID_Start + 'VERTICAL TILDE' (U+2E2F)
* </pre>
* {@code 'VERTICAL TILDE'} is added to {@code Start} for backward
* compatibility.
*
* <p><b>Note:</b> This method cannot handle <a
* href="#supplementary"> supplementary characters</a>. To support
@ -9947,7 +9958,19 @@ class Character implements java.io.Serializable, Comparable<Character> {
* returns {@code true}
* <li> {@link #getType(int) getType(codePoint)}
* returns {@code LETTER_NUMBER}.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
* {@code Other_ID_Start}</a> character.
* </ul>
* <p>
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
* with the following profile of UAX31:
* <pre>
* Start := ID_Start + 'VERTICAL TILDE' (U+2E2F)
* </pre>
* {@code 'VERTICAL TILDE'} is added to {@code Start} for backward
* compatibility.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character may start a Unicode
* identifier; {@code false} otherwise.
@ -9975,7 +9998,22 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <li> it is a non-spacing mark
* <li> {@code isIdentifierIgnorable} returns
* {@code true} for this character.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
* {@code Other_ID_Start}</a> character.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Continue">
* {@code Other_ID_Continue}</a> character.
* </ul>
* <p>
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
* with the following profile of UAX31:
* <pre>
* Continue := Start + ID_Continue + ignorable
* Medial := empty
* ignorable := isIdentifierIgnorable(char) returns true for the character
* </pre>
* {@code ignorable} is added to {@code Continue} for backward
* compatibility.
*
* <p><b>Note:</b> This method cannot handle <a
* href="#supplementary"> supplementary characters</a>. To support
@ -10010,7 +10048,23 @@ class Character implements java.io.Serializable, Comparable<Character> {
* <li> it is a non-spacing mark
* <li> {@code isIdentifierIgnorable} returns
* {@code true} for this character.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Start">
* {@code Other_ID_Start}</a> character.
* <li> it is an <a href="http://www.unicode.org/reports/tr44/#Other_ID_Continue">
* {@code Other_ID_Continue}</a> character.
* </ul>
* <p>
* This method conforms to <a href="https://unicode.org/reports/tr31/#R1">
* UAX31-R1: Default Identifiers</a> requirement of the Unicode Standard,
* with the following profile of UAX31:
* <pre>
* Continue := Start + ID_Continue + ignorable
* Medial := empty
* ignorable := isIdentifierIgnorable(int) returns true for the character
* </pre>
* {@code ignorable} is added to {@code Continue} for backward
* compatibility.
*
* @param codePoint the character (Unicode code point) to be tested.
* @return {@code true} if the character may be part of a
* Unicode identifier; {@code false} otherwise.

View File

@ -23,7 +23,7 @@
/*
* @test
* @bug 8202771 8221431
* @bug 8202771 8221431 8229831
* @summary Check j.l.Character.isDigit/isLetter/isLetterOrDigit/isSpaceChar
* /isWhitespace/isTitleCase/isISOControl/isIdentifierIgnorable
* /isJavaIdentifierStart/isJavaIdentifierPart/isUnicodeIdentifierStart
@ -182,7 +182,7 @@ public class CharPropTest {
private static void isUnicodeIdentifierStartTest(int codePoint, String category) {
boolean actual = Character.isUnicodeIdentifierStart(codePoint);
boolean expected = isUnicodeIdentifierStart(category);
boolean expected = isUnicodeIdentifierStart(codePoint, category);
if (actual != expected) {
printDiff(codePoint, "isUnicodeIdentifierStart", actual, expected);
}
@ -266,14 +266,33 @@ public class CharPropTest {
|| isIdentifierIgnorable(codePoint, category);
}
private static boolean isUnicodeIdentifierStart(String category) {
return isLetter(category) || category.equals("Nl");
private static boolean isUnicodeIdentifierStart(int codePoint, String category) {
return isLetter(category) || category.equals("Nl")
|| isOtherIDStart(codePoint);
}
private static boolean isUnicodeIdentifierPart(int codePoint, String category) {
return isLetter(category) || category.equals("Pc") || category.equals("Nd")
|| category.equals("Nl") || category.equals("Mc") || category.equals("Mn")
|| isIdentifierIgnorable(codePoint, category);
|| isIdentifierIgnorable(codePoint, category)
|| isOtherIDStart(codePoint)
|| isOtherIDContinue(codePoint);
}
private static boolean isOtherIDStart(int codePoint) {
return codePoint == 0x1885 ||
codePoint == 0x1886 ||
codePoint == 0x2118 ||
codePoint == 0x212E ||
codePoint == 0x309B ||
codePoint == 0x309C;
}
private static boolean isOtherIDContinue(int codePoint) {
return codePoint == 0x00B7 ||
codePoint == 0x0387 ||
(codePoint >= 0x1369 && codePoint <= 0x1371) ||
codePoint == 0x19DA;
}
private static void printDiff(int codePoint, String method, boolean actual, boolean expected) {

View File

@ -24,8 +24,9 @@
/**
* @test
* @bug 7037261 7070436 7198195 8032446 8072600 8221431
* @summary Check j.l.Character.isLowerCase/isUppercase/isAlphabetic/isIdeographic
* @bug 7037261 7070436 7198195 8032446 8072600 8221431 8229831
* @summary Check j.l.Character.isLowerCase/isUppercase/isAlphabetic/isIdeographic/
* isUnicodeIdentifierStart/isUnicodeIdentifierPart
* @library /lib/testlibrary/java/lang
*/
@ -36,47 +37,17 @@ import static java.lang.Character.*;
public class CheckProp {
public static void main(String[] args) throws IOException {
File fPropList = UCDFiles.PROP_LIST.toFile();
int i, j;
BufferedReader sbfr = new BufferedReader(new FileReader(fPropList));
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)\\s+#.*").matcher("");
Map<String, ArrayList<Integer>> propMap = new LinkedHashMap<>();
String line = null;
int lineNo = 0;
while ((line = sbfr.readLine()) != null) {
lineNo++;
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
ArrayList<Integer> list = propMap.get(name);
if (list == null) {
list = new ArrayList<Integer>();
propMap.put(name, list);
}
while (start <= end)
list.add(start++);
} else {
System.out.printf("Warning: Unrecognized line %d <%s>%n", lineNo, line);
}
}
sbfr.close();
//for (String name: propMap.keySet()) {
// System.out.printf("%s %d%n", name, propMap.get(name).size());
//}
public static void main(String[] args) {
Map<String, List<Integer>> propMap = new LinkedHashMap<>();
List.of(UCDFiles.PROP_LIST.toFile(), UCDFiles.DERIVED_PROPS.toFile()).stream()
.forEach(f -> readPropMap(propMap, f));
Integer[] otherLowercase = propMap.get("Other_Lowercase").toArray(new Integer[0]);
Integer[] otherUppercase = propMap.get("Other_Uppercase").toArray(new Integer[0]);
Integer[] otherAlphabetic = propMap.get("Other_Alphabetic").toArray(new Integer[0]);
Integer[] ideographic = propMap.get("Ideographic").toArray(new Integer[0]);
Integer[] IDStart = propMap.get("ID_Start").toArray(new Integer[0]);
Integer[] IDContinue = propMap.get("ID_Continue").toArray(new Integer[0]);
int fails = 0;
for (int cp = MIN_CODE_POINT; cp < MAX_CODE_POINT; cp++) {
@ -111,8 +82,63 @@ public class CheckProp {
fails++;
System.err.printf("Wrong isIdeographic(U+%04x)\n", cp);
}
if (isUnicodeIdentifierStart(cp) !=
(cp == 0x2E2F ||
Arrays.binarySearch(IDStart, cp) >= 0))
{
fails++;
System.err.printf("Wrong isUnicodeIdentifierStart(U+%04x)\n", cp);
}
if (isUnicodeIdentifierPart(cp) !=
(isIdentifierIgnorable(cp) ||
cp == 0x2E2F ||
Arrays.binarySearch(IDContinue, cp) >= 0))
{
fails++;
System.err.printf("Wrong isUnicodeIdentifierPart(U+%04x)\n", cp);
}
}
if (fails != 0)
throw new RuntimeException("CheckProp failed=" + fails);
}
private static void readPropMap(Map<String, List<Integer>> propMap, File fPropList) {
try {
BufferedReader sbfr = new BufferedReader(new FileReader(fPropList));
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)\\s+#.*").matcher("");
String line = null;
int lineNo = 0;
while ((line = sbfr.readLine()) != null) {
lineNo++;
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
List<Integer> list = propMap.get(name);
if (list == null) {
list = new ArrayList<Integer>();
propMap.put(name, list);
}
while (start <= end)
list.add(start++);
} else {
System.out.printf("Warning: Unrecognized line %d <%s>%n", lineNo, line);
}
}
sbfr.close();
} catch (IOException ioe) {
throw new UncheckedIOException(ioe);
}
//for (String name: propMap.keySet()) {
// System.out.printf("%s %d%n", name, propMap.get(name).size());
//}
}
}

View File

@ -36,6 +36,8 @@ public class UCDFiles {
public static Path BLOCKS =
UCD_DIR.resolve("Blocks.txt");
public static Path DERIVED_PROPS =
UCD_DIR.resolve("DerivedCoreProperties.txt");
public static Path GRAPHEME_BREAK_PROPERTY =
UCD_DIR.resolve("auxiliary").resolve("GraphemeBreakProperty.txt");
public static Path GRAPHEME_BREAK_TEST =