gecko-dev/layout/mathml/updateOperatorDictionary.pl
Frederic Wang 246c6b03a7 Bug 1246657 - The Operator Dictionary lack entries for U+1EEF0 and U+1EEF1. r=emilio
This commit ensures that the following operators use category I from
MathML Core's operator dictionary [1] [2]:

  U+1EEF0 ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL
  U+1EEF1 ARABIC MATHEMATICAL OPERATOR HAH WITH DAL

which corresponds to zero lspace/rspace and stretchy. There should
already be exhaustive WPT tests operator-dictionary-* to check
these and other properties, but they may be shadowed by existing
failures or Firefox bugs, so add some more specific reftests for
spacing and stretching. However, nsMathMLmoFrame and nsMathMLChar
don't handle non-BMP characters very well, so only the first one
currently passes.

Also tweak updateOperatorDictionary.pl to ignore these special
operators.

[1] https://w3c.github.io/mathml-core/#dfn-algorithm-to-determine-the-category-of-an-operator
[2] https://w3c.github.io/mathml-core/#operator-dictionary-categories-values

Differential Revision: https://phabricator.services.mozilla.com/D157788
2022-09-23 08:08:43 +00:00

460 lines
14 KiB
Perl
Executable File

#!/usr/bin/perl
# -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
use XML::LibXSLT;
use XML::LibXML;
use LWP::Simple;
# output files
$FILE_UNICODE = "unicode.xml";
$FILE_DICTIONARY = "dictionary.xml";
$FILE_DIFFERENCES = "differences.txt";
$FILE_NEW_DICTIONARY = "new_dictionary.txt";
$FILE_SYNTAX_ERRORS = "syntax_errors.txt";
# our dictionary (property file)
$MOZ_DICTIONARY = "mathfont.properties";
# dictionary provided by the W3C in "XML Entity Definitions for Characters"
$WG_DICTIONARY_URL = "https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml";
# XSL stylesheet to extract relevant data from the dictionary
$DICTIONARY_XSL = "operatorDictionary.xsl";
# dictionary provided by the W3C transformed with operatorDictionary.xsl
$WG_DICTIONARY = $FILE_DICTIONARY;
if (!($#ARGV >= 0 &&
((($ARGV[0] eq "download") && $#ARGV <= 1) ||
(($ARGV[0] eq "compare") && $#ARGV <= 1) ||
(($ARGV[0] eq "check") && $#ARGV <= 0) ||
(($ARGV[0] eq "clean") && $#ARGV <= 0)))) {
&usage;
}
if ($ARGV[0] eq "download") {
if ($#ARGV == 1) {
$WG_DICTIONARY_URL = $ARGV[1];
}
print "Downloading $WG_DICTIONARY_URL...\n";
getstore($WG_DICTIONARY_URL, $FILE_UNICODE);
print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n";
my $xslt = XML::LibXSLT->new();
my $source = XML::LibXML->load_xml(location => $FILE_UNICODE);
my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL,
no_cdata=>1);
my $stylesheet = $xslt->parse_stylesheet($style_doc);
my $results = $stylesheet->transform($source);
open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!");
print $file $stylesheet->output_as_bytes($results);
close($file);
exit 0;
}
if ($ARGV[0] eq "clean") {
unlink($FILE_UNICODE,
$FILE_DICTIONARY,
$FILE_DIFFERENCES,
$FILE_NEW_DICTIONARY,
$FILE_SYNTAX_ERRORS);
exit 0;
}
if ($ARGV[0] eq "compare" && $#ARGV == 1) {
$WG_DICTIONARY = $ARGV[1];
}
################################################################################
# structure of the dictionary used by this script:
# - key: same as in mathfont.properties
# - table:
# index | value
# 0 | description
# 1 | lspace
# 2 | rspace
# 4 | largeop
# 5 | movablelimits
# 6 | stretchy
# 7 | separator
# 8 | accent
# 9 | fence
# 10 | symmetric
# 13 | direction
# 1) build %moz_hash from $MOZ_DICTIONARY
print "loading $MOZ_DICTIONARY...\n";
open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!");
print "building dictionary...\n";
while (<$file>) {
next unless (m/^operator\.(.*)$/);
(m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/);
# 1.1) build the key
$key = $1;
# 1.2) build the array
$_ = $2;
@value = ();
$value[0] = $3;
if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; }
if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; }
$value[4] = (m/^(.*)largeop(.*)$/);
$value[5] = (m/^(.*)movablelimits(.*)$/);
$value[6] = (m/^(.*)stretchy(.*)$/);
$value[7] = (m/^(.*)separator(.*)$/);
$value[8] = (m/^(.*)accent(.*)$/);
$value[9] = (m/^(.*)fence(.*)$/);
$value[10] = (m/^(.*)symmetric(.*)$/);
if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; }
else { $value[13] = ""; }
# 1.3) save the key and value
$moz_hash{$key} = [ @value ];
}
close($file);
################################################################################
# 2) If mode "check", verify validity of our operator dictionary and quit.
# If mode "compare", go to step 3)
if ($ARGV[0] eq "check") {
print "checking operator dictionary...\n";
open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") ||
die ("Couldn't open $FILE_SYNTAX_ERRORS!");
$nb_errors = 0;
$nb_warnings = 0;
@moz_keys = (keys %moz_hash);
# check the validity of our private data
while ($key = pop(@moz_keys)) {
if ($key =~ /\\u.+\\u.+\\u.+/) {
$valid = 0;
$nb_errors++;
print $file_syntax_errors "error: \"$key\" has more than 2 characters\n";
}
if ($key =~ /\\u20D2\./ || $key =~ /\\u0338\./) {
$valid = 0;
$nb_errors++;
print $file_syntax_errors "error: \"$key\" ends with character U+20D2 or U+0338\n";
}
@moz = @{ $moz_hash{$key} };
$entry = &generateEntry($key, @moz);
$valid = 1;
if (!(@moz[13] eq "" ||
@moz[13] eq "horizontal" ||
@moz[13] eq "vertical")) {
$valid = 0;
$nb_errors++;
print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n";
}
if (@moz[4] && !(@moz[13] eq "vertical")) {
$valid = 0;
$nb_errors++;
print $file_syntax_errors "error: operator is largeop but does not have vertical direction\n";
}
if (!$valid) {
print $file_syntax_errors $entry;
print $file_syntax_errors "\n";
}
}
# check that all forms have the same direction.
@moz_keys = (keys %moz_hash);
while ($key = pop(@moz_keys)) {
if (@{ $moz_hash{$key} }) {
# the operator has not been removed from the hash table yet.
$_ = $key;
(m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
$key_prefix = "$1.prefix";
$key_infix = "$1.infix";
$key_postfix = "$1.postfix";
@moz_prefix = @{ $moz_hash{$key_prefix} };
@moz_infix = @{ $moz_hash{$key_infix} };
@moz_postfix = @{ $moz_hash{$key_postfix} };
$same_direction = 1;
if (@moz_prefix) {
if (@moz_infix &&
!($moz_infix[13] eq $moz_prefix[13])) {
$same_direction = 0;
}
if (@moz_postfix &&
!($moz_postfix[13] eq $moz_prefix[13])) {
$same_direction = 0;
}
}
if (@moz_infix) {
if (@moz_postfix &&
!($moz_postfix[13] eq $moz_infix[13])) {
$same_direction = 0;
}
}
if (!$same_direction) {
$nb_errors++;
print $file_syntax_errors
"error: operator has a stretchy form, but all forms";
print $file_syntax_errors
" have not the same direction\n";
if (@moz_prefix) {
$_ = &generateEntry($key_prefix, @moz_prefix);
print $file_syntax_errors $_;
}
if (@moz_infix) {
$_ = &generateEntry($key_infix, @moz_infix);
print $file_syntax_errors $_;
}
if (@moz_postfix) {
$_ = &generateEntry($key_postfix, @moz_postfix);
print $file_syntax_errors $_;
}
print $file_syntax_errors "\n";
}
if (@moz_prefix) {
delete $moz_hash{$key.prefix};
}
if (@moz_infix) {
delete $moz_hash{$key_infix};
}
if (@moz_postfix) {
delete $moz_hash{$key_postfix};
}
}
}
close($file_syntax_errors);
print "\n";
if ($nb_errors > 0 || $nb_warnings > 0) {
print "$nb_errors error(s) found\n";
print "$nb_warnings warning(s) found\n";
print "See output file $FILE_SYNTAX_ERRORS.\n\n";
} else {
print "No error found.\n\n";
}
exit 0;
}
################################################################################
# 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY
print "loading $WG_DICTIONARY...\n";
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($WG_DICTIONARY);
print "building dictionary...\n";
@wg_keys = ();
foreach my $entry ($doc->findnodes('/root/entry')) {
# 3.1) build the key
$key = "operator.";
$_ = $entry->getAttribute("unicode");
# Skip non-BMP Arabic characters that are handled specially.
if ($_ == "U1EEF0" || $_ == "U1EEF1") {
next;
}
$_ = "$_-";
while (m/^U?0(\w*)-(.*)$/) {
# Concatenate .\uNNNN
$key = "$key\\u$1";
$_ = $2;
}
$_ = $entry->getAttribute("form"); # "Form"
$key = "$key.$_";
# 3.2) build the array
@value = ();
$value[0] = lc($entry->getAttribute("description"));
$value[1] = $entry->getAttribute("lspace");
if ($value[1] eq "") { $value[1] = "5"; }
$value[2] = $entry->getAttribute("rspace");
if ($value[2] eq "") { $value[2] = "5"; }
$_ = $entry->getAttribute("properties");
$value[4] = (m/^(.*)largeop(.*)$/);
$value[5] = (m/^(.*)movablelimits(.*)$/);
$value[6] = (m/^(.*)stretchy(.*)$/);
$value[7] = (m/^(.*)separator(.*)$/);
$value[9] = (m/^(.*)fence(.*)$/);
$value[10] = (m/^(.*)symmetric(.*)$/);
# not stored in the WG dictionary
$value[8] = ""; # accent
$value[13] = ""; # direction
# 3.3) save the key and value
push(@wg_keys, $key);
$wg_hash{$key} = [ @value ];
}
@wg_keys = reverse(@wg_keys);
################################################################################
# 4) Compare the two dictionaries and output the result
print "comparing dictionaries...\n";
open($file_differences, ">$FILE_DIFFERENCES") ||
die ("Couldn't open $FILE_DIFFERENCES!");
open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") ||
die ("Couldn't open $FILE_NEW_DICTIONARY!");
$conflicting = 0; $conflicting_stretching = 0;
$new = 0; $new_stretching = 0;
$obsolete = 0; $obsolete_stretching = 0;
$unchanged = 0;
# 4.1) look to the entries of the WG dictionary
while ($key = pop(@wg_keys)) {
@wg = @{ $wg_hash{$key} };
delete $wg_hash{$key};
$wg_value = &generateCommon(@wg);
if (exists($moz_hash{$key})) {
# entry is in both dictionary
@moz = @{ $moz_hash{$key} };
delete $moz_hash{$key};
$moz_value = &generateCommon(@moz);
if ($moz_value ne $wg_value) {
# conflicting entry
print $file_differences "[conflict]";
$conflicting++;
if ($moz[6] != $wg[6]) {
print $file_differences "[stretching]";
$conflicting_stretching++;
}
print $file_differences " - $key ($wg[0])\n";
print $file_differences "-$moz_value\n+$wg_value\n\n";
$_ = &completeCommon($wg_value, $key, @moz, @wg);
print $file_new_dictionary $_;
} else {
# unchanged entry
$unchanged++;
$_ = &completeCommon($wg_value, $key, @moz, @wg);
print $file_new_dictionary $_;
}
} else {
# we don't have this entry in our dictionary yet
print $file_differences "[new entry]";
$new++;
if ($wg[6]) {
print $file_differences "[stretching]";
$new_stretching++;
}
print $file_differences " - $key ($wg[0])\n";
print $file_differences "-\n+$wg_value\n\n";
$_ = &completeCommon($wg_value, $key, (), @wg);
print $file_new_dictionary $_;
}
}
print $file_new_dictionary
"\n# Entries below are not part of the official MathML dictionary\n\n";
# 4.2) look in our dictionary the remaining entries
@moz_keys = (keys %moz_hash);
@moz_keys = reverse(sort(@moz_keys));
while ($key = pop(@moz_keys)) {
@moz = @{ $moz_hash{$key} };
$moz_value = &generateCommon(@moz);
print $file_differences "[obsolete entry]";
$obsolete++;
if ($moz[6]) {
print $file_differences "[stretching]";
$obsolete_stretching++;
}
print $file_differences " - $key ($moz[0])\n";
print $file_differences "-$moz_value\n+\n\n";
$_ = &completeCommon($moz_value, $key, (), @moz);
print $file_new_dictionary $_;
}
close($file_differences);
close($file_new_dictionary);
print "\n";
print "- $obsolete obsolete entries ";
print "($obsolete_stretching of them are related to stretching)\n";
print "- $unchanged unchanged entries\n";
print "- $conflicting conflicting entries ";
print "($conflicting_stretching of them are related to stretching)\n";
print "- $new new entries ";
print "($new_stretching of them are related to stretching)\n";
print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n";
print "After having modified the dictionary, please run";
print "./updateOperatorDictionary check\n\n";
exit 0;
################################################################################
sub usage {
# display the accepted command syntax and quit
print "usage:\n";
print " ./updateOperatorDictionary.pl download [unicode.xml]\n";
print " ./updateOperatorDictionary.pl compare [dictionary.xml]\n";
print " ./updateOperatorDictionary.pl check\n";
print " ./updateOperatorDictionary.pl clean\n";
exit 0;
}
sub generateCommon {
# helper function to generate the string of data shared by both dictionaries
my(@v) = @_;
$entry = "lspace:$v[1] rspace:$v[2]";
if ($v[4]) { $entry = "$entry largeop"; }
if ($v[5]) { $entry = "$entry movablelimits"; }
if ($v[6]) { $entry = "$entry stretchy"; }
if ($v[7]) { $entry = "$entry separator"; }
if ($v[9]) { $entry = "$entry fence"; }
if ($v[10]) { $entry = "$entry symmetric"; }
return $entry;
}
sub completeCommon {
# helper to add key and private data to generateCommon
my($entry, $key, @v_moz, @v_wg) = @_;
$entry = "$key = $entry";
if ($v_moz[8]) { $entry = "$entry accent"; }
if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; }
if ($v_moz[0]) {
# keep our previous comment
$entry = "$entry # $v_moz[0]";
} else {
# otherwise use the description given by the WG
$entry = "$entry # $v_wg[0]";
}
$entry = "$entry\n";
return $entry;
}
sub generateEntry {
# helper function to generate an entry of our operator dictionary
my($key, @moz) = @_;
$entry = &generateCommon(@moz);
$entry = &completeCommon($entry, $key, @moz, @moz);
return $entry;
}