diff --git a/intl/unicharutil/tools/genSpecialCasingData.pl b/intl/unicharutil/tools/genSpecialCasingData.pl new file mode 100755 index 000000000000..3d9207c4a7ac --- /dev/null +++ b/intl/unicharutil/tools/genSpecialCasingData.pl @@ -0,0 +1,287 @@ +#!/usr/bin/env perl + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +# This tool is used to extract "special" (one-to-many) case mappings +# into a form that can be used by nsTextRunTransformations. + +use strict; + +if ($#ARGV != 1) { + print <<__EOT; +# Run this tool using a command line of the form +# +# perl genSpecialCasingData.pl UnicodeData.txt SpecialCasing.txt +# +# The nsSpecialCasingData.cpp file will be written to standard output. +# +# This tool will also write up-to-date versions of the test files +# all-{upper,lower,title}.html +# and corresponding -ref files in the current directory. +# +__EOT + exit 0; +} + +my %allLower; +my %allUpper; +my %allTitle; +my %compositions; +my %gc; +open FH, "< $ARGV[0]" or die "can't open $ARGV[0] (should be UnicodeData.txt)\n"; +while () { + chomp; + my @fields = split /;/; + next if ($fields[1] =~ /) { + chomp; + m/#\s*(.+)$/; + my $comment = $1; + if ($comment =~ /^(SpecialCasing-|Date:)/) { + push @headerLines, $comment; + next; + } + s/#.*//; + s/;\s*$//; + next if $_ eq ''; + my @fields = split /; */; + next unless (scalar @fields) == 4; + my $usv = hex "0x$fields[0]"; + addIfSpecial(\%specialLower, $usv, $fields[1]); + addIfSpecial(\%specialTitle, $usv, $fields[2]); + addIfSpecial(\%specialUpper, $usv, $fields[3]); + $charName{$usv} = $comment; +} +close FH; + +print <<__END__; +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Auto-generated from files in the Unicode Character Database + by genSpecialCasingData.pl - do not edit! */ + +#include "nsSpecialCasingData.h" +#include "mozilla/Util.h" // for ArrayLength +#include // for bsearch + +__END__ +map { print "/* $_ */\n" } @headerLines; + +print <<__END__; + +using mozilla::unicode::MultiCharMapping; + +__END__ + +printMappings('Lower', \%specialLower); +printMappings('Upper', \%specialUpper); +printMappings('Title', \%specialTitle); + +print <<__END__; +static int CompareMCM(const void* aKey, const void* aElement) +{ + const PRUint32 ch = *static_cast(aKey); + const MultiCharMapping* mcm = static_cast(aElement); + return int(ch) - int(mcm->mOriginalChar); +} + +#define MAKE_SPECIAL_CASE_ACCESSOR(which) \\ + const MultiCharMapping* \\ + Special##which(PRUint32 aChar) \\ + { \\ + const void* p = bsearch(&aChar, CaseSpecials_##which, \\ + mozilla::ArrayLength(CaseSpecials_##which), \\ + sizeof(MultiCharMapping), CompareMCM); \\ + return static_cast(p); \\ + } + +namespace mozilla { +namespace unicode { + +MAKE_SPECIAL_CASE_ACCESSOR(Lower) +MAKE_SPECIAL_CASE_ACCESSOR(Upper) +MAKE_SPECIAL_CASE_ACCESSOR(Title) + +} // namespace unicode +} // namespace mozilla +__END__ + +addSpecialsTo(\%allLower, \%specialLower); +addSpecialsTo(\%allUpper, \%specialUpper); +addSpecialsTo(\%allTitle, \%specialTitle); + +my $testFont = "../fonts/dejavu-sans/DejaVuSans.ttf"; +genTest('lower', \%allLower); +genTest('upper', \%allUpper); +genTitleTest(); + +sub printMappings { + my ($whichMapping, $hash) = @_; + print "static const MultiCharMapping CaseSpecials_${whichMapping}[] = {\n"; + foreach my $key (sort { $a <=> $b } keys %$hash) { + my @chars = split(/ /, $hash->{$key}); + printf " { 0x%04x, {0x%04x, 0x%04x, 0x%04x} }, // %s\n", $key, + hex "0x0$chars[0]", hex "0x0$chars[1]", hex "0x0$chars[2]", + "$charName{$key}"; + } + print "};\n\n"; +}; + +sub addIfSpecial { + my ($hash, $usv, $mapping) = @_; + return unless $mapping =~ / /; + # only do compositions that start with the initial char + foreach (keys %compositions) { + $mapping =~ s/^$_/$compositions{$_}/; + } + $hash->{$usv} = $mapping; +}; + +sub addSpecialsTo { + my ($hash, $specials) = @_; + foreach my $key (keys %$specials) { + $hash->{$key} = $specials->{$key}; + } +}; + +sub genTest { + my ($whichMapping, $hash) = @_; + open OUT, "> all-$whichMapping.html"; + print OUT <<__END__; + + + + + + + +

+__END__ + foreach my $key (sort { $a <=> $b } keys %$hash) { + printf OUT "&#x%04X;", $key; + print OUT " " if exists $charName{$key}; + print OUT "\n"; + } + print OUT <<__END__; +

+ + +__END__ + close OUT; + + open OUT, "> all-$whichMapping-ref.html"; + print OUT <<__END__; + + + + + + + +

+__END__ + foreach my $key (sort { $a <=> $b } keys %$hash) { + print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $hash->{$key})); + print OUT " " if exists $charName{$key}; + print OUT "\n"; + } + print OUT <<__END__; +

+ + +__END__ + close OUT; +}; + +sub genTitleTest { + open OUT, "> all-title.html"; + print OUT <<__END__; + + + + + + + +

+__END__ + foreach my $key (sort { $a <=> $b } keys %allTitle) { + printf OUT "&#x%04X;x", $key; + print OUT " " if exists $charName{$key}; + print OUT "\n"; + } + print OUT <<__END__; +

+ + +__END__ + close OUT; + + open OUT, "> all-title-ref.html"; + print OUT <<__END__; + + + + + + + +

+__END__ + foreach my $key (sort { $a <=> $b } keys %allTitle) { + # capitalize is only applied to characters with GC=L* or N*... + if ($gc{$key} =~ /^[LN]/) { + # ...and those that are already uppercase are not transformed + if (exists $allUpper{$key}) { + print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $allTitle{$key})); + } else { + printf OUT "&#x%04X;", $key; + } + print OUT "x"; + } else { + printf OUT "&#x%04X;X", $key; + } + print OUT " " if exists $charName{$key}; + print OUT "\n"; + } + print OUT <<__END__; +

+ + +__END__ + close OUT; +}; diff --git a/intl/unicharutil/util/Makefile.in b/intl/unicharutil/util/Makefile.in index 6bd276a94aa9..37415c4d27ac 100644 --- a/intl/unicharutil/util/Makefile.in +++ b/intl/unicharutil/util/Makefile.in @@ -62,6 +62,7 @@ SDK_HEADERS = \ EXPORTS = \ nsBidiUtils.h \ + nsSpecialCasingData.h \ nsUnicodeProperties.h \ nsUnicodeScriptCodes.h \ $(NULL) @@ -69,6 +70,7 @@ EXPORTS = \ CPPSRCS = \ nsUnicharUtils.cpp \ nsBidiUtils.cpp \ + nsSpecialCasingData.cpp \ nsUnicodeProperties.cpp \ $(NULL) diff --git a/intl/unicharutil/util/nsSpecialCasingData.cpp b/intl/unicharutil/util/nsSpecialCasingData.cpp new file mode 100644 index 000000000000..fccb79215ad0 --- /dev/null +++ b/intl/unicharutil/util/nsSpecialCasingData.cpp @@ -0,0 +1,202 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Auto-generated from files in the Unicode Character Database + by genSpecialCasingData.pl - do not edit! */ + +#include "nsSpecialCasingData.h" +#include "mozilla/Util.h" // for ArrayLength +#include // for bsearch + +/* SpecialCasing-6.1.0.txt */ +/* Date: 2011-11-27, 05:10:51 GMT [MD] */ + +using mozilla::unicode::MultiCharMapping; + +static const MultiCharMapping CaseSpecials_Lower[] = { + { 0x0130, {0x0069, 0x0307, 0x0000} }, // LATIN CAPITAL LETTER I WITH DOT ABOVE +}; + +static const MultiCharMapping CaseSpecials_Upper[] = { + { 0x00df, {0x0053, 0x0053, 0x0000} }, // LATIN SMALL LETTER SHARP S + { 0x0149, {0x02bc, 0x004e, 0x0000} }, // LATIN SMALL LETTER N PRECEDED BY APOSTROPHE + { 0x01f0, {0x004a, 0x030c, 0x0000} }, // LATIN SMALL LETTER J WITH CARON + { 0x0390, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + { 0x03b0, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + { 0x0587, {0x0535, 0x0552, 0x0000} }, // ARMENIAN SMALL LIGATURE ECH YIWN + { 0x1e96, {0x0048, 0x0331, 0x0000} }, // LATIN SMALL LETTER H WITH LINE BELOW + { 0x1e97, {0x0054, 0x0308, 0x0000} }, // LATIN SMALL LETTER T WITH DIAERESIS + { 0x1e98, {0x0057, 0x030a, 0x0000} }, // LATIN SMALL LETTER W WITH RING ABOVE + { 0x1e99, {0x0059, 0x030a, 0x0000} }, // LATIN SMALL LETTER Y WITH RING ABOVE + { 0x1e9a, {0x0041, 0x02be, 0x0000} }, // LATIN SMALL LETTER A WITH RIGHT HALF RING + { 0x1f50, {0x03a5, 0x0313, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PSILI + { 0x1f52, {0x03a5, 0x0313, 0x0300} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA + { 0x1f54, {0x03a5, 0x0313, 0x0301} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA + { 0x1f56, {0x03a5, 0x0313, 0x0342} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI + { 0x1f80, {0x1f08, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI + { 0x1f81, {0x1f09, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI + { 0x1f82, {0x1f0a, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI + { 0x1f83, {0x1f0b, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI + { 0x1f84, {0x1f0c, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI + { 0x1f85, {0x1f0d, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI + { 0x1f86, {0x1f0e, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1f87, {0x1f0f, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1f88, {0x1f08, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + { 0x1f89, {0x1f09, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI + { 0x1f8a, {0x1f0a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1f8b, {0x1f0b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1f8c, {0x1f0c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1f8d, {0x1f0d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1f8e, {0x1f0e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1f8f, {0x1f0f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1f90, {0x1f28, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI + { 0x1f91, {0x1f29, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI + { 0x1f92, {0x1f2a, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI + { 0x1f93, {0x1f2b, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI + { 0x1f94, {0x1f2c, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI + { 0x1f95, {0x1f2d, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI + { 0x1f96, {0x1f2e, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1f97, {0x1f2f, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1f98, {0x1f28, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + { 0x1f99, {0x1f29, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI + { 0x1f9a, {0x1f2a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1f9b, {0x1f2b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1f9c, {0x1f2c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1f9d, {0x1f2d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1f9e, {0x1f2e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1f9f, {0x1f2f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1fa0, {0x1f68, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI + { 0x1fa1, {0x1f69, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI + { 0x1fa2, {0x1f6a, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI + { 0x1fa3, {0x1f6b, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI + { 0x1fa4, {0x1f6c, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI + { 0x1fa5, {0x1f6d, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI + { 0x1fa6, {0x1f6e, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1fa7, {0x1f6f, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI + { 0x1fa8, {0x1f68, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + { 0x1fa9, {0x1f69, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI + { 0x1faa, {0x1f6a, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1fab, {0x1f6b, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1fac, {0x1f6c, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1fad, {0x1f6d, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1fae, {0x1f6e, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1faf, {0x1f6f, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1fb2, {0x1fba, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI + { 0x1fb3, {0x0391, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI + { 0x1fb4, {0x0386, 0x0399, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI + { 0x1fb6, {0x0391, 0x0342, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI + { 0x1fb7, {0x0391, 0x0342, 0x0399} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0x1fbc, {0x0391, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + { 0x1fc2, {0x1fca, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI + { 0x1fc3, {0x0397, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI + { 0x1fc4, {0x0389, 0x0399, 0x0000} }, // GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI + { 0x1fc6, {0x0397, 0x0342, 0x0000} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI + { 0x1fc7, {0x0397, 0x0342, 0x0399} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0x1fcc, {0x0397, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + { 0x1fd2, {0x03aa, 0x0300, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA + { 0x1fd3, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + { 0x1fd6, {0x0399, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH PERISPOMENI + { 0x1fd7, {0x03aa, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI + { 0x1fe2, {0x03ab, 0x0300, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA + { 0x1fe3, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + { 0x1fe4, {0x03a1, 0x0313, 0x0000} }, // GREEK SMALL LETTER RHO WITH PSILI + { 0x1fe6, {0x03a5, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PERISPOMENI + { 0x1fe7, {0x03ab, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI + { 0x1ff2, {0x1ffa, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI + { 0x1ff3, {0x03a9, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI + { 0x1ff4, {0x038f, 0x0399, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI + { 0x1ff6, {0x03a9, 0x0342, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI + { 0x1ff7, {0x03a9, 0x0342, 0x0399} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0x1ffc, {0x03a9, 0x0399, 0x0000} }, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + { 0xfb00, {0x0046, 0x0046, 0x0000} }, // LATIN SMALL LIGATURE FF + { 0xfb01, {0x0046, 0x0049, 0x0000} }, // LATIN SMALL LIGATURE FI + { 0xfb02, {0x0046, 0x004c, 0x0000} }, // LATIN SMALL LIGATURE FL + { 0xfb03, {0x0046, 0x0046, 0x0049} }, // LATIN SMALL LIGATURE FFI + { 0xfb04, {0x0046, 0x0046, 0x004c} }, // LATIN SMALL LIGATURE FFL + { 0xfb05, {0x0053, 0x0054, 0x0000} }, // LATIN SMALL LIGATURE LONG S T + { 0xfb06, {0x0053, 0x0054, 0x0000} }, // LATIN SMALL LIGATURE ST + { 0xfb13, {0x0544, 0x0546, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN NOW + { 0xfb14, {0x0544, 0x0535, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN ECH + { 0xfb15, {0x0544, 0x053b, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN INI + { 0xfb16, {0x054e, 0x0546, 0x0000} }, // ARMENIAN SMALL LIGATURE VEW NOW + { 0xfb17, {0x0544, 0x053d, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN XEH +}; + +static const MultiCharMapping CaseSpecials_Title[] = { + { 0x00df, {0x0053, 0x0073, 0x0000} }, // LATIN SMALL LETTER SHARP S + { 0x0149, {0x02bc, 0x004e, 0x0000} }, // LATIN SMALL LETTER N PRECEDED BY APOSTROPHE + { 0x01f0, {0x004a, 0x030c, 0x0000} }, // LATIN SMALL LETTER J WITH CARON + { 0x0390, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + { 0x03b0, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + { 0x0587, {0x0535, 0x0582, 0x0000} }, // ARMENIAN SMALL LIGATURE ECH YIWN + { 0x1e96, {0x0048, 0x0331, 0x0000} }, // LATIN SMALL LETTER H WITH LINE BELOW + { 0x1e97, {0x0054, 0x0308, 0x0000} }, // LATIN SMALL LETTER T WITH DIAERESIS + { 0x1e98, {0x0057, 0x030a, 0x0000} }, // LATIN SMALL LETTER W WITH RING ABOVE + { 0x1e99, {0x0059, 0x030a, 0x0000} }, // LATIN SMALL LETTER Y WITH RING ABOVE + { 0x1e9a, {0x0041, 0x02be, 0x0000} }, // LATIN SMALL LETTER A WITH RIGHT HALF RING + { 0x1f50, {0x03a5, 0x0313, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PSILI + { 0x1f52, {0x03a5, 0x0313, 0x0300} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA + { 0x1f54, {0x03a5, 0x0313, 0x0301} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA + { 0x1f56, {0x03a5, 0x0313, 0x0342} }, // GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI + { 0x1fb2, {0x1fba, 0x0345, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI + { 0x1fb4, {0x0386, 0x0345, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI + { 0x1fb6, {0x0391, 0x0342, 0x0000} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI + { 0x1fb7, {0x0391, 0x0342, 0x0345} }, // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0x1fc2, {0x1fca, 0x0345, 0x0000} }, // GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI + { 0x1fc4, {0x0389, 0x0345, 0x0000} }, // GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI + { 0x1fc6, {0x0397, 0x0342, 0x0000} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI + { 0x1fc7, {0x0397, 0x0342, 0x0345} }, // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0x1fd2, {0x03aa, 0x0300, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA + { 0x1fd3, {0x03aa, 0x0301, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + { 0x1fd6, {0x0399, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH PERISPOMENI + { 0x1fd7, {0x03aa, 0x0342, 0x0000} }, // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI + { 0x1fe2, {0x03ab, 0x0300, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA + { 0x1fe3, {0x03ab, 0x0301, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + { 0x1fe4, {0x03a1, 0x0313, 0x0000} }, // GREEK SMALL LETTER RHO WITH PSILI + { 0x1fe6, {0x03a5, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH PERISPOMENI + { 0x1fe7, {0x03ab, 0x0342, 0x0000} }, // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI + { 0x1ff2, {0x1ffa, 0x0345, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI + { 0x1ff4, {0x038f, 0x0345, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI + { 0x1ff6, {0x03a9, 0x0342, 0x0000} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI + { 0x1ff7, {0x03a9, 0x0342, 0x0345} }, // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + { 0xfb00, {0x0046, 0x0066, 0x0000} }, // LATIN SMALL LIGATURE FF + { 0xfb01, {0x0046, 0x0069, 0x0000} }, // LATIN SMALL LIGATURE FI + { 0xfb02, {0x0046, 0x006c, 0x0000} }, // LATIN SMALL LIGATURE FL + { 0xfb03, {0x0046, 0x0066, 0x0069} }, // LATIN SMALL LIGATURE FFI + { 0xfb04, {0x0046, 0x0066, 0x006c} }, // LATIN SMALL LIGATURE FFL + { 0xfb05, {0x0053, 0x0074, 0x0000} }, // LATIN SMALL LIGATURE LONG S T + { 0xfb06, {0x0053, 0x0074, 0x0000} }, // LATIN SMALL LIGATURE ST + { 0xfb13, {0x0544, 0x0576, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN NOW + { 0xfb14, {0x0544, 0x0565, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN ECH + { 0xfb15, {0x0544, 0x056b, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN INI + { 0xfb16, {0x054e, 0x0576, 0x0000} }, // ARMENIAN SMALL LIGATURE VEW NOW + { 0xfb17, {0x0544, 0x056d, 0x0000} }, // ARMENIAN SMALL LIGATURE MEN XEH +}; + +static int CompareMCM(const void* aKey, const void* aElement) +{ + const PRUint32 ch = *static_cast(aKey); + const MultiCharMapping* mcm = static_cast(aElement); + return int(ch) - int(mcm->mOriginalChar); +} + +#define MAKE_SPECIAL_CASE_ACCESSOR(which) \ + const MultiCharMapping* \ + Special##which(PRUint32 aChar) \ + { \ + const void* p = bsearch(&aChar, CaseSpecials_##which, \ + mozilla::ArrayLength(CaseSpecials_##which), \ + sizeof(MultiCharMapping), CompareMCM); \ + return static_cast(p); \ + } + +namespace mozilla { +namespace unicode { + +MAKE_SPECIAL_CASE_ACCESSOR(Lower) +MAKE_SPECIAL_CASE_ACCESSOR(Upper) +MAKE_SPECIAL_CASE_ACCESSOR(Title) + +} // namespace unicode +} // namespace mozilla diff --git a/intl/unicharutil/util/nsSpecialCasingData.h b/intl/unicharutil/util/nsSpecialCasingData.h new file mode 100644 index 000000000000..fe2fc5220271 --- /dev/null +++ b/intl/unicharutil/util/nsSpecialCasingData.h @@ -0,0 +1,26 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "prtypes.h" + +namespace mozilla { +namespace unicode { + +// Multi-character mappings (from SpecialCasing.txt) map a single Unicode +// value to a sequence of 2 or 3 Unicode characters. There are currently none +// defined outside the BMP, so we can use PRUnichar here. Unused trailing +// positions in mMappedChars are set to 0. +struct MultiCharMapping { + PRUnichar mOriginalChar; + PRUnichar mMappedChars[3]; +}; + +// Return a pointer to the special case mapping for the given character; +// returns NULL if no such mapping is defined. +const MultiCharMapping* SpecialUpper(PRUint32 aCh); +const MultiCharMapping* SpecialLower(PRUint32 aCh); +const MultiCharMapping* SpecialTitle(PRUint32 aCh); + +} // namespace unicode +} // namespace mozilla diff --git a/intl/unicharutil/util/objs.mk b/intl/unicharutil/util/objs.mk index 69f78cce7f18..4c75e08e617b 100644 --- a/intl/unicharutil/util/objs.mk +++ b/intl/unicharutil/util/objs.mk @@ -37,6 +37,7 @@ INTL_UNICHARUTIL_UTIL_LCPPSRCS = \ nsUnicharUtils.cpp \ nsBidiUtils.cpp \ + nsSpecialCasingData.cpp \ nsUnicodeProperties.cpp \ $(NULL) diff --git a/layout/generic/nsTextRunTransformations.cpp b/layout/generic/nsTextRunTransformations.cpp index b9aaaa2acb42..6c05638479c2 100644 --- a/layout/generic/nsTextRunTransformations.cpp +++ b/layout/generic/nsTextRunTransformations.cpp @@ -47,8 +47,7 @@ #include "nsContentUtils.h" #include "nsUnicharUtils.h" #include "nsUnicodeProperties.h" - -#define SZLIG 0x00DF +#include "nsSpecialCasingData.h" // Unicode characters needing special casing treatment in tr/az languages #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130 @@ -158,11 +157,18 @@ nsTransformingTextRunFactory::MakeTextRun(const PRUint8* aString, PRUint32 aLeng * are identical. * * This is used for text-transform:uppercase when we encounter a SZLIG, - * whose uppercase form is "SS". + * whose uppercase form is "SS", or other ligature or precomposed form + * that expands to multiple codepoints during case transformation. * * This function is unable to merge characters when they occur in different - * glyph runs. It's hard to see how this could happen, but if it does, we just - * discard the characters-to-merge. + * glyph runs. This only happens in tricky edge cases where a character was + * decomposed by case-mapping (e.g. there's no precomposed uppercase version + * of an accented lowercase letter), and then font-matching caused the + * diacritics to be assigned to a different font than the base character. + * In this situation, the diacritic(s) get discarded, which is less than + * ideal, but they probably weren't going to render very well anyway. + * Bug 543200 will improve this by making font-matching operate on entire + * clusters instead of individual codepoints. * * For simplicity, this produces a textrun containing all DetailedGlyphs, * no simple glyphs. So don't call it unless you really have merging to do. @@ -188,9 +194,11 @@ MergeCharactersInTextRun(gfxTextRun* aDest, gfxTextRun* aSrc, bool anyMissing = false; PRUint32 mergeRunStart = iter.GetStringStart(); - PRUint32 k; - for (k = iter.GetStringStart(); k < iter.GetStringEnd(); ++k) { - const gfxTextRun::CompressedGlyph g = aSrc->GetCharacterGlyphs()[k]; + const gfxTextRun::CompressedGlyph *srcGlyphs = aSrc->GetCharacterGlyphs(); + gfxTextRun::CompressedGlyph mergedGlyph = srcGlyphs[mergeRunStart]; + PRUint32 stringEnd = iter.GetStringEnd(); + for (PRUint32 k = iter.GetStringStart(); k < stringEnd; ++k) { + const gfxTextRun::CompressedGlyph g = srcGlyphs[k]; if (g.IsSimpleGlyph()) { if (!anyMissing) { gfxTextRun::DetailedGlyph details; @@ -210,40 +218,39 @@ MergeCharactersInTextRun(gfxTextRun* aDest, gfxTextRun* aSrc, } } - // We could teach this method to handle merging of characters that aren't - // cluster starts or ligature group starts, but this is really only used - // to merge S's (uppercase ß), so it's not worth it. - if (k + 1 < iter.GetStringEnd() && aCharsToMerge[k + 1]) { - NS_ASSERTION(g.IsClusterStart() && g.IsLigatureGroupStart(), - "Don't know how to merge this stuff"); + // next char is supposed to merge with current, so loop without + // writing current merged glyph to the destination continue; } - NS_ASSERTION(mergeRunStart == k || - (g.IsClusterStart() && g.IsLigatureGroupStart()), - "Don't know how to merge this stuff"); - // If the start of the merge run is actually a character that should // have been merged with the previous character (this can happen - // if there's a font change in the middle of a szlig, for example), + // if there's a font change in the middle of a case-mapped character, + // that decomposed into a sequence of base+diacritics, for example), // just discard the entire merge run. See comment at start of this // function. + NS_WARN_IF_FALSE(!aCharsToMerge[mergeRunStart], + "unable to merge across a glyph run boundary, " + "glyph(s) discarded"); if (!aCharsToMerge[mergeRunStart]) { - gfxTextRun::CompressedGlyph mergedGlyphs = - aSrc->GetCharacterGlyphs()[mergeRunStart]; if (anyMissing) { - mergedGlyphs.SetMissing(glyphs.Length()); + mergedGlyph.SetMissing(glyphs.Length()); } else { - mergedGlyphs.SetComplex(true, true, glyphs.Length()); + mergedGlyph.SetComplex(mergedGlyph.IsClusterStart(), + mergedGlyph.IsLigatureGroupStart(), + glyphs.Length()); } - aDest->SetGlyphs(offset, mergedGlyphs, glyphs.Elements()); + aDest->SetGlyphs(offset, mergedGlyph, glyphs.Elements()); ++offset; } glyphs.Clear(); anyMissing = false; mergeRunStart = k + 1; + if (mergeRunStart < stringEnd) { + mergedGlyph = srcGlyphs[mergeRunStart]; + } } NS_ASSERTION(glyphs.Length() == 0, "Leftover glyphs, don't request merging of the last character with its next!"); @@ -310,7 +317,7 @@ nsFontVariantTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, ch = SURROGATE_TO_UCS4(ch, str[i + 1]); } PRUint32 ch2 = ToUpperCase(ch); - isLowercase = ch != ch2 || ch == SZLIG; + isLowercase = ch != ch2 || mozilla::unicode::SpecialUpper(ch); } else { // Don't transform the character! I.e., pretend that it's not lowercase } @@ -399,7 +406,8 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, PRUint8 style = mAllUppercase ? NS_STYLE_TEXT_TRANSFORM_UPPERCASE : styleContext->GetStyleText()->mTextTransform; - bool extraChar = false; + int extraChars = 0; + const mozilla::unicode::MultiCharMapping *mcm; if (NS_IS_HIGH_SURROGATE(ch) && i < length - 1 && NS_IS_LOW_SURROGATE(str[i + 1])) { ch = SURROGATE_TO_UCS4(ch, str[i + 1]); @@ -420,11 +428,19 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, switch (style) { case NS_STYLE_TEXT_TRANSFORM_LOWERCASE: - if (languageSpecificCasing == eTurkish && ch == 'I') { - ch = LATIN_SMALL_LETTER_DOTLESS_I; - prevIsLetter = true; - sigmaIndex = PRUint32(-1); - break; + if (languageSpecificCasing == eTurkish) { + if (ch == 'I') { + ch = LATIN_SMALL_LETTER_DOTLESS_I; + prevIsLetter = true; + sigmaIndex = PRUint32(-1); + break; + } + if (ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { + ch = 'i'; + prevIsLetter = true; + sigmaIndex = PRUint32(-1); + break; + } } // Special lowercasing behavior for Greek Sigma: note that this is listed @@ -473,8 +489,6 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, break; } - ch = ToLowerCase(ch); - // ignore diacritics for the purpose of contextual sigma mapping; // otherwise, reset prevIsLetter appropriately and clear the // sigmaIndex marker @@ -482,19 +496,40 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, prevIsLetter = (cat == nsIUGenCategory::kLetter); sigmaIndex = PRUint32(-1); } + + mcm = mozilla::unicode::SpecialLower(ch); + if (mcm) { + int j = 0; + while (j < 2 && mcm->mMappedChars[j + 1]) { + convertedString.Append(mcm->mMappedChars[j]); + ++extraChars; + ++j; + } + ch = mcm->mMappedChars[j]; + break; + } + + ch = ToLowerCase(ch); break; case NS_STYLE_TEXT_TRANSFORM_UPPERCASE: - if (ch == SZLIG) { - convertedString.Append('S'); - extraChar = true; - ch = 'S'; - break; - } if (languageSpecificCasing == eTurkish && ch == 'i') { ch = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; break; } + + mcm = mozilla::unicode::SpecialUpper(ch); + if (mcm) { + int j = 0; + while (j < 2 && mcm->mMappedChars[j + 1]) { + convertedString.Append(mcm->mMappedChars[j]); + ++extraChars; + ++j; + } + ch = mcm->mMappedChars[j]; + break; + } + ch = ToUpperCase(ch); break; @@ -506,12 +541,6 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, } capitalizeDutchIJ = false; if (i < aTextRun->mCapitalize.Length() && aTextRun->mCapitalize[i]) { - if (ch == SZLIG) { - convertedString.Append('S'); - extraChar = true; - ch = 'S'; - break; - } if (languageSpecificCasing == eTurkish && ch == 'i') { ch = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; break; @@ -521,6 +550,19 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, capitalizeDutchIJ = true; break; } + + mcm = mozilla::unicode::SpecialTitle(ch); + if (mcm) { + int j = 0; + while (j < 2 && mcm->mMappedChars[j + 1]) { + convertedString.Append(mcm->mMappedChars[j]); + ++extraChars; + ++j; + } + ch = mcm->mMappedChars[j]; + break; + } + ch = ToTitleCase(ch); } break; @@ -540,11 +582,12 @@ nsCaseTransformTextRunFactory::RebuildTextRun(nsTransformedTextRun* aTextRun, canBreakBeforeArray.AppendElement(false); } - if (extraChar) { + while (extraChars > 0) { ++extraCharsCount; charsToMergeArray.AppendElement(true); styleArray.AppendElement(styleContext); canBreakBeforeArray.AppendElement(false); + --extraChars; } }