mirror of
https://github.com/openharmony/third_party_tex-hyphen.git
synced 2026-07-01 22:24:02 -04:00
275 lines
8.8 KiB
Ruby
Executable File
275 lines
8.8 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
||
# encoding: utf-8
|
||
|
||
encodings = ["ec", "qx", "latin7x", "t8m", "lth"]
|
||
encodings = ["ec"]
|
||
# "texnansi", "t5", "lt"
|
||
|
||
$path_data = "data"
|
||
$filename_AGL = File.join($path_data, "aglfn13.txt")
|
||
|
||
$filename_unicode_data = File.join($path_data, "UnicodeData.txt")
|
||
|
||
$AGL_names = Hash.new()
|
||
|
||
|
||
# read from adobe glyph list
|
||
File.open($filename_AGL).grep /^[0-9A-F]+/ do |line|
|
||
unicode, pdfname = line.split(/;/)
|
||
$AGL_names[pdfname] = unicode;
|
||
end
|
||
|
||
$lowercase_letter = Hash.new()
|
||
# 00F0;LATIN SMALL LETTER ETH;Ll;0;L;;;;;N;;Icelandic;00D0;;00D0
|
||
# FB01;LATIN SMALL LIGATURE FI;Ll;0;L;<compat> 0066 0069;;;;N;;;;;
|
||
# lowercase letters
|
||
#File.open($filename_unicode_data).grep /^([0-9A-F]*);[^;]*;Ll;.*$/ do |line|
|
||
File.open($filename_unicode_data).grep /^([0-9A-F]*);.*$/ do |line|
|
||
unicode, name, lowercase, dummy1, dummy2, compat = line.split(/;/)
|
||
if lowercase == "Ll" then
|
||
unless compat.include?("compat")
|
||
$lowercase_letter[unicode] = true
|
||
end
|
||
# Thai
|
||
elsif unicode.hex >= 0x0E01 and unicode.hex <= 0x0E5B then
|
||
if lowercase =~ /(Lo|Mn)/ then
|
||
$lowercase_letter[unicode] = true
|
||
end
|
||
# Georgian lowercase (lowercase: 'Lo')
|
||
elsif unicode.hex >= 0x10D0 and unicode.hex <= 0x10FA then
|
||
$lowercase_letter[unicode] = true
|
||
end
|
||
end
|
||
|
||
|
||
# ij
|
||
$lowercase_letter["0133"] = true
|
||
# florin
|
||
$lowercase_letter["0192"] = false
|
||
# ell
|
||
$lowercase_letter["2113"] = false
|
||
|
||
$AGL_names["hyphenchar"] = $AGL_names["hyphen"]
|
||
$AGL_names["sfthyphen"] = "00AD"
|
||
$AGL_names["hyphen.alt"] = "00AD"
|
||
|
||
$AGL_names["dotlessj"] = "0237"
|
||
$AGL_names["tcedilla"] = "0163"
|
||
$AGL_names["Tcedilla"] = "0162"
|
||
|
||
$AGL_names["ff"] = "FB00" # = 0066 + 0066
|
||
$AGL_names["fi"] = "FB01" # = 0066 + 0069
|
||
$AGL_names["fl"] = "FB02" # = 0066 + 006C
|
||
$AGL_names["ffi"] = "FB03" # = 0066 + 0066 + 0069
|
||
$AGL_names["ffl"] = "FB04" # = 0066 + 0066 + 006C
|
||
|
||
$AGL_names["cwm"] = "200B"
|
||
$AGL_names["zerowidthspace"] = "200B"
|
||
$AGL_names["perthousandzero"] = "?"
|
||
$AGL_names["visiblespace"] = "2423"
|
||
#$AGL_names["nbspace"] = "00A0"
|
||
$AGL_names["nonbreakingspace"] = "00A0"
|
||
$AGL_names["Germandbls"] = "1E9E" # = 0053 + 0053
|
||
$AGL_names["ell"] = "2113"
|
||
|
||
$AGL_names[".notdef"] = "?"
|
||
|
||
$AGL_names["onesuperior"] = "00B9"
|
||
$AGL_names["twosuperior"] = "00B2"
|
||
$AGL_names["threesuperior"] = "00B3"
|
||
|
||
$AGL_names["anglearc"] = "2222"
|
||
$AGL_names["diameter"] = "2300"
|
||
$AGL_names["dottedcircle"] = "25CC"
|
||
$AGL_names["threequartersemdash"] = "?"
|
||
$AGL_names["f_k"] = "?"
|
||
|
||
# actually wrong
|
||
$AGL_names["perthousandzero"] = "2030"
|
||
|
||
$punct = Hash.new
|
||
$punct["0020"] = "space character|SP"
|
||
$punct["0021"] = "Exclamation mark|!"
|
||
$punct["0022"] = "Quotation mark|""
|
||
$punct["0023"] = "Number sign|#"
|
||
$punct["0024"] = "Dollar sign|$"
|
||
$punct["0025"] = "Percent sign|%"
|
||
$punct["0026"] = "Ampersand|&"
|
||
$punct["0027"] = "Apostrophe|'"
|
||
$punct["0028"] = "Bracket|("
|
||
$punct["0029"] = "Bracket|)"
|
||
$punct["002A"] = "Asterisk|*"
|
||
$punct["002B"] = "Plus and minus signs|+"
|
||
$punct["002C"] = "Comma (punctuation)|,"
|
||
$punct["002D"] = "Plus and minus signs|-"
|
||
$punct["002E"] = "Full stop|."
|
||
$punct["002F"] = "Slash (punctuation)|/"
|
||
$punct["003A"] = "colon (punctuation)|:"
|
||
$punct["003B"] = "semicolon|;"
|
||
$punct["003C"] = "less-than sign|<"
|
||
$punct["003D"] = "equals sign|="
|
||
$punct["003E"] = "greater-than sign|>"
|
||
$punct["003F"] = "question mark|?"
|
||
$punct["0040"] = "@"
|
||
$punct["005B"] = "Square bracket|["
|
||
$punct["005C"] = "Backslash|\"
|
||
$punct["005D"] = "Square bracket|]"
|
||
$punct["005E"] = "Circumflex|^"
|
||
$punct["005F"] = "Underscore|_"
|
||
$punct["0060"] = "Grave accent|`"
|
||
$punct["007B"] = "Brace (punctuation)|{"
|
||
$punct["007C"] = "Vertical bar||"
|
||
$punct["007D"] = "Brace (punctuation)|}"
|
||
$punct["007E"] = "Tilde|~"
|
||
|
||
$ext_punct = Hash.new()
|
||
$ext_punct["00A0"] = "Non-breaking space|NBSP"
|
||
# $ext_punct["02D8"] = "Breve|˘"
|
||
$ext_punct["00A4"] = "Currency (typography)|¤"
|
||
$ext_punct["00A7"] = "Section sign|§"
|
||
$ext_punct["00A8"] = "¨"
|
||
$ext_punct["00AD"] = "Soft hyphen|SHY"
|
||
$ext_punct["00B0"] = "Degree symbol|°"
|
||
$ext_punct["02DB"] = "Ogonek|˛"
|
||
$ext_punct["00B4"] = "Acute accent|´"
|
||
$ext_punct["02C7"] = "Caron|ˇ"
|
||
$ext_punct["00B8"] = "Cedilla|¸"
|
||
$ext_punct["02DD"] = "Double acute accent|˝"
|
||
$ext_punct["00D7"] = "Multiplication sign|×"
|
||
$ext_punct["00F7"] = "Obelus|÷"
|
||
$ext_punct["02D9"] = "Dot (diacritic)|˙"
|
||
|
||
$ext_punct["02C6"] = "Circumflex|ˆ"
|
||
$ext_punct["02DC"] = "Tilde|˜"
|
||
$ext_punct["02DA"] = "Ring_(diacritic)|˚"
|
||
$ext_punct["02D8"] = "Breve|˘"
|
||
$ext_punct["00AF"] = "Macron|¯"
|
||
|
||
$ext_punct["201A"] = "‚"
|
||
$ext_punct["2039"] = "Guillemet|‹"
|
||
$ext_punct["203A"] = "Guillemet|›"
|
||
|
||
$ext_punct["201C"] = "|“"
|
||
$ext_punct["201D"] = "|”"
|
||
$ext_punct["201E"] = "|„"
|
||
$ext_punct["2019"] = "’"
|
||
$ext_punct["2018"] = "‘"
|
||
|
||
$ext_punct["00AB"] = "Guillemet|«"
|
||
$ext_punct["00BB"] = "Guillemet|»"
|
||
$ext_punct["2013"] = "–"
|
||
$ext_punct["2014"] = "—"
|
||
$ext_punct["200B"] = "Zero-width space|ZWSP"
|
||
$ext_punct["2030"] = "Per mil|‰"
|
||
# TODO: this is letter!!!
|
||
$ext_punct["0131"] = "Dotted and dotless I|ı"
|
||
$ext_punct["0237"] = "Dotless j|ȷ"
|
||
# TODO: intl-var
|
||
$ext_punct["FB00"] = "Typographic ligature#Stylistic ligatures|ff"
|
||
$ext_punct["FB01"] = "Typographic ligature#Stylistic ligatures|fi"
|
||
$ext_punct["FB02"] = "Typographic ligature#Stylistic ligatures|fl"
|
||
$ext_punct["FB03"] = "Typographic ligature#Stylistic ligatures|ffi"
|
||
$ext_punct["FB04"] = "Typographic ligature#Stylistic ligatures|ffl"
|
||
$ext_punct["2423"] = "Space (punctuation)|␣"
|
||
|
||
# TODO: Eth and permil need a -box and a footnote
|
||
# TODO: A1 and BF are punctuation
|
||
|
||
encodings.each do |enc|
|
||
puts "Writing files for encoding '#{enc}'"
|
||
|
||
$filename_encoding = File.join($path_data, "enc/#{enc}.enc")
|
||
$filename_encoding2unicode = "/tmp/#{enc}.dat"
|
||
|
||
printf("{| {{chset-tableformat}}\n{{chset-table-header|Cork encoding}}}\n")
|
||
|
||
$file_encoding2unicode = File.open($filename_encoding2unicode, "w")
|
||
|
||
i = 0
|
||
#$file_out = File.open("#{enc}.txt", "w")
|
||
# read from adobe glyph list
|
||
File.open($filename_encoding).grep(/\/[_a-zA-Z0-9\.]+/) do |line|
|
||
# ignore comments
|
||
line.gsub!(/%.*/,'')
|
||
# encoding name should not be considered
|
||
line.gsub!(/.*\[/,'')
|
||
# nor the ending definition
|
||
line.gsub!(/\].*/,'')
|
||
|
||
line.scan(/[_a-zA-Z0-9\.]+/) do |w|
|
||
if i%16 == 0 then
|
||
printf("|-\n!{{chset-left|%X}}\n", i/16)
|
||
end
|
||
# Adobe Glyph List doesn't contain uniXXXX names,
|
||
# so we add that particular uniXXXX to our list for easier handling later on
|
||
if w =~ /^uni(.*)$/ then
|
||
$AGL_names[w] = $1
|
||
end
|
||
# if the glyph is not in AGL and isn't uniXXXX, print a warning
|
||
if $AGL_names[w] == nil then
|
||
puts sprintf(">> error: %s unknown (index 0x%02X)", w, i)
|
||
else
|
||
#$file_out.printf("%3s %-20s %s\n", i.to_s, w, $AGL_names[w])
|
||
#puts w + " " + $AGL_names[w]
|
||
if $AGL_names[w] == "?"
|
||
# $file_map.printf("; %-20s: no Unicode mapping assigned\n", w);
|
||
# $file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
|
||
$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
|
||
printf("0x%02X\tU+....\t\t%s\n", i, w);
|
||
# somewhat unreliable way to filter out uniXXXX.something
|
||
elsif $AGL_names[w].size > 4 then
|
||
# $file_map.printf("; %-20s: no unique way to map to Unicode\n", w);
|
||
# $file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, $AGL_names[w]);
|
||
$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
|
||
printf("0x%02X\tU+....\t\t%s\n", i, w);
|
||
else
|
||
unicode_point = $AGL_names[w]
|
||
if i != $AGL_names[w].hex
|
||
# $file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
|
||
# $file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, unicode_point);
|
||
else
|
||
# $file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
|
||
# $file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
|
||
end
|
||
lowercase = ""
|
||
if $lowercase_letter[unicode_point] == true and unicode_point.hex > 127
|
||
lowercase = "1"
|
||
# exception: in Thai, we don't want any characted below 0xA0
|
||
if enc == "lth" and i < 0xA0 then
|
||
lowercase = ""
|
||
end
|
||
end
|
||
type = ""
|
||
unichar = [unicode_point.to_i(16)].pack('U')
|
||
cell = unichar
|
||
if i >= 'a'.ord and i <= 'z'.ord then
|
||
type = "alpha"
|
||
elsif i >= 'A'.ord and i <= 'Z'.ord then
|
||
type = "alpha"
|
||
elsif i >= '0'.ord and i <= '9'.ord then
|
||
type = "digit"
|
||
cell = sprintf("%s (number)|%s", unichar, unichar)
|
||
elsif i >= 128 then
|
||
type = "intl"
|
||
elsif $punct[unicode_point] != nil then
|
||
type = "punct"
|
||
cell = $punct[unicode_point]
|
||
elsif $ext_punct[unicode_point] != nil then
|
||
type = "ext-punct"
|
||
cell = $ext_punct[unicode_point]
|
||
else
|
||
type = "TODO"
|
||
cell = unichar
|
||
end
|
||
$file_encoding2unicode.printf("0x%02X\tU+%s\t%s\t%s\n", i, unicode_point, lowercase, w);
|
||
printf("|{{chset-color-%s}}|{{chset-cell3|%s|[[%s]]|%d|%o}}\n", type, unicode_point, cell, i, i); # w - character name
|
||
end
|
||
end
|
||
i = i.next
|
||
end
|
||
end
|
||
printf("{{chset-table-footer}}\n|}\n")
|
||
|
||
$file_encoding2unicode.close
|
||
end
|