mirror of
https://github.com/openharmony/third_party_tex-hyphen.git
synced 2026-06-30 22:28:02 -04:00
9fbfff2665
In Thai encoding still some issues.
162 lines
5.0 KiB
Ruby
Executable File
162 lines
5.0 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
|
|
encodings = ["ec", "qx", "latin7x", "t8m", "lth"]
|
|
# "texnansi", "t5", "lt"
|
|
|
|
$path_data = "data"
|
|
$filename_AGL = File.join($path_data, "aglfn13.txt")
|
|
|
|
$filename_unicode_data = File.join($path_data, "UnicodeData.txt")
|
|
|
|
$AGL_names = Hash.new()
|
|
|
|
|
|
# read from adobe glyph list
|
|
File.open($filename_AGL).grep /^[0-9A-F]+/ do |line|
|
|
unicode, pdfname = line.split(/;/)
|
|
$AGL_names[pdfname] = unicode;
|
|
end
|
|
|
|
$lowercase_letter = Hash.new()
|
|
# 00F0;LATIN SMALL LETTER ETH;Ll;0;L;;;;;N;;Icelandic;00D0;;00D0
|
|
# FB01;LATIN SMALL LIGATURE FI;Ll;0;L;<compat> 0066 0069;;;;N;;;;;
|
|
# lowercase letters
|
|
#File.open($filename_unicode_data).grep /^([0-9A-F]*);[^;]*;Ll;.*$/ do |line|
|
|
File.open($filename_unicode_data).grep /^([0-9A-F]*);.*$/ do |line|
|
|
unicode, name, lowercase, dummy1, dummy2, compat = line.split(/;/)
|
|
if lowercase == "Ll" then
|
|
unless compat.include?("compat")
|
|
$lowercase_letter[unicode] = true
|
|
end
|
|
# Thai
|
|
elsif unicode.hex >= 0x0E01 and unicode.hex <= 0x0E5B then
|
|
if lowercase =~ /(Lo|Mn)/ then
|
|
$lowercase_letter[unicode] = true
|
|
end
|
|
# Georgian lowercase (lowercase: 'Lo')
|
|
elsif unicode.hex >= 0x10D0 and unicode.hex <= 0x10FA then
|
|
$lowercase_letter[unicode] = true
|
|
end
|
|
end
|
|
|
|
|
|
# ij
|
|
$lowercase_letter["0133"] = true
|
|
# florin
|
|
$lowercase_letter["0192"] = false
|
|
# ell
|
|
$lowercase_letter["2113"] = false
|
|
|
|
$AGL_names["hyphenchar"] = $AGL_names["hyphen"]
|
|
$AGL_names["sfthyphen"] = "00AD"
|
|
$AGL_names["hyphen.alt"] = "00AD"
|
|
|
|
$AGL_names["dotlessj"] = "0237"
|
|
$AGL_names["tcedilla"] = "0163"
|
|
$AGL_names["Tcedilla"] = "0162"
|
|
|
|
$AGL_names["ff"] = "FB00" # = 0066 + 0066
|
|
$AGL_names["fi"] = "FB01" # = 0066 + 0069
|
|
$AGL_names["fl"] = "FB02" # = 0066 + 006C
|
|
$AGL_names["ffi"] = "FB03" # = 0066 + 0066 + 0069
|
|
$AGL_names["ffl"] = "FB04" # = 0066 + 0066 + 006C
|
|
|
|
$AGL_names["cwm"] = "200B"
|
|
$AGL_names["zerowidthspace"] = "200B"
|
|
$AGL_names["perthousandzero"] = "?"
|
|
$AGL_names["visiblespace"] = "2423"
|
|
#$AGL_names["nbspace"] = "00A0"
|
|
$AGL_names["nonbreakingspace"] = "00A0"
|
|
$AGL_names["Germandbls"] = "1E9E" # = 0053 + 0053
|
|
$AGL_names["ell"] = "2113"
|
|
|
|
$AGL_names[".notdef"] = "?"
|
|
|
|
$AGL_names["onesuperior"] = "00B9"
|
|
$AGL_names["twosuperior"] = "00B2"
|
|
$AGL_names["threesuperior"] = "00B3"
|
|
|
|
$AGL_names["anglearc"] = "2222"
|
|
$AGL_names["diameter"] = "2300"
|
|
$AGL_names["dottedcircle"] = "25CC"
|
|
$AGL_names["threequartersemdash"] = "?"
|
|
$AGL_names["f_k"] = "?"
|
|
|
|
encodings.each do |enc|
|
|
puts "Writing files for encoding '#{enc}'"
|
|
|
|
$filename_encoding = File.join($path_data, "enc/#{enc}.enc")
|
|
$filename_xetex_mapping = File.join($path_data, "map/#{enc}.map")
|
|
$filename_encoding2unicode = File.join($path_data, "enc2unicode/#{enc}.dat")
|
|
|
|
|
|
$file_map = File.open($filename_xetex_mapping, "w")
|
|
# FIXME
|
|
$file_fixed_enc = File.open("data/enc/#{enc}-new.enc", "w")
|
|
$file_encoding2unicode = File.open($filename_encoding2unicode, "w")
|
|
|
|
$file_map.print("EncodingName \"TeX-#{enc}\"\n\n")
|
|
$file_map.print("pass(Byte_Unicode)\n\n")
|
|
|
|
i = 0
|
|
#$file_out = File.open("#{enc}.txt", "w")
|
|
# read from adobe glyph list
|
|
File.open($filename_encoding).grep(/\/[_a-zA-Z0-9\.]+/) do |line|
|
|
# ignore comments
|
|
line.gsub!(/%.*/,'')
|
|
# encoding name should not be considered
|
|
line.gsub!(/.*\[/,'')
|
|
# nor the ending definition
|
|
line.gsub!(/\].*/,'')
|
|
|
|
line.scan(/[_a-zA-Z0-9\.]+/) do |w|
|
|
# Adobe Glyph List doesn't contain uniXXXX names,
|
|
# so we add that particular uniXXXX to our list for easier handling later on
|
|
if w =~ /^uni(.*)$/ then
|
|
$AGL_names[w] = $1
|
|
end
|
|
# if the glyph is not in AGL and isn't uniXXXX, print a warning
|
|
if $AGL_names[w] == nil then
|
|
puts sprintf(">> error: %s unknown (index 0x%02X)", w, i)
|
|
else
|
|
#$file_out.printf("%3s %-20s %s\n", i.to_s, w, $AGL_names[w])
|
|
#puts w + " " + $AGL_names[w]
|
|
if $AGL_names[w] == "?"
|
|
$file_map.printf("; %-20s: no Unicode mapping assigned\n", w);
|
|
$file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
|
|
$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
|
|
# somewhat unreliable way to filter out uniXXXX.something
|
|
elsif $AGL_names[w].size > 4 then
|
|
$file_map.printf("; %-20s: no unique way to map to Unicode\n", w);
|
|
$file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, $AGL_names[w]);
|
|
$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
|
|
else
|
|
unicode_point = $AGL_names[w]
|
|
if i != $AGL_names[w].hex
|
|
$file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
|
|
$file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, unicode_point);
|
|
else
|
|
$file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
|
|
$file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
|
|
end
|
|
lowercase = ""
|
|
if $lowercase_letter[unicode_point] == true and unicode_point.hex > 127
|
|
lowercase = "1"
|
|
# exception: in Thai, we don't want any characted below 0xA0
|
|
if enc == "lth" and i < 0xA0 then
|
|
lowercase = ""
|
|
end
|
|
end
|
|
$file_encoding2unicode.printf("0x%02X\tU+%s\t%s\t%s\n", i, unicode_point, lowercase, w);
|
|
end
|
|
end
|
|
i = i.next
|
|
end
|
|
end
|
|
#$file_out.close
|
|
$file_map.close
|
|
$file_encoding2unicode.close
|
|
end
|
|
|
|
|