POC for JP sotn_strs (#986)

This is a proof-of-concept for Japanese sotn_strs. The macro `_SJ` is
used to designate them. I only converted one string in config_jp to
serve as a test for the pipeline. I moved sotn_str to a folder and added
some tests. I renamed it since python doesn't like modules with - in the
name. The code could be cleaner and the conversion table could use some
more work but I think this is a reasonable start.
This commit is contained in:
sozud 2024-01-17 00:27:17 -08:00 committed by GitHub
parent 3a8fd22bb4
commit abca5d44d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1206 additions and 835 deletions

View File

@ -60,7 +60,7 @@ M2C_DIR := $(TOOLS_DIR)/m2c
M2C_APP := $(M2C_DIR)/m2c.py
M2C := $(PYTHON) $(M2C_APP)
M2C_ARGS := -P 4
SOTNSTR := $(PYTHON) $(TOOLS_DIR)/sotn-str.py process
SOTNSTR := $(PYTHON) $(TOOLS_DIR)/sotn_str/sotn_str.py process
MASPSX_DIR := $(TOOLS_DIR)/maspsx
MASPSX_APP := $(MASPSX_DIR)/maspsx.py
MASPSX := $(PYTHON) $(MASPSX_APP) --no-macro-inc --expand-div --expand-li

View File

@ -231,10 +231,10 @@ typedef struct Primitive {
#define ANIMSET_OVL(x) ((x) | ANIMSET_OVL_FLAG)
#ifndef SOTN_STR
// Decorator to re-encode strings with tools/sotn-str.py when building the game.
// Certain strings in SOTN do not follow the ASCII encoding and each character
// is offseted by 0x20. This is only for strings that use the 8x8 font.
// e.g. _S("I am a Symphony of the Night encoded string")
// Decorator to re-encode strings with tools/sotn_str/sotn_str.py when building
// the game. Certain strings in SOTN do not follow the ASCII encoding and each
// character is offseted by 0x20. This is only for strings that use the 8x8
// font. e.g. _S("I am a Symphony of the Night encoded string")
#define _S(x) (x)
#endif

File diff suppressed because it is too large Load Diff

286
tools/sotn_str/jp.py Normal file
View File

@ -0,0 +1,286 @@
def dakuten(chr, prev):
if chr == "":
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
print(chr, prev)
assert False
if chr == "":
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
if prev == "":
return ""
print(chr, prev)
assert False
print(chr, prev)
assert False
table = [
# fmt: off
# 0 1 2 3 4 5 6 7 8 9 A B C D E F
" ", "!", "\"", "#", "$", "%", "&", "'", "(", ")", "", "+", ",", "-", ".", "/",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", "", "", "=", "", "?",
"", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "", "]", "", "_",
"", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "", "", "", "~", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", # dakuten and handakuten are lower in the graphic than seen here and get shifted by the code upwards when printing
"", "", "", "", "", "", "","", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "×", "", "", "", "", "", "",
"i0", "i1", "i2" , "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10", "i11", "", "", "", "",
# off by one somewhere for バルザイのえん月刀
"???"
]
# fmt: on
utf8_to_index = {}
for index, value in enumerate(table):
utf8_to_index[value] = index
def get_chr(chr):
return table[chr]
def convert_j(f):
pos = 0
str = ""
prev = None
prev_prev = None
while True:
ch = f[pos]
prev_prev = prev
prev = ch
pos += 1
if int(ch) == 0xFF:
ch = f[pos]
pos += 1
if ch == 0:
break
if ch != 0xFF:
# if we dakuten, erase previous
str = str[:-1]
str += dakuten(get_chr(ch), get_chr(prev_prev))
if ch != 158 and ch != 159:
str += get_chr(ch)
return str
def parse_string_to_int_array(input_str):
# Remove "_SJ(" and ")" from the input string
clean_str = input_str.replace("_SJ(", "").replace(")", "")
# Parse each byte and convert to integer
int_array = [int(byte, 16) for byte in clean_str.split("\\x")[1:]]
# null termination
int_array.append(0)
return int_array
# ten ten
def has_dakuten(utf8_char):
# fmt: off
chars = [
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "",
""]
# fmt: on
return utf8_char in chars
# maru
def has_handakuten(utf8_char):
# fmt: off
chars = [
"", "", "", "", "",
"", "", "", "", ""
]
# fmt: on
return utf8_char in chars
def remove_dakuten_handakuten(utf8_char):
table = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": ""
}
return table[utf8_char]
def dakuten_to_bytes(input_chr):
no_dakuten = remove_dakuten_handakuten(input_chr)
no_dakuten_bytes = utf8_to_index[no_dakuten]
byte = 158
if has_handakuten(input_chr):
byte = 159
return [no_dakuten_bytes, 0xFF, byte]
def utf8_to_byte_literals(input_str):
clean_str = input_str.replace("_SJ(", "").replace(")", "")
bytes = []
for char in clean_str:
if has_dakuten(char) or has_handakuten(char):
bytes += dakuten_to_bytes(char)
elif char == '':
bytes += [0xff, 0xff]
else:
bytes.append(utf8_to_index[char])
bytes.append(0xFF)
hex_list = [hex(num) for num in bytes]
return bytes
def utf8_to_byte_literals_wrapped(input):
out = utf8_to_byte_literals(input)
str = f"_SJ()"
escaped_string = "".join([f"\\x{val:02X}" for val in out])
out = f"_SJ({escaped_string})"
return out
def utf8_to_byte_literals_escaped(input):
out = utf8_to_byte_literals(input)
escaped_string = "".join([f"\\x{val:02X}" for val in out])
return escaped_string

View File

@ -3,6 +3,7 @@
import argparse
import re
import sys
from jp import utf8_to_byte_literals
def parse(filename, str_offset):
@ -18,24 +19,40 @@ def parse(filename, str_offset):
print(f'_S("{r}")')
def process(filename):
def process_string(match: re.Match[str]):
s = match.group(1)
r = ""
for ch in s.encode("shift_jis"):
# TODO at the moment this only works well with ASCII, not with Shift-JIS
r += f"\\x{ch - 0x20:02X}"
return f'"{r}\\xFF"'
def process_string(match: re.Match[str]):
s = match.group(1)
r = ""
for ch in s.encode("shift_jis"):
# TODO at the moment this only works well with ASCII, not with Shift-JIS
r += f"\\x{ch - 0x20:02X}"
return f'"{r}\\xFF"'
def process_string_jp(match: re.Match[str]):
s = match.group(1)
out = utf8_to_byte_literals(s)
escaped = "".join([f"\\x{val:02X}" for val in out])
return f'"{escaped}"'
def do_sub(line):
pattern = r'_S\("([^"]*)"\)'
# english_str_processed = re.sub(pattern, process_string, line)
# pattern_jp = r'_SJ\("([^"]+)"\)'
jp_str_processed = re.sub(pattern, process_string_jp, line)
jp_str_processed = jp_str_processed.replace("_S(\"\")", "\"\\xFF\"")
return jp_str_processed
def process(filename):
if not filename or filename == "-":
fn = sys.stdin
else:
fn = open(filename, "r")
with fn as f:
pattern = r'_S\("([^"]*)"\)'
for line in f:
sys.stdout.write(re.sub(pattern, process_string, line))
sys.stdout.write(do_sub(line))
if __name__ == "__main__":

68
tools/sotn_str/test.py Normal file
View File

@ -0,0 +1,68 @@
from jp import *
import unittest
from sotn_str import *
class TestingJp(unittest.TestCase):
def test_parse_string_to_int_array(self):
input = "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
out = parse_string_to_int_array(input)
assert out == [0xBD, 0xC3, 0xFF, 0x9E, 0xFF, 0x00]
def test_convert_dakuten(self):
input = "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
split = parse_string_to_int_array(input)
converted = convert_j(split)
assert converted == "すで"
def test_remove_dakuten_handakuten(self):
assert remove_dakuten_handakuten("") == ""
def test_dakuten_to_bytes(self):
bytes = dakuten_to_bytes("")
assert bytes == [0xC3, 0xFF, 0x9E]
def test_utf8_to_byte_literals_wrapped_dakuten(self):
input = "_SJ(すで)"
out = utf8_to_byte_literals_wrapped(input)
assert out == "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
def test_utf8_to_byte_literals_wrapped_kanji(self):
input = "_SJ(あかつきの剣)"
out = utf8_to_byte_literals_wrapped(input)
assert out == "_SJ(\\xB1\\xB6\\xC2\\xB7\\xC9\\x3C\\xFF)"
def check_sei():
assert(utf8_to_index[''] == 222)
def test_glasses(self):
input = "_SJ(聖なるめがね)"
out = utf8_to_byte_literals_wrapped(input)
assert out == "_SJ(\\xEE\\xC5\\xD9\\xD2\\xB6\\xFF\\x9E\\xC8\\xFF)"
def test_moon(self):
input = "_SJ(バルザイのえん月刀)"
out = utf8_to_byte_literals_wrapped(input)
assert out == "_SJ(\\x8A\\xFF\\x9E\\x99\\x7B\\xFF\\x9E\\x72\\xC9\\xB4\\xDD\\xFF\\xFF\\xED\\xFF)"
def test_str_potion(self):
input = "Str. potion"
out = utf8_to_byte_literals_escaped(input)
assert out == "\\x33\\x54\\x52\\x0E\\x00\\x50\\x4F\\x54\\x49\\x4F\\x4E\\xFF"
class TestingSotnStr(unittest.TestCase):
def test_do_sub_jp(self):
line = '{_SJ("すで"), "装備なし(素手)", 0, 0, 0, 3, 255, 0, 0, 36, 42, 0, 5, 128, 0, 0, false, 8, 0, 0, 0, 0, 4, 2, 1, 1, 1, 1, 0},'
out = do_sub(line)
expected = '{"\\xBD\\xC3\\xFF\\x9E\\xFF", "装備なし(素手)", 0, 0, 0, 3, 255, 0, 0, 36, 42, 0, 5, 128, 0, 0, false, 8, 0, 0, 0, 0, 4, 2, 1, 1, 1, 1, 0},'
assert out == expected
def test_jp_empty(self):
line = "_SJ(\"\")"
out = do_sub(line)
expected = '\"\\xFF\"'
assert out == expected
if __name__ == "__main__":
unittest.main()