POC for JP sotn_strs (#986)

This is a proof-of-concept for Japanese sotn_strs. The macro `_SJ` is used to designate them. I only converted one string in config_jp to serve as a test for the pipeline. I moved sotn_str to a folder and added some tests. I renamed it since python doesn't like modules with - in the name. The code could be cleaner and the conversion table could use some more work but I think this is a reasonable start.
2024-11-23 13:09:44 +00:00 · 2024-01-17 00:27:17 -08:00 · 2024-01-17 00:27:17 -08:00 · abca5d44d2
commit abca5d44d2
parent 3a8fd22bb4
6 changed files with 1206 additions and 835 deletions
--- a/2
+++ b/2
@ -60,7 +60,7 @@ M2C_DIR         := $(TOOLS_DIR)/m2c
 M2C_APP         := $(M2C_DIR)/m2c.py
 M2C             := $(PYTHON) $(M2C_APP)
 M2C_ARGS        := -P 4
-SOTNSTR			:= $(PYTHON) $(TOOLS_DIR)/sotn-str.py process
+SOTNSTR			:= $(PYTHON) $(TOOLS_DIR)/sotn_str/sotn_str.py process
 MASPSX_DIR      := $(TOOLS_DIR)/maspsx
 MASPSX_APP      := $(MASPSX_DIR)/maspsx.py
 MASPSX          := $(PYTHON) $(MASPSX_APP) --no-macro-inc --expand-div --expand-li
--- a/include/game.h
+++ b/include/game.h
@ -231,10 +231,10 @@ typedef struct Primitive {
 #define ANIMSET_OVL(x) ((x) | ANIMSET_OVL_FLAG)

 #ifndef SOTN_STR
-// Decorator to re-encode strings with tools/sotn-str.py when building the game.
-// Certain strings in SOTN do not follow the ASCII encoding and each character
-// is offseted by 0x20. This is only for strings that use the 8x8 font.
-// e.g. _S("I am a Symphony of the Night encoded string")
+// Decorator to re-encode strings with tools/sotn_str/sotn_str.py when building
+// the game. Certain strings in SOTN do not follow the ASCII encoding and each
+// character is offseted by 0x20. This is only for strings that use the 8x8
+// font. e.g. _S("I am a Symphony of the Night encoded string")
 #define _S(x) (x)
 #endif

--- a/src/dra/config_jp.c
+++ b/src/dra/config_jp.c
--- a/tools/sotn_str/jp.py
+++ b/tools/sotn_str/jp.py
@ -0,0 +1,286 @@
+def dakuten(chr, prev):
+    if chr == "ﾞ":
+        if prev == "シ":
+            return "ジ"
+        if prev == "ク":
+            return "グ"
+        if prev == "て":
+            return "で"
+        if prev == "ト":
+            return "ド"
+        if prev == "サ":
+            return "ザ"
+        if prev == "タ":
+            return "ダ"
+        if prev == "か":
+            return "が"
+        if prev == "テ":
+            return "デ"
+        if prev == "ハ":
+            return "バ"
+        if prev == "セ":
+            return "ゼ"
+        if prev == "ホ":
+            return "ボ"
+        if prev == "ヒ":
+            return "ビ"
+        if prev == "こ":
+            return "ご"
+        if prev == "ふ":
+            return "ぶ"
+        if prev == "と":
+            return "ど"
+        if prev == "へ":
+            return "べ"
+        if prev == "ヘ":
+            return "ベ"
+        if prev == "ス":
+            return "ズ"
+        if prev == "カ":
+            return "ガ"
+        if prev == "ケ":
+            return "ゲ"
+        if prev == "シ":
+            return "ジ"
+        if prev == "し":
+            return "じ"
+        if prev == "き":
+            return "ぎ"
+        if prev == "は":
+            return "ば"
+        if prev == "フ":
+            return "ブ"
+        if prev == "ウ":
+            return "ヴ"
+        if prev == "さ":
+            return "ざ"
+        if prev == "ひ":
+            return "び"
+        if prev == "せ":
+            return "ぜ"
+        if prev == "コ":
+            return "ゴ"
+        if prev == "ほ":
+            return "ぼ"
+        if prev == "キ":
+            return "ギ"
+        if prev == "そ":
+            return "ぞ"
+        if prev == "た":
+            return "だ"
+        if prev == "ソ":
+            return "ゾ"
+        if prev == "く":
+            return "ぐ"
+        print(chr, prev)
+        assert False
+    if chr == "ﾟ":
+        if prev == "フ":
+            return "プ"
+        if prev == "ヒ":
+            return "ピ"
+        if prev == "ハ":
+            return "パ"
+        if prev == "ヘ":
+            return "ペ"
+        if prev == "ホ":
+            return "ポ"
+        print(chr, prev)
+        assert False
+    print(chr, prev)
+    assert False
+
+
+table = [
+    # fmt: off
+    # 0      1      2      3      4      5      6      7      8      9      A      B      C      D      E      F
+    " ",     "!",  "\"", "#",   "$",   "%", "&", "'", "(", ")", "男", "+", ",", "-", ".", "/",
+    "0",     "1",   "2",   "3",   "4", "5", "6", "7", "8", "9", ":", "人", "手", "=", "玉", "?",
+    "石",    "A",   "B",   "C",   "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
+    "P",     "Q",   "R",   "S",   "T", "U", "V", "W", "X", "Y", "Z", "[", "剣", "]", "盾", "_",
+    "書",   "a",   "b",   "c",   "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
+    "p",     "q",   "r",   "s",   "t", "u", "v", "w", "x", "y", "z", "炎", "氷", "雷", "~", "女",
+    "力",    "。",   "「",   "」", "、", "・", "ヲ", "ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ッ",
+    "ー",    "ア",   "イ",   "ウ", "エ", "オ", "カ", "キ", "ク", "ケ", "コ", "サ", "シ", "ス", "セ", "ソ",
+    "タ",    "チ",   "ツ",   "テ", "ト", "ナ", "ニ", "ヌ", "ネ", "ノ", "ハ", "ヒ", "フ", "ヘ", "ホ", "マ",
+    "ミ",    "ム",   "メ",   "モ", "ヤ", "ユ", "ヨ", "ラ", "リ", "ル", "レ", "ロ", "ワ", "ン", "ﾞ", "ﾟ", # dakuten and handakuten are lower in the graphic than seen here and get shifted by the code upwards when printing
+    "子",    "悪",   "魔",   "人", "妖", "精", "を","ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "ゃ", "ゅ", "ょ", "っ",
+    "金",    "あ",   "い",   "う", "え", "お", "か", "き", "く", "け", "こ", "さ", "し", "す", "せ", "そ",
+    "た",    "ち",   "つ",  "て",  "と", "な", "に", "ぬ", "ね", "の", "は", "ひ", "ふ", "へ", "ほ", "ま",
+    "み",    "む",   "め",  "も",  "や", "ゆ", "よ", "ら", "り", "る", "れ", "ろ", "わ", "ん", "指", "輪",
+    "←",     "↖",   "↑",   "↗",   "→", "↘", "↓", "↙", "○", "×", "□", "△", "名", "刀", "聖", "血",
+    "i0",    "i1",  "i2" , "i3", "i4", "i5", "i6", "i7", "i8", "i9", "i10", "i11", "大", "光", "邪", "月", 
+    # off by one somewhere for バルザイのえん月刀
+    "???"
+]
+# fmt: on
+
+utf8_to_index = {}
+for index, value in enumerate(table):
+    utf8_to_index[value] = index
+
+
+def get_chr(chr):
+    return table[chr]
+
+
+def convert_j(f):
+    pos = 0
+    str = ""
+    prev = None
+    prev_prev = None
+    while True:
+        ch = f[pos]
+        prev_prev = prev
+        prev = ch
+        pos += 1
+        if int(ch) == 0xFF:
+            ch = f[pos]
+            pos += 1
+
+            if ch == 0:
+                break
+
+            if ch != 0xFF:
+                # if we dakuten, erase previous
+                str = str[:-1]
+                str += dakuten(get_chr(ch), get_chr(prev_prev))
+        if ch != 158 and ch != 159:
+            str += get_chr(ch)
+    return str
+
+
+def parse_string_to_int_array(input_str):
+    # Remove "_SJ(" and ")" from the input string
+    clean_str = input_str.replace("_SJ(", "").replace(")", "")
+
+    # Parse each byte and convert to integer
+    int_array = [int(byte, 16) for byte in clean_str.split("\\x")[1:]]
+
+    # null termination
+    int_array.append(0)
+
+    return int_array
+
+
+# ten ten
+def has_dakuten(utf8_char):
+    # fmt: off
+    chars = [
+        "が", "ぎ", "ぐ", "げ", "ご",
+	    "ざ", "じ",	"ず", "ぜ", "ぞ",
+	    "だ", "ぢ", "づ", "で",	"ど",
+        "ば", "び", "ぶ", "べ", "ぼ",
+        "ガ", "ギ", "グ", "ゲ", "ゴ",
+	    "ザ", "ジ", "ズ", "ゼ", "ゾ",
+	    "ダ", "ヂ", "ヅ", "デ", "ド",
+        "バ", "ビ", "ブ", "ベ", "ボ",
+        "ヴ"]
+    # fmt: on
+    return utf8_char in chars
+
+
+# maru
+def has_handakuten(utf8_char):
+    # fmt: off
+    chars = [
+        "ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
+        "パ", "ピ", "プ", "ペ", "ポ"
+    ]
+    # fmt: on
+    return utf8_char in chars
+
+
+def remove_dakuten_handakuten(utf8_char):
+    table = {
+        "が": "か",
+        "ぎ": "き",
+        "ぐ": "く",
+        "げ": "け",
+        "ご": "こ",
+        "ざ": "さ",
+        "じ": "し",
+        "ず": "す",
+        "ぜ": "せ",
+        "ぞ": "そ",
+        "だ": "た",
+        "ぢ": "ち",
+        "づ": "つ",
+        "で": "て",
+        "ど": "と",
+        "ば": "は",
+        "び": "ひ",
+        "ぶ": "ふ",
+        "べ": "へ",
+        "ぼ": "ほ",
+        "ぱ": "は",
+        "ぴ": "ひ",
+        "ぷ": "ふ",
+        "ぺ": "へ",
+        "ぽ": "ほ",
+        "ガ": "カ",
+        "ギ": "キ",
+        "グ": "ク",
+        "ゲ": "ケ",
+        "ゴ": "コ",
+        "ザ": "サ",
+        "ジ": "シ",
+        "ズ": "ス",
+        "ゼ": "セ",
+        "ゾ": "ソ",
+        "ダ": "タ",
+        "ヂ": "チ",
+        "ヅ": "ツ",
+        "デ": "テ",
+        "ド": "ト",
+        "バ": "ハ",
+        "ビ": "ヒ",
+        "ブ": "フ",
+        "ベ": "ヘ",
+        "ボ": "ホ",
+        "パ": "ハ",
+        "ピ": "ヒ",
+        "プ": "フ",
+        "ペ": "ヘ",
+        "ポ": "ホ",
+        "ヴ": "ウ"
+    }
+    return table[utf8_char]
+
+
+def dakuten_to_bytes(input_chr):
+    no_dakuten = remove_dakuten_handakuten(input_chr)
+    no_dakuten_bytes = utf8_to_index[no_dakuten]
+    byte = 158
+    if has_handakuten(input_chr):
+        byte = 159
+
+    return [no_dakuten_bytes, 0xFF, byte]
+
+
+def utf8_to_byte_literals(input_str):
+    clean_str = input_str.replace("_SJ(", "").replace(")", "")
+    bytes = []
+    for char in clean_str:
+        if has_dakuten(char) or has_handakuten(char):
+            bytes += dakuten_to_bytes(char)
+        elif char == '月':
+            bytes += [0xff, 0xff]
+        else:
+            bytes.append(utf8_to_index[char])
+    bytes.append(0xFF)
+    hex_list = [hex(num) for num in bytes]
+    return bytes
+
+
+def utf8_to_byte_literals_wrapped(input):
+    out = utf8_to_byte_literals(input)
+    str = f"_SJ()"
+    escaped_string = "".join([f"\\x{val:02X}" for val in out])
+    out = f"_SJ({escaped_string})"
+    return out
+
+def utf8_to_byte_literals_escaped(input):
+    out = utf8_to_byte_literals(input)
+    escaped_string = "".join([f"\\x{val:02X}" for val in out])
+    return escaped_string
--- a/tools/sotn_str/sotn_str.py
+++ b/tools/sotn_str/sotn_str.py
@ -3,6 +3,7 @@
 import argparse
 import re
 import sys
+from jp import utf8_to_byte_literals


 def parse(filename, str_offset):
@ -18,24 +19,40 @@ def parse(filename, str_offset):
    print(f'_S("{r}")')


-def process(filename):
-    def process_string(match: re.Match[str]):
-        s = match.group(1)
-        r = ""
-        for ch in s.encode("shift_jis"):
-            # TODO at the moment this only works well with ASCII, not with Shift-JIS
-            r += f"\\x{ch - 0x20:02X}"
-        return f'"{r}\\xFF"'
+def process_string(match: re.Match[str]):
+    s = match.group(1)
+    r = ""
+    for ch in s.encode("shift_jis"):
+        # TODO at the moment this only works well with ASCII, not with Shift-JIS
+        r += f"\\x{ch - 0x20:02X}"
+    return f'"{r}\\xFF"'

+
+def process_string_jp(match: re.Match[str]):
+    s = match.group(1)
+    out = utf8_to_byte_literals(s)
+    escaped = "".join([f"\\x{val:02X}" for val in out])
+    return f'"{escaped}"'
+
+
+def do_sub(line):
+    pattern = r'_S\("([^"]*)"\)'
+    # english_str_processed = re.sub(pattern, process_string, line)
+    # pattern_jp = r'_SJ\("([^"]+)"\)'
+    jp_str_processed = re.sub(pattern, process_string_jp, line)
+    jp_str_processed = jp_str_processed.replace("_S(\"\")", "\"\\xFF\"")
+    return jp_str_processed
+
+
+def process(filename):
    if not filename or filename == "-":
        fn = sys.stdin
    else:
        fn = open(filename, "r")

    with fn as f:
-        pattern = r'_S\("([^"]*)"\)'
        for line in f:
-            sys.stdout.write(re.sub(pattern, process_string, line))
+            sys.stdout.write(do_sub(line))


 if __name__ == "__main__":
--- a/tools/sotn_str/test.py
+++ b/tools/sotn_str/test.py
@ -0,0 +1,68 @@
+from jp import *
+import unittest
+from sotn_str import *
+
+
+class TestingJp(unittest.TestCase):
+    def test_parse_string_to_int_array(self):
+        input = "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
+        out = parse_string_to_int_array(input)
+        assert out == [0xBD, 0xC3, 0xFF, 0x9E, 0xFF, 0x00]
+
+    def test_convert_dakuten(self):
+        input = "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
+        split = parse_string_to_int_array(input)
+        converted = convert_j(split)
+        assert converted == "すで"
+
+    def test_remove_dakuten_handakuten(self):
+        assert remove_dakuten_handakuten("が") == "か"
+
+    def test_dakuten_to_bytes(self):
+        bytes = dakuten_to_bytes("で")
+        assert bytes == [0xC3, 0xFF, 0x9E]
+
+    def test_utf8_to_byte_literals_wrapped_dakuten(self):
+        input = "_SJ(すで)"
+        out = utf8_to_byte_literals_wrapped(input)
+        assert out == "_SJ(\\xBD\\xC3\\xFF\\x9E\\xFF)"
+
+    def test_utf8_to_byte_literals_wrapped_kanji(self):
+        input = "_SJ(あかつきの剣)"
+        out = utf8_to_byte_literals_wrapped(input)
+        assert out == "_SJ(\\xB1\\xB6\\xC2\\xB7\\xC9\\x3C\\xFF)"
+
+    def check_sei():
+        assert(utf8_to_index['聖'] == 222)
+
+    def test_glasses(self):
+        input = "_SJ(聖なるめがね)"
+        out = utf8_to_byte_literals_wrapped(input)
+        assert out == "_SJ(\\xEE\\xC5\\xD9\\xD2\\xB6\\xFF\\x9E\\xC8\\xFF)"
+
+    def test_moon(self):
+        input = "_SJ(バルザイのえん月刀)"
+        out = utf8_to_byte_literals_wrapped(input)
+        assert out == "_SJ(\\x8A\\xFF\\x9E\\x99\\x7B\\xFF\\x9E\\x72\\xC9\\xB4\\xDD\\xFF\\xFF\\xED\\xFF)"
+
+    def test_str_potion(self):
+        input = "Str. potion"
+        out = utf8_to_byte_literals_escaped(input)
+        assert out == "\\x33\\x54\\x52\\x0E\\x00\\x50\\x4F\\x54\\x49\\x4F\\x4E\\xFF"
+
+class TestingSotnStr(unittest.TestCase):
+    def test_do_sub_jp(self):
+        line = '{_SJ("すで"), "装備なし（素手）", 0, 0, 0, 3, 255, 0, 0, 36, 42, 0, 5, 128, 0, 0, false, 8, 0, 0, 0, 0, 4, 2, 1, 1, 1, 1, 0},'
+        out = do_sub(line)
+        expected = '{"\\xBD\\xC3\\xFF\\x9E\\xFF", "装備なし（素手）", 0, 0, 0, 3, 255, 0, 0, 36, 42, 0, 5, 128, 0, 0, false, 8, 0, 0, 0, 0, 4, 2, 1, 1, 1, 1, 0},'
+        assert out == expected
+
+    def test_jp_empty(self):
+        line = "_SJ(\"\")"
+        out = do_sub(line)
+        expected = '\"\\xFF\"'
+        assert out == expected
+
+
+if __name__ == "__main__":
+    unittest.main()