decoder: include operands in metadata

Stores operand bit positions and types in the decoder metadata. This
makes it way easier to emit IR instructions.

Signed-off-by: Ronald Caesar <github43132@proton.me>
This commit is contained in:
Ronald Caesar
2026-01-18 02:44:40 -04:00
parent 2596a20f31
commit 3b9703f316
4 changed files with 32892 additions and 17040 deletions

View File

@@ -11,6 +11,7 @@
#include "bal_attributes.h"
#include "bal_types.h"
#include <assert.h>
#include <stdint.h>
#ifdef __cplusplus
@@ -18,6 +19,32 @@ extern "C"
{
#endif
/// The type of an instruction operand.
typedef enum
{
BAL_OPERAND_TYPE_NONE = 0,
BAL_OPERAND_TYPE_REGISTER_32 = 1,
BAL_OPERAND_TYPE_REGISTER_64 = 2,
BAL_OPERAND_TYPE_REGISTER_128 = 3,
BAL_OPERAND_TYPE_IMMEDIATE = 4,
BAL_OPERAND_TYPE_CONDITION = 5,
} bal_decoder_operand_type_t;
/// Descriptor for a single operand.
typedef struct
{
/// Operand type. See [`bal_decoder_operand_type_t`].
uint16_t type : 5;
/// Bit position in the instruction.
uint16_t bit_position : 6;
/// Bit width of the field.
uint16_t bit_width : 5;
} bal_decoder_operand_t;
static_assert(2 == sizeof(bal_decoder_operand_t), "Expected operand struct to be 2 bytes.");
/// Represents static metadata aasociated with a specific ARM instruction.
BAL_ALIGNED(32) typedef struct
{
@@ -38,10 +65,12 @@ extern "C"
/// The IR opcode equivalent to this instruction's mnemonic.
bal_opcode_t ir_opcode;
/// Padding to maintain 32 byte alignment.
char _pad[8];
/// Descriptors for up to 4 operands.
bal_decoder_operand_t operand[4];
} bal_decoder_instruction_metadata_t;
static_assert(32 == sizeof(bal_decoder_instruction_metadata_t), "Expected decoder metadata struct to be 32 bytes.");
/// Decodes a raw ARM64 instruction.
///
/// Returns a pointer to [`bal_decoder_instruction_metadata_t`] describing

File diff suppressed because it is too large Load Diff

View File

@@ -7,7 +7,7 @@
#include <stdint.h>
#include <stddef.h>
#define BAL_DECODER_ARM64_INSTRUCTIONS_SIZE 3945
#define BAL_DECODER_ARM64_INSTRUCTIONS_SIZE 3608
typedef struct {
const bal_decoder_instruction_metadata_t *const *instructions;

View File

@@ -12,6 +12,7 @@ Generated ARM decoder table source file -> ../src/decoder_table_gen.c
import io
import os
import re
import sys
import glob
import argparse
@@ -45,6 +46,13 @@ GENERATED_FILE_WARNING = """/*
*/"""
@dataclass
class Operand:
type_enum: str
bit_position: int
bit_width: int
@dataclass
class A64Instruction:
mnemonic: str
@@ -52,20 +60,45 @@ class A64Instruction:
value: int
priority: int # Higher number of set bits in mask = higher priority.
array_index: int # Position in the hash table bucket.
operands: List[Operand]
def parse_register_diagram(register_diagram: ET.Element) -> Dict[str, Tuple[int, int]]:
"""Parse the register diagram to map field names to (bit_position, bit_width)."""
fields: Dict[str, Tuple[int, int]] = {}
for box in register_diagram.findall("box"):
name: Optional[str] = box.get("name")
if name is None:
continue
hibit: int = int(box.get("hibit"))
width: int = int(box.get("width", "1"))
bit_position: int = hibit - width + 1
if name not in fields:
fields[name] = (bit_position, width)
return fields
def process_box(box: Element, current_mask: int, current_value: int) -> Tuple[int, int]:
"""Process a specific bit-field box fron the XML diagram."""
# The high-bit position in the 32-bit word.
hibit_str: Optional[str] = box.attrib.get("hibit")
if hibit_str is None:
return (current_mask, current_value)
hibit: int = int(hibit_str)
# The width of this bitfield.
width_str: Optional[int] = int(box.attrib.get("width", "1"))
if width_str is None:
return (current_mask, current_value)
width: int = int(width_str)
if hibit >= 32:
@@ -76,6 +109,7 @@ def process_box(box: Element, current_mask: int, current_value: int) -> Tuple[in
c_elements: List[Element[str]] = box.findall("c")
content: str = ""
for c_element in c_elements:
content += c_element.text if c_element.text is not None else "x"
@@ -99,23 +133,166 @@ def process_box(box: Element, current_mask: int, current_value: int) -> Tuple[in
return (current_mask, current_value)
def parse_explanations(root: Element) -> Dict[str, str]:
"""
Parses the <explanations> section of map symbol links (e.g. 'sa_rd')
to the encoded bitfield name (e.g 'Rd').
"""
mapping: Dict[str, str] = {}
for explanation in root.findall(".//explanation"):
symbol_tag: Optional[str] = explanation.find("symbol")
if symbol_tag is None:
continue
link: Optional[str] = symbol_tag.get("link")
if link is None:
continue
encoded_in: Optional[str] = None
account: Optional[str] = explanation.find("account")
if account is not None:
encoded_in = account.get("encodedin")
if encoded_in is None:
definition = explanation.find("definition")
if definition is not None:
encoded_in = definition.get("encodedin")
if encoded_in is not None:
mapping[link] = encoded_in
return mapping
def derive_operand_type(text: str, hover: str) -> str:
"""Derives the BAL_OPERAND_TYPE based on syntax and description."""
# Normalize text: <Wd> -> WD, <Wd|WSP> -> WD|WSP
t = text.strip().replace("<", "<").replace(">", ">").replace("<", "").replace(">", "").upper()
h = hover.lower()
# Immediate checks
if any(
k in h for k in ["immediate", "amount", "offset", "index", "label", "shift"]
) or t.startswith("#"):
return "BAL_OPERAND_TYPE_IMMEDIATE"
# Register checks by name prefix
# W = 32-bit GP, S = 32-bit FP
if t.startswith("W") or t.startswith("S"):
return "BAL_OPERAND_TYPE_REGISTER_32"
# X = 64-bit GP, D = 64-bit FP, SP = Stack Pointer
if t.startswith("X") or t.startswith("D") or t == "SP" or t == "WSP":
# WSP is 32-bit SP
if t == "WSP":
return "BAL_OPERAND_TYPE_REGISTER_32"
return "BAL_OPERAND_TYPE_REGISTER_64"
# V = Vector, Q = 128-bit FP, Z = SVE Vector
if t.startswith("V") or t.startswith("Q") or t.startswith("Z"):
return "BAL_OPERAND_TYPE_REGISTER_128"
# P = SVE Predicate.
if t.startswith("P"):
return "BAL_OPERAND_TYPE_REGISTER_32"
# B = 8-bit FP/Vector, H = 16-bit FP/Vector
# Usually treated as SIMD/FP registers.
if t.startswith("B") or t.startswith("H"):
return "BAL_OPERAND_TYPE_REGISTER_128"
# Fallback to hover text analysis
if "32-bit" in h and ("general-purpose" in h or "register" in h):
return "BAL_OPERAND_TYPE_REGISTER_32"
if "64-bit" in h and ("general-purpose" in h or "register" in h):
return "BAL_OPERAND_TYPE_REGISTER_64"
if "128-bit" in h or "simd" in h or "vector" in h or "scalable" in h:
return "BAL_OPERAND_TYPE_REGISTER_128"
if "condition" in h or "cond" in t.lower():
return "BAL_OPERAND_TYPE_CONDITION"
return "BAL_OPERAND_TYPE_NONE"
def parse_operands(
asmtemplate: ET.Element,
field_map: Dict[str, Tuple[int, int]],
explanation_map: Dict[str, str],
) -> List[Operand]:
"""
Parses `asmtemplate` to find operands and map them to bit fields.
"""
operands: List[Operand] = []
for anchor in asmtemplate.findall("a"):
link: Optional[str] = anchor.get("link")
hover: Optional[str] = anchor.get("hover", "")
text: str = anchor.text if anchor.text else ""
if link is None:
continue
encoded_field: Optional[str] = explanation_map.get(link)
if encoded_field is None:
clean_text = text.strip().replace("<", "").replace(">", "")
if clean_text in field_map:
encoded_field = clean_text
elif text in field_map:
encoded_field = text
else:
pass
if encoded_field is None:
continue
if encoded_field in field_map:
bit_position, bit_width = field_map[encoded_field]
operand_type = derive_operand_type(text, hover)
if operand_type == "BAL_OPERAND_TYPE_NONE":
continue
operand: Operand = Operand(operand_type, bit_position, bit_width)
if operand not in operands:
operands.append(operand)
if len(operands) >= 4:
break
return operands
def get_mnemonic_from_element(element: Element) -> Optional[str]:
"""
Helper to extract 'mnemonic' from a <docvars> block inside an element.
Returns None if not found.
"""
docvars: Element[str]
for docvars in element.findall("docvars"):
docvar: Element[str]
for docvar in docvars.findall("docvar"):
if docvar.get("key") == "mnemonic":
return docvar.get("value")
return None
def parse_xml_file(filepath: str) -> List[A64Instruction]:
try:
tree: Optional[ElementTree[Element[str]]] = ET.parse(filepath)
if tree is None:
raise ET.ParseError
@@ -133,8 +310,10 @@ def parse_xml_file(filepath: str) -> List[A64Instruction]:
# If no docvar, try the Heading/Title as a fallback for the file default.
if file_mnemonic is None:
heading: Optional[Element] = root.find(".//heading")
if heading is not None and heading.text is not None:
candidate: str = heading.text.split()[0]
if "<" not in candidate:
file_mnemonic = candidate
@@ -142,17 +321,19 @@ def parse_xml_file(filepath: str) -> List[A64Instruction]:
# Dict[(mask, value), instruction]
instructions: Dict[Tuple[int, int], A64Instruction] = {}
# Extract mask and value.
explanation_map: Dict[str, str] = parse_explanations(root)
iclass: Element[str]
for iclass in root.findall(".//iclass"):
# The diagram box contains the bit definitions.
box_diagram: Optional[Element[str]] = iclass.find("regdiagram")
if box_diagram is None:
# The diagram contains the bit definitions.
register_diagram: Optional[Element[str]] = iclass.find("regdiagram")
if register_diagram is None:
continue
# Is 32-bit instruction?
if box_diagram.get("form") != "32":
# Is not a 32-bit instruction?
if register_diagram.get("form") != "32":
continue
# Determine the mnemonic for this specific class.
@@ -164,12 +345,14 @@ def parse_xml_file(filepath: str) -> List[A64Instruction]:
if class_mnemonic is None:
class_mnemonic = "[UNKNOWN]"
field_map: Dict[str, Tuple[int, int]] = parse_register_diagram(register_diagram)
class_mask: int = 0
class_value: int = 0
# Process global diagram bits inherited by all encoding class.
try:
for box in box_diagram.findall("box"):
for box in register_diagram.findall("box"):
(class_mask, class_value) = process_box(box, class_mask, class_value)
except ValueError as e:
print(f"Skipping malformed box in {filepath}: {e}", file=sys.stderr)
@@ -177,23 +360,14 @@ def parse_xml_file(filepath: str) -> List[A64Instruction]:
priority: int = bin(class_mask).count("1")
# Check for negatives
class_instruction = A64Instruction(
mnemonic=class_mnemonic,
mask=class_mask,
value=class_value,
priority=priority,
array_index=0,
)
instructions[(class_mask, class_value)] = class_instruction
# Refine with specific encoding bits.
# <encoding> blocks often override specific boxes to different variants.
encoding: Element[str]
for encoding in iclass.findall("encoding"):
asm_template = encoding.find("asmtemplate")
if asm_template is None:
asmtemplate = encoding.find("asmtemplate")
if asmtemplate is None:
continue
# Check if Encoding overrides mnemonic.
@@ -215,68 +389,67 @@ def parse_xml_file(filepath: str) -> List[A64Instruction]:
except ValueError:
continue
operands: List[Operand] = parse_operands(
asmtemplate, field_map, explanation_map
)
key: Tuple[int, int] = (encoding_mask, encoding_value)
priority = bin(encoding_mask).count("1")
encoding_instruction = A64Instruction(
if key not in instructions:
instructions[key] = A64Instruction(
mnemonic=encoding_mnemonic,
mask=encoding_mask,
value=encoding_value,
priority=priority,
array_index=0,
operands=operands,
)
instructions[(encoding_mask, encoding_value)] = encoding_instruction
return list(instructions.values())
def derive_opcode(mnemonic: str) -> str:
"""Maps an ARM mnemonic to a Ballistic IR Opcode."""
m = mnemonic.upper()
if m in ["MOVZ", "MOVN"]:
if m in ["MOVZ", "MOVN", "MOVK"]:
return "OPCODE_CONST"
# ORR is often used for register moves.
if m == "ORR" or m == "MOV":
if m in ["ORR", "MOV"]:
return "OPCODE_MOV"
if m.startswith("ADD"): return "OPCODE_ADD"
if m.startswith("SUB"): return "OPCODE_SUB"
if m.startswith("MUL") or m.startswith("MADD"): return "OPCODE_MUL"
if m.startswith("SDIV") or m.startswith("UDIV"): return "OPCODE_DIV"
if m.startswith("AND"): return "OPCODE_AND"
if m.startswith("EOR"): return "OPCODE_XOR"
# ORR is handled above, but ORN (Or Not) is distinct
if m.startswith("ORN"): return "OPCODE_OR_NOT"
if m in ["LSL", "LSR", "ASR", "ROR"]:
return "OPCODE_SHIFT"
if m.startswith("LDR") or m.startswith("LDU") or m.startswith("LDP"):
if m.startswith("ADD"):
return "OPCODE_ADD"
if m.startswith("SUB"):
return "OPCODE_SUB"
if m.startswith("MUL") or m.startswith("MADD"):
return "OPCODE_MUL"
if m.startswith("SDIV") or m.startswith("UDIV"):
return "OPCODE_DIV"
if m.startswith("AND"):
return "OPCODE_AND"
if m.startswith("EOR"):
return "OPCODE_XOR"
if m.startswith("LDR") or m.startswith("LDP"):
return "OPCODE_LOAD"
if m.startswith("STR") or m.startswith("STP"):
return "OPCODE_STORE"
if m == "B": return "OPCODE_JUMP"
if m == "BL": return "OPCODE_CALL"
if m == "RET": return "OPCODE_RETURN"
if m == "CBZ": return "OPCODE_BRANCH_ZERO"
if m == "CBNZ": return "OPCODE_BRANCH_NOT_ZERO"
if m.startswith("TBZ"): return "OPCODE_TEST_BIT_ZERO"
if m.startswith("CMP") or m.startswith("CMN"):
if m == "B":
return "OPCODE_JUMP"
if m == "BL":
return "OPCODE_CALL"
if m == "RET":
return "OPCODE_RETURN"
if m.startswith("CMP"):
return "OPCODE_CMP"
if m.startswith("CCMP"):
return "OPCODE_CMP_COND"
# If we don't know what it is, map it to TRAP.
return "OPCODE_TRAP"
def generate_hash_table(instructions: List[A64Instruction]) -> Dict[int, List[A64Instruction]]:
buckets: Dict[int, List[A64Instruction]] = {i: [] for i in range(DECODER_HASH_TABLE_SIZE)}
def generate_hash_table(
instructions: List[A64Instruction],
) -> Dict[int, List[A64Instruction]]:
buckets: Dict[int, List[A64Instruction]] = {
i: [] for i in range(DECODER_HASH_TABLE_SIZE)
}
# Iterate over every possible hash index to determine which instructions
# belong in it
@@ -292,7 +465,9 @@ def generate_hash_table(instructions: List[A64Instruction]) -> Dict[int, List[A6
if (probe_val & mask) == value:
buckets[i].append(inst)
buckets[i].sort(key=lambda x: x.priority, reverse=True)
return buckets
@@ -356,13 +531,16 @@ if __name__ == "__main__":
# Process XML Files
# -------------------------------------------------------------------------
files = glob.glob(os.path.join(xml_directory, "*.xml"))
if len(files) < 1:
print(f"No XML files found in {xml_directory}")
sys.exit(1)
print(f"Found {len(files)} XML files")
all_instructions: List[A64Instruction] = []
files_to_ignore: List[str] = [os.path.join(xml_directory + "onebigfile.xml")]
for f in files:
# Skip index and shared pseudo-code files.
if "index" in f or "shared" in f:
@@ -404,10 +582,12 @@ if __name__ == "__main__":
# Generate Source File
# -------------------------------------------------------------------------
decoder_generated_header_name: str = DEFAULT_DECODER_GENERATED_HEADER_NAME
if args.output_header is not None:
decoder_generated_header_name = args.output_header
buckets: Dict[int, List[A64Instruction]] = generate_hash_table(all_instructions)
with open(output_source_path, "w", encoding="utf-8") as f:
f.write(f"{GENERATED_FILE_WARNING}\n\n")
f.write(f"/* Generated {len(all_instructions)} instructions */\n")
@@ -417,11 +597,22 @@ if __name__ == "__main__":
f.write(
f"const {DECODER_METADATA_STRUCT_NAME} {DECODER_ARM64_GLOBAL_INSTRUCTIONS_ARRAY_NAME}[{DECODER_ARM64_INSTRUCTIONS_SIZE_NAME}] = {{\n"
)
for inst in all_instructions:
ir_opcode: str = derive_opcode(inst.mnemonic)
operands_str: str = ""
for i in range(4):
if i < len(inst.operands):
operand: Operand = inst.operands[i]
operands_str += f"{{ {operand.type_enum}, {operand.bit_position}, {operand.bit_width} }},\n"
else:
operands_str += "{ BAL_OPERAND_TYPE_NONE, 0, 0 },\n"
f.write(
f' {{ "{inst.mnemonic}", 0x{inst.mask:08X}, 0x{inst.value:08X}, {ir_opcode} }}, \n'
f' {{ "{inst.mnemonic}", 0x{inst.mask:08X}, 0x{inst.value:08X}, {ir_opcode},\n{{ {operands_str} }} }},\n'
)
f.write("};")
# Generate the lookup table arrays first