diff --git a/scripts/trace_opcodes.py b/scripts/trace_opcodes.py new file mode 100644 index 0000000..6edd5ff --- /dev/null +++ b/scripts/trace_opcodes.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 +"""Z-Machine bytecode disassembler and opcode tracer. + +Performs recursive-descent disassembly of a z-machine story file, +following all reachable code paths to catalog every opcode used. +Cross-references findings against zvm implementation status. +""" + +import argparse +import struct +import sys +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class OpcodeInfo: + """Metadata about an opcode.""" + + name: str + stores: bool = False + branches: bool = False + terminal: bool = False + inline_string: bool = False + is_call: bool = False + + +# Complete V3 opcode definitions +OP2_OPCODES = { + 1: OpcodeInfo("je", branches=True), + 2: OpcodeInfo("jl", branches=True), + 3: OpcodeInfo("jg", branches=True), + 4: OpcodeInfo("dec_chk", branches=True), + 5: OpcodeInfo("inc_chk", branches=True), + 6: OpcodeInfo("jin", branches=True), + 7: OpcodeInfo("test", branches=True), + 8: OpcodeInfo("or", stores=True), + 9: OpcodeInfo("and", stores=True), + 10: OpcodeInfo("test_attr", branches=True), + 11: OpcodeInfo("set_attr"), + 12: OpcodeInfo("clear_attr"), + 13: OpcodeInfo("store"), + 14: OpcodeInfo("insert_obj"), + 15: OpcodeInfo("loadw", stores=True), + 16: OpcodeInfo("loadb", stores=True), + 17: OpcodeInfo("get_prop", stores=True), + 18: OpcodeInfo("get_prop_addr", stores=True), + 19: OpcodeInfo("get_next_prop", stores=True), + 20: OpcodeInfo("add", stores=True), + 21: OpcodeInfo("sub", stores=True), + 22: OpcodeInfo("mul", stores=True), + 23: OpcodeInfo("div", stores=True), + 24: OpcodeInfo("mod", stores=True), +} + +OP1_OPCODES = { + 0: OpcodeInfo("jz", branches=True), + 1: OpcodeInfo("get_sibling", stores=True, branches=True), + 2: OpcodeInfo("get_child", stores=True, branches=True), + 3: OpcodeInfo("get_parent", stores=True), + 4: OpcodeInfo("get_prop_len", stores=True), + 5: OpcodeInfo("inc"), + 6: OpcodeInfo("dec"), + 7: OpcodeInfo("print_addr"), + 8: OpcodeInfo("call_1s", stores=True, is_call=True), + 9: OpcodeInfo("remove_obj"), + 10: OpcodeInfo("print_obj"), + 11: OpcodeInfo("ret", terminal=True), + 12: OpcodeInfo("jump", terminal=True), + 13: OpcodeInfo("print_paddr"), + 14: OpcodeInfo("load", stores=True), + 15: OpcodeInfo("not", stores=True), +} + +OP0_OPCODES = { + 0: OpcodeInfo("rtrue", terminal=True), + 1: OpcodeInfo("rfalse", terminal=True), + 2: OpcodeInfo("print", inline_string=True), + 3: OpcodeInfo("print_ret", inline_string=True, terminal=True), + 4: OpcodeInfo("nop"), + 5: OpcodeInfo("save", branches=True), + 6: OpcodeInfo("restore", branches=True), + 7: OpcodeInfo("restart", terminal=True), + 8: OpcodeInfo("ret_popped", terminal=True), + 9: OpcodeInfo("pop"), + 10: OpcodeInfo("quit", terminal=True), + 11: OpcodeInfo("new_line"), + 12: OpcodeInfo("show_status"), + 13: OpcodeInfo("verify", branches=True), +} + +VAR_OPCODES = { + 0: OpcodeInfo("call_vs", stores=True, is_call=True), + 1: OpcodeInfo("storew"), + 2: OpcodeInfo("storeb"), + 3: OpcodeInfo("put_prop"), + 4: OpcodeInfo("sread"), + 5: OpcodeInfo("print_char"), + 6: OpcodeInfo("print_num"), + 7: OpcodeInfo("random", stores=True), + 8: OpcodeInfo("push"), + 9: OpcodeInfo("pull"), + 10: OpcodeInfo("split_window"), + 11: OpcodeInfo("set_window"), + 19: OpcodeInfo("output_stream"), + 20: OpcodeInfo("input_stream"), + 21: OpcodeInfo("sound_effect"), + 22: OpcodeInfo("read_char", stores=True), +} + +# ZVM implementation status (opcodes with real logic, not stubs) +ZVM_IMPLEMENTED = { + ("2OP", 1), + ("2OP", 2), + ("2OP", 4), + ("2OP", 5), + ("2OP", 8), + ("2OP", 9), + ("2OP", 13), + ("2OP", 14), + ("2OP", 15), + ("2OP", 16), + ("2OP", 17), + ("2OP", 20), + ("2OP", 21), + ("2OP", 22), + ("2OP", 23), + ("1OP", 0), + ("1OP", 2), + ("1OP", 3), + ("1OP", 5), + ("1OP", 8), + ("1OP", 12), + ("1OP", 13), + ("0OP", 0), + ("0OP", 1), + ("0OP", 2), + ("0OP", 3), + ("VAR", 0), + ("VAR", 1), + ("VAR", 3), + ("VAR", 5), + ("VAR", 7), + ("VAR", 8), + ("VAR", 10), + ("VAR", 11), + ("VAR", 19), + ("VAR", 22), +} + + +class ZMachine: + """Z-Machine story file reader and disassembler.""" + + def __init__(self, story_path: Path, verbose: bool = False): + self.story_path = story_path + self.verbose = verbose + self.data = story_path.read_bytes() + self.version = self.data[0] + self.entry_point = self.read_word(0x06) + self.static_mem_base = self.read_word(0x0E) + + # Disassembly state + self.visited_addrs = set() + self.visited_routines = set() + self.worklist = [] + self.opcode_counts = defaultdict(int) + self.instruction_count = 0 + self.routines_from_entrypoint = 0 + self.routines_from_scan = 0 + + def read_byte(self, addr: int) -> int: + """Read a single byte.""" + return self.data[addr] + + def read_word(self, addr: int) -> int: + """Read a 16-bit big-endian word.""" + return struct.unpack_from(">H", self.data, addr)[0] + + def read_signed_word(self, addr: int) -> int: + """Read a 16-bit signed big-endian word.""" + val = self.read_word(addr) + return val if val < 0x8000 else val - 0x10000 + + def unpack_routine_addr(self, packed: int) -> int: + """Convert packed routine address to byte address.""" + if self.version <= 3: + return packed * 2 + elif self.version <= 5: + return packed * 4 + else: + return packed * 8 + + def parse_operands( + self, pc: int, opcode_byte: int + ) -> tuple[list[tuple[int, bool]], int]: + """Parse operands and return (operands, bytes_consumed). + + Each operand is (value, is_constant) tuple. + """ + operands = [] + pos = pc + + if opcode_byte < 0x80: + # Long form 2OP + op1_type = (opcode_byte >> 6) & 1 + op2_type = (opcode_byte >> 5) & 1 + + if op1_type == 0: # small constant + operands.append((self.read_byte(pos), True)) + pos += 1 + else: # variable + operands.append((self.read_byte(pos), False)) + pos += 1 + + if op2_type == 0: # small constant + operands.append((self.read_byte(pos), True)) + pos += 1 + else: # variable + operands.append((self.read_byte(pos), False)) + pos += 1 + + elif opcode_byte < 0xB0: + # Short form 1OP or 0OP + op_type = (opcode_byte >> 4) & 3 + + if op_type == 0: # large constant + operands.append((self.read_word(pos), True)) + pos += 2 + elif op_type == 1: # small constant + operands.append((self.read_byte(pos), True)) + pos += 1 + elif op_type == 2: # variable + operands.append((self.read_byte(pos), False)) + pos += 1 + # op_type == 3: 0OP, no operands + + else: + # Variable form + types_byte = self.read_byte(pos) + pos += 1 + + for i in range(4): + op_type = (types_byte >> (6 - i * 2)) & 3 + if op_type == 3: # omitted + break + elif op_type == 0: # large constant + operands.append((self.read_word(pos), True)) + pos += 2 + elif op_type == 1: # small constant + operands.append((self.read_byte(pos), True)) + pos += 1 + elif op_type == 2: # variable + operands.append((self.read_byte(pos), False)) + pos += 1 + + return operands, pos - pc + + def parse_zstring(self, addr: int) -> int: + """Parse z-string and return length in bytes.""" + pos = addr + while True: + word = self.read_word(pos) + pos += 2 + if word & 0x8000: + break + return pos - addr + + def parse_branch(self, pc: int) -> tuple[int | None, int]: + """Parse branch data and return (target_addr, bytes_consumed).""" + branch_byte = self.read_byte(pc) + short_form = (branch_byte & 0x40) != 0 + + if short_form: + offset = branch_byte & 0x3F + bytes_consumed = 1 + else: + offset = ((branch_byte & 0x3F) << 8) | self.read_byte(pc + 1) + if offset >= 0x2000: + offset -= 0x4000 + bytes_consumed = 2 + + # Compute target + if offset == 0 or offset == 1: + # Return true/false - terminal for this path + return None, bytes_consumed + else: + target = pc + bytes_consumed + offset - 2 + return target, bytes_consumed + + def decode_instruction(self, addr: int) -> tuple[str, str, int, list[int]]: + """Decode instruction at addr. + + Returns (opclass, name, next_addr, targets). + targets is a list of addresses to visit next. + """ + if addr in self.visited_addrs: + return "", "", addr, [] + + self.visited_addrs.add(addr) + self.instruction_count += 1 + + opcode_byte = self.read_byte(addr) + pc = addr + 1 + + # Determine form and opcode + if opcode_byte < 0x80: + # Long form 2OP + opcode_num = opcode_byte & 0x1F + opclass = "2OP" + info = OP2_OPCODES.get(opcode_num) + elif opcode_byte < 0xC0: + # Short form (includes 0xB0-0xBF which are 0OP) + op_type = (opcode_byte >> 4) & 3 + if op_type == 3: + # 0OP + opcode_num = opcode_byte & 0x0F + opclass = "0OP" + info = OP0_OPCODES.get(opcode_num) + else: + # 1OP + opcode_num = opcode_byte & 0x0F + opclass = "1OP" + info = OP1_OPCODES.get(opcode_num) + elif opcode_byte < 0xE0: + # Variable form 2OP (0xC0-0xDF) + opcode_num = opcode_byte & 0x1F + opclass = "2OP" + info = OP2_OPCODES.get(opcode_num) + else: + # Variable form VAR (0xE0-0xFF) + opcode_num = opcode_byte & 0x1F + opclass = "VAR" + info = VAR_OPCODES.get(opcode_num) + + if info is None: + # Unknown opcode + return opclass, f"unknown_{opcode_num}", pc, [] + + # Track opcode + self.opcode_counts[(opclass, opcode_num)] += 1 + + if self.verbose: + print( + f" {addr:05x}: {opclass}:{opcode_num:02d} {info.name}", + file=sys.stderr, + ) + + # Parse operands + operands, operand_bytes = self.parse_operands(pc, opcode_byte) + pc += operand_bytes + + # Handle inline z-string + if info.inline_string: + string_bytes = self.parse_zstring(pc) + pc += string_bytes + + # Handle store byte + if info.stores: + pc += 1 + + # Handle branch + targets = [] + if info.branches: + branch_target, branch_bytes = self.parse_branch(pc) + pc += branch_bytes + if branch_target is not None: + targets.append(branch_target) + + # Handle call (follow routine + continue after call) + if info.is_call and operands: + packed_addr, is_const = operands[0] + if is_const and packed_addr != 0: + routine_addr = self.unpack_routine_addr(packed_addr) + if routine_addr not in self.visited_routines and routine_addr < len( + self.data + ): + self.visited_routines.add(routine_addr) + targets.append(routine_addr) + + # Handle jump (terminal but has target) + if opcode_num == 12 and opclass == "1OP" and operands: + offset, _ = operands[0] + if offset >= 0x8000: + offset -= 0x10000 + jump_target = pc + offset - 2 + targets.append(jump_target) + return opclass, info.name, pc, targets + + # Add fall-through if not terminal + if not info.terminal: + targets.append(pc) + + return opclass, info.name, pc, targets + + def disassemble_routine(self, addr: int): + """Disassemble a routine starting at addr.""" + if addr >= len(self.data): + return + + # Parse routine header + num_locals = self.read_byte(addr) + pc = addr + 1 + + # Skip local variable initial values (V3 only) + if self.version <= 3: + pc += num_locals * 2 + + # Add first instruction to worklist + self.worklist.append(pc) + + def scan_data_for_routines(self): + """Scan dynamic memory for packed addresses pointing to routines. + + Globals and object property tables live in dynamic memory and + contain packed routine addresses. This catches routines reachable + only through indirect calls (variable operands in CALL opcodes). + """ + high_mem = self.read_word(0x04) + found = 0 + + # Scan every word in dynamic memory (globals, property tables) + for offset in range(0, self.static_mem_base - 1, 2): + packed = self.read_word(offset) + if packed == 0: + continue + + addr = self.unpack_routine_addr(packed) + + # Must point into the code region + if addr < high_mem or addr >= len(self.data) - 1: + continue + + # Must not already be a known routine + if addr in self.visited_routines: + continue + + # Must look like a valid routine header (local count 0-15) + num_locals = self.read_byte(addr) + if num_locals > 15: + continue + + # First instruction must be within bounds + first_instr = addr + 1 + if self.version <= 3: + first_instr += num_locals * 2 + if first_instr >= len(self.data): + continue + + self.visited_routines.add(addr) + self.disassemble_routine(addr) + found += 1 + + return found + + def _process_worklist(self): + """Process the instruction worklist until empty.""" + while self.worklist: + addr = self.worklist.pop() + + if addr in self.visited_addrs or addr >= len(self.data): + continue + + opclass, name, next_addr, targets = self.decode_instruction(addr) + + for target in targets: + if target >= len(self.data) or target < 0: + continue + if target in self.visited_routines: + num_locals = self.read_byte(target) + first_instr = target + 1 + if self.version <= 3: + first_instr += num_locals * 2 + if first_instr not in self.visited_addrs: + self.worklist.append(first_instr) + else: + if target not in self.visited_addrs: + self.worklist.append(target) + + def disassemble_all(self): + """Perform complete recursive-descent disassembly.""" + # Entry point is a byte address of the first instruction (V1-5) + self.worklist.append(self.entry_point) + + # Phase 1: recursive descent from entry point + self._process_worklist() + self.reachable_routines = len(self.visited_routines) + + # Phase 2: scan dynamic memory for packed routine addresses + self.scanned_routines = self.scan_data_for_routines() + self._process_worklist() + + def generate_report(self) -> str: + """Generate analysis report.""" + lines = [] + lines.append("=" * 70) + lines.append("Z-MACHINE OPCODE TRACE REPORT") + lines.append("=" * 70) + lines.append("") + lines.append(f"Story file: {self.story_path}") + lines.append(f"Version: {self.version}") + lines.append(f"Entry point: ${self.entry_point:04x}") + lines.append(f"Story size: {len(self.data)} bytes") + lines.append("") + lines.append("DISASSEMBLY STATISTICS") + lines.append("-" * 70) + lines.append( + f"Routines found (entry-point reachable): {self.reachable_routines}" + ) + lines.append(f"Routines found (data scan): {self.scanned_routines}") + lines.append(f"Total routines: {len(self.visited_routines)}") + lines.append(f"Instructions decoded: {self.instruction_count}") + lines.append(f"Unique opcodes: {len(self.opcode_counts)}") + lines.append("") + + # Opcodes by class + lines.append("OPCODES FOUND IN STORY") + lines.append("-" * 70) + + for opclass_name, opcode_dict in [ + ("2OP", OP2_OPCODES), + ("1OP", OP1_OPCODES), + ("0OP", OP0_OPCODES), + ("VAR", VAR_OPCODES), + ]: + lines.append(f"\n{opclass_name} opcodes:") + found = [ + (num, opcode_dict[num].name, self.opcode_counts[(opclass_name, num)]) + for num in sorted(opcode_dict.keys()) + if (opclass_name, num) in self.opcode_counts + ] + if found: + for num, name, count in found: + lines.append(f" {num:2d} {name:20s} (used {count} times)") + else: + lines.append(" (none)") + + lines.append("") + lines.append("GAP ANALYSIS: ZVM IMPLEMENTATION STATUS") + lines.append("-" * 70) + + implemented = [] + missing = [] + + for (opclass, opcode_num), count in sorted(self.opcode_counts.items()): + if opclass == "2OP": + name = OP2_OPCODES[opcode_num].name + elif opclass == "1OP": + name = OP1_OPCODES[opcode_num].name + elif opclass == "0OP": + name = OP0_OPCODES[opcode_num].name + elif opclass == "VAR": + name = VAR_OPCODES[opcode_num].name + else: + name = "unknown" + + key = (opclass, opcode_num) + if key in ZVM_IMPLEMENTED: + implemented.append((opclass, opcode_num, name, count)) + else: + missing.append((opclass, opcode_num, name, count)) + + lines.append("\nImplemented in zvm:") + for opclass, num, name, count in implemented: + lines.append(f" {opclass}:{num:2d} {name:20s} (used {count} times)") + + lines.append("\nMissing from zvm (need porting):") + for opclass, num, name, count in missing: + lines.append(f" {opclass}:{num:2d} {name:20s} (used {count} times)") + + lines.append("") + lines.append("SUMMARY") + lines.append("-" * 70) + total = len(self.opcode_counts) + impl_count = len(implemented) + missing_count = len(missing) + lines.append( + f"{total} unique opcodes found in story, " + f"{impl_count} already in zvm, {missing_count} need porting" + ) + lines.append("") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Trace z-machine opcodes in a story file" + ) + parser.add_argument( + "story", + nargs="?", + default="content/stories/zork1.z3", + help="Path to z-machine story file (default: content/stories/zork1.z3)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Print each opcode as it's found", + ) + + args = parser.parse_args() + story_path = Path(args.story) + + if not story_path.exists(): + print(f"Error: Story file not found: {story_path}", file=sys.stderr) + sys.exit(1) + + if args.verbose: + print(f"Disassembling {story_path}...", file=sys.stderr) + + zm = ZMachine(story_path, verbose=args.verbose) + zm.disassemble_all() + + print(zm.generate_report()) + + +if __name__ == "__main__": + main()