621 lines
20 KiB
Python
621 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""Z-Machine bytecode disassembler and opcode tracer.
|
|
|
|
Performs recursive-descent disassembly of a z-machine story file,
|
|
following all reachable code paths to catalog every opcode used.
|
|
Cross-references findings against zvm implementation status.
|
|
"""
|
|
|
|
import argparse
|
|
import struct
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass
|
|
class OpcodeInfo:
|
|
"""Metadata about an opcode."""
|
|
|
|
name: str
|
|
stores: bool = False
|
|
branches: bool = False
|
|
terminal: bool = False
|
|
inline_string: bool = False
|
|
is_call: bool = False
|
|
|
|
|
|
# Complete V3 opcode definitions
|
|
OP2_OPCODES = {
|
|
1: OpcodeInfo("je", branches=True),
|
|
2: OpcodeInfo("jl", branches=True),
|
|
3: OpcodeInfo("jg", branches=True),
|
|
4: OpcodeInfo("dec_chk", branches=True),
|
|
5: OpcodeInfo("inc_chk", branches=True),
|
|
6: OpcodeInfo("jin", branches=True),
|
|
7: OpcodeInfo("test", branches=True),
|
|
8: OpcodeInfo("or", stores=True),
|
|
9: OpcodeInfo("and", stores=True),
|
|
10: OpcodeInfo("test_attr", branches=True),
|
|
11: OpcodeInfo("set_attr"),
|
|
12: OpcodeInfo("clear_attr"),
|
|
13: OpcodeInfo("store"),
|
|
14: OpcodeInfo("insert_obj"),
|
|
15: OpcodeInfo("loadw", stores=True),
|
|
16: OpcodeInfo("loadb", stores=True),
|
|
17: OpcodeInfo("get_prop", stores=True),
|
|
18: OpcodeInfo("get_prop_addr", stores=True),
|
|
19: OpcodeInfo("get_next_prop", stores=True),
|
|
20: OpcodeInfo("add", stores=True),
|
|
21: OpcodeInfo("sub", stores=True),
|
|
22: OpcodeInfo("mul", stores=True),
|
|
23: OpcodeInfo("div", stores=True),
|
|
24: OpcodeInfo("mod", stores=True),
|
|
}
|
|
|
|
OP1_OPCODES = {
|
|
0: OpcodeInfo("jz", branches=True),
|
|
1: OpcodeInfo("get_sibling", stores=True, branches=True),
|
|
2: OpcodeInfo("get_child", stores=True, branches=True),
|
|
3: OpcodeInfo("get_parent", stores=True),
|
|
4: OpcodeInfo("get_prop_len", stores=True),
|
|
5: OpcodeInfo("inc"),
|
|
6: OpcodeInfo("dec"),
|
|
7: OpcodeInfo("print_addr"),
|
|
8: OpcodeInfo("call_1s", stores=True, is_call=True),
|
|
9: OpcodeInfo("remove_obj"),
|
|
10: OpcodeInfo("print_obj"),
|
|
11: OpcodeInfo("ret", terminal=True),
|
|
12: OpcodeInfo("jump", terminal=True),
|
|
13: OpcodeInfo("print_paddr"),
|
|
14: OpcodeInfo("load", stores=True),
|
|
15: OpcodeInfo("not", stores=True),
|
|
}
|
|
|
|
OP0_OPCODES = {
|
|
0: OpcodeInfo("rtrue", terminal=True),
|
|
1: OpcodeInfo("rfalse", terminal=True),
|
|
2: OpcodeInfo("print", inline_string=True),
|
|
3: OpcodeInfo("print_ret", inline_string=True, terminal=True),
|
|
4: OpcodeInfo("nop"),
|
|
5: OpcodeInfo("save", branches=True),
|
|
6: OpcodeInfo("restore", branches=True),
|
|
7: OpcodeInfo("restart", terminal=True),
|
|
8: OpcodeInfo("ret_popped", terminal=True),
|
|
9: OpcodeInfo("pop"),
|
|
10: OpcodeInfo("quit", terminal=True),
|
|
11: OpcodeInfo("new_line"),
|
|
12: OpcodeInfo("show_status"),
|
|
13: OpcodeInfo("verify", branches=True),
|
|
}
|
|
|
|
VAR_OPCODES = {
|
|
0: OpcodeInfo("call_vs", stores=True, is_call=True),
|
|
1: OpcodeInfo("storew"),
|
|
2: OpcodeInfo("storeb"),
|
|
3: OpcodeInfo("put_prop"),
|
|
4: OpcodeInfo("sread"),
|
|
5: OpcodeInfo("print_char"),
|
|
6: OpcodeInfo("print_num"),
|
|
7: OpcodeInfo("random", stores=True),
|
|
8: OpcodeInfo("push"),
|
|
9: OpcodeInfo("pull"),
|
|
10: OpcodeInfo("split_window"),
|
|
11: OpcodeInfo("set_window"),
|
|
19: OpcodeInfo("output_stream"),
|
|
20: OpcodeInfo("input_stream"),
|
|
21: OpcodeInfo("sound_effect"),
|
|
22: OpcodeInfo("read_char", stores=True),
|
|
}
|
|
|
|
# ZVM implementation status (opcodes with real logic, not stubs)
|
|
ZVM_IMPLEMENTED = {
|
|
("2OP", 1),
|
|
("2OP", 2),
|
|
("2OP", 4),
|
|
("2OP", 5),
|
|
("2OP", 8),
|
|
("2OP", 9),
|
|
("2OP", 13),
|
|
("2OP", 14),
|
|
("2OP", 15),
|
|
("2OP", 16),
|
|
("2OP", 17),
|
|
("2OP", 20),
|
|
("2OP", 21),
|
|
("2OP", 22),
|
|
("2OP", 23),
|
|
("1OP", 0),
|
|
("1OP", 2),
|
|
("1OP", 3),
|
|
("1OP", 5),
|
|
("1OP", 8),
|
|
("1OP", 12),
|
|
("1OP", 13),
|
|
("0OP", 0),
|
|
("0OP", 1),
|
|
("0OP", 2),
|
|
("0OP", 3),
|
|
("VAR", 0),
|
|
("VAR", 1),
|
|
("VAR", 3),
|
|
("VAR", 5),
|
|
("VAR", 7),
|
|
("VAR", 8),
|
|
("VAR", 10),
|
|
("VAR", 11),
|
|
("VAR", 19),
|
|
("VAR", 22),
|
|
}
|
|
|
|
|
|
class ZMachine:
|
|
"""Z-Machine story file reader and disassembler."""
|
|
|
|
def __init__(self, story_path: Path, verbose: bool = False):
|
|
self.story_path = story_path
|
|
self.verbose = verbose
|
|
self.data = story_path.read_bytes()
|
|
self.version = self.data[0]
|
|
self.entry_point = self.read_word(0x06)
|
|
self.static_mem_base = self.read_word(0x0E)
|
|
|
|
# Disassembly state
|
|
self.visited_addrs = set()
|
|
self.visited_routines = set()
|
|
self.worklist = []
|
|
self.opcode_counts = defaultdict(int)
|
|
self.instruction_count = 0
|
|
self.routines_from_entrypoint = 0
|
|
self.routines_from_scan = 0
|
|
|
|
def read_byte(self, addr: int) -> int:
|
|
"""Read a single byte."""
|
|
return self.data[addr]
|
|
|
|
def read_word(self, addr: int) -> int:
|
|
"""Read a 16-bit big-endian word."""
|
|
return struct.unpack_from(">H", self.data, addr)[0]
|
|
|
|
def read_signed_word(self, addr: int) -> int:
|
|
"""Read a 16-bit signed big-endian word."""
|
|
val = self.read_word(addr)
|
|
return val if val < 0x8000 else val - 0x10000
|
|
|
|
def unpack_routine_addr(self, packed: int) -> int:
|
|
"""Convert packed routine address to byte address."""
|
|
if self.version <= 3:
|
|
return packed * 2
|
|
elif self.version <= 5:
|
|
return packed * 4
|
|
else:
|
|
return packed * 8
|
|
|
|
def parse_operands(
|
|
self, pc: int, opcode_byte: int
|
|
) -> tuple[list[tuple[int, bool]], int]:
|
|
"""Parse operands and return (operands, bytes_consumed).
|
|
|
|
Each operand is (value, is_constant) tuple.
|
|
"""
|
|
operands = []
|
|
pos = pc
|
|
|
|
if opcode_byte < 0x80:
|
|
# Long form 2OP
|
|
op1_type = (opcode_byte >> 6) & 1
|
|
op2_type = (opcode_byte >> 5) & 1
|
|
|
|
if op1_type == 0: # small constant
|
|
operands.append((self.read_byte(pos), True))
|
|
pos += 1
|
|
else: # variable
|
|
operands.append((self.read_byte(pos), False))
|
|
pos += 1
|
|
|
|
if op2_type == 0: # small constant
|
|
operands.append((self.read_byte(pos), True))
|
|
pos += 1
|
|
else: # variable
|
|
operands.append((self.read_byte(pos), False))
|
|
pos += 1
|
|
|
|
elif opcode_byte < 0xB0:
|
|
# Short form 1OP or 0OP
|
|
op_type = (opcode_byte >> 4) & 3
|
|
|
|
if op_type == 0: # large constant
|
|
operands.append((self.read_word(pos), True))
|
|
pos += 2
|
|
elif op_type == 1: # small constant
|
|
operands.append((self.read_byte(pos), True))
|
|
pos += 1
|
|
elif op_type == 2: # variable
|
|
operands.append((self.read_byte(pos), False))
|
|
pos += 1
|
|
# op_type == 3: 0OP, no operands
|
|
|
|
else:
|
|
# Variable form
|
|
types_byte = self.read_byte(pos)
|
|
pos += 1
|
|
|
|
for i in range(4):
|
|
op_type = (types_byte >> (6 - i * 2)) & 3
|
|
if op_type == 3: # omitted
|
|
break
|
|
elif op_type == 0: # large constant
|
|
operands.append((self.read_word(pos), True))
|
|
pos += 2
|
|
elif op_type == 1: # small constant
|
|
operands.append((self.read_byte(pos), True))
|
|
pos += 1
|
|
elif op_type == 2: # variable
|
|
operands.append((self.read_byte(pos), False))
|
|
pos += 1
|
|
|
|
return operands, pos - pc
|
|
|
|
def parse_zstring(self, addr: int) -> int:
|
|
"""Parse z-string and return length in bytes."""
|
|
pos = addr
|
|
while True:
|
|
word = self.read_word(pos)
|
|
pos += 2
|
|
if word & 0x8000:
|
|
break
|
|
return pos - addr
|
|
|
|
def parse_branch(self, pc: int) -> tuple[int | None, int]:
|
|
"""Parse branch data and return (target_addr, bytes_consumed)."""
|
|
branch_byte = self.read_byte(pc)
|
|
short_form = (branch_byte & 0x40) != 0
|
|
|
|
if short_form:
|
|
offset = branch_byte & 0x3F
|
|
bytes_consumed = 1
|
|
else:
|
|
offset = ((branch_byte & 0x3F) << 8) | self.read_byte(pc + 1)
|
|
if offset >= 0x2000:
|
|
offset -= 0x4000
|
|
bytes_consumed = 2
|
|
|
|
# Compute target
|
|
if offset == 0 or offset == 1:
|
|
# Return true/false - terminal for this path
|
|
return None, bytes_consumed
|
|
else:
|
|
target = pc + bytes_consumed + offset - 2
|
|
return target, bytes_consumed
|
|
|
|
def decode_instruction(self, addr: int) -> tuple[str, str, int, list[int]]:
|
|
"""Decode instruction at addr.
|
|
|
|
Returns (opclass, name, next_addr, targets).
|
|
targets is a list of addresses to visit next.
|
|
"""
|
|
if addr in self.visited_addrs:
|
|
return "", "", addr, []
|
|
|
|
self.visited_addrs.add(addr)
|
|
self.instruction_count += 1
|
|
|
|
opcode_byte = self.read_byte(addr)
|
|
pc = addr + 1
|
|
|
|
# Determine form and opcode
|
|
if opcode_byte < 0x80:
|
|
# Long form 2OP
|
|
opcode_num = opcode_byte & 0x1F
|
|
opclass = "2OP"
|
|
info = OP2_OPCODES.get(opcode_num)
|
|
elif opcode_byte < 0xC0:
|
|
# Short form (includes 0xB0-0xBF which are 0OP)
|
|
op_type = (opcode_byte >> 4) & 3
|
|
if op_type == 3:
|
|
# 0OP
|
|
opcode_num = opcode_byte & 0x0F
|
|
opclass = "0OP"
|
|
info = OP0_OPCODES.get(opcode_num)
|
|
else:
|
|
# 1OP
|
|
opcode_num = opcode_byte & 0x0F
|
|
opclass = "1OP"
|
|
info = OP1_OPCODES.get(opcode_num)
|
|
elif opcode_byte < 0xE0:
|
|
# Variable form 2OP (0xC0-0xDF)
|
|
opcode_num = opcode_byte & 0x1F
|
|
opclass = "2OP"
|
|
info = OP2_OPCODES.get(opcode_num)
|
|
else:
|
|
# Variable form VAR (0xE0-0xFF)
|
|
opcode_num = opcode_byte & 0x1F
|
|
opclass = "VAR"
|
|
info = VAR_OPCODES.get(opcode_num)
|
|
|
|
if info is None:
|
|
# Unknown opcode
|
|
return opclass, f"unknown_{opcode_num}", pc, []
|
|
|
|
# Track opcode
|
|
self.opcode_counts[(opclass, opcode_num)] += 1
|
|
|
|
if self.verbose:
|
|
print(
|
|
f" {addr:05x}: {opclass}:{opcode_num:02d} {info.name}",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
# Parse operands
|
|
operands, operand_bytes = self.parse_operands(pc, opcode_byte)
|
|
pc += operand_bytes
|
|
|
|
# Handle inline z-string
|
|
if info.inline_string:
|
|
string_bytes = self.parse_zstring(pc)
|
|
pc += string_bytes
|
|
|
|
# Handle store byte
|
|
if info.stores:
|
|
pc += 1
|
|
|
|
# Handle branch
|
|
targets = []
|
|
if info.branches:
|
|
branch_target, branch_bytes = self.parse_branch(pc)
|
|
pc += branch_bytes
|
|
if branch_target is not None:
|
|
targets.append(branch_target)
|
|
|
|
# Handle call (follow routine + continue after call)
|
|
if info.is_call and operands:
|
|
packed_addr, is_const = operands[0]
|
|
if is_const and packed_addr != 0:
|
|
routine_addr = self.unpack_routine_addr(packed_addr)
|
|
if routine_addr not in self.visited_routines and routine_addr < len(
|
|
self.data
|
|
):
|
|
self.visited_routines.add(routine_addr)
|
|
targets.append(routine_addr)
|
|
|
|
# Handle jump (terminal but has target)
|
|
if opcode_num == 12 and opclass == "1OP" and operands:
|
|
offset, _ = operands[0]
|
|
if offset >= 0x8000:
|
|
offset -= 0x10000
|
|
jump_target = pc + offset - 2
|
|
targets.append(jump_target)
|
|
return opclass, info.name, pc, targets
|
|
|
|
# Add fall-through if not terminal
|
|
if not info.terminal:
|
|
targets.append(pc)
|
|
|
|
return opclass, info.name, pc, targets
|
|
|
|
def disassemble_routine(self, addr: int):
|
|
"""Disassemble a routine starting at addr."""
|
|
if addr >= len(self.data):
|
|
return
|
|
|
|
# Parse routine header
|
|
num_locals = self.read_byte(addr)
|
|
pc = addr + 1
|
|
|
|
# Skip local variable initial values (V3 only)
|
|
if self.version <= 3:
|
|
pc += num_locals * 2
|
|
|
|
# Add first instruction to worklist
|
|
self.worklist.append(pc)
|
|
|
|
def scan_data_for_routines(self):
|
|
"""Scan dynamic memory for packed addresses pointing to routines.
|
|
|
|
Globals and object property tables live in dynamic memory and
|
|
contain packed routine addresses. This catches routines reachable
|
|
only through indirect calls (variable operands in CALL opcodes).
|
|
"""
|
|
high_mem = self.read_word(0x04)
|
|
found = 0
|
|
|
|
# Scan every word in dynamic memory (globals, property tables)
|
|
for offset in range(0, self.static_mem_base - 1, 2):
|
|
packed = self.read_word(offset)
|
|
if packed == 0:
|
|
continue
|
|
|
|
addr = self.unpack_routine_addr(packed)
|
|
|
|
# Must point into the code region
|
|
if addr < high_mem or addr >= len(self.data) - 1:
|
|
continue
|
|
|
|
# Must not already be a known routine
|
|
if addr in self.visited_routines:
|
|
continue
|
|
|
|
# Must look like a valid routine header (local count 0-15)
|
|
num_locals = self.read_byte(addr)
|
|
if num_locals > 15:
|
|
continue
|
|
|
|
# First instruction must be within bounds
|
|
first_instr = addr + 1
|
|
if self.version <= 3:
|
|
first_instr += num_locals * 2
|
|
if first_instr >= len(self.data):
|
|
continue
|
|
|
|
self.visited_routines.add(addr)
|
|
self.disassemble_routine(addr)
|
|
found += 1
|
|
|
|
return found
|
|
|
|
def _process_worklist(self):
|
|
"""Process the instruction worklist until empty."""
|
|
while self.worklist:
|
|
addr = self.worklist.pop()
|
|
|
|
if addr in self.visited_addrs or addr >= len(self.data):
|
|
continue
|
|
|
|
opclass, name, next_addr, targets = self.decode_instruction(addr)
|
|
|
|
for target in targets:
|
|
if target >= len(self.data) or target < 0:
|
|
continue
|
|
if target in self.visited_routines:
|
|
num_locals = self.read_byte(target)
|
|
first_instr = target + 1
|
|
if self.version <= 3:
|
|
first_instr += num_locals * 2
|
|
if first_instr not in self.visited_addrs:
|
|
self.worklist.append(first_instr)
|
|
else:
|
|
if target not in self.visited_addrs:
|
|
self.worklist.append(target)
|
|
|
|
def disassemble_all(self):
|
|
"""Perform complete recursive-descent disassembly."""
|
|
# Entry point is a byte address of the first instruction (V1-5)
|
|
self.worklist.append(self.entry_point)
|
|
|
|
# Phase 1: recursive descent from entry point
|
|
self._process_worklist()
|
|
self.reachable_routines = len(self.visited_routines)
|
|
|
|
# Phase 2: scan dynamic memory for packed routine addresses
|
|
self.scanned_routines = self.scan_data_for_routines()
|
|
self._process_worklist()
|
|
|
|
def generate_report(self) -> str:
|
|
"""Generate analysis report."""
|
|
lines = []
|
|
lines.append("=" * 70)
|
|
lines.append("Z-MACHINE OPCODE TRACE REPORT")
|
|
lines.append("=" * 70)
|
|
lines.append("")
|
|
lines.append(f"Story file: {self.story_path}")
|
|
lines.append(f"Version: {self.version}")
|
|
lines.append(f"Entry point: ${self.entry_point:04x}")
|
|
lines.append(f"Story size: {len(self.data)} bytes")
|
|
lines.append("")
|
|
lines.append("DISASSEMBLY STATISTICS")
|
|
lines.append("-" * 70)
|
|
lines.append(
|
|
f"Routines found (entry-point reachable): {self.reachable_routines}"
|
|
)
|
|
lines.append(f"Routines found (data scan): {self.scanned_routines}")
|
|
lines.append(f"Total routines: {len(self.visited_routines)}")
|
|
lines.append(f"Instructions decoded: {self.instruction_count}")
|
|
lines.append(f"Unique opcodes: {len(self.opcode_counts)}")
|
|
lines.append("")
|
|
|
|
# Opcodes by class
|
|
lines.append("OPCODES FOUND IN STORY")
|
|
lines.append("-" * 70)
|
|
|
|
for opclass_name, opcode_dict in [
|
|
("2OP", OP2_OPCODES),
|
|
("1OP", OP1_OPCODES),
|
|
("0OP", OP0_OPCODES),
|
|
("VAR", VAR_OPCODES),
|
|
]:
|
|
lines.append(f"\n{opclass_name} opcodes:")
|
|
found = [
|
|
(num, opcode_dict[num].name, self.opcode_counts[(opclass_name, num)])
|
|
for num in sorted(opcode_dict.keys())
|
|
if (opclass_name, num) in self.opcode_counts
|
|
]
|
|
if found:
|
|
for num, name, count in found:
|
|
lines.append(f" {num:2d} {name:20s} (used {count} times)")
|
|
else:
|
|
lines.append(" (none)")
|
|
|
|
lines.append("")
|
|
lines.append("GAP ANALYSIS: ZVM IMPLEMENTATION STATUS")
|
|
lines.append("-" * 70)
|
|
|
|
implemented = []
|
|
missing = []
|
|
|
|
for (opclass, opcode_num), count in sorted(self.opcode_counts.items()):
|
|
if opclass == "2OP":
|
|
name = OP2_OPCODES[opcode_num].name
|
|
elif opclass == "1OP":
|
|
name = OP1_OPCODES[opcode_num].name
|
|
elif opclass == "0OP":
|
|
name = OP0_OPCODES[opcode_num].name
|
|
elif opclass == "VAR":
|
|
name = VAR_OPCODES[opcode_num].name
|
|
else:
|
|
name = "unknown"
|
|
|
|
key = (opclass, opcode_num)
|
|
if key in ZVM_IMPLEMENTED:
|
|
implemented.append((opclass, opcode_num, name, count))
|
|
else:
|
|
missing.append((opclass, opcode_num, name, count))
|
|
|
|
lines.append("\nImplemented in zvm:")
|
|
for opclass, num, name, count in implemented:
|
|
lines.append(f" {opclass}:{num:2d} {name:20s} (used {count} times)")
|
|
|
|
lines.append("\nMissing from zvm (need porting):")
|
|
for opclass, num, name, count in missing:
|
|
lines.append(f" {opclass}:{num:2d} {name:20s} (used {count} times)")
|
|
|
|
lines.append("")
|
|
lines.append("SUMMARY")
|
|
lines.append("-" * 70)
|
|
total = len(self.opcode_counts)
|
|
impl_count = len(implemented)
|
|
missing_count = len(missing)
|
|
lines.append(
|
|
f"{total} unique opcodes found in story, "
|
|
f"{impl_count} already in zvm, {missing_count} need porting"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Trace z-machine opcodes in a story file"
|
|
)
|
|
parser.add_argument(
|
|
"story",
|
|
nargs="?",
|
|
default="content/stories/zork1.z3",
|
|
help="Path to z-machine story file (default: content/stories/zork1.z3)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Print each opcode as it's found",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
story_path = Path(args.story)
|
|
|
|
if not story_path.exists():
|
|
print(f"Error: Story file not found: {story_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.verbose:
|
|
print(f"Disassembling {story_path}...", file=sys.stderr)
|
|
|
|
zm = ZMachine(story_path, verbose=args.verbose)
|
|
zm.disassemble_all()
|
|
|
|
print(zm.generate_report())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|