mud/src/mudlib/zmachine/zstring.py

#
# A ZString-to-Unicode Universal Translator.
#
# For the license of this file, please consult the LICENSE file in the
# root directory of this distribution.
#

import itertools

from .zlogging import log


class ZStringEndOfString(Exception):
    """No more data left in string."""


class ZStringIllegalAbbrevInString(Exception):
    """String abbreviation encountered within a string in a context
    where it is not allowed."""


class ZStringTranslator:
    def __init__(self, zmem):
        self._mem = zmem

    def get(self, addr):
        from .bitfield import BitField

        pos = (addr, BitField(self._mem.read_word(addr)), 0)

        s = []
        try:
            while True:
                s.append(self._read_char(pos))
                pos = self._next_pos(pos)
        except ZStringEndOfString:
            return s

    def _read_char(self, pos):
        offset = (2 - pos[2]) * 5
        return pos[1][offset : offset + 5]

    def _is_final(self, pos):
        return pos[1][15] == 1

    def _next_pos(self, pos):
        from .bitfield import BitField

        offset = pos[2] + 1
        # Overflowing from current block?
        if offset == 3:
            # Was last block?
            if self._is_final(pos):
                # Kill processing.
                raise ZStringEndOfString
            # Get and return the next block.
            return (pos[0] + 2, BitField(self._mem.read_word(pos[0] + 2)), 0)

        # Just increment the intra-block counter.
        return (pos[0], pos[1], offset)


class ZCharTranslator:
    # The default alphabet tables for ZChar translation.
    # As the codes 0-5 are special, alphabets start with code 0x6.
    DEFAULT_A0 = [ord(x) for x in "abcdefghijklmnopqrstuvwxyz"]
    DEFAULT_A1 = [ord(x) for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
    # A2 also has 0x6 as special char, so they start at 0x7.
    DEFAULT_A2 = [ord(x) for x in "\n0123456789.,!?_#'\"/\\-:()"]
    DEFAULT_A2_V5 = [ord(x) for x in "\n0123456789.,!?_#'\"/\\-:()"]

    ALPHA = (DEFAULT_A0, DEFAULT_A1, DEFAULT_A2)
    ALPHA_V5 = (DEFAULT_A0, DEFAULT_A1, DEFAULT_A2_V5)

    def __init__(self, zmem):
        self._mem = zmem

        # Initialize the alphabets
        if self._mem.version == 5:
            self._alphabet = self._load_custom_alphabet() or self.ALPHA_V5
        else:
            self._alphabet = self.ALPHA

        # Initialize the special state handlers
        self._load_specials()

        # Initialize the abbreviations (if supported)
        self._load_abbrev_tables()

    def _load_custom_alphabet(self):
        """Check for the existence of a custom alphabet, and load it
        if it does exist. Return the custom alphabet if it was found,
        None otherwise."""
        # The custom alphabet table address is at 0x34 in the memory.
        if self._mem[0x34] == 0:
            return None

        alph_addr = self._mem.read_word(0x34)
        alphabet = self._mem[alph_addr : alph_addr + 78]
        return [alphabet[0:26], alphabet[26:52], alphabet[52:78]]

    def _load_abbrev_tables(self):
        self._abbrevs = {}

        # If the ZM doesn't do abbrevs, just return an empty dict.
        if self._mem.version == 1:
            return

        # Build ourselves a ZStringTranslator for the abbrevs.
        xlator = ZStringTranslator(self._mem)

        def _load_subtable(num, base):
            for i, zoff in [(i, base + (num * 64) + (i * 2)) for i in range(0, 32)]:
                zaddr = self._mem.read_word(zoff)
                zstr = xlator.get(self._mem.word_address(zaddr))
                zchr = self.get(zstr, allow_abbreviations=False)
                self._abbrevs[(num, i)] = zchr

        abbrev_base = self._mem.read_word(0x18)
        _load_subtable(0, abbrev_base)

        # Does this ZM support the extended abbrev tables?
        if self._mem.version >= 3:
            _load_subtable(1, abbrev_base)
            _load_subtable(2, abbrev_base)

    def _load_specials(self):
        """Load the special character code handlers for the current
        machine version.
        """

        # The following three functions define the three possible
        # special character code handlers.
        def newline(state):
            """Append ZSCII 13 (newline) to the output."""
            state["zscii"].append(13)

        def shift_alphabet(state, direction, lock):
            """Shift the current alphaber up or down. If lock is
            False, the alphabet will revert to the previous alphabet
            after outputting 1 character. Else, the alphabet will
            remain unchanged until the next shift.
            """
            state["curr_alpha"] = (state["curr_alpha"] + direction) % 3
            if lock:
                state["prev_alpha"] = state["curr_alpha"]

        def abbreviation(state, abbrev):
            """Insert the given abbreviation from the given table into
            the output stream.

            This character was an abbreviation table number. The next
            character will be the offset within that table of the
            abbreviation. Set up a state handler to intercept the next
            character and output the right abbreviation."""

            def write_abbreviation(state, c, subtable):
                state["zscii"] += self._abbrevs[(subtable, c)]
                del state["state_handler"]

            # If we're parsing an abbreviation, there should be no
            # nested abbreviations. So this is just a sanity check for
            # people feeding us bad stories.
            if not state["allow_abbreviations"]:
                raise ZStringIllegalAbbrevInString

            state["state_handler"] = lambda s, c: write_abbreviation(s, c, abbrev)

        # Register the specials handlers depending on machine version.
        if self._mem.version == 1:
            self._specials = {
                1: lambda s: newline(s),
                2: lambda s: shift_alphabet(s, +1, False),
                3: lambda s: shift_alphabet(s, -1, False),
                4: lambda s: shift_alphabet(s, +1, True),
                5: lambda s: shift_alphabet(s, -1, True),
            }
        elif self._mem.version == 2:
            self._specials = {
                1: lambda s: abbreviation(s, 0),
                2: lambda s: shift_alphabet(s, +1, False),
                3: lambda s: shift_alphabet(s, -1, False),
                4: lambda s: shift_alphabet(s, +1, True),
                5: lambda s: shift_alphabet(s, -1, True),
            }
        else:  # ZM v3-5
            self._specials = {
                1: lambda s: abbreviation(s, 0),
                2: lambda s: abbreviation(s, 1),
                3: lambda s: abbreviation(s, 2),
                4: lambda s: shift_alphabet(s, +1, False),
                5: lambda s: shift_alphabet(s, -1, False),
            }

    def _special_zscii(self, state, char):
        if "zscii_char" not in list(state.keys()):
            state["zscii_char"] = char
        else:
            zchar = (state["zscii_char"] << 5) + char
            state["zscii"].append(zchar)
            del state["zscii_char"]
            del state["state_handler"]

    def get(self, zstr, allow_abbreviations=True):
        state = {
            "curr_alpha": 0,
            "prev_alpha": 0,
            "zscii": [],
            "allow_abbreviations": allow_abbreviations,
        }

        for c in zstr:
            if "state_handler" in list(state.keys()):
                # If a special handler has registered itself, then hand
                # processing over to it.
                state["state_handler"](state, c)  # type: ignore[call-non-callable]
            elif c in list(self._specials.keys()):
                # Hand off per-ZM version special char handling.
                self._specials[c](state)
            elif state["curr_alpha"] == 2 and c == 6:
                # Handle the strange A2/6 character
                state["state_handler"] = self._special_zscii
            else:
                # Do the usual Thing: append a zscii code to the
                # decoded sequence and revert to the "previous"
                # alphabet (or not, if it hasn't recently changed or
                # was locked)
                if c == 0:
                    # Append a space.
                    z = 32
                elif state["curr_alpha"] == 2:
                    # The symbol alphabet table only has 25 chars
                    # because of the A2/6 special char, so we need to
                    # adjust differently.
                    z = self._alphabet[state["curr_alpha"]][c - 7]
                else:
                    z = self._alphabet[state["curr_alpha"]][c - 6]
                state["zscii"].append(z)
                state["curr_alpha"] = state["prev_alpha"]

        return state["zscii"]


class ZsciiTranslator:
    # The default Unicode Translation Table that maps to ZSCII codes
    # 155-251. The codes are unicode codepoints for a host of strange
    # characters.
    DEFAULT_UTT = [
        chr(x)
        for x in (
            0xE4,
            0xF6,
            0xFC,
            0xC4,
            0xD6,
            0xDC,
            0xDF,
            0xBB,
            0xAB,
            0xEB,
            0xEF,
            0xFF,
            0xCB,
            0xCF,
            0xE1,
            0xE9,
            0xED,
            0xF3,
            0xFA,
            0xFD,
            0xC1,
            0xC9,
            0xCD,
            0xD3,
            0xDA,
            0xDD,
            0xE0,
            0xE8,
            0xEC,
            0xF2,
            0xF9,
            0xC0,
            0xC8,
            0xCC,
            0xD2,
            0xD9,
            0xE2,
            0xEA,
            0xEE,
            0xF4,
            0xFB,
            0xC2,
            0xCA,
            0xCE,
            0xD4,
            0xDB,
            0xE5,
            0xC5,
            0xF8,
            0xD8,
            0xE3,
            0xF1,
            0xF5,
            0xC3,
            0xD1,
            0xD5,
            0xE6,
            0xC6,
            0xE7,
            0xC7,
            0xFE,
            0xF0,
            0xDE,
            0xD0,
            0xA3,
            0x153,
            0x152,
            0xA1,
            0xBF,
        )
    ]
    # And here is the offset at which the Unicode Translation Table
    # starts.
    UTT_OFFSET = 155

    # This subclass just lists all the "special" character codes that
    # are capturable from an input stream. They're just there so that
    # the user of the virtual machine can give them a nice name.
    class Input:
        DELETE = 8
        ESCAPE = 27
        # The cursor pad
        CUR_UP = 129
        CUR_DOWN = 130
        CUR_LEFT = 131
        CUR_RIGHT = 132
        # The Function keys
        F1 = 133
        F2 = 134
        F3 = 135
        F4 = 136
        F5 = 137
        F6 = 138
        F7 = 139
        F8 = 140
        F9 = 141
        F10 = 142
        F11 = 143
        F12 = 144
        # The numpad (keypad) keys.
        KP_0 = 145
        KP_1 = 146
        KP_2 = 147
        KP_3 = 148
        KP_4 = 149
        KP_5 = 150
        KP_6 = 151
        KP_7 = 152
        KP_8 = 153
        KP_9 = 154

    def __init__(self, zmem):
        self._mem = zmem
        self._output_table = {0: "", 10: "\n"}
        self._input_table = {"\n": 10}

        self._load_unicode_table()

        # Populate the input and output tables with the ASCII and UTT
        # characters.
        for code, char in [(x, chr(x)) for x in range(32, 127)]:
            self._output_table[code] = char
            self._input_table[char] = code

        # Populate the input table with the extra "special" input
        # codes.  The cool trick we use here, is that all these values
        # are in fact numbers, so their key will be available in both
        # dicts, and ztoa will provide the correct code if you pass it
        # a special symbol instead of a character to translate!
        #
        # Oh and we also pull the items from the subclass into this
        # instance, so as to make reference to these special codes
        # easier.
        for name, code in [
            (c, v)
            for c, v in list(self.Input.__dict__.items())
            if not c.startswith("__")
        ]:
            self._input_table[code] = code
            setattr(self, name, code)

        # The only special support required for ZSCII: ZM v5 defines
        # an extra character code to represent a mouse click. If we're
        # booting a v5 ZM, define this.
        if self._mem.version == 5:
            self.MOUSE_CLICK = 254
            self._input_table[254] = 254

    def _load_unicode_table(self):
        if self._mem.version == 5:
            # Read the header extension table address
            ext_table_addr = self._mem.read_word(0x36)

            # If:
            #  - The extension header's address is non-null
            #  - There are at least 3 words in the extension header
            #    (the unicode translation table is the third word)
            #  - The 3rd word (unicode translation table address) is
            #    non-null
            #
            # Then there is a unicode translation table other than the
            # default that needs loading.
            if (
                ext_table_addr != 0
                and self._mem.read_word(ext_table_addr) >= 3
                and self._mem.read_word(ext_table_addr + 6) != 0
            ):
                # Get the unicode translation table address
                utt_addr = self._mem.read_word(ext_table_addr + 6)

                # The first byte is the number of unicode characters
                # in the table.
                utt_len = self._mem[utt_addr]

                # Build the range of addresses to load from, and build
                # the unicode translation table as a list of unicode
                # chars.
                utt_range = range(utt_addr + 1, utt_addr + 1 + (utt_len * 2), 2)
                utt = [chr(self._mem.read_word(i)) for i in utt_range]
            else:
                utt = self.DEFAULT_UTT

            # One way or another, we have a unicode translation
            # table. Add all the characters in it to the input and
            # output translation tables.
            for zscii, unichar in zip(itertools.count(155), utt):
                self._output_table[zscii] = unichar
                self._input_table[unichar] = zscii

    def ztou(self, index):
        """Translate the given ZSCII code into the corresponding
        output Unicode character and return it, or raise an exception if
        the requested index has no translation."""
        try:
            return self._output_table[index]
        except KeyError:
            # Handle undefined ZSCII characters
            # 0-31 (except 0, 10): control characters, return empty string
            # 128-154, 252-254: undefined, return placeholder
            # 155-251: extended characters, should have Unicode table but don't
            if index < 32:
                return ""
            # For undefined or unmapped characters, return a placeholder
            log(f"Warning: undefined ZSCII character code {index}, using '?'")
            return "?"

    def utoz(self, char):
        """Translate the given Unicode code into the corresponding
        input ZSCII character and return it, or raise an exception if
        the requested character has no translation."""
        try:
            return self._input_table[char]
        except KeyError:
            raise IndexError("No such input character") from None

    def get(self, zscii):
        return "".join([self.ztou(c) for c in zscii])


class ZStringFactory:
    def __init__(self, zmem):
        self._mem = zmem
        self.zstr = ZStringTranslator(zmem)
        self.zchr = ZCharTranslator(zmem)
        self.zscii = ZsciiTranslator(zmem)

    def get(self, addr):
        zstr = self.zstr.get(addr)
        zchr = self.zchr.get(zstr)
        return self.zscii.get(zchr)