From cab26b8ed213e830d8f51111a5c0dcdf92625bad Mon Sep 17 00:00:00 2001 From: Lucian Mogosanu Date: Mon, 29 Jun 2026 23:27:58 +0300 Subject: [PATCH] Reversing text files --- CHECKLIST.md | 6 +++- JOURNAL.md | 28 +++++++++++++-- spec/formats/text-string-table.md | 69 +++++++++++++++++++++++++++++++++++++ tools/openpt/text.py | 65 ++++++++++++++++++++++++++++++++++ 4 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 spec/formats/text-string-table.md create mode 100644 tools/openpt/text.py diff --git a/CHECKLIST.md b/CHECKLIST.md index c57ac64..54fcb11 100644 --- a/CHECKLIST.md +++ b/CHECKLIST.md @@ -23,7 +23,11 @@ Living map of work across sessions. Status: `[ ]` todo · `[~]` in progress · - [ ] `.FNT` bitmap fonts (`FONTS/*.FNT`) - [ ] `.MIS` mission definitions (`M/*.MIS`) - [ ] `.PIZ` hiscores (`D/HISCORES.PIZ`) -- [ ] `T/*.E` text/dialog strings → structured extraction +- [x] `T/*.E` text/dialog strings → `spec/formats/text-string-table.md`; all 28 + extracted to `artifacts/text/*.json` (positional LF tables, cp437) +- [ ] `Z/ARTIKEL*.E` newspaper articles (ISO-8859, `$` markup, leading code byte) + — different/richer format +- [?] String index→in-game-use mapping (lives in `PT.EXE`; needs disassembly) - [ ] Confirm whether `.PC`/`.BOB` reuse the VGA RLE codec ## Phase 2 — DOS environment spec (`spec/environment/`) diff --git a/JOURNAL.md b/JOURNAL.md index 403ade5..5eb0350 100644 --- a/JOURNAL.md +++ b/JOURNAL.md @@ -46,5 +46,29 @@ high-level status map is in [CHECKLIST.md](CHECKLIST.md). (not positional); the transparency/colorkey index; whether `.PC`/`.BOB` reuse this RLE codec. -**Artifacts:** `artifacts/*.pgm` (index-as-gray) and `*.ppm` (color). Tools: -`tools/openpt/{rle,vga,palette,pnm,decode_vga,verify}.py`. +**Artifacts:** `artifacts/vga/*.pgm` (index-as-gray). Tools: +`tools/openpt/{rle,vga,palette,pnm,decode_vga,verify,build}.py`. + +**Tooling invocation settled:** run from repo root as +`python3 -m tools.openpt. PIZZA` (defaults `PIZZA`/`artifacts` relative to +root). Getting-started in `tools/README.md`. (User rejected an auto path-resolution +helper — keep invocation explicit/simple.) + +--- + +## 2026-06-29 — Session 1 (cont.): text string tables + +**`T/*.E` string tables — DONE (high confidence):** `spec/formats/text-string-table.md`. +- Positional table: LF(0x0A)-delimited, record N (0-based) = string index N. + Empty lines are **empty slots** (preserve indexing), not separators. Trailing + LF doesn't add a slot. No headers/markup; value substitution done in engine + code (no `%`-specifiers anywhere). +- Extension = locale (`.E` = English). Files pure 7-bit ASCII; cp437 assumed for + localized builds (inferred). +- Extracted all 28 tables to `artifacts/text/*.json` via `tools/openpt/text.py` + (also reports anomalies). `T/MAESTRO.E` flagged: one stray CR + one 0x81 byte + (harmless data glitch). +- `.E` is overloaded: `GFX/PALETTE.E` is binary; `Z/ARTIKEL*.E` are ISO-8859 + newspaper articles with `$` markup + leading code byte — a separate, richer + format, deferred (in CHECKLIST). +- String index→in-game-use mapping is in `PT.EXE` (needs disassembly; out of scope). diff --git a/spec/formats/text-string-table.md b/spec/formats/text-string-table.md new file mode 100644 index 0000000..7004bea --- /dev/null +++ b/spec/formats/text-string-table.md @@ -0,0 +1,69 @@ +# Text string table (`T/*.E`) + +Positional tables of localized game strings: dialog, menu labels, messages. +One file per subsystem (`BANK.E`, `WAFFEN.E`, `MARKT.E`, …), 28 files under `T/`. + +The file **extension is the locale**: `.E` = English. Other language builds use +the corresponding suffix (cf. the in-binary templates `chr\c%de.pc`, +`bob\typ%de.bob`, where the trailing `e` is the same English marker). *(inferred +from naming + the engine's per-locale filename templates; confidence medium.)* + +## Layout *(provenance: observed; confidence: high)* + +Plain text. A file is a sequence of **records terminated by `LF` (0x0A)**: + +- Record *N* (0-based, counting from the start of the file) is **string index + *N***. The game looks strings up by this position. +- An **empty record** (an `LF` with nothing before it) is an **empty string + slot** that preserves indexing — not a separator. (E.g. `T/PERS.E` indices + 12–17 are intentionally empty.) +- Files end with a trailing `LF`; that final terminator does **not** add an + extra empty slot. +- No length prefix, no count header, no in-text markup. Value substitution + (amounts, names) is performed by the engine concatenating around a string, + not via format codes in the text. *(observed: e.g. `BANK.E` "...fixed cost + is" with the number appended in code; no `%`-specifiers occur in any file.)* + +### Encoding + +DOS **code page 437**. The shipped English (`.E`) files are pure 7-bit ASCII (a +subset of cp437), so this only matters for accented characters in localized +builds. *(ASCII observed; cp437 inferred; confidence medium.)* + +## Algorithm (pseudocode) + +``` +parse(bytes) -> strings: + text = decode(bytes, codepage_437) + lines = text.split("\n") + if lines is non-empty and lines[-1] == "": + remove lines[-1] # the trailing-LF slot, not a real entry + return lines # strings[i] is string index i +``` + +## Test vector + +Input bytes `6f 66 20 35 30 2c 0a 6f 66 20 35 30 30 2c 0a` +(`"of 50,\nof 500,\n"`) → `["of 50,", "of 500,"]`. + +Input `61 0a 0a 62 0a` (`"a\n\nb\n"`) → `["a", "", "b"]` (index 1 is an empty +slot). These mirror real entries in `T/BANK.E` (indices 8–9) and the empty +slots in `T/PERS.E`. + +## Anomalies / open questions + +- `T/MAESTRO.E` contains one stray `CR` (0x0D) and one non-ASCII byte (0x81) — + almost certainly data glitches in the original; parsing is unaffected. + *(observed.)* +- The `.E` extension is **overloaded** and these are *not* the only `.E` files: + - `GFX/PALETTE.E` is binary (the palette bank — see [vga-image.md](vga-image.md)). + - `Z/ARTIKEL*.E` are ISO-8859 newspaper-article text with inline markup (a + leading code byte, `$` field markers) — a **different, richer format**, to + be documented separately. Tracked in [CHECKLIST](../../CHECKLIST.md). +- Mapping of string **indices to their in-game uses** lives in `PT.EXE` and is + out of scope here (needs disassembly). + +## Reference tooling + +- `tools/openpt/text.py` — parser + extractor + anomaly scan; writes + `artifacts/text/.json` (`{file, strings[]}`). diff --git a/tools/openpt/text.py b/tools/openpt/text.py new file mode 100644 index 0000000..7ff23b6 --- /dev/null +++ b/tools/openpt/text.py @@ -0,0 +1,65 @@ +"""Parser/extractor for Pizza Tycoon's string tables (T/*.E). + +A string table is a positional list of game strings: LF-delimited, one string +per line, indexed by 0-based line number. Empty lines are empty string slots +that preserve indexing. See spec/formats/text-string-table.md. + +The shipped English tables (the `E` suffix = English locale) are pure ASCII; +text is decoded as DOS code page 437, which is a superset of ASCII and is the +correct codepage for the localized (e.g. German) builds. + +Run from the repo root: + python3 -m tools.openpt.text [PIZZA_DIR] [ARTIFACTS_DIR] +will scan every T/*.E, report anomalies, and write artifacts/text/.json. +""" + +import glob +import json +import os +import sys + +ENCODING = "cp437" + + +def parse(blob): + """Decode a string-table file into a list of strings (index = position).""" + text = blob.decode(ENCODING) + lines = text.split("\n") + if lines and lines[-1] == "": + lines.pop() # drop the slot created by the trailing newline + return lines + + +def scan(pizza): + """Yield (path, strings, anomalies) for each T/*.E file.""" + for path in sorted(glob.glob(os.path.join(pizza, "T", "*.E"))): + blob = open(path, "rb").read() + anomalies = [] + if not blob.endswith(b"\n"): + anomalies.append("no trailing LF") + if b"\r" in blob: + anomalies.append("contains CR") + if any(b >= 0x80 for b in blob): + anomalies.append("non-ASCII byte(s)") + yield path, parse(blob), anomalies + + +def main(argv): + pizza = argv[1] if len(argv) > 1 else "PIZZA" + artifacts = argv[2] if len(argv) > 2 else "artifacts" + out_dir = os.path.join(artifacts, "text") + os.makedirs(out_dir, exist_ok=True) + total = 0 + for path, strings, anomalies in scan(pizza): + name = os.path.splitext(os.path.basename(path))[0] + with open(os.path.join(out_dir, name + ".json"), "w", encoding="utf-8") as f: + json.dump({"file": os.path.basename(path), "strings": strings}, + f, ensure_ascii=False, indent=1) + total += 1 + flag = (" <- " + ", ".join(anomalies)) if anomalies else "" + print("%-14s %4d strings%s" % (name, len(strings), flag)) + print("text: wrote %d JSON tables to %s" % (total, out_dir)) + + +if __name__ == "__main__": + main(sys.argv) -- 1.7.10.4