mud/scripts/split_grimm.py

201 lines
5.9 KiB
Python

#!/usr/bin/env -S uv run --script
# /// script
# dependencies = []
# ///
"""
Split Project Gutenberg #5314 (Complete Brothers Grimm) into individual tale files.
Usage: uv run scripts/split_grimm.py <source_file>
Creates:
- 200 numbered tales: 001_the_frog_king_or_iron_henry.txt, etc.
- Tale 151a (variant): 151a_the_twelve_idle_servants.txt
- 10 legends: legend_01_st_joseph_in_the_forest.txt, etc.
"""
import re
import sys
from pathlib import Path
def slugify(title: str) -> str:
"""Convert title to filename slug."""
slug = title.lower()
slug = re.sub(r"[^\w\s-]", "", slug)
slug = re.sub(r"[-\s]+", "_", slug)
return slug.strip("_")
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <source_file>")
sys.exit(1)
source = Path(sys.argv[1])
output_dir = Path(__file__).resolve().parent.parent / "content/library/grimm"
with source.open() as f:
lines = f.readlines()
tales_written = 0
legends_written = 0
current_tale = None
current_title = None
current_number = None
is_legend = False
in_content = False
for i, line in enumerate(lines):
line_stripped = line.strip()
# Skip Gutenberg header and table of contents (tales start at line 258)
if i < 257:
continue
if "Children" in line_stripped and "Legends" in line_stripped:
if current_tale is not None:
assert current_number is not None
assert current_title is not None
write_tale(
output_dir,
current_number,
current_title,
current_tale,
is_legend,
)
if is_legend:
legends_written += 1
elif isinstance(current_number, int):
tales_written += 1
current_tale = None
continue
if line_stripped.startswith("START: FULL LICENSE") or (
line_stripped.startswith("***") and "END OF" in line_stripped
):
if current_tale is not None:
assert current_number is not None
assert current_title is not None
write_tale(
output_dir,
current_number,
current_title,
current_tale,
is_legend,
)
if is_legend:
legends_written += 1
elif isinstance(current_number, int):
tales_written += 1
current_tale = None
break
tale_match = re.match(r"^(\d+) (.+)$", line_stripped)
variant_match = re.match(r"^(\d+)\* (.+)$", line_stripped)
legend_match = re.match(r"^Legend (\d+) (.+)$", line_stripped)
if tale_match or variant_match:
if current_tale is not None:
assert current_number is not None
assert current_title is not None
write_tale(
output_dir,
current_number,
current_title,
current_tale,
is_legend,
)
if is_legend:
legends_written += 1
elif isinstance(current_number, int):
tales_written += 1
if variant_match:
# e.g. "151* The Twelve Idle Servants" -> 151a
num, title = variant_match.groups()
current_number = f"{int(num)}a"
current_title = title
current_tale = []
is_legend = False
in_content = False
else:
assert tale_match is not None
num, title = tale_match.groups()
expected = tales_written + 1
if int(num) != expected:
continue
current_number = int(num)
current_title = title
current_tale = []
is_legend = False
in_content = False
elif legend_match:
if current_tale is not None:
assert current_number is not None
assert current_title is not None
write_tale(
output_dir,
current_number,
current_title,
current_tale,
is_legend,
)
if is_legend:
legends_written += 1
elif isinstance(current_number, int):
tales_written += 1
num, title = legend_match.groups()
current_number = int(num)
current_title = title
current_tale = []
is_legend = True
in_content = False
elif current_tale is not None:
if not in_content and line_stripped:
in_content = True
if in_content:
current_tale.append(line)
print(f"Written {tales_written} tales and {legends_written} legends")
def write_tale(
output_dir: Path,
number: int | str,
title: str,
content: list[str],
is_legend: bool,
):
"""Write a single tale to disk."""
slug = slugify(title)
if is_legend:
filename = f"legend_{number:02d}_{slug}.txt"
elif isinstance(number, str):
# Variant like "151a"
filename = f"{number}_{slug}.txt"
else:
filename = f"{number:03d}_{slug}.txt"
filepath = output_dir / filename
while content and not content[0].strip():
content.pop(0)
while content and not content[-1].strip():
content.pop()
with filepath.open("w") as f:
f.write(title + "\n")
f.write("\n")
f.writelines(content)
print(f"Wrote {filepath.name}")
if __name__ == "__main__":
main()