From 947dd739121d24408c7cd2bb358ceee338f47f77 Mon Sep 17 00:00:00 2001 From: Jared Miller Date: Fri, 13 Feb 2026 10:58:06 -0500 Subject: [PATCH] Add Grimm fairy tales splitter script --- scripts/split_grimm.py | 201 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 scripts/split_grimm.py diff --git a/scripts/split_grimm.py b/scripts/split_grimm.py new file mode 100644 index 0000000..bb646be --- /dev/null +++ b/scripts/split_grimm.py @@ -0,0 +1,201 @@ +#!/usr/bin/env -S uv run --script +# /// script +# dependencies = [] +# /// + +""" +Split Project Gutenberg #5314 (Complete Brothers Grimm) into individual tale files. + +Usage: uv run scripts/split_grimm.py + +Creates: +- 200 numbered tales: 001_the_frog_king_or_iron_henry.txt, etc. +- Tale 151a (variant): 151a_the_twelve_idle_servants.txt +- 10 legends: legend_01_st_joseph_in_the_forest.txt, etc. +""" + +import re +import sys +from pathlib import Path + + +def slugify(title: str) -> str: + """Convert title to filename slug.""" + slug = title.lower() + slug = re.sub(r"[^\w\s-]", "", slug) + slug = re.sub(r"[-\s]+", "_", slug) + return slug.strip("_") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + source = Path(sys.argv[1]) + output_dir = Path(__file__).resolve().parent.parent / "content/library/grimm" + + with source.open() as f: + lines = f.readlines() + + tales_written = 0 + legends_written = 0 + current_tale = None + current_title = None + current_number = None + is_legend = False + in_content = False + + for i, line in enumerate(lines): + line_stripped = line.strip() + + # Skip Gutenberg header and table of contents (tales start at line 258) + if i < 257: + continue + + if "Children" in line_stripped and "Legends" in line_stripped: + if current_tale is not None: + assert current_number is not None + assert current_title is not None + write_tale( + output_dir, + current_number, + current_title, + current_tale, + is_legend, + ) + if is_legend: + legends_written += 1 + elif isinstance(current_number, int): + tales_written += 1 + current_tale = None + continue + + if line_stripped.startswith("START: FULL LICENSE") or ( + line_stripped.startswith("***") and "END OF" in line_stripped + ): + if current_tale is not None: + assert current_number is not None + assert current_title is not None + write_tale( + output_dir, + current_number, + current_title, + current_tale, + is_legend, + ) + if is_legend: + legends_written += 1 + elif isinstance(current_number, int): + tales_written += 1 + current_tale = None + break + + tale_match = re.match(r"^(\d+) (.+)$", line_stripped) + variant_match = re.match(r"^(\d+)\* (.+)$", line_stripped) + legend_match = re.match(r"^Legend (\d+) (.+)$", line_stripped) + + if tale_match or variant_match: + if current_tale is not None: + assert current_number is not None + assert current_title is not None + write_tale( + output_dir, + current_number, + current_title, + current_tale, + is_legend, + ) + if is_legend: + legends_written += 1 + elif isinstance(current_number, int): + tales_written += 1 + + if variant_match: + # e.g. "151* The Twelve Idle Servants" -> 151a + num, title = variant_match.groups() + current_number = f"{int(num)}a" + current_title = title + current_tale = [] + is_legend = False + in_content = False + else: + assert tale_match is not None + num, title = tale_match.groups() + expected = tales_written + 1 + if int(num) != expected: + continue + + current_number = int(num) + current_title = title + current_tale = [] + is_legend = False + in_content = False + + elif legend_match: + if current_tale is not None: + assert current_number is not None + assert current_title is not None + write_tale( + output_dir, + current_number, + current_title, + current_tale, + is_legend, + ) + if is_legend: + legends_written += 1 + elif isinstance(current_number, int): + tales_written += 1 + + num, title = legend_match.groups() + current_number = int(num) + current_title = title + current_tale = [] + is_legend = True + in_content = False + + elif current_tale is not None: + if not in_content and line_stripped: + in_content = True + + if in_content: + current_tale.append(line) + + print(f"Written {tales_written} tales and {legends_written} legends") + + +def write_tale( + output_dir: Path, + number: int | str, + title: str, + content: list[str], + is_legend: bool, +): + """Write a single tale to disk.""" + slug = slugify(title) + + if is_legend: + filename = f"legend_{number:02d}_{slug}.txt" + elif isinstance(number, str): + # Variant like "151a" + filename = f"{number}_{slug}.txt" + else: + filename = f"{number:03d}_{slug}.txt" + + filepath = output_dir / filename + + while content and not content[0].strip(): + content.pop(0) + while content and not content[-1].strip(): + content.pop() + + with filepath.open("w") as f: + f.write(title + "\n") + f.write("\n") + f.writelines(content) + + print(f"Wrote {filepath.name}") + + +if __name__ == "__main__": + main()