#!/usr/bin/env -S uv run --script # /// script # dependencies = [] # /// """ Split Project Gutenberg #5314 (Complete Brothers Grimm) into individual tale files. Usage: uv run scripts/split_grimm.py Creates: - 200 numbered tales: 001_the_frog_king_or_iron_henry.txt, etc. - Tale 151a (variant): 151a_the_twelve_idle_servants.txt - 10 legends: legend_01_st_joseph_in_the_forest.txt, etc. """ import re import sys from pathlib import Path def slugify(title: str) -> str: """Convert title to filename slug.""" slug = title.lower() slug = re.sub(r"[^\w\s-]", "", slug) slug = re.sub(r"[-\s]+", "_", slug) return slug.strip("_") def main(): if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) source = Path(sys.argv[1]) output_dir = Path(__file__).resolve().parent.parent / "content/library/grimm" with source.open() as f: lines = f.readlines() tales_written = 0 legends_written = 0 current_tale = None current_title = None current_number = None is_legend = False in_content = False for i, line in enumerate(lines): line_stripped = line.strip() # Skip Gutenberg header and table of contents (tales start at line 258) if i < 257: continue if "Children" in line_stripped and "Legends" in line_stripped: if current_tale is not None: assert current_number is not None assert current_title is not None write_tale( output_dir, current_number, current_title, current_tale, is_legend, ) if is_legend: legends_written += 1 elif isinstance(current_number, int): tales_written += 1 current_tale = None continue if line_stripped.startswith("START: FULL LICENSE") or ( line_stripped.startswith("***") and "END OF" in line_stripped ): if current_tale is not None: assert current_number is not None assert current_title is not None write_tale( output_dir, current_number, current_title, current_tale, is_legend, ) if is_legend: legends_written += 1 elif isinstance(current_number, int): tales_written += 1 current_tale = None break tale_match = re.match(r"^(\d+) (.+)$", line_stripped) variant_match = re.match(r"^(\d+)\* (.+)$", line_stripped) legend_match = re.match(r"^Legend (\d+) (.+)$", line_stripped) if tale_match or variant_match: if current_tale is not None: assert current_number is not None assert current_title is not None write_tale( output_dir, current_number, current_title, current_tale, is_legend, ) if is_legend: legends_written += 1 elif isinstance(current_number, int): tales_written += 1 if variant_match: # e.g. "151* The Twelve Idle Servants" -> 151a num, title = variant_match.groups() current_number = f"{int(num)}a" current_title = title current_tale = [] is_legend = False in_content = False else: assert tale_match is not None num, title = tale_match.groups() expected = tales_written + 1 if int(num) != expected: continue current_number = int(num) current_title = title current_tale = [] is_legend = False in_content = False elif legend_match: if current_tale is not None: assert current_number is not None assert current_title is not None write_tale( output_dir, current_number, current_title, current_tale, is_legend, ) if is_legend: legends_written += 1 elif isinstance(current_number, int): tales_written += 1 num, title = legend_match.groups() current_number = int(num) current_title = title current_tale = [] is_legend = True in_content = False elif current_tale is not None: if not in_content and line_stripped: in_content = True if in_content: current_tale.append(line) print(f"Written {tales_written} tales and {legends_written} legends") def write_tale( output_dir: Path, number: int | str, title: str, content: list[str], is_legend: bool, ): """Write a single tale to disk.""" slug = slugify(title) if is_legend: filename = f"legend_{number:02d}_{slug}.txt" elif isinstance(number, str): # Variant like "151a" filename = f"{number}_{slug}.txt" else: filename = f"{number:03d}_{slug}.txt" filepath = output_dir / filename while content and not content[0].strip(): content.pop(0) while content and not content[-1].strip(): content.pop() with filepath.open("w") as f: f.write(title + "\n") f.write("\n") f.writelines(content) print(f"Wrote {filepath.name}") if __name__ == "__main__": main()