Add Grimm fairy tales splitter script
This commit is contained in:
parent
65fe92b4e0
commit
947dd73912
1 changed files with 201 additions and 0 deletions
201
scripts/split_grimm.py
Normal file
201
scripts/split_grimm.py
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
#!/usr/bin/env -S uv run --script
|
||||||
|
# /// script
|
||||||
|
# dependencies = []
|
||||||
|
# ///
|
||||||
|
|
||||||
|
"""
|
||||||
|
Split Project Gutenberg #5314 (Complete Brothers Grimm) into individual tale files.
|
||||||
|
|
||||||
|
Usage: uv run scripts/split_grimm.py <source_file>
|
||||||
|
|
||||||
|
Creates:
|
||||||
|
- 200 numbered tales: 001_the_frog_king_or_iron_henry.txt, etc.
|
||||||
|
- Tale 151a (variant): 151a_the_twelve_idle_servants.txt
|
||||||
|
- 10 legends: legend_01_st_joseph_in_the_forest.txt, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def slugify(title: str) -> str:
|
||||||
|
"""Convert title to filename slug."""
|
||||||
|
slug = title.lower()
|
||||||
|
slug = re.sub(r"[^\w\s-]", "", slug)
|
||||||
|
slug = re.sub(r"[-\s]+", "_", slug)
|
||||||
|
return slug.strip("_")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} <source_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
source = Path(sys.argv[1])
|
||||||
|
output_dir = Path(__file__).resolve().parent.parent / "content/library/grimm"
|
||||||
|
|
||||||
|
with source.open() as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
tales_written = 0
|
||||||
|
legends_written = 0
|
||||||
|
current_tale = None
|
||||||
|
current_title = None
|
||||||
|
current_number = None
|
||||||
|
is_legend = False
|
||||||
|
in_content = False
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line_stripped = line.strip()
|
||||||
|
|
||||||
|
# Skip Gutenberg header and table of contents (tales start at line 258)
|
||||||
|
if i < 257:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "Children" in line_stripped and "Legends" in line_stripped:
|
||||||
|
if current_tale is not None:
|
||||||
|
assert current_number is not None
|
||||||
|
assert current_title is not None
|
||||||
|
write_tale(
|
||||||
|
output_dir,
|
||||||
|
current_number,
|
||||||
|
current_title,
|
||||||
|
current_tale,
|
||||||
|
is_legend,
|
||||||
|
)
|
||||||
|
if is_legend:
|
||||||
|
legends_written += 1
|
||||||
|
elif isinstance(current_number, int):
|
||||||
|
tales_written += 1
|
||||||
|
current_tale = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line_stripped.startswith("START: FULL LICENSE") or (
|
||||||
|
line_stripped.startswith("***") and "END OF" in line_stripped
|
||||||
|
):
|
||||||
|
if current_tale is not None:
|
||||||
|
assert current_number is not None
|
||||||
|
assert current_title is not None
|
||||||
|
write_tale(
|
||||||
|
output_dir,
|
||||||
|
current_number,
|
||||||
|
current_title,
|
||||||
|
current_tale,
|
||||||
|
is_legend,
|
||||||
|
)
|
||||||
|
if is_legend:
|
||||||
|
legends_written += 1
|
||||||
|
elif isinstance(current_number, int):
|
||||||
|
tales_written += 1
|
||||||
|
current_tale = None
|
||||||
|
break
|
||||||
|
|
||||||
|
tale_match = re.match(r"^(\d+) (.+)$", line_stripped)
|
||||||
|
variant_match = re.match(r"^(\d+)\* (.+)$", line_stripped)
|
||||||
|
legend_match = re.match(r"^Legend (\d+) (.+)$", line_stripped)
|
||||||
|
|
||||||
|
if tale_match or variant_match:
|
||||||
|
if current_tale is not None:
|
||||||
|
assert current_number is not None
|
||||||
|
assert current_title is not None
|
||||||
|
write_tale(
|
||||||
|
output_dir,
|
||||||
|
current_number,
|
||||||
|
current_title,
|
||||||
|
current_tale,
|
||||||
|
is_legend,
|
||||||
|
)
|
||||||
|
if is_legend:
|
||||||
|
legends_written += 1
|
||||||
|
elif isinstance(current_number, int):
|
||||||
|
tales_written += 1
|
||||||
|
|
||||||
|
if variant_match:
|
||||||
|
# e.g. "151* The Twelve Idle Servants" -> 151a
|
||||||
|
num, title = variant_match.groups()
|
||||||
|
current_number = f"{int(num)}a"
|
||||||
|
current_title = title
|
||||||
|
current_tale = []
|
||||||
|
is_legend = False
|
||||||
|
in_content = False
|
||||||
|
else:
|
||||||
|
assert tale_match is not None
|
||||||
|
num, title = tale_match.groups()
|
||||||
|
expected = tales_written + 1
|
||||||
|
if int(num) != expected:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_number = int(num)
|
||||||
|
current_title = title
|
||||||
|
current_tale = []
|
||||||
|
is_legend = False
|
||||||
|
in_content = False
|
||||||
|
|
||||||
|
elif legend_match:
|
||||||
|
if current_tale is not None:
|
||||||
|
assert current_number is not None
|
||||||
|
assert current_title is not None
|
||||||
|
write_tale(
|
||||||
|
output_dir,
|
||||||
|
current_number,
|
||||||
|
current_title,
|
||||||
|
current_tale,
|
||||||
|
is_legend,
|
||||||
|
)
|
||||||
|
if is_legend:
|
||||||
|
legends_written += 1
|
||||||
|
elif isinstance(current_number, int):
|
||||||
|
tales_written += 1
|
||||||
|
|
||||||
|
num, title = legend_match.groups()
|
||||||
|
current_number = int(num)
|
||||||
|
current_title = title
|
||||||
|
current_tale = []
|
||||||
|
is_legend = True
|
||||||
|
in_content = False
|
||||||
|
|
||||||
|
elif current_tale is not None:
|
||||||
|
if not in_content and line_stripped:
|
||||||
|
in_content = True
|
||||||
|
|
||||||
|
if in_content:
|
||||||
|
current_tale.append(line)
|
||||||
|
|
||||||
|
print(f"Written {tales_written} tales and {legends_written} legends")
|
||||||
|
|
||||||
|
|
||||||
|
def write_tale(
|
||||||
|
output_dir: Path,
|
||||||
|
number: int | str,
|
||||||
|
title: str,
|
||||||
|
content: list[str],
|
||||||
|
is_legend: bool,
|
||||||
|
):
|
||||||
|
"""Write a single tale to disk."""
|
||||||
|
slug = slugify(title)
|
||||||
|
|
||||||
|
if is_legend:
|
||||||
|
filename = f"legend_{number:02d}_{slug}.txt"
|
||||||
|
elif isinstance(number, str):
|
||||||
|
# Variant like "151a"
|
||||||
|
filename = f"{number}_{slug}.txt"
|
||||||
|
else:
|
||||||
|
filename = f"{number:03d}_{slug}.txt"
|
||||||
|
|
||||||
|
filepath = output_dir / filename
|
||||||
|
|
||||||
|
while content and not content[0].strip():
|
||||||
|
content.pop(0)
|
||||||
|
while content and not content[-1].strip():
|
||||||
|
content.pop()
|
||||||
|
|
||||||
|
with filepath.open("w") as f:
|
||||||
|
f.write(title + "\n")
|
||||||
|
f.write("\n")
|
||||||
|
f.writelines(content)
|
||||||
|
|
||||||
|
print(f"Wrote {filepath.name}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue