Add Grimm fairy tales splitter script
This commit is contained in:
parent
65fe92b4e0
commit
947dd73912
1 changed files with 201 additions and 0 deletions
201
scripts/split_grimm.py
Normal file
201
scripts/split_grimm.py
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
#!/usr/bin/env -S uv run --script
|
||||
# /// script
|
||||
# dependencies = []
|
||||
# ///
|
||||
|
||||
"""
|
||||
Split Project Gutenberg #5314 (Complete Brothers Grimm) into individual tale files.
|
||||
|
||||
Usage: uv run scripts/split_grimm.py <source_file>
|
||||
|
||||
Creates:
|
||||
- 200 numbered tales: 001_the_frog_king_or_iron_henry.txt, etc.
|
||||
- Tale 151a (variant): 151a_the_twelve_idle_servants.txt
|
||||
- 10 legends: legend_01_st_joseph_in_the_forest.txt, etc.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def slugify(title: str) -> str:
|
||||
"""Convert title to filename slug."""
|
||||
slug = title.lower()
|
||||
slug = re.sub(r"[^\w\s-]", "", slug)
|
||||
slug = re.sub(r"[-\s]+", "_", slug)
|
||||
return slug.strip("_")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: {sys.argv[0]} <source_file>")
|
||||
sys.exit(1)
|
||||
source = Path(sys.argv[1])
|
||||
output_dir = Path(__file__).resolve().parent.parent / "content/library/grimm"
|
||||
|
||||
with source.open() as f:
|
||||
lines = f.readlines()
|
||||
|
||||
tales_written = 0
|
||||
legends_written = 0
|
||||
current_tale = None
|
||||
current_title = None
|
||||
current_number = None
|
||||
is_legend = False
|
||||
in_content = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
|
||||
# Skip Gutenberg header and table of contents (tales start at line 258)
|
||||
if i < 257:
|
||||
continue
|
||||
|
||||
if "Children" in line_stripped and "Legends" in line_stripped:
|
||||
if current_tale is not None:
|
||||
assert current_number is not None
|
||||
assert current_title is not None
|
||||
write_tale(
|
||||
output_dir,
|
||||
current_number,
|
||||
current_title,
|
||||
current_tale,
|
||||
is_legend,
|
||||
)
|
||||
if is_legend:
|
||||
legends_written += 1
|
||||
elif isinstance(current_number, int):
|
||||
tales_written += 1
|
||||
current_tale = None
|
||||
continue
|
||||
|
||||
if line_stripped.startswith("START: FULL LICENSE") or (
|
||||
line_stripped.startswith("***") and "END OF" in line_stripped
|
||||
):
|
||||
if current_tale is not None:
|
||||
assert current_number is not None
|
||||
assert current_title is not None
|
||||
write_tale(
|
||||
output_dir,
|
||||
current_number,
|
||||
current_title,
|
||||
current_tale,
|
||||
is_legend,
|
||||
)
|
||||
if is_legend:
|
||||
legends_written += 1
|
||||
elif isinstance(current_number, int):
|
||||
tales_written += 1
|
||||
current_tale = None
|
||||
break
|
||||
|
||||
tale_match = re.match(r"^(\d+) (.+)$", line_stripped)
|
||||
variant_match = re.match(r"^(\d+)\* (.+)$", line_stripped)
|
||||
legend_match = re.match(r"^Legend (\d+) (.+)$", line_stripped)
|
||||
|
||||
if tale_match or variant_match:
|
||||
if current_tale is not None:
|
||||
assert current_number is not None
|
||||
assert current_title is not None
|
||||
write_tale(
|
||||
output_dir,
|
||||
current_number,
|
||||
current_title,
|
||||
current_tale,
|
||||
is_legend,
|
||||
)
|
||||
if is_legend:
|
||||
legends_written += 1
|
||||
elif isinstance(current_number, int):
|
||||
tales_written += 1
|
||||
|
||||
if variant_match:
|
||||
# e.g. "151* The Twelve Idle Servants" -> 151a
|
||||
num, title = variant_match.groups()
|
||||
current_number = f"{int(num)}a"
|
||||
current_title = title
|
||||
current_tale = []
|
||||
is_legend = False
|
||||
in_content = False
|
||||
else:
|
||||
assert tale_match is not None
|
||||
num, title = tale_match.groups()
|
||||
expected = tales_written + 1
|
||||
if int(num) != expected:
|
||||
continue
|
||||
|
||||
current_number = int(num)
|
||||
current_title = title
|
||||
current_tale = []
|
||||
is_legend = False
|
||||
in_content = False
|
||||
|
||||
elif legend_match:
|
||||
if current_tale is not None:
|
||||
assert current_number is not None
|
||||
assert current_title is not None
|
||||
write_tale(
|
||||
output_dir,
|
||||
current_number,
|
||||
current_title,
|
||||
current_tale,
|
||||
is_legend,
|
||||
)
|
||||
if is_legend:
|
||||
legends_written += 1
|
||||
elif isinstance(current_number, int):
|
||||
tales_written += 1
|
||||
|
||||
num, title = legend_match.groups()
|
||||
current_number = int(num)
|
||||
current_title = title
|
||||
current_tale = []
|
||||
is_legend = True
|
||||
in_content = False
|
||||
|
||||
elif current_tale is not None:
|
||||
if not in_content and line_stripped:
|
||||
in_content = True
|
||||
|
||||
if in_content:
|
||||
current_tale.append(line)
|
||||
|
||||
print(f"Written {tales_written} tales and {legends_written} legends")
|
||||
|
||||
|
||||
def write_tale(
|
||||
output_dir: Path,
|
||||
number: int | str,
|
||||
title: str,
|
||||
content: list[str],
|
||||
is_legend: bool,
|
||||
):
|
||||
"""Write a single tale to disk."""
|
||||
slug = slugify(title)
|
||||
|
||||
if is_legend:
|
||||
filename = f"legend_{number:02d}_{slug}.txt"
|
||||
elif isinstance(number, str):
|
||||
# Variant like "151a"
|
||||
filename = f"{number}_{slug}.txt"
|
||||
else:
|
||||
filename = f"{number:03d}_{slug}.txt"
|
||||
|
||||
filepath = output_dir / filename
|
||||
|
||||
while content and not content[0].strip():
|
||||
content.pop(0)
|
||||
while content and not content[-1].strip():
|
||||
content.pop()
|
||||
|
||||
with filepath.open("w") as f:
|
||||
f.write(title + "\n")
|
||||
f.write("\n")
|
||||
f.writelines(content)
|
||||
|
||||
print(f"Wrote {filepath.name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue