Add bulk book import script
This commit is contained in:
parent
a0360f221c
commit
355795a991
2 changed files with 355 additions and 0 deletions
152
scripts/import_books.py
Executable file
152
scripts/import_books.py
Executable file
|
|
@ -0,0 +1,152 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Import .txt story files to TOML thing templates for readable books."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def parse_txt_file(path: Path) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Parse a .txt file with title on line 1, blank line, then content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(title, text) tuple
|
||||||
|
"""
|
||||||
|
lines = path.read_text().splitlines()
|
||||||
|
|
||||||
|
if len(lines) < 2:
|
||||||
|
raise ValueError(f"File too short: {path}")
|
||||||
|
|
||||||
|
title = lines[0]
|
||||||
|
|
||||||
|
if len(lines) > 1 and lines[1] != "":
|
||||||
|
raise ValueError(f"Expected blank line after title in {path}")
|
||||||
|
|
||||||
|
# Join all lines after the blank line
|
||||||
|
text = "\n".join(lines[2:]) if len(lines) > 2 else ""
|
||||||
|
|
||||||
|
return title, text
|
||||||
|
|
||||||
|
|
||||||
|
def generate_slug(filename: str) -> str:
|
||||||
|
"""Convert filename to slug (remove .txt extension)."""
|
||||||
|
return filename.removesuffix(".txt")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_alias_words(title: str) -> list[str]:
|
||||||
|
"""Extract meaningful words from title, lowercased."""
|
||||||
|
# Remove common articles and prepositions, keep hyphenated words
|
||||||
|
stopwords = {"the", "a", "an", "in", "on", "or", "and", "of", "to", "our"}
|
||||||
|
|
||||||
|
# Split on spaces but preserve hyphens and apostrophes
|
||||||
|
words = title.lower().replace(",", "").split()
|
||||||
|
|
||||||
|
return [w for w in words if w not in stopwords]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_aliases(title: str) -> list[str]:
|
||||||
|
"""Generate aliases from title."""
|
||||||
|
words = extract_alias_words(title)
|
||||||
|
|
||||||
|
aliases = []
|
||||||
|
|
||||||
|
# Full title without articles
|
||||||
|
full = " ".join(words)
|
||||||
|
if full:
|
||||||
|
aliases.append(full)
|
||||||
|
|
||||||
|
# Individual meaningful words
|
||||||
|
aliases.extend(words)
|
||||||
|
|
||||||
|
# Remove duplicates while preserving order
|
||||||
|
seen = set()
|
||||||
|
unique_aliases = []
|
||||||
|
for alias in aliases:
|
||||||
|
if alias not in seen:
|
||||||
|
seen.add(alias)
|
||||||
|
unique_aliases.append(alias)
|
||||||
|
|
||||||
|
return unique_aliases
|
||||||
|
|
||||||
|
|
||||||
|
def txt_to_toml(title: str, text: str) -> str:
|
||||||
|
"""
|
||||||
|
Generate TOML string for a thing template.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Book title (becomes name field)
|
||||||
|
text: Story content (becomes readable_text field)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TOML-formatted string
|
||||||
|
"""
|
||||||
|
aliases = generate_aliases(title)
|
||||||
|
|
||||||
|
# Build aliases list for TOML
|
||||||
|
aliases_str = ", ".join(f'"{a}"' for a in aliases)
|
||||||
|
|
||||||
|
# Escape any triple quotes in the text
|
||||||
|
escaped_text = text.replace('"""', r"\"\"\"")
|
||||||
|
|
||||||
|
toml = f'''name = "{title}"
|
||||||
|
description = "a leather-bound story book"
|
||||||
|
portable = true
|
||||||
|
aliases = [{aliases_str}]
|
||||||
|
readable_text = """
|
||||||
|
{escaped_text}"""
|
||||||
|
'''
|
||||||
|
|
||||||
|
return toml
|
||||||
|
|
||||||
|
|
||||||
|
def import_books(input_dir: Path, output_dir: Path) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Import all .txt files from input_dir to .toml files in output_dir.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping slug -> title for all imported books
|
||||||
|
"""
|
||||||
|
input_dir = Path(input_dir)
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
imported = {}
|
||||||
|
|
||||||
|
for txt_path in sorted(input_dir.glob("*.txt")):
|
||||||
|
slug = generate_slug(txt_path.name)
|
||||||
|
toml_path = output_dir / f"{slug}.toml"
|
||||||
|
|
||||||
|
title, text = parse_txt_file(txt_path)
|
||||||
|
toml_content = txt_to_toml(title, text)
|
||||||
|
|
||||||
|
toml_path.write_text(toml_content)
|
||||||
|
imported[slug] = title
|
||||||
|
|
||||||
|
return imported
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point for standalone script."""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: import_books.py INPUT_DIR OUTPUT_DIR")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input_dir = Path(sys.argv[1])
|
||||||
|
output_dir = Path(sys.argv[2])
|
||||||
|
|
||||||
|
if not input_dir.is_dir():
|
||||||
|
print(f"Error: {input_dir} is not a directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"Importing books from {input_dir} to {output_dir}...")
|
||||||
|
imported = import_books(input_dir, output_dir)
|
||||||
|
|
||||||
|
print(f"\nImported {len(imported)} books:")
|
||||||
|
for slug, title in imported.items():
|
||||||
|
print(f" {slug}.toml <- {title}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
203
tests/test_import_books.py
Normal file
203
tests/test_import_books.py
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
"""Tests for bulk book import script."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import tomllib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from scripts.import_books import (
|
||||||
|
extract_alias_words,
|
||||||
|
generate_aliases,
|
||||||
|
generate_slug,
|
||||||
|
parse_txt_file,
|
||||||
|
txt_to_toml,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_txt_file_with_title_and_content():
|
||||||
|
"""Parse a basic .txt file with title and content."""
|
||||||
|
content = "The Frog King\n\nOnce upon a time...\nThere was a princess."
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write(content)
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
title, text = parse_txt_file(Path(f.name))
|
||||||
|
|
||||||
|
assert title == "The Frog King"
|
||||||
|
assert text == "Once upon a time...\nThere was a princess."
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_txt_file_empty_content():
|
||||||
|
"""Parse file with only title and blank line."""
|
||||||
|
content = "Title Only\n\n"
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write(content)
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
title, text = parse_txt_file(Path(f.name))
|
||||||
|
|
||||||
|
assert title == "Title Only"
|
||||||
|
assert text == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_txt_file_no_blank_line():
|
||||||
|
"""Parse file where second line is not blank."""
|
||||||
|
content = "Title\nImmediate content"
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
|
||||||
|
f.write(content)
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Expected blank line"):
|
||||||
|
parse_txt_file(Path(f.name))
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_slug_from_filename():
|
||||||
|
"""Convert filename to slug for TOML output."""
|
||||||
|
result = generate_slug("001_the_frog_king_or_iron_henry.txt")
|
||||||
|
assert result == "001_the_frog_king_or_iron_henry"
|
||||||
|
assert generate_slug("002_cat_and_mouse.txt") == "002_cat_and_mouse"
|
||||||
|
assert generate_slug("simple.txt") == "simple"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_alias_words():
|
||||||
|
"""Extract meaningful words from title for aliases."""
|
||||||
|
assert extract_alias_words("The Frog King") == ["frog", "king"]
|
||||||
|
result = extract_alias_words("Cat and Mouse in Partnership")
|
||||||
|
assert result == ["cat", "mouse", "partnership"]
|
||||||
|
result = extract_alias_words("The Frog-King, or Iron Henry")
|
||||||
|
assert result == ["frog-king", "iron", "henry"]
|
||||||
|
assert extract_alias_words("Our Lady's Child") == ["lady's", "child"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_aliases():
|
||||||
|
"""Generate aliases from title."""
|
||||||
|
# Basic title
|
||||||
|
aliases = generate_aliases("The Frog King")
|
||||||
|
assert "frog king" in aliases
|
||||||
|
assert "frog" in aliases
|
||||||
|
assert "king" in aliases
|
||||||
|
|
||||||
|
# With punctuation - gets full phrase plus individual words
|
||||||
|
aliases = generate_aliases("The Frog-King, or Iron Henry")
|
||||||
|
assert "frog-king iron henry" in aliases
|
||||||
|
assert "frog-king" in aliases
|
||||||
|
assert "iron" in aliases
|
||||||
|
assert "henry" in aliases
|
||||||
|
|
||||||
|
# Single word title should not generate meaningless aliases
|
||||||
|
aliases = generate_aliases("Single")
|
||||||
|
assert aliases == ["single"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_txt_to_toml_basic():
|
||||||
|
"""Generate valid TOML from title and text."""
|
||||||
|
title = "The Frog King"
|
||||||
|
text = "Once upon a time..."
|
||||||
|
|
||||||
|
toml_str = txt_to_toml(title, text)
|
||||||
|
|
||||||
|
# Parse the generated TOML to verify it's valid
|
||||||
|
data = tomllib.loads(toml_str)
|
||||||
|
|
||||||
|
assert data["name"] == "The Frog King"
|
||||||
|
assert data["description"] == "a leather-bound story book"
|
||||||
|
assert data["portable"] is True
|
||||||
|
assert "frog king" in data["aliases"]
|
||||||
|
assert data["readable_text"] == "Once upon a time..."
|
||||||
|
|
||||||
|
|
||||||
|
def test_txt_to_toml_multiline_text():
|
||||||
|
"""Generate TOML with multiline readable_text."""
|
||||||
|
title = "Test Story"
|
||||||
|
text = "Line 1\nLine 2\nLine 3"
|
||||||
|
|
||||||
|
toml_str = txt_to_toml(title, text)
|
||||||
|
data = tomllib.loads(toml_str)
|
||||||
|
|
||||||
|
assert data["readable_text"] == "Line 1\nLine 2\nLine 3"
|
||||||
|
|
||||||
|
|
||||||
|
def test_txt_to_toml_empty_text():
|
||||||
|
"""Generate TOML with empty readable_text."""
|
||||||
|
title = "Empty Story"
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
toml_str = txt_to_toml(title, text)
|
||||||
|
data = tomllib.loads(toml_str)
|
||||||
|
|
||||||
|
assert data["readable_text"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_pipeline_single_file(tmp_path):
|
||||||
|
"""Test complete pipeline from .txt to .toml."""
|
||||||
|
from scripts.import_books import import_books
|
||||||
|
|
||||||
|
# Create input directory with one file
|
||||||
|
input_dir = tmp_path / "input"
|
||||||
|
input_dir.mkdir()
|
||||||
|
|
||||||
|
txt_file = input_dir / "001_the_frog_king.txt"
|
||||||
|
txt_file.write_text("The Frog King\n\nOnce upon a time...")
|
||||||
|
|
||||||
|
# Create output directory
|
||||||
|
output_dir = tmp_path / "output"
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
|
# Run import
|
||||||
|
import_books(input_dir, output_dir)
|
||||||
|
|
||||||
|
# Verify output file was created
|
||||||
|
toml_file = output_dir / "001_the_frog_king.toml"
|
||||||
|
assert toml_file.exists()
|
||||||
|
|
||||||
|
# Verify contents
|
||||||
|
with open(toml_file, "rb") as f:
|
||||||
|
data = tomllib.load(f)
|
||||||
|
|
||||||
|
assert data["name"] == "The Frog King"
|
||||||
|
assert data["readable_text"] == "Once upon a time..."
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_pipeline_multiple_files(tmp_path):
|
||||||
|
"""Test pipeline with multiple files."""
|
||||||
|
from scripts.import_books import import_books
|
||||||
|
|
||||||
|
input_dir = tmp_path / "input"
|
||||||
|
input_dir.mkdir()
|
||||||
|
|
||||||
|
# Create multiple files
|
||||||
|
(input_dir / "001_story_one.txt").write_text("Story One\n\nText one")
|
||||||
|
(input_dir / "002_story_two.txt").write_text("Story Two\n\nText two")
|
||||||
|
(input_dir / "003_story_three.txt").write_text("Story Three\n\nText three")
|
||||||
|
|
||||||
|
output_dir = tmp_path / "output"
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
|
import_books(input_dir, output_dir)
|
||||||
|
|
||||||
|
# Verify all files were created
|
||||||
|
assert (output_dir / "001_story_one.toml").exists()
|
||||||
|
assert (output_dir / "002_story_two.toml").exists()
|
||||||
|
assert (output_dir / "003_story_three.toml").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_pipeline_skips_non_txt(tmp_path):
|
||||||
|
"""Pipeline should only process .txt files."""
|
||||||
|
from scripts.import_books import import_books
|
||||||
|
|
||||||
|
input_dir = tmp_path / "input"
|
||||||
|
input_dir.mkdir()
|
||||||
|
|
||||||
|
(input_dir / "story.txt").write_text("Title\n\nContent")
|
||||||
|
(input_dir / "README.md").write_text("# Not a story")
|
||||||
|
(input_dir / "data.json").write_text("{}")
|
||||||
|
|
||||||
|
output_dir = tmp_path / "output"
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
|
import_books(input_dir, output_dir)
|
||||||
|
|
||||||
|
# Only the .txt file should generate output
|
||||||
|
assert (output_dir / "story.toml").exists()
|
||||||
|
assert not (output_dir / "README.toml").exists()
|
||||||
|
assert not (output_dir / "data.toml").exists()
|
||||||
Loading…
Reference in a new issue