Add bulk book import script

2026-02-14 12:34:34 -05:00 · 2026-02-14 12:34:34 -05:00 · 355795a991
commit 355795a991
parent a0360f221c
2 changed files with 355 additions and 0 deletions
--- a/scripts/import_books.py
+++ b/scripts/import_books.py
@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""Import .txt story files to TOML thing templates for readable books."""
+
+from pathlib import Path
+
+
+def parse_txt_file(path: Path) -> tuple[str, str]:
+    """
+    Parse a .txt file with title on line 1, blank line, then content.
+
+    Returns:
+        (title, text) tuple
+    """
+    lines = path.read_text().splitlines()
+
+    if len(lines) < 2:
+        raise ValueError(f"File too short: {path}")
+
+    title = lines[0]
+
+    if len(lines) > 1 and lines[1] != "":
+        raise ValueError(f"Expected blank line after title in {path}")
+
+    # Join all lines after the blank line
+    text = "\n".join(lines[2:]) if len(lines) > 2 else ""
+
+    return title, text
+
+
+def generate_slug(filename: str) -> str:
+    """Convert filename to slug (remove .txt extension)."""
+    return filename.removesuffix(".txt")
+
+
+def extract_alias_words(title: str) -> list[str]:
+    """Extract meaningful words from title, lowercased."""
+    # Remove common articles and prepositions, keep hyphenated words
+    stopwords = {"the", "a", "an", "in", "on", "or", "and", "of", "to", "our"}
+
+    # Split on spaces but preserve hyphens and apostrophes
+    words = title.lower().replace(",", "").split()
+
+    return [w for w in words if w not in stopwords]
+
+
+def generate_aliases(title: str) -> list[str]:
+    """Generate aliases from title."""
+    words = extract_alias_words(title)
+
+    aliases = []
+
+    # Full title without articles
+    full = " ".join(words)
+    if full:
+        aliases.append(full)
+
+    # Individual meaningful words
+    aliases.extend(words)
+
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_aliases = []
+    for alias in aliases:
+        if alias not in seen:
+            seen.add(alias)
+            unique_aliases.append(alias)
+
+    return unique_aliases
+
+
+def txt_to_toml(title: str, text: str) -> str:
+    """
+    Generate TOML string for a thing template.
+
+    Args:
+        title: Book title (becomes name field)
+        text: Story content (becomes readable_text field)
+
+    Returns:
+        TOML-formatted string
+    """
+    aliases = generate_aliases(title)
+
+    # Build aliases list for TOML
+    aliases_str = ", ".join(f'"{a}"' for a in aliases)
+
+    # Escape any triple quotes in the text
+    escaped_text = text.replace('"""', r"\"\"\"")
+
+    toml = f'''name = "{title}"
+description = "a leather-bound story book"
+portable = true
+aliases = [{aliases_str}]
+readable_text = """
+{escaped_text}"""
+'''
+
+    return toml
+
+
+def import_books(input_dir: Path, output_dir: Path) -> dict[str, str]:
+    """
+    Import all .txt files from input_dir to .toml files in output_dir.
+
+    Returns:
+        Dict mapping slug -> title for all imported books
+    """
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    imported = {}
+
+    for txt_path in sorted(input_dir.glob("*.txt")):
+        slug = generate_slug(txt_path.name)
+        toml_path = output_dir / f"{slug}.toml"
+
+        title, text = parse_txt_file(txt_path)
+        toml_content = txt_to_toml(title, text)
+
+        toml_path.write_text(toml_content)
+        imported[slug] = title
+
+    return imported
+
+
+def main():
+    """Main entry point for standalone script."""
+    import sys
+
+    if len(sys.argv) != 3:
+        print("Usage: import_books.py INPUT_DIR OUTPUT_DIR")
+        sys.exit(1)
+
+    input_dir = Path(sys.argv[1])
+    output_dir = Path(sys.argv[2])
+
+    if not input_dir.is_dir():
+        print(f"Error: {input_dir} is not a directory")
+        sys.exit(1)
+
+    print(f"Importing books from {input_dir} to {output_dir}...")
+    imported = import_books(input_dir, output_dir)
+
+    print(f"\nImported {len(imported)} books:")
+    for slug, title in imported.items():
+        print(f"  {slug}.toml <- {title}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_import_books.py
+++ b/tests/test_import_books.py
@ -0,0 +1,203 @@
+"""Tests for bulk book import script."""
+
+import tempfile
+import tomllib
+from pathlib import Path
+
+import pytest
+from scripts.import_books import (
+    extract_alias_words,
+    generate_aliases,
+    generate_slug,
+    parse_txt_file,
+    txt_to_toml,
+)
+
+
+def test_parse_txt_file_with_title_and_content():
+    """Parse a basic .txt file with title and content."""
+    content = "The Frog King\n\nOnce upon a time...\nThere was a princess."
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+        f.write(content)
+        f.flush()
+
+        title, text = parse_txt_file(Path(f.name))
+
+        assert title == "The Frog King"
+        assert text == "Once upon a time...\nThere was a princess."
+
+
+def test_parse_txt_file_empty_content():
+    """Parse file with only title and blank line."""
+    content = "Title Only\n\n"
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+        f.write(content)
+        f.flush()
+
+        title, text = parse_txt_file(Path(f.name))
+
+        assert title == "Title Only"
+        assert text == ""
+
+
+def test_parse_txt_file_no_blank_line():
+    """Parse file where second line is not blank."""
+    content = "Title\nImmediate content"
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+        f.write(content)
+        f.flush()
+
+        with pytest.raises(ValueError, match="Expected blank line"):
+            parse_txt_file(Path(f.name))
+
+
+def test_generate_slug_from_filename():
+    """Convert filename to slug for TOML output."""
+    result = generate_slug("001_the_frog_king_or_iron_henry.txt")
+    assert result == "001_the_frog_king_or_iron_henry"
+    assert generate_slug("002_cat_and_mouse.txt") == "002_cat_and_mouse"
+    assert generate_slug("simple.txt") == "simple"
+
+
+def test_extract_alias_words():
+    """Extract meaningful words from title for aliases."""
+    assert extract_alias_words("The Frog King") == ["frog", "king"]
+    result = extract_alias_words("Cat and Mouse in Partnership")
+    assert result == ["cat", "mouse", "partnership"]
+    result = extract_alias_words("The Frog-King, or Iron Henry")
+    assert result == ["frog-king", "iron", "henry"]
+    assert extract_alias_words("Our Lady's Child") == ["lady's", "child"]
+
+
+def test_generate_aliases():
+    """Generate aliases from title."""
+    # Basic title
+    aliases = generate_aliases("The Frog King")
+    assert "frog king" in aliases
+    assert "frog" in aliases
+    assert "king" in aliases
+
+    # With punctuation - gets full phrase plus individual words
+    aliases = generate_aliases("The Frog-King, or Iron Henry")
+    assert "frog-king iron henry" in aliases
+    assert "frog-king" in aliases
+    assert "iron" in aliases
+    assert "henry" in aliases
+
+    # Single word title should not generate meaningless aliases
+    aliases = generate_aliases("Single")
+    assert aliases == ["single"]
+
+
+def test_txt_to_toml_basic():
+    """Generate valid TOML from title and text."""
+    title = "The Frog King"
+    text = "Once upon a time..."
+
+    toml_str = txt_to_toml(title, text)
+
+    # Parse the generated TOML to verify it's valid
+    data = tomllib.loads(toml_str)
+
+    assert data["name"] == "The Frog King"
+    assert data["description"] == "a leather-bound story book"
+    assert data["portable"] is True
+    assert "frog king" in data["aliases"]
+    assert data["readable_text"] == "Once upon a time..."
+
+
+def test_txt_to_toml_multiline_text():
+    """Generate TOML with multiline readable_text."""
+    title = "Test Story"
+    text = "Line 1\nLine 2\nLine 3"
+
+    toml_str = txt_to_toml(title, text)
+    data = tomllib.loads(toml_str)
+
+    assert data["readable_text"] == "Line 1\nLine 2\nLine 3"
+
+
+def test_txt_to_toml_empty_text():
+    """Generate TOML with empty readable_text."""
+    title = "Empty Story"
+    text = ""
+
+    toml_str = txt_to_toml(title, text)
+    data = tomllib.loads(toml_str)
+
+    assert data["readable_text"] == ""
+
+
+def test_full_pipeline_single_file(tmp_path):
+    """Test complete pipeline from .txt to .toml."""
+    from scripts.import_books import import_books
+
+    # Create input directory with one file
+    input_dir = tmp_path / "input"
+    input_dir.mkdir()
+
+    txt_file = input_dir / "001_the_frog_king.txt"
+    txt_file.write_text("The Frog King\n\nOnce upon a time...")
+
+    # Create output directory
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+
+    # Run import
+    import_books(input_dir, output_dir)
+
+    # Verify output file was created
+    toml_file = output_dir / "001_the_frog_king.toml"
+    assert toml_file.exists()
+
+    # Verify contents
+    with open(toml_file, "rb") as f:
+        data = tomllib.load(f)
+
+    assert data["name"] == "The Frog King"
+    assert data["readable_text"] == "Once upon a time..."
+
+
+def test_full_pipeline_multiple_files(tmp_path):
+    """Test pipeline with multiple files."""
+    from scripts.import_books import import_books
+
+    input_dir = tmp_path / "input"
+    input_dir.mkdir()
+
+    # Create multiple files
+    (input_dir / "001_story_one.txt").write_text("Story One\n\nText one")
+    (input_dir / "002_story_two.txt").write_text("Story Two\n\nText two")
+    (input_dir / "003_story_three.txt").write_text("Story Three\n\nText three")
+
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+
+    import_books(input_dir, output_dir)
+
+    # Verify all files were created
+    assert (output_dir / "001_story_one.toml").exists()
+    assert (output_dir / "002_story_two.toml").exists()
+    assert (output_dir / "003_story_three.toml").exists()
+
+
+def test_full_pipeline_skips_non_txt(tmp_path):
+    """Pipeline should only process .txt files."""
+    from scripts.import_books import import_books
+
+    input_dir = tmp_path / "input"
+    input_dir.mkdir()
+
+    (input_dir / "story.txt").write_text("Title\n\nContent")
+    (input_dir / "README.md").write_text("# Not a story")
+    (input_dir / "data.json").write_text("{}")
+
+    output_dir = tmp_path / "output"
+    output_dir.mkdir()
+
+    import_books(input_dir, output_dir)
+
+    # Only the .txt file should generate output
+    assert (output_dir / "story.toml").exists()
+    assert not (output_dir / "README.toml").exists()
+    assert not (output_dir / "data.toml").exists()