From 355795a9915a6e9c9ed06d6ad1bd7c94f03e20bc Mon Sep 17 00:00:00 2001 From: Jared Miller Date: Sat, 14 Feb 2026 12:34:34 -0500 Subject: [PATCH] Add bulk book import script --- scripts/import_books.py | 152 +++++++++++++++++++++++++++ tests/test_import_books.py | 203 +++++++++++++++++++++++++++++++++++++ 2 files changed, 355 insertions(+) create mode 100755 scripts/import_books.py create mode 100644 tests/test_import_books.py diff --git a/scripts/import_books.py b/scripts/import_books.py new file mode 100755 index 0000000..758ea5a --- /dev/null +++ b/scripts/import_books.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""Import .txt story files to TOML thing templates for readable books.""" + +from pathlib import Path + + +def parse_txt_file(path: Path) -> tuple[str, str]: + """ + Parse a .txt file with title on line 1, blank line, then content. + + Returns: + (title, text) tuple + """ + lines = path.read_text().splitlines() + + if len(lines) < 2: + raise ValueError(f"File too short: {path}") + + title = lines[0] + + if len(lines) > 1 and lines[1] != "": + raise ValueError(f"Expected blank line after title in {path}") + + # Join all lines after the blank line + text = "\n".join(lines[2:]) if len(lines) > 2 else "" + + return title, text + + +def generate_slug(filename: str) -> str: + """Convert filename to slug (remove .txt extension).""" + return filename.removesuffix(".txt") + + +def extract_alias_words(title: str) -> list[str]: + """Extract meaningful words from title, lowercased.""" + # Remove common articles and prepositions, keep hyphenated words + stopwords = {"the", "a", "an", "in", "on", "or", "and", "of", "to", "our"} + + # Split on spaces but preserve hyphens and apostrophes + words = title.lower().replace(",", "").split() + + return [w for w in words if w not in stopwords] + + +def generate_aliases(title: str) -> list[str]: + """Generate aliases from title.""" + words = extract_alias_words(title) + + aliases = [] + + # Full title without articles + full = " ".join(words) + if full: + aliases.append(full) + + # Individual meaningful words + aliases.extend(words) + + # Remove duplicates while preserving order + seen = set() + unique_aliases = [] + for alias in aliases: + if alias not in seen: + seen.add(alias) + unique_aliases.append(alias) + + return unique_aliases + + +def txt_to_toml(title: str, text: str) -> str: + """ + Generate TOML string for a thing template. + + Args: + title: Book title (becomes name field) + text: Story content (becomes readable_text field) + + Returns: + TOML-formatted string + """ + aliases = generate_aliases(title) + + # Build aliases list for TOML + aliases_str = ", ".join(f'"{a}"' for a in aliases) + + # Escape any triple quotes in the text + escaped_text = text.replace('"""', r"\"\"\"") + + toml = f'''name = "{title}" +description = "a leather-bound story book" +portable = true +aliases = [{aliases_str}] +readable_text = """ +{escaped_text}""" +''' + + return toml + + +def import_books(input_dir: Path, output_dir: Path) -> dict[str, str]: + """ + Import all .txt files from input_dir to .toml files in output_dir. + + Returns: + Dict mapping slug -> title for all imported books + """ + input_dir = Path(input_dir) + output_dir = Path(output_dir) + + output_dir.mkdir(parents=True, exist_ok=True) + + imported = {} + + for txt_path in sorted(input_dir.glob("*.txt")): + slug = generate_slug(txt_path.name) + toml_path = output_dir / f"{slug}.toml" + + title, text = parse_txt_file(txt_path) + toml_content = txt_to_toml(title, text) + + toml_path.write_text(toml_content) + imported[slug] = title + + return imported + + +def main(): + """Main entry point for standalone script.""" + import sys + + if len(sys.argv) != 3: + print("Usage: import_books.py INPUT_DIR OUTPUT_DIR") + sys.exit(1) + + input_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) + + if not input_dir.is_dir(): + print(f"Error: {input_dir} is not a directory") + sys.exit(1) + + print(f"Importing books from {input_dir} to {output_dir}...") + imported = import_books(input_dir, output_dir) + + print(f"\nImported {len(imported)} books:") + for slug, title in imported.items(): + print(f" {slug}.toml <- {title}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_import_books.py b/tests/test_import_books.py new file mode 100644 index 0000000..5329070 --- /dev/null +++ b/tests/test_import_books.py @@ -0,0 +1,203 @@ +"""Tests for bulk book import script.""" + +import tempfile +import tomllib +from pathlib import Path + +import pytest +from scripts.import_books import ( + extract_alias_words, + generate_aliases, + generate_slug, + parse_txt_file, + txt_to_toml, +) + + +def test_parse_txt_file_with_title_and_content(): + """Parse a basic .txt file with title and content.""" + content = "The Frog King\n\nOnce upon a time...\nThere was a princess." + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write(content) + f.flush() + + title, text = parse_txt_file(Path(f.name)) + + assert title == "The Frog King" + assert text == "Once upon a time...\nThere was a princess." + + +def test_parse_txt_file_empty_content(): + """Parse file with only title and blank line.""" + content = "Title Only\n\n" + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write(content) + f.flush() + + title, text = parse_txt_file(Path(f.name)) + + assert title == "Title Only" + assert text == "" + + +def test_parse_txt_file_no_blank_line(): + """Parse file where second line is not blank.""" + content = "Title\nImmediate content" + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: + f.write(content) + f.flush() + + with pytest.raises(ValueError, match="Expected blank line"): + parse_txt_file(Path(f.name)) + + +def test_generate_slug_from_filename(): + """Convert filename to slug for TOML output.""" + result = generate_slug("001_the_frog_king_or_iron_henry.txt") + assert result == "001_the_frog_king_or_iron_henry" + assert generate_slug("002_cat_and_mouse.txt") == "002_cat_and_mouse" + assert generate_slug("simple.txt") == "simple" + + +def test_extract_alias_words(): + """Extract meaningful words from title for aliases.""" + assert extract_alias_words("The Frog King") == ["frog", "king"] + result = extract_alias_words("Cat and Mouse in Partnership") + assert result == ["cat", "mouse", "partnership"] + result = extract_alias_words("The Frog-King, or Iron Henry") + assert result == ["frog-king", "iron", "henry"] + assert extract_alias_words("Our Lady's Child") == ["lady's", "child"] + + +def test_generate_aliases(): + """Generate aliases from title.""" + # Basic title + aliases = generate_aliases("The Frog King") + assert "frog king" in aliases + assert "frog" in aliases + assert "king" in aliases + + # With punctuation - gets full phrase plus individual words + aliases = generate_aliases("The Frog-King, or Iron Henry") + assert "frog-king iron henry" in aliases + assert "frog-king" in aliases + assert "iron" in aliases + assert "henry" in aliases + + # Single word title should not generate meaningless aliases + aliases = generate_aliases("Single") + assert aliases == ["single"] + + +def test_txt_to_toml_basic(): + """Generate valid TOML from title and text.""" + title = "The Frog King" + text = "Once upon a time..." + + toml_str = txt_to_toml(title, text) + + # Parse the generated TOML to verify it's valid + data = tomllib.loads(toml_str) + + assert data["name"] == "The Frog King" + assert data["description"] == "a leather-bound story book" + assert data["portable"] is True + assert "frog king" in data["aliases"] + assert data["readable_text"] == "Once upon a time..." + + +def test_txt_to_toml_multiline_text(): + """Generate TOML with multiline readable_text.""" + title = "Test Story" + text = "Line 1\nLine 2\nLine 3" + + toml_str = txt_to_toml(title, text) + data = tomllib.loads(toml_str) + + assert data["readable_text"] == "Line 1\nLine 2\nLine 3" + + +def test_txt_to_toml_empty_text(): + """Generate TOML with empty readable_text.""" + title = "Empty Story" + text = "" + + toml_str = txt_to_toml(title, text) + data = tomllib.loads(toml_str) + + assert data["readable_text"] == "" + + +def test_full_pipeline_single_file(tmp_path): + """Test complete pipeline from .txt to .toml.""" + from scripts.import_books import import_books + + # Create input directory with one file + input_dir = tmp_path / "input" + input_dir.mkdir() + + txt_file = input_dir / "001_the_frog_king.txt" + txt_file.write_text("The Frog King\n\nOnce upon a time...") + + # Create output directory + output_dir = tmp_path / "output" + output_dir.mkdir() + + # Run import + import_books(input_dir, output_dir) + + # Verify output file was created + toml_file = output_dir / "001_the_frog_king.toml" + assert toml_file.exists() + + # Verify contents + with open(toml_file, "rb") as f: + data = tomllib.load(f) + + assert data["name"] == "The Frog King" + assert data["readable_text"] == "Once upon a time..." + + +def test_full_pipeline_multiple_files(tmp_path): + """Test pipeline with multiple files.""" + from scripts.import_books import import_books + + input_dir = tmp_path / "input" + input_dir.mkdir() + + # Create multiple files + (input_dir / "001_story_one.txt").write_text("Story One\n\nText one") + (input_dir / "002_story_two.txt").write_text("Story Two\n\nText two") + (input_dir / "003_story_three.txt").write_text("Story Three\n\nText three") + + output_dir = tmp_path / "output" + output_dir.mkdir() + + import_books(input_dir, output_dir) + + # Verify all files were created + assert (output_dir / "001_story_one.toml").exists() + assert (output_dir / "002_story_two.toml").exists() + assert (output_dir / "003_story_three.toml").exists() + + +def test_full_pipeline_skips_non_txt(tmp_path): + """Pipeline should only process .txt files.""" + from scripts.import_books import import_books + + input_dir = tmp_path / "input" + input_dir.mkdir() + + (input_dir / "story.txt").write_text("Title\n\nContent") + (input_dir / "README.md").write_text("# Not a story") + (input_dir / "data.json").write_text("{}") + + output_dir = tmp_path / "output" + output_dir.mkdir() + + import_books(input_dir, output_dir) + + # Only the .txt file should generate output + assert (output_dir / "story.toml").exists() + assert not (output_dir / "README.toml").exists() + assert not (output_dir / "data.toml").exists()