Add initial generation:

2026-01-28 14:35:34 -05:00 · 2026-01-28 14:35:34 -05:00 · ac8c2a490c
commit ac8c2a490c
4 changed files with 203 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+# docker volumes are external, nothing to ignore really
+# but good practice to have this file
+*.log
+.env
--- a/README.md
+++ b/README.md
@ -0,0 +1,86 @@
+# local-llm-stack
+
+run llms locally on cpu. slow but complete.
+
+## what's in the box
+
+```
+ollama (port 11434)     - runs the models
+    ↓
+open-webui (port 3001)  - chat interface + RAG
+    ↓
+chroma (port 8007)      - vector database for document retrieval
+```
+
+## quickstart
+
+```bash
+just up              # start everything
+just pull tinyllama  # download a small model (~600MB)
+just open            # open web ui at localhost:3001
+```
+
+## the stack explained
+
+**ollama** - inference engine. downloads models, loads them into memory, generates tokens. uses llama.cpp under the hood which is optimized for cpu.
+
+**open-webui** - web interface for chatting. also handles:
+- document upload (pdf, txt, etc)
+- embedding documents into vectors
+- RAG (retrieval-augmented generation)
+- conversation history
+
+**chroma** - vector database. when you upload docs:
+1. open-webui chunks the text
+2. embedding model converts chunks to vectors
+3. vectors stored in chroma
+4. when you ask a question, similar chunks retrieved
+5. chunks injected into prompt as context
+
+## models for cpu
+
+| model | params | ram needed | speed |
+|-------|--------|------------|-------|
+| qwen2:0.5b | 0.5B | ~1GB | fast |
+| tinyllama | 1.1B | ~2GB | fast |
+| gemma2:2b | 2B | ~3GB | ok |
+| phi3:mini | 3.8B | ~4GB | slow |
+
+```bash
+just pull qwen2:0.5b
+just pull tinyllama
+just recommend  # see all options
+```
+
+## useful commands
+
+```bash
+just up        # start
+just down      # stop
+just logs      # watch all logs
+just models    # list downloaded models
+just stats     # cpu/mem usage
+just nuke      # delete everything including data
+```
+
+## testing rag
+
+1. open http://localhost:3001
+2. click workspace (top left) > documents
+3. upload a pdf or txt file
+4. start a chat, click the + button, attach the document
+5. ask questions about it
+
+## what this isn't
+
+this is inference, not ML. we're not training anything - just running models that others trained. the "learning" in machine learning happened elsewhere on gpu clusters. we're just using the results.
+
+## hardware notes
+
+tested on intel i5-6500t (no gpu). expect:
+- ~2-5 tokens/sec with tinyllama
+- ~1-2 tokens/sec with phi3:mini
+- first response slow (model loading)
+- subsequent responses faster (model stays in ram)
+
+more ram = can run bigger models. 16gb should handle 7b models (slowly).
--- a/compose.yml
+++ b/compose.yml
@ -0,0 +1,54 @@
+services:
+  ollama:
+    image: ollama/ollama:latest
+    container_name: ollama
+    volumes:
+      - ollama_data:/root/.ollama
+    ports:
+      - "11434:11434"
+    restart: unless-stopped
+    # no gpu, cpu only - it'll be slow but it works
+    # add this if you ever get a gpu:
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+
+  open-webui:
+    image: ghcr.io/open-webui/open-webui:main
+    container_name: open-webui
+    volumes:
+      - open_webui_data:/app/backend/data
+    ports:
+      - "3001:8080"
+    environment:
+      - OLLAMA_BASE_URL=http://ollama:11434
+      - WEBUI_AUTH=false
+      # use external chroma for RAG
+      - CHROMA_HTTP_HOST=chroma
+      - CHROMA_HTTP_PORT=8000
+      - RAG_EMBEDDING_MODEL=all-MiniLM-L6-v2
+    depends_on:
+      - ollama
+      - chroma
+    restart: unless-stopped
+
+  chroma:
+    image: chromadb/chroma:latest
+    container_name: chroma
+    volumes:
+      - chroma_data:/chroma/chroma
+    ports:
+      - "8007:8000"
+    environment:
+      - IS_PERSISTENT=TRUE
+      - ANONYMIZED_TELEMETRY=FALSE
+    restart: unless-stopped
+
+volumes:
+  ollama_data:
+  open_webui_data:
+  chroma_data:
--- a/59
+++ b/59
@ -0,0 +1,59 @@
+# local-llm-stack commands
+
+# start everything
+up:
+    docker compose up -d
+
+# stop everything
+down:
+    docker compose down
+
+# view logs (all services)
+logs:
+    docker compose logs -f
+
+# view logs for specific service
+log service:
+    docker compose logs -f {{service}}
+
+# pull a model (default: tinyllama - small and fast on cpu)
+pull model="tinyllama":
+    docker exec ollama ollama pull {{model}}
+
+# list downloaded models
+models:
+    docker exec ollama ollama list
+
+# run a quick test prompt
+test model="tinyllama":
+    docker exec ollama ollama run {{model}} "Say hello in exactly 5 words"
+
+# open the web ui
+open:
+    xdg-open http://localhost:3001
+
+# check status of all services
+status:
+    docker compose ps
+
+# restart a service
+restart service:
+    docker compose restart {{service}}
+
+# nuke everything (volumes too)
+nuke:
+    docker compose down -v
+
+# show resource usage
+stats:
+    docker stats ollama open-webui chroma --no-stream
+
+# recommended small models for cpu
+recommend:
+    @echo "models that won't melt your cpu:"
+    @echo "  tinyllama     - 1.1B params, very fast"
+    @echo "  phi3:mini     - 3.8B params, smart for size"
+    @echo "  qwen2:0.5b    - 0.5B params, tiny"
+    @echo "  gemma2:2b     - 2B params, decent"
+    @echo ""
+    @echo "pull with: just pull tinyllama"