Add initial generation:

2026-01-28 14:35:34 -05:00 · 2026-01-28 14:35:34 -05:00 · ac8c2a490c
commit ac8c2a490c
4 changed files with 203 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 # docker volumes are external, nothing to ignore really
 # but good practice to have this file
 *.log
 .env
--- a/README.md
+++ b/README.md
@ -0,0 +1,86 @@
 # local-llm-stack
 run llms locally on cpu. slow but complete.
 ## what's in the box
 ```
 ollama (port 11434)     - runs the models
    ↓
 open-webui (port 3001)  - chat interface + RAG
    ↓
 chroma (port 8007)      - vector database for document retrieval
 ```
 ## quickstart
 ```bash
 just up              # start everything
 just pull tinyllama  # download a small model (~600MB)
 just open            # open web ui at localhost:3001
 ```
 ## the stack explained
 **ollama** - inference engine. downloads models, loads them into memory, generates tokens. uses llama.cpp under the hood which is optimized for cpu.
 **open-webui** - web interface for chatting. also handles:
 - document upload (pdf, txt, etc)
 - embedding documents into vectors
 - RAG (retrieval-augmented generation)
 - conversation history
 **chroma** - vector database. when you upload docs:
 1. open-webui chunks the text
 2. embedding model converts chunks to vectors
 3. vectors stored in chroma
 4. when you ask a question, similar chunks retrieved
 5. chunks injected into prompt as context
 ## models for cpu
 | model | params | ram needed | speed |
 |-------|--------|------------|-------|
 | qwen2:0.5b | 0.5B | ~1GB | fast |
 | tinyllama | 1.1B | ~2GB | fast |
 | gemma2:2b | 2B | ~3GB | ok |
 | phi3:mini | 3.8B | ~4GB | slow |
 ```bash
 just pull qwen2:0.5b
 just pull tinyllama
 just recommend  # see all options
 ```
 ## useful commands
 ```bash
 just up        # start
 just down      # stop
 just logs      # watch all logs
 just models    # list downloaded models
 just stats     # cpu/mem usage
 just nuke      # delete everything including data
 ```
 ## testing rag
 1. open http://localhost:3001
 2. click workspace (top left) > documents
 3. upload a pdf or txt file
 4. start a chat, click the + button, attach the document
 5. ask questions about it
 ## what this isn't
 this is inference, not ML. we're not training anything - just running models that others trained. the "learning" in machine learning happened elsewhere on gpu clusters. we're just using the results.
 ## hardware notes
 tested on intel i5-6500t (no gpu). expect:
 - ~2-5 tokens/sec with tinyllama
 - ~1-2 tokens/sec with phi3:mini
 - first response slow (model loading)
 - subsequent responses faster (model stays in ram)
 more ram = can run bigger models. 16gb should handle 7b models (slowly).
--- a/compose.yml
+++ b/compose.yml
@ -0,0 +1,54 @@
 services:
  ollama:
    image: ollama/ollama:latest
    container_name: ollama
    volumes:
      - ollama_data:/root/.ollama
    ports:
      - "11434:11434"
    restart: unless-stopped
    # no gpu, cpu only - it'll be slow but it works
    # add this if you ever get a gpu:
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia
    #           count: all
    #           capabilities: [gpu]
  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open-webui
    volumes:
      - open_webui_data:/app/backend/data
    ports:
      - "3001:8080"
    environment:
      - OLLAMA_BASE_URL=http://ollama:11434
      - WEBUI_AUTH=false
      # use external chroma for RAG
      - CHROMA_HTTP_HOST=chroma
      - CHROMA_HTTP_PORT=8000
      - RAG_EMBEDDING_MODEL=all-MiniLM-L6-v2
    depends_on:
      - ollama
      - chroma
    restart: unless-stopped
  chroma:
    image: chromadb/chroma:latest
    container_name: chroma
    volumes:
      - chroma_data:/chroma/chroma
    ports:
      - "8007:8000"
    environment:
      - IS_PERSISTENT=TRUE
      - ANONYMIZED_TELEMETRY=FALSE
    restart: unless-stopped
 volumes:
  ollama_data:
  open_webui_data:
  chroma_data:
--- a/59
+++ b/59
@ -0,0 +1,59 @@
 # local-llm-stack commands
 # start everything
 up:
    docker compose up -d
 # stop everything
 down:
    docker compose down
 # view logs (all services)
 logs:
    docker compose logs -f
 # view logs for specific service
 log service:
    docker compose logs -f {{service}}
 # pull a model (default: tinyllama - small and fast on cpu)
 pull model="tinyllama":
    docker exec ollama ollama pull {{model}}
 # list downloaded models
 models:
    docker exec ollama ollama list
 # run a quick test prompt
 test model="tinyllama":
    docker exec ollama ollama run {{model}} "Say hello in exactly 5 words"
 # open the web ui
 open:
    xdg-open http://localhost:3001
 # check status of all services
 status:
    docker compose ps
 # restart a service
 restart service:
    docker compose restart {{service}}
 # nuke everything (volumes too)
 nuke:
    docker compose down -v
 # show resource usage
 stats:
    docker stats ollama open-webui chroma --no-stream
 # recommended small models for cpu
 recommend:
    @echo "models that won't melt your cpu:"
    @echo "  tinyllama     - 1.1B params, very fast"
    @echo "  phi3:mini     - 3.8B params, smart for size"
    @echo "  qwen2:0.5b    - 0.5B params, tiny"
    @echo "  gemma2:2b     - 2B params, decent"
    @echo ""
    @echo "pull with: just pull tinyllama"