commit ac8c2a490cf56e0e87c38cfc5e87e64663f90bce Author: Jared Miller Date: Wed Jan 28 14:35:34 2026 -0500 Add initial generation: diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bc834bf --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# docker volumes are external, nothing to ignore really +# but good practice to have this file +*.log +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..e2bb803 --- /dev/null +++ b/README.md @@ -0,0 +1,86 @@ +# local-llm-stack + +run llms locally on cpu. slow but complete. + +## what's in the box + +``` +ollama (port 11434) - runs the models + ↓ +open-webui (port 3001) - chat interface + RAG + ↓ +chroma (port 8007) - vector database for document retrieval +``` + +## quickstart + +```bash +just up # start everything +just pull tinyllama # download a small model (~600MB) +just open # open web ui at localhost:3001 +``` + +## the stack explained + +**ollama** - inference engine. downloads models, loads them into memory, generates tokens. uses llama.cpp under the hood which is optimized for cpu. + +**open-webui** - web interface for chatting. also handles: +- document upload (pdf, txt, etc) +- embedding documents into vectors +- RAG (retrieval-augmented generation) +- conversation history + +**chroma** - vector database. when you upload docs: +1. open-webui chunks the text +2. embedding model converts chunks to vectors +3. vectors stored in chroma +4. when you ask a question, similar chunks retrieved +5. chunks injected into prompt as context + +## models for cpu + +| model | params | ram needed | speed | +|-------|--------|------------|-------| +| qwen2:0.5b | 0.5B | ~1GB | fast | +| tinyllama | 1.1B | ~2GB | fast | +| gemma2:2b | 2B | ~3GB | ok | +| phi3:mini | 3.8B | ~4GB | slow | + +```bash +just pull qwen2:0.5b +just pull tinyllama +just recommend # see all options +``` + +## useful commands + +```bash +just up # start +just down # stop +just logs # watch all logs +just models # list downloaded models +just stats # cpu/mem usage +just nuke # delete everything including data +``` + +## testing rag + +1. open http://localhost:3001 +2. click workspace (top left) > documents +3. upload a pdf or txt file +4. start a chat, click the + button, attach the document +5. ask questions about it + +## what this isn't + +this is inference, not ML. we're not training anything - just running models that others trained. the "learning" in machine learning happened elsewhere on gpu clusters. we're just using the results. + +## hardware notes + +tested on intel i5-6500t (no gpu). expect: +- ~2-5 tokens/sec with tinyllama +- ~1-2 tokens/sec with phi3:mini +- first response slow (model loading) +- subsequent responses faster (model stays in ram) + +more ram = can run bigger models. 16gb should handle 7b models (slowly). diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..357eee1 --- /dev/null +++ b/compose.yml @@ -0,0 +1,54 @@ +services: + ollama: + image: ollama/ollama:latest + container_name: ollama + volumes: + - ollama_data:/root/.ollama + ports: + - "11434:11434" + restart: unless-stopped + # no gpu, cpu only - it'll be slow but it works + # add this if you ever get a gpu: + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: all + # capabilities: [gpu] + + open-webui: + image: ghcr.io/open-webui/open-webui:main + container_name: open-webui + volumes: + - open_webui_data:/app/backend/data + ports: + - "3001:8080" + environment: + - OLLAMA_BASE_URL=http://ollama:11434 + - WEBUI_AUTH=false + # use external chroma for RAG + - CHROMA_HTTP_HOST=chroma + - CHROMA_HTTP_PORT=8000 + - RAG_EMBEDDING_MODEL=all-MiniLM-L6-v2 + depends_on: + - ollama + - chroma + restart: unless-stopped + + chroma: + image: chromadb/chroma:latest + container_name: chroma + volumes: + - chroma_data:/chroma/chroma + ports: + - "8007:8000" + environment: + - IS_PERSISTENT=TRUE + - ANONYMIZED_TELEMETRY=FALSE + restart: unless-stopped + +volumes: + ollama_data: + open_webui_data: + chroma_data: diff --git a/justfile b/justfile new file mode 100644 index 0000000..e0e16b2 --- /dev/null +++ b/justfile @@ -0,0 +1,59 @@ +# local-llm-stack commands + +# start everything +up: + docker compose up -d + +# stop everything +down: + docker compose down + +# view logs (all services) +logs: + docker compose logs -f + +# view logs for specific service +log service: + docker compose logs -f {{service}} + +# pull a model (default: tinyllama - small and fast on cpu) +pull model="tinyllama": + docker exec ollama ollama pull {{model}} + +# list downloaded models +models: + docker exec ollama ollama list + +# run a quick test prompt +test model="tinyllama": + docker exec ollama ollama run {{model}} "Say hello in exactly 5 words" + +# open the web ui +open: + xdg-open http://localhost:3001 + +# check status of all services +status: + docker compose ps + +# restart a service +restart service: + docker compose restart {{service}} + +# nuke everything (volumes too) +nuke: + docker compose down -v + +# show resource usage +stats: + docker stats ollama open-webui chroma --no-stream + +# recommended small models for cpu +recommend: + @echo "models that won't melt your cpu:" + @echo " tinyllama - 1.1B params, very fast" + @echo " phi3:mini - 3.8B params, smart for size" + @echo " qwen2:0.5b - 0.5B params, tiny" + @echo " gemma2:2b - 2B params, decent" + @echo "" + @echo "pull with: just pull tinyllama"