Modify forgejo release to target our own runner

Add glossary and rops doc
Add hd530 notes with point sprite experience
2025-12-20 12:47:54 -05:00 · 2025-12-19 07:50:32 -05:00 · 2025-12-17 21:22:02 -05:00 · 2025-12-17 21:01:21 -05:00 · 2025-12-17 20:58:31 -05:00 · 2025-12-17 14:13:51 -05:00
35 changed files with 2550 additions and 75 deletions
--- a/.forgejo/workflows/release.yml
+++ b/.forgejo/workflows/release.yml
@ -1,12 +1,14 @@
 name: release

 on:
-  release:
-    types: [published]
+  push:
+    tags:
+      - '*'

 jobs:
  build:
-    runs-on: codeberg-small
+    runs-on: ubuntu-latest
+    container: catthehacker/ubuntu:act-latest

    steps:
      - uses: actions/checkout@v4
@ -35,16 +37,32 @@ jobs:

      - name: Upload to release
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          FORGEJO_TOKEN: ${{ secrets.FORGEJO_TOKEN }}
        run: |
-          RELEASE_ID="${{ github.event.release.id }}"
-          API_URL="${{ github.api_url }}/repos/${{ github.repository }}/releases/${RELEASE_ID}/assets"
+          TAG="${{ github.ref_name }}"
+          API_BASE="${{ github.server_url }}/api/v1"
+          REPO="${{ github.repository }}"
+
+          # check if release exists
+          RELEASE_ID=$(curl -sf \
+            -H "Authorization: token ${FORGEJO_TOKEN}" \
+            "${API_BASE}/repos/${REPO}/releases/tags/${TAG}" | jq -r '.id // empty')
+
+          if [ -z "$RELEASE_ID" ]; then
+            echo "Creating release for ${TAG}..."
+            RELEASE_ID=$(curl -sf \
+              -H "Authorization: token ${FORGEJO_TOKEN}" \
+              -H "Content-Type: application/json" \
+              -d '{"tag_name":"'"${TAG}"'","name":"'"${TAG}"'"}' \
+              "${API_BASE}/repos/${REPO}/releases" | jq -r '.id')
+          fi
+
+          echo "Release ID: ${RELEASE_ID}"

          for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
            echo "Uploading $file..."
-            curl -X POST \
-              -H "Authorization: token ${GITHUB_TOKEN}" \
-              -H "Content-Type: application/octet-stream" \
-              --data-binary @"$file" \
-              "${API_URL}?name=${file}"
+            curl -sf \
+              -H "Authorization: token ${FORGEJO_TOKEN}" \
+              -F "attachment=@${file}" \
+              "${API_BASE}/repos/${REPO}/releases/${RELEASE_ID}/assets?name=${file}"
          done
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -10,9 +10,14 @@ jobs:
      matrix:
        include:
          - os: ubuntu-latest
-            artifact: sandbox-linux-x86_64
+            target: native
+            artifact: lofivor-linux-x86_64
+          - os: ubuntu-latest
+            target: x86_64-windows-gnu
+            artifact: lofivor-windows-x86_64.exe
          - os: macos-latest
-            artifact: sandbox-macos-aarch64
+            target: native
+            artifact: lofivor-macos-aarch64

    runs-on: ${{ matrix.os }}

@ -26,12 +31,24 @@ jobs:
          version: 0.15.2

      - name: Install X11 dependencies (Linux)
-        if: matrix.os == 'ubuntu-latest'
+        if: matrix.os == 'ubuntu-latest' && matrix.target == 'native'
        run: sudo apt-get update && sudo apt-get install -y libx11-dev libxcursor-dev libxrandr-dev libxinerama-dev libxi-dev libxext-dev libxfixes-dev libgl1-mesa-dev

-      - run: zig build -Doptimize=ReleaseFast
+      - name: Build native
+        if: matrix.target == 'native'
+        run: zig build -Doptimize=ReleaseFast

-      - run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
+      - name: Build cross-compile
+        if: matrix.target != 'native'
+        run: zig build -Dtarget=${{ matrix.target }} -Doptimize=ReleaseFast
+
+      - name: Rename artifact (Unix)
+        if: "!contains(matrix.artifact, '.exe')"
+        run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
+
+      - name: Rename artifact (Windows)
+        if: contains(matrix.artifact, '.exe')
+        run: mv zig-out/bin/sandbox.exe ${{ matrix.artifact }}

      - uses: actions/upload-artifact@v4
        with:
--- a/OPTIMIZATIONS.md
+++ b/OPTIMIZATIONS.md
@ -82,8 +82,8 @@ these target the rendering bottleneck since update loop is already fast.

 | technique              | description                                                          | expected gain                   |
 | ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
-| ~~SSBO instance data~~ | ~~pack (x, y, color) = 12 bytes instead of 64-byte matrices~~        | **done** - see optimization 5   |
-| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync            | significant                     |
+| SSBO instance data     | pack (x, y, color) = 12 bytes instead of 64-byte matrices            | done - see optimization 5       |
+| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync            | done - see optimization 6       |
 | OpenGL vs Vulkan       | test raylib's Vulkan backend                                         | unknown                         |
 | discrete GPU testing   | test on dedicated GPU where instancing/SSBO shine                    | significant (different hw)      |

@ -126,6 +126,33 @@ currently not the bottleneck - update stays <1ms at 100k. these become relevant
 | entity pools          | pre-allocated, reusable entity slots  | reduces allocation overhead |
 | component packing     | minimize struct padding               | better cache utilization    |

+#### estimated gains summary
+
+| Optimization           | Expected Gain | Why                                               |
+|------------------------|---------------|---------------------------------------------------|
+| SIMD updates           | 0%            | Update already on GPU                             |
+| Multithreaded update   | 0%            | Update already on GPU                             |
+| Cache-friendly layouts | 0%            | CPU doesn't iterate entities                      |
+| Fixed-point math       | 0% or worse   | GPUs are optimized for float                      |
+| SoA vs AoS             | ~5%           | Only helps data upload, not bottleneck            |
+| Frustum culling        | 5-15%         | Most entities converge to center anyway           |
+| LOD rendering          | 20-40%        | Real gains - fewer fragments for distant entities |
+| Temporal techniques    | ~50%          | But with visual artifacts (flickering)            |
+
+Realistic total if you did everything: ~30-50% improvement
+
+That'd take you from ~1.4M @ 38fps to maybe ~1.8-2M @ 38fps, or ~1.4M @ 50-55fps.
+
+What would actually move the needle:
+- GPU-side frustum culling in compute shader (cull before render, not after)
+- Point sprites instead of quads for distant entities (4 vertices → 1)
+- Indirect draw calls (GPU decides what to render, CPU never touches entity data)
+
+Your real bottleneck is fill rate and vertex throughput on HD 530 integrated
+graphics. The CPU side is already essentially free.
+
+
+
 ---

 ## testing methodology
--- a/README.txt
+++ b/README.txt
@ -4,6 +4,8 @@ lofivor
 sandbox stress test for measuring entity rendering performance on weak hardware.
 written in zig with raylib.

+(lofivor aka lofi-survivor)
+
 build & run
 -----------

@ -12,8 +14,8 @@ build & run
 controls
 --------

-    +/-      add/remove 1000 entities
-    shift    hold for 10x (10000 entities)
+    +/-      add/remove 10k entities
+    shift    hold for 50k
    space    pause/resume
    r        reset

--- a/TODO.md
+++ b/TODO.md
@ -59,7 +59,7 @@ further options (if needed):
 - [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
 - [x] GPU instancing (single draw call for all entities)
 - [x] SSBO instance data (12 bytes vs 64-byte matrices)
- [ ] compute shader entity updates (if raylib supports)
+- [x] compute shader entity updates (raylib supports via rlgl)
 - [ ] compare OpenGL vs Vulkan backend

 findings (i5-6500T / HD 530):
@ -68,14 +68,18 @@ findings (i5-6500T / HD 530):
 - instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
 - bottleneck is memory bandwidth, not draw call overhead
 - rlgl batching is already near-optimal for this hardware
+- compute shaders: update time ~5ms → ~0ms at 150k entities (CPU freed entirely)

-## future optimization concepts
+## future optimization concepts (GPU-focused)

- [ ] SIMD entity updates (AVX2/SSE)
- [ ] struct-of-arrays vs array-of-structs benchmark
- [ ] multithreaded update loop (thread pool)
- [ ] cache-friendly memory layouts
- [ ] LOD rendering (skip distant entities or reduce detail)
- [ ] frustum culling (only render visible)
- [ ] temporal techniques (update subset per frame)
- [ ] fixed-point vs floating-point math
+- [ ] GPU-side frustum culling in compute shader
+- [ ] point sprites for distant/small entities (4 verts → 1)
+- [ ] indirect draw calls (glDrawArraysIndirect)
+
+## future optimization concepts (CPU - not currently bottleneck)
+
+- [ ] SIMD / SoA / multithreading (if game logic makes CPU hot again)
+
+## other ideas that aren't about optimization
+
+- [ ] scanline shader
--- a/build.zig
+++ b/build.zig
@ -4,6 +4,9 @@ pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});

+    // tracy profiling (run with -Dtracy=true)
+    const enable_tracy = b.option(bool, "tracy", "Enable Tracy profiler") orelse false;
+
    const raylib_dep = b.dependency("raylib_zig", .{
        .target = target,
        .optimize = optimize,
@ -24,6 +27,16 @@ pub fn build(b: *std.Build) void {
    sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
    sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));

+    // tracy integration (optional)
+    const ztracy = b.dependency("ztracy", .{
+        .enable_ztracy = enable_tracy,
+        .on_demand = true, // allow connecting after app starts
+    });
+    sandbox_exe.root_module.addImport("ztracy", ztracy.module("root"));
+    if (enable_tracy) {
+        sandbox_exe.linkLibrary(ztracy.artifact("tracy"));
+    }
+
    b.installArtifact(sandbox_exe);

    const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
--- a/build.zig.zon
+++ b/build.zig.zon
@ -7,6 +7,10 @@
            .url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
            .hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
        },
+        .ztracy = .{
+            .url = "git+https://github.com/zig-gamedev/ztracy?ref=main#e7b401dea9ce006f8b236e3a2ca1a9f3d5c3e896",
+            .hash = "ztracy-0.14.0-dev-zHJSq78GGQC904aYvBPn6OOvRVOq_opAwDfeHZdvQyej",
+        },
    },
    .paths = .{
        "build.zig",
--- a/docs/GLOSSARY.txt
+++ b/docs/GLOSSARY.txt
@ -0,0 +1,292 @@
+lofivor glossary
+================
+
+terms that come up when optimizing graphics.
+
+
+clock cycle
+-----------
+
+one "tick" of the processor's internal clock.
+
+a CPU or GPU has a crystal oscillator that vibrates at a fixed rate.
+each vibration = one cycle. the processor does some work each cycle.
+
+  1 GHz = 1 billion cycles per second
+  1 MHz = 1 million cycles per second
+
+so a 1 GHz processor has 1 billion opportunities to do work per second.
+
+"one operation per cycle" is idealized. real work often takes multiple
+cycles (memory access: 100+ cycles, division: 10-20 cycles, add: 1 cycle).
+
+your HD 530 runs at ~950 MHz, so roughly 950 million cycles per second.
+at 60fps, that's about 15.8 million cycles per frame.
+
+
+fill rate
+---------
+
+pixels written per second. measured in megapixels/s or gigapixels/s.
+
+  fill rate = ROPs * clock speed * pixels per clock
+
+your HD 530: 3 ROPs * 950 MHz * 1 = 2.85 GPixels/s theoretical max.
+
+
+overdraw
+--------
+
+drawing the same pixel multiple times per frame.
+
+if two entities overlap, the back one gets drawn, then the front one
+overwrites it. the back one's work was wasted.
+
+  overdraw ratio = total pixels drawn / screen pixels
+
+1080p = 2.07M pixels. if you draw 20M pixels, overdraw = ~10x.
+
+
+bandwidth
+---------
+
+data transfer rate. measured in bytes/second (GB/s, MB/s).
+
+memory bandwidth = how fast data moves between processor and RAM.
+
+your HD 530 shares DDR4 with the CPU: ~30 GB/s total.
+a discrete GPU has dedicated VRAM: 200-900 GB/s.
+
+
+latency
+-------
+
+time delay. measured in nanoseconds (ns) or cycles.
+
+memory latency = time to fetch data from RAM.
+  - L1 cache: ~4 cycles
+  - L2 cache: ~12 cycles
+  - L3 cache: ~40 cycles
+  - main RAM: ~200 cycles
+
+this is why cache matters. a cache miss = 50x slower than a hit.
+
+
+throughput vs latency
+---------------------
+
+latency = how long ONE thing takes.
+throughput = how many things per second.
+
+a pipeline can have high latency but high throughput.
+
+example: a car wash takes 10 minutes (latency).
+but if cars enter every 1 minute, throughput is 60 cars/hour.
+
+GPUs hide latency with throughput. one thread waits for memory?
+switch to another thread. thousands of threads keep the GPU busy.
+
+
+draw call
+---------
+
+one command from CPU to GPU: "draw this batch of geometry."
+
+each draw call has overhead:
+  - CPU prepares command buffer
+  - driver validates state
+  - GPU switches context
+
+1 draw call for 1M triangles: fast.
+1M draw calls for 1M triangles: slow.
+
+lofivor uses 1 draw call for all entities (instanced rendering).
+
+
+instancing
+----------
+
+drawing many copies of the same geometry in one draw call.
+
+instead of: draw triangle, draw triangle, draw triangle...
+you say: draw this triangle 1 million times, here are the positions.
+
+the GPU handles the replication. massively more efficient.
+
+
+shader
+------
+
+a small program that runs on the GPU.
+
+the name is historical - early shaders calculated shading/lighting.
+but today: a shader is just software running on GPU hardware.
+it doesn't have to do with shading at all.
+
+more precisely: a shader turns one piece of data into another piece of data.
+  - vertex shader: positions → screen coordinates
+  - fragment shader: fragments → pixel colors
+  - compute shader: data → data (anything)
+
+GPUs are massively parallel, so shaders run on thousands of inputs at once.
+CPUs have stagnated; GPUs keep getting faster. modern engines like UE5
+increasingly use shaders for work that used to be CPU-only.
+
+
+SSBO (shader storage buffer object)
+-----------------------------------
+
+a block of GPU memory that shaders can read/write.
+
+unlike uniforms (small, read-only), SSBOs can be large and writable.
+lofivor stores all entity data in an SSBO: positions, velocities, colors.
+
+
+compute shader
+--------------
+
+a shader that does general computation, not rendering.
+
+runs on GPU cores but doesn't output pixels. just processes data.
+lofivor uses compute shaders to update entity positions.
+
+because compute exists, shaders can be anything: physics, AI, sorting,
+image processing. the GPU is a general-purpose parallel processor.
+
+
+fragment / pixel shader
+-----------------------
+
+program that runs once per pixel (actually per "fragment").
+
+determines the final color of each pixel. this is where:
+  - texture sampling happens
+  - lighting calculations happen
+  - the expensive math lives
+
+lofivor's fragment shader: sample texture, multiply by color. trivial.
+AAA game fragment shader: 500+ instructions. expensive.
+
+
+vertex shader
+-------------
+
+program that runs once per vertex.
+
+transforms 3D positions to screen positions. lofivor's vertex shader
+reads from SSBO and positions the quad corners.
+
+
+ROP (render output unit)
+------------------------
+
+final stage of GPU pipeline. writes pixels to framebuffer.
+
+handles: depth test, stencil test, blending, antialiasing.
+your bottleneck on HD 530. see docs/rops.txt.
+
+
+TMU (texture mapping unit)
+--------------------------
+
+samples textures. reads pixel colors from texture memory.
+
+your HD 530 has 24 TMUs. they're fast (22.8 GTexels/s).
+texture sampling is cheap relative to ROPs on this hardware.
+
+
+EU (execution unit)
+-------------------
+
+intel's term for shader cores.
+
+your HD 530 has 24 EUs, each with 8 ALUs = 192 ALUs total.
+these run your vertex, fragment, and compute shaders.
+
+
+ALU (arithmetic logic unit)
+---------------------------
+
+does math. add, multiply, compare, bitwise operations.
+
+one ALU can do one operation per cycle (simple ops).
+complex ops (sqrt, sin, cos) take multiple cycles.
+
+
+framebuffer
+-----------
+
+the image being rendered. lives in GPU memory.
+
+at 1080p with 32-bit color: 1920 * 1080 * 4 = 8.3 MB.
+double-buffered (front + back): 16.6 MB.
+
+
+vsync
+-----
+
+synchronizing frame presentation with monitor refresh.
+
+without vsync: tearing (half old frame, half new frame).
+with vsync: smooth, but if you miss 16.7ms, you wait for next refresh.
+
+
+frame budget
+------------
+
+time available per frame.
+
+  60 fps = 16.67 ms per frame
+  30 fps = 33.33 ms per frame
+
+everything (CPU + GPU) must complete within budget or frames drop.
+
+
+pipeline stall
+--------------
+
+GPU waiting for something. bad for performance.
+
+causes:
+  - waiting for memory (cache miss)
+  - waiting for previous stage to finish
+  - synchronization points (barriers)
+  - `discard` in fragment shader (breaks early-z)
+
+
+early-z
+-------
+
+optimization: test depth BEFORE running fragment shader.
+
+if pixel will be occluded, skip the expensive shader work.
+`discard` breaks this because GPU can't know depth until shader runs.
+
+
+LOD (level of detail)
+---------------------
+
+using simpler geometry/textures for distant objects.
+
+far away = fewer pixels = less detail needed.
+saves vertices, texture bandwidth, and fill rate.
+
+
+frustum culling
+---------------
+
+don't draw what's outside the camera view.
+
+the "frustum" is the pyramid-shaped visible region.
+anything outside = wasted work. cull it before sending to GPU.
+
+
+spatial partitioning
+--------------------
+
+organizing entities by position for fast queries.
+
+types: grid, quadtree, octree, BVH.
+
+"which entities are near point X?" goes from O(n) to O(log n).
+essential for collision detection at scale.
--- a/docs/hd530_optimization_guide.md
+++ b/docs/hd530_optimization_guide.md
@ -0,0 +1,119 @@
+# intel hd 530 optimization guide for lofivor
+
+based on hardware specs and empirical testing.
+
+## hardware constraints
+
+from `intel_hd_graphics_530.txt`:
+
+| resource   | value               | implication                                 |
+| ---------- | -------             | -------------                               |
+| ROPs       | 3                   | fill rate limited - this is our ceiling     |
+| TMUs       | 24                  | texture sampling is relatively fast         |
+| memory     | shared DDR4 ~30GB/s | bandwidth is precious, no VRAM              |
+| pixel rate | 2.85 GPixel/s       | max theoretical throughput                  |
+| EUs        | 24 (192 ALUs)       | decent compute, weak vs discrete            |
+| L3 cache   | 768 KB              | small, cache misses hurt                    |
+
+the bottleneck is ROPs (fill rate), not vertices or compute.
+
+## what works (proven)
+
+### SSBO instance data
+- 16 bytes per entity vs 64 bytes (matrices)
+- minimizes bandwidth on shared memory bus
+- result: ~5x improvement over instancing
+
+### compute shader updates
+- GPU does position/velocity updates
+- no CPU→GPU sync per frame
+- result: update time essentially free
+
+### texture sampling
+- 22.8 GTexel/s is fast relative to other units
+- pre-baked circle texture beats procedural math
+- result: 2x faster than procedural fragment shader
+
+### instanced triangles/quads
+- most optimized driver path
+- intel mesa heavily optimizes this
+- result: baseline, hard to beat
+
+## what doesn't work (proven)
+
+### point sprites
+- theoretically 6x fewer vertices
+- reality: 2.4x SLOWER on this hardware
+- triangle rasterizer is more optimized
+- see `docs/point_sprites_experiment.md`
+
+### procedural fragment shaders
+- `length()`, `smoothstep()`, `discard` are expensive
+- EUs are weaker than discrete GPUs
+- `discard` breaks early-z optimization
+- result: 3.7x slower than texture sampling
+
+### complex fragment math
+- only 24 EUs, each running 8 ALUs
+- transcendentals (sqrt, sin, cos) are 4x slower than FMAD
+- avoid in hot path
+
+## what to try next (theoretical)
+
+### likely to help
+
+| technique                            | why it should work                      | expected gain            |
+| -----------                          | -------------------                     | ---------------          |
+| frustum culling (GPU)                | reduce fill rate, which is bottleneck   | 10-30% depending on view |
+| smaller points when zoomed out (LOD) | fewer pixels per entity = less ROP work | 20-40%                   |
+| early-z / depth pre-pass             | skip fragment work for occluded pixels  | moderate                 |
+
+### unlikely to help
+
+| technique                | why it won't help                         |
+| -----------              | ------------------                        |
+| more vertex optimization | already fill rate bound, not vertex bound |
+| SIMD on CPU              | updates already on GPU                    |
+| multithreading           | CPU isn't the bottleneck                  |
+| different vertex layouts | negligible vs fill rate                   |
+
+### uncertain (need to test)
+
+| technique           | notes                                                 |
+| -----------         | -------                                               |
+| vulkan backend      | might have less driver overhead, or might not matter  |
+| indirect draw calls | GPU decides what to render, but we're not CPU bound   |
+| fp16 in shaders     | HD 530 has 2:1 fp16 ratio, might help fragment shader |
+
+## key insights
+
+1. fill rate is king - with only 3 ROPs, everything comes down to how many
+   pixels we're writing. optimizations that don't reduce pixel count won't
+   help.
+
+2. shared memory hurts - no dedicated VRAM means CPU and GPU compete for
+   bandwidth. keep data transfers minimal.
+
+3. driver optimization matters - the "common path" (triangles) is more
+   optimized than alternatives (points). don't be clever.
+
+4. texture sampling is cheap - 22.8 GTexel/s is fast. prefer texture
+   lookups over ALU math in fragment shaders.
+
+5. avoid discard - breaks early-z, causes pipeline stalls. alpha blending
+   is faster than discard.
+
+## current ceiling
+
+~950k entities @ 57fps (SSBO + compute + quads)
+
+to go higher, we need to reduce fill rate:
+- cull offscreen entities
+- reduce entity size when zoomed out
+- or accept lower fps at higher counts
+
+## references
+
+- intel gen9 compute architecture whitepaper
+- empirical benchmarks in `benchmark_current_i56500t.log`
+- point sprites experiment in `docs/point_sprites_experiment.md`
--- a/docs/hysteria.md
+++ b/docs/hysteria.md
@ -0,0 +1,31 @@
+# hysteresis in lofivor
+
+## the problem without it
+
+say your target is 8.33ms. your frame times naturally jitter: 8.2, 8.4, 8.3, 8.5, 8.2...
+
+without hysteresis, every time it crosses 8.33ms you'd log "crossed threshold!" - potentially dozens of times per second. the log becomes useless noise.
+
+## how the code works
+
+from `sandbox_main.zig` lines 74-89:
+
+```
+was_above=false → need frame_ms > 10.33 (target + 2.0 margin) to flip to true
+was_above=true  → need frame_ms < 8.33 (target) to flip back to false
+```
+
+this creates a "dead zone" between 8.33 and 10.33ms where no state change happens.
+
+## the magnet analogy
+
+the `was_above_target` boolean is like the magnet's current polarity. the frame time "pushing" past thresholds is like the magnetic field. the key insight: **the threshold you need to cross depends on which side you're currently on.**
+
+if you're in "good" state, you need a significant spike (>10.33ms) before you flip to "bad". if you're in "bad" state, you only need to drop below 8.33ms to recover. this asymmetry is the hysteresis.
+
+## real-world examples
+
+- thermostat: heat on at 68°F, off at 72°F (prevents rapid on/off cycling)
+- schmitt trigger in electronics: same concept, prevents noise from causing oscillation
+
+the `THRESHOLD_MARGIN` of 2.0ms is the "width" of the hysteresis band - bigger = more stable but less responsive.
--- a/docs/plans/2025-12-16-zoom-pan-design.md
+++ b/docs/plans/2025-12-16-zoom-pan-design.md
@ -0,0 +1,54 @@
+# Zoom/Pan Camera Design
+
+A viewport camera for zooming into and panning around the simulation without affecting entity behavior.
+
+## Core Behavior
+
+### Zoom
+- Scroll wheel zooms toward mouse cursor position
+- Range: 1x (default floor) to 10x (ceiling)
+- Instant response, no animation
+- Esc or Space resets to 1x and clears pan offset
+
+### Pan
+- Any mouse button (left/middle/right) + drag pans the viewport
+- Only available when zoom > 1x
+- Bounded to simulation area - cannot pan into empty space
+
+### UI
+- Display current zoom level in existing panel under render info (e.g., `zoom: 2.3x`)
+
+## Implementation Approach
+
+### State
+New camera state in `sandbox_main.zig`:
+```zig
+var zoom: f32 = 1.0;
+var pan: @Vector(2, f32) = .{ 0, 0 };
+```
+
+### Shader Changes
+Modify `entity.vert` to accept `zoom` and `pan` uniforms:
+- Apply pan offset before converting to NDC
+- Scale by zoom factor
+- Scale quad size by zoom so entities appear larger
+
+### Input Handling
+- `getMouseWheelMove()` adjusts zoom (clamped 1.0–10.0)
+- Zoom-toward-cursor: adjust pan to keep point under cursor stationary
+- Mouse drag (any button) adjusts pan with bounds checking
+- Esc/Space resets zoom to 1.0 and pan to (0, 0)
+
+### Zoom-Toward-Cursor Math
+When zooming from `oldZoom` to `newZoom` with cursor at `mousePos`:
+```
+worldMousePos = (mousePos / oldZoom) + pan
+newPan = worldMousePos - (mousePos / newZoom)
+```
+
+### Pan Bounds
+Constrain pan so viewport stays within simulation area:
+```
+maxPan = simulationSize - (screenSize / zoom)
+pan = clamp(pan, 0, maxPan)
+```
--- a/docs/plans/2025-12-16-zoom-pan-plan.md
+++ b/docs/plans/2025-12-16-zoom-pan-plan.md
@ -0,0 +1,440 @@
+# Zoom/Pan Camera Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Add viewport zoom (scroll wheel toward cursor) and pan (any mouse drag when zoomed) to observe the simulation up close.
+
+**Architecture:** Camera state (zoom, pan) lives in sandbox_main.zig. Passed to shader as uniforms. All rendering paths use the same camera state, but only SSBO path gets shader-based zoom (others would need separate work).
+
+**Tech Stack:** Zig, raylib, GLSL 430
+
+---
+
+### Task 1: Add camera state and shader uniforms
+
+**Files:**
+- Modify: `src/sandbox_main.zig:266` (add state after `var paused`)
+- Modify: `src/ssbo_renderer.zig:20-21` (add uniform locations to struct)
+- Modify: `src/ssbo_renderer.zig:54-62` (get uniform locations in init)
+- Modify: `src/ssbo_renderer.zig:154-156` (pass uniforms in render)
+
+**Step 1: Add camera state to sandbox_main.zig**
+
+After line 266 (`var paused = false;`), add:
+
+```zig
+// camera state for zoom/pan
+var zoom: f32 = 1.0;
+var pan = @Vector(2, f32){ 0, 0 };
+```
+
+**Step 2: Add uniform locations to SsboRenderer struct**
+
+In `src/ssbo_renderer.zig`, add to struct fields after line 21 (`circle_texture_loc`):
+
+```zig
+zoom_loc: i32,
+pan_loc: i32,
+```
+
+**Step 3: Get uniform locations in init**
+
+After line 55 (`const circle_texture_loc = ...`), add:
+
+```zig
+const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
+const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
+```
+
+**Step 4: Add fields to return struct**
+
+In the return statement (around line 112), add:
+
+```zig
+.zoom_loc = zoom_loc,
+.pan_loc = pan_loc,
+```
+
+**Step 5: Pass uniforms in render method**
+
+Change render signature to accept zoom/pan:
+
+```zig
+pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
+```
+
+After line 156 (setting screenSize uniform), add:
+
+```zig
+// set zoom uniform
+rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
+
+// set pan uniform
+const pan_arr = [2]f32{ pan[0], pan[1] };
+rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
+```
+
+**Step 6: Update render call in sandbox_main.zig**
+
+Change line 336 from:
+
+```zig
+ssbo_renderer.?.render(&entities);
+```
+
+To:
+
+```zig
+ssbo_renderer.?.render(&entities, zoom, pan);
+```
+
+**Step 7: Build and verify compiles**
+
+Run: `zig build`
+
+Expected: Compiles with no errors (shader won't use uniforms yet, but that's fine)
+
+---
+
+### Task 2: Update vertex shader for zoom/pan
+
+**Files:**
+- Modify: `src/shaders/entity.vert`
+
+**Step 1: Add uniforms**
+
+After line 19 (`uniform vec2 screenSize;`), add:
+
+```glsl
+uniform float zoom;
+uniform vec2 pan;
+```
+
+**Step 2: Update NDC calculation**
+
+Replace lines 29-31:
+
+```glsl
+// convert entity position to NDC
+// entity coords are in screen pixels, convert to [-1, 1]
+float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
+float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
+```
+
+With:
+
+```glsl
+// apply pan offset and zoom to convert to NDC
+// pan is in screen pixels, zoom scales the view
+float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
+float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
+```
+
+**Step 3: Scale quad size by zoom**
+
+Replace line 34:
+
+```glsl
+float quadSizeNdc = 16.0 / screenSize.x;
+```
+
+With:
+
+```glsl
+float quadSizeNdc = (16.0 * zoom) / screenSize.x;
+```
+
+**Step 4: Build and test**
+
+Run: `zig build && ./zig-out/bin/lofivor`
+
+Expected: Renders exactly as before (zoom=1.0, pan=0,0 should be identical to old behavior)
+
+---
+
+### Task 3: Add zoom input handling
+
+**Files:**
+- Modify: `src/sandbox_main.zig` (handleInput function and main loop)
+
+**Step 1: Add zoom constants**
+
+After line 32 (BENCH_EXIT_SUSTAIN), add:
+
+```zig
+// zoom settings
+const ZOOM_MIN: f32 = 1.0;
+const ZOOM_MAX: f32 = 10.0;
+const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
+```
+
+**Step 2: Create handleCamera function**
+
+After the `handleInput` function (around line 458), add:
+
+```zig
+fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) void {
+    const wheel = rl.getMouseWheelMove();
+
+    if (wheel != 0) {
+        const mouse_pos = rl.getMousePosition();
+        const old_zoom = zoom.*;
+
+        // calculate new zoom
+        const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
+        var new_zoom = old_zoom * zoom_factor;
+        new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
+
+        if (new_zoom != old_zoom) {
+            // zoom toward mouse cursor:
+            // keep the world point under the cursor stationary
+            // world_pos = (screen_pos / old_zoom) + old_pan
+            // new_pan = world_pos - (screen_pos / new_zoom)
+            const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
+            const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
+            pan.*[0] = world_x - (mouse_pos.x / new_zoom);
+            pan.*[1] = world_y - (mouse_pos.y / new_zoom);
+            zoom.* = new_zoom;
+
+            // clamp pan to bounds
+            clampPan(pan, zoom.*);
+        }
+    }
+
+    // reset on Esc or Space (Space also toggles pause in handleInput)
+    if (rl.isKeyPressed(.escape)) {
+        zoom.* = 1.0;
+        pan.* = @Vector(2, f32){ 0, 0 };
+    }
+}
+
+fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
+    // when zoomed in, limit pan so viewport stays in simulation bounds
+    // visible area = screen_size / zoom
+    // max pan = world_size - visible_area
+    const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
+    const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
+    const visible_w = screen_w / zoom;
+    const visible_h = screen_h / zoom;
+
+    const max_pan_x = @max(0, screen_w - visible_w);
+    const max_pan_y = @max(0, screen_h - visible_h);
+
+    pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
+    pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
+}
+```
+
+**Step 3: Call handleCamera in main loop**
+
+In the main loop, after the `handleInput` call (line 318), add:
+
+```zig
+handleCamera(&zoom, &pan);
+```
+
+**Step 4: Also reset zoom when Space is pressed**
+
+In `handleInput`, modify the space key handler (around line 450):
+
+```zig
+// pause: space (also resets zoom in handleCamera context)
+if (rl.isKeyPressed(.space)) {
+    paused.* = !paused.*;
+}
+```
+
+Actually, handleInput doesn't have access to zoom/pan. We need to either:
+- Pass zoom/pan to handleInput
+- Handle space reset in handleCamera
+
+Let's handle it in handleCamera. Add after the escape check:
+
+```zig
+// Space also resets zoom (pause is handled separately in handleInput)
+if (rl.isKeyPressed(.space)) {
+    zoom.* = 1.0;
+    pan.* = @Vector(2, f32){ 0, 0 };
+}
+```
+
+**Step 5: Build and test zoom**
+
+Run: `zig build && ./zig-out/bin/lofivor`
+
+Test:
+1. Scroll up - entities should get bigger (zoom in toward cursor)
+2. Scroll down - entities get smaller (but not below 1x)
+3. Press Esc or Space - resets to default view
+
+---
+
+### Task 4: Add pan input handling
+
+**Files:**
+- Modify: `src/sandbox_main.zig` (handleCamera function)
+
+**Step 1: Add pan logic to handleCamera**
+
+Add this after the zoom handling, before the reset checks:
+
+```zig
+// pan with any mouse button drag (only when zoomed in)
+if (zoom.* > 1.0) {
+    const any_button = rl.isMouseButtonDown(.left) or
+                       rl.isMouseButtonDown(.right) or
+                       rl.isMouseButtonDown(.middle);
+    if (any_button) {
+        const delta = rl.getMouseDelta();
+        // pan in opposite direction of drag (drag right = view moves left = pan increases)
+        pan.*[0] -= delta.x / zoom.*;
+        pan.*[1] -= delta.y / zoom.*;
+        clampPan(pan, zoom.*);
+    }
+}
+```
+
+**Step 2: Build and test pan**
+
+Run: `zig build && ./zig-out/bin/lofivor`
+
+Test:
+1. Scroll to zoom in past 1x
+2. Click and drag with any mouse button - viewport should pan
+3. Try to pan past edges - should be bounded
+4. At 1x zoom, dragging should do nothing
+
+---
+
+### Task 5: Add zoom display to UI
+
+**Files:**
+- Modify: `src/ui.zig:34` (drawMetrics signature)
+- Modify: `src/ui.zig:71-72` (add zoom line after render)
+- Modify: `src/sandbox_main.zig:387` (pass zoom to drawMetrics)
+
+**Step 1: Update drawMetrics signature**
+
+Change line 34:
+
+```zig
+pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
+```
+
+To:
+
+```zig
+pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
+```
+
+**Step 2: Increase box height for zoom line**
+
+Change line 50:
+
+```zig
+const bg_height: i32 = if (paused) 130 else 100;
+```
+
+To:
+
+```zig
+const bg_height: i32 = if (paused) 150 else 120;
+```
+
+**Step 3: Add zoom display after render line**
+
+After line 72 (render_text draw), add:
+
+```zig
+y += line_height;
+
+// zoom level
+const zoom_text = std.fmt.bufPrintZ(&buf, "zoom:     {d:.1}x", .{zoom}) catch "?";
+rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
+```
+
+**Step 4: Update call in sandbox_main.zig**
+
+Change line 387:
+
+```zig
+ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
+```
+
+To:
+
+```zig
+ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
+```
+
+**Step 5: Build and test UI**
+
+Run: `zig build && ./zig-out/bin/lofivor`
+
+Test:
+1. UI should show "zoom: 1.0x" in white
+2. Scroll to zoom - should update and turn yellow when > 1x
+3. Reset with Esc - back to white 1.0x
+
+---
+
+### Task 6: Update controls legend
+
+**Files:**
+- Modify: `src/ui.zig:120-139` (drawControls function)
+
+**Step 1: Update controls list and box height**
+
+Change line 121:
+
+```zig
+const ctrl_box_height: i32 = @intFromFloat(small_line_height * 5 + box_padding * 2);
+```
+
+To:
+
+```zig
+const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
+```
+
+Change the controls array (lines 127-133):
+
+```zig
+const controls = [_][]const u8{
+    "+/-: 10k entities",
+    "shift +/-: 50k",
+    "scroll: zoom",
+    "drag: pan (zoomed)",
+    "space: pause/reset",
+    "esc: reset zoom",
+    "tab: toggle ui",
+};
+```
+
+**Step 2: Build and final test**
+
+Run: `zig build && ./zig-out/bin/lofivor`
+
+Full test:
+1. Scroll wheel zooms toward cursor (1x-10x)
+2. Any mouse drag pans when zoomed > 1x
+3. Pan is bounded to simulation area
+4. Esc resets zoom/pan
+5. Space toggles pause AND resets zoom/pan
+6. UI shows zoom level (yellow when zoomed)
+7. Controls legend shows new controls
+
+---
+
+### Task 7: Commit
+
+```bash
+git add src/sandbox_main.zig src/ssbo_renderer.zig src/shaders/entity.vert src/ui.zig
+git commit -m "feat: add zoom/pan camera
+
+- scroll wheel zooms toward cursor (1x-10x range)
+- any mouse button drag pans when zoomed
+- pan bounded to simulation area
+- esc/space resets to default view
+- zoom level shown in metrics panel"
+```
--- a/docs/plans/2025-12-17-compute-shader-updates.md
+++ b/docs/plans/2025-12-17-compute-shader-updates.md
@ -0,0 +1,170 @@
+# compute shader entity updates
+
+move entity position math to GPU, eliminate CPU→GPU sync per frame.
+
+## context
+
+current bottleneck: per-frame `rlUpdateShaderBuffer()` uploads all entity data from CPU to GPU. at 950k entities that's 19MB/frame. targeting 10M entities would be 160MB/frame.
+
+solution: keep entity data on GPU entirely. compute shader updates positions, vertex shader renders. CPU just dispatches.
+
+## data structures
+
+**GpuEntity (16 bytes, std430):**
+```glsl
+struct Entity {
+    float x;        // world position
+    float y;
+    int packedVel;  // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
+    uint color;     // 0xRRGGBB
+};
+```
+
+**zig side:**
+```zig
+const GpuEntity = extern struct {
+    x: f32,
+    y: f32,
+    packed_vel: i32,
+    color: u32,
+};
+
+fn packVelocity(vx: f32, vy: f32) i32 {
+    const vx_fixed: i16 = @intFromFloat(vx * 256.0);
+    const vy_fixed: i16 = @intFromFloat(vy * 256.0);
+    return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
+}
+```
+
+## compute shader
+
+`src/shaders/entity_update.comp`:
+```glsl
+#version 430
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer Entities {
+    Entity entities[];
+};
+
+uniform uint entityCount;
+uniform uint frameNumber;
+uniform vec2 screenSize;
+uniform vec2 center;
+uniform float respawnRadius;
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    if (id >= entityCount) return;
+
+    Entity e = entities[id];
+
+    // unpack velocity
+    float vx = float(e.packedVel >> 16) / 256.0;
+    float vy = float((e.packedVel << 16) >> 16) / 256.0;
+
+    // update position
+    e.x += vx;
+    e.y += vy;
+
+    // respawn check
+    float dx = e.x - center.x;
+    float dy = e.y - center.y;
+    if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
+        // GPU RNG
+        uint seed = id * 1103515245u + frameNumber * 12345u;
+        seed = seed * 747796405u + 2891336453u;
+
+        uint edge = seed & 3u;
+        float t = float((seed >> 2) & 0xFFFFu) / 65535.0;
+
+        // spawn on edge with velocity toward center
+        // (full edge logic in implementation)
+    }
+
+    entities[id] = e;
+}
+```
+
+## integration
+
+raylib doesn't wrap compute shaders. use raw GL calls via `compute.zig`:
+
+```zig
+pub fn dispatch(entity_count: u32, frame: u32) void {
+    gl.glUseProgram(program);
+    gl.glUniform1ui(entity_count_loc, entity_count);
+    gl.glUniform1ui(frame_loc, frame);
+    // ... other uniforms
+
+    const groups = (entity_count + 255) / 256;
+    gl.glDispatchCompute(groups, 1, 1);
+    gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
+}
+```
+
+## frame flow
+
+**before:**
+```
+CPU: update positions (5ms at 950k)
+CPU: copy to gpu_buffer
+CPU→GPU: rlUpdateShaderBuffer() ← bottleneck
+GPU: render
+```
+
+**after:**
+```
+GPU: compute dispatch (~0ms CPU time)
+GPU: memory barrier
+GPU: render
+```
+
+## implementation steps
+
+each step is a commit point if desired.
+
+### step 1: GpuEntity struct expansion
+- modify `GpuEntity` in sandbox.zig: add `packed_vel` field
+- add `packVelocity()` helper
+- update ssbo_renderer to handle 16-byte stride
+- verify existing rendering still works
+
+### step 2: compute shader infrastructure
+- create `src/compute.zig` with GL bindings
+- create `src/shaders/entity_update.comp` (position update only, no respawn yet)
+- load and compile compute shader in sandbox_main.zig
+- dispatch before render, verify positions update
+
+### step 3: respawn logic
+- add GPU RNG to compute shader
+- implement edge spawning + velocity calculation
+- remove CPU update loop from sandbox.zig
+
+### step 4: cleanup ✓
+- `--compute` is now default, `--cpu` flag for fallback/comparison
+- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
+- verbose debug output reduced
+
+## files changed
+
+**new:**
+- `src/shaders/entity_update.comp`
+- `src/compute.zig`
+
+**modified:**
+- `src/sandbox.zig` — GpuEntity struct, packVelocity(), remove CPU update
+- `src/ssbo_renderer.zig` — remove per-frame upload
+- `src/sandbox_main.zig` — init compute, dispatch in frame loop
+
+## risks
+
+1. **driver quirks** — intel HD 530 compute support is fine but older, may hit edge cases
+2. **debugging** — GPU code harder to debug, start with small counts
+3. **fallback** — keep `--compute` flag to A/B test against existing SSBO path
+
+## expected results
+
+- CPU update time: ~5ms → ~0ms
+- no per-frame buffer upload
+- target: 1M+ entities, pushing toward 10M ceiling
--- a/docs/point_sprites_experiment.md
+++ b/docs/point_sprites_experiment.md
@ -0,0 +1,89 @@
+# point sprites experiment
+
+branch: `point-sprites` (point-sprites work)
+date: 2024-12
+hardware: intel hd 530 (skylake gt2, i5-6500T)
+
+## hypothesis
+
+point sprites should be faster than quads because:
+- 1 vertex per entity instead of 6 (quad = 2 triangles)
+- less vertex throughput
+- `gl_PointCoord` provides texture coords automatically
+
+## implementation
+
+### vertex shader changes
+- removed quad vertex attributes (position, texcoord)
+- use `gl_PointSize = 16.0 * zoom` for size control
+- position calculated from SSBO data only
+
+### fragment shader changes
+- use `gl_PointCoord` instead of vertex texcoord
+- sample circle texture for alpha
+
+### renderer changes
+- load `glEnable` and `glDrawArraysInstanced` via `rlGetProcAddress`
+- enable `GL_PROGRAM_POINT_SIZE`
+- draw with `glDrawArraysInstanced(GL_POINTS, 0, 1, count)`
+- removed VBO (no vertex data needed)
+
+## results
+
+### attempt 1: procedural circle in fragment shader
+
+```glsl
+vec2 coord = gl_PointCoord - vec2(0.5);
+float dist = length(coord);
+float alpha = 1.0 - smoothstep(0.4, 0.5, dist);
+if (alpha < 0.01) discard;
+```
+
+**benchmark @ 350k entities:**
+- point sprites: 23ms render, 43fps
+- quads (main): 6.2ms render, 151fps
+- **result: 3.7x SLOWER**
+
+**why:** `discard` breaks early-z optimization, `length()` and `smoothstep()` are ALU-heavy, intel integrated GPUs are weak at fragment shader math.
+
+### attempt 2: texture sampling
+
+```glsl
+float alpha = texture(circleTexture, gl_PointCoord).r;
+finalColor = vec4(fragColor, alpha);
+```
+
+**benchmark @ 450k entities:**
+- point sprites: 19.1ms render, 52fps
+- quads (main): 8.0ms render, 122fps
+- **result: 2.4x SLOWER**
+
+better than procedural, but still significantly slower than quads.
+
+## analysis
+
+the theoretical advantage (1/6 vertices) doesn't translate to real performance because:
+
+1. **triangle path is more optimized** - intel's driver heavily optimizes the standard triangle rasterization path. point sprites use a less-traveled code path.
+
+2. **fill rate is the bottleneck** - HD 530 has only 3 ROPs. we're bound by how fast we can write pixels, not by vertex count. reducing vertices from 6 to 1 doesn't help when fill rate is the constraint.
+
+3. **point size overhead** - each point requires computing `gl_PointSize` and setting up the point sprite rasterization, which may have per-vertex overhead.
+
+4. **texture cache behavior** - `gl_PointCoord` may have worse cache locality than explicit vertex texcoords.
+
+## conclusion
+
+**point sprites are a regression on intel hd 530.**
+
+the optimization makes theoretical sense but fails in practice on this hardware. the quad/triangle path is simply more optimized in intel's mesa driver.
+
+**keep this branch for testing on discrete GPUs** where point sprites might actually help (nvidia/amd have different optimization priorities).
+
+## lessons learned
+
+1. always benchmark, don't assume
+2. "fewer vertices" doesn't always mean faster
+3. integrated GPU optimization is different from discrete
+4. the most optimized path is usually the most common path (triangles)
+5. fill rate matters more than vertex count at high entity counts
--- a/docs/rops.txt
+++ b/docs/rops.txt
@ -0,0 +1,201 @@
+rops: render output units
+=========================
+
+what they are, where they came from, and what yours can do.
+
+
+what is a rop?
+--------------
+
+ROP = Render Output Unit (originally "Raster Operations Pipeline")
+
+it's the final stage of the GPU pipeline. after all the fancy shader
+math is done, the ROP is the unit that actually writes pixels to memory.
+
+think of it as the bottleneck between "calculated" and "visible."
+
+a ROP does:
+  - depth testing (is this pixel in front of what's already there?)
+  - stencil testing (mask operations)
+  - blending (alpha, additive, etc)
+  - anti-aliasing resolve
+  - writing the final color to the framebuffer
+
+one ROP can write one pixel per clock cycle (roughly).
+
+
+the first rop
+-------------
+
+the term comes from the IBM 8514/A (1987), which had dedicated hardware
+for "raster operations" - bitwise operations on pixels (AND, OR, XOR).
+this was revolutionary because before this, the CPU did all pixel math.
+
+but the modern ROP as we know it emerged with:
+
+  NVIDIA NV1 (1995)
+    one of the first chips with dedicated pixel output hardware
+    could do ~1 million textured pixels/second
+
+  3dfx Voodoo (1996)
+    the card that defined the modern GPU pipeline
+    had 1 TMU + 1 pixel pipeline (essentially 1 ROP)
+    could push 45 million pixels/second
+    that ONE pipeline ran Quake at 640x480
+
+  NVIDIA GeForce 256 (1999)
+    "the first GPU" - named itself with that term
+    4 pixel pipelines = 4 ROPs
+    480 million pixels/second
+
+so the original consumer 3D cards had... 1 ROP. and they ran Quake.
+
+
+what one rop can do
+-------------------
+
+let's do the math.
+
+one ROP at 100 MHz (3dfx Voodoo era):
+  100 million cycles/second
+  ~1 pixel per cycle
+  = 100 megapixels/second
+
+at 640x480 @ 60fps:
+  640 * 480 * 60 = 18.4 megapixels/second needed
+
+so ONE ROP at 100MHz could handle 640x480 with ~5x headroom for overdraw.
+
+at 1024x768 @ 60fps:
+  1024 * 768 * 60 = 47 megapixels/second
+
+now you're at 2x overdraw max. still playable, but tight.
+
+
+one modern rop
+--------------
+
+a single modern ROP runs at ~1-2 GHz and can do more per cycle:
+  - multiple color outputs (MRT)
+  - 64-bit or 128-bit color formats
+  - compressed writes
+
+rough estimate for one ROP at 1.5 GHz:
+  ~1.5 billion pixels/second base throughput
+
+at 1920x1080 @ 60fps:
+  1920 * 1080 * 60 = 124 megapixels/second
+
+one ROP could handle 1080p with 12x overdraw headroom.
+
+at 4K @ 60fps:
+  3840 * 2160 * 60 = 497 megapixels/second
+
+one ROP could handle 4K with 3x overdraw. tight, but possible.
+
+
+your three rops (intel hd 530)
+------------------------------
+
+HD 530 specs:
+  - 3 ROPs
+  - ~950 MHz boost clock
+  - theoretical: 2.85 GPixels/second
+
+let's break that down:
+
+at 1080p @ 60fps (124 MP/s needed):
+  2850 / 124 = 23x overdraw budget
+
+that's actually generous! you could draw each pixel 23 times.
+
+so why does lofivor struggle at 1M entities?
+
+because 1M entities at 4x4 pixels = 16M pixels minimum.
+but with overlap? let's say average 10x overdraw:
+  160M pixels/frame
+  at 60fps = 9.6 billion pixels/second
+
+your ceiling is 2.85 billion.
+
+so you're 3.4x over budget. that's why you top out around 300k-400k
+before frame drops (which matches empirical testing).
+
+
+the real constraint
+-------------------
+
+ROPs don't work in isolation. they're limited by:
+
+  1. MEMORY BANDWIDTH
+     each pixel write = memory access
+     HD 530 shares DDR4 with CPU (~30 GB/s)
+     at 32-bit color: 30GB/s / 4 bytes = 7.5 billion pixels/second max
+     but you're competing with CPU, texture reads, etc.
+     realistic: maybe 2-3 billion pixels for framebuffer writes
+
+  2. TEXTURE SAMPLING
+     if fragment shader samples textures, TMUs must keep up
+     HD 530 has 24 TMUs, so this isn't the bottleneck
+
+  3. SHADER EXECUTION
+     ROPs wait for fragments to be shaded
+     if shaders are slow, ROPs starve
+     lofivor's shaders are trivial, so this isn't the bottleneck
+
+for lofivor specifically: your 3 ROPs are THE ceiling.
+
+
+what could you do with more rops?
+---------------------------------
+
+comparison:
+
+  Intel HD 530:     3 ROPs,  2.85 GPixels/s
+  GTX 1060:        48 ROPs,  72 GPixels/s
+  RTX 3080:        96 ROPs, 164 GPixels/s
+  RTX 4090:       176 ROPs, 443 GPixels/s
+
+with a GTX 1060 (25x your fill rate):
+  lofivor could probably hit 5-10 million entities
+
+with an RTX 4090 (155x your fill rate):
+  tens of millions, limited by other factors
+
+
+perspective: what 3 rops means historically
+-------------------------------------------
+
+your HD 530 has roughly the fill rate of:
+  - GeForce 4 Ti 4600 (2002): 4 ROPs, 1.2 GPixels/s
+  - Radeon 9700 Pro (2002): 8 ROPs, 2.6 GPixels/s
+
+you're running hardware that, in raw pixel output, matches GPUs from
+20+ years ago. but with modern features (compute shaders, SSBO, etc).
+
+this is why lofivor is interesting: you're achieving 700k+ entities
+on fill-rate-equivalent hardware that originally ran games with
+maybe 10,000 triangles on screen.
+
+the difference is technique. those 2002 games did complex per-pixel
+lighting, shadows, multiple texture passes. lofivor does one texture
+sample and one blend. same fill rate, 100x the entities.
+
+
+the lesson
+----------
+
+ROPs are simple: they write pixels.
+
+the number you have determines your pixel budget.
+everything else (shaders, vertices, CPU logic) only matters if
+the ROPs aren't your bottleneck.
+
+with 3 ROPs, you have roughly 2.85 billion pixels/second.
+spend them wisely:
+  - cull what's offscreen (don't spend pixels on invisible things)
+  - shrink distant objects (LOD saves pixels)
+  - reduce overlap (spatial organization)
+  - keep shaders simple (don't starve the ROPs)
+
+your 3 ROPs can do remarkable things. Quake ran on 1.
--- a/docs/why-millions-is-hard.txt
+++ b/docs/why-millions-is-hard.txt
@ -0,0 +1,316 @@
+why rendering millions of entities is hard
+=========================================
+
+and what "hard" actually means, from first principles.
+
+
+the simple answer
+-----------------
+
+every frame, your computer does work. work takes time. you have 16.7
+milliseconds to do all the work before the next frame (at 60fps).
+
+if the work takes longer than 16.7ms, you miss the deadline. frames drop.
+the game stutters.
+
+10 million entities means 10 million units of work. whether that fits in
+16.7ms depends on how much work each unit is.
+
+
+what is "work" anyway?
+----------------------
+
+let's trace what happens when you draw one entity:
+
+  1. CPU: "here's an entity at position (340, 512), color cyan"
+  2. that data travels over a bus to the GPU
+  3. GPU: receives the data, stores it in memory
+  4. GPU: runs a vertex shader (figures out where on screen)
+  5. GPU: runs a fragment shader (figures out what color each pixel is)
+  6. GPU: writes pixels to the framebuffer
+  7. framebuffer gets sent to your monitor
+
+each step has a speed limit. the slowest step is your bottleneck.
+
+
+the bottlenecks, explained simply
+---------------------------------
+
+MEMORY BANDWIDTH
+  how fast data can move around. measured in GB/s.
+
+  think of it like a highway. you can have a fast car (processor), but
+  if the highway is jammed, you're stuck in traffic.
+
+  an integrated GPU (like Intel HD 530) shares the highway with the CPU.
+  a discrete GPU (like an RTX card) has its own private highway.
+
+  this is why lofivor's SSBO optimization helped so much: shrinking
+  entity data from 64 bytes to 12 bytes means 5x less traffic.
+
+DRAW CALLS
+  every time you say "GPU, draw this thing", there's overhead.
+  the CPU and GPU have to synchronize, state gets set up, etc.
+
+  1 draw call for 1 million entities: fast
+  1 million draw calls for 1 million entities: slow
+
+  this is why batching matters. not the drawing itself, but the
+  *coordination* of drawing.
+
+FILL RATE
+  how many pixels the GPU can color per second.
+
+  a 4x4 pixel entity = 16 pixels
+  1 million entities = 16 million pixels minimum
+
+  but your screen is only ~2 million pixels (1920x1080). so entities
+  overlap. "overdraw" means coloring the same pixel multiple times.
+
+  10 million overlapping entities might touch each pixel 50+ times.
+  that's 100 million pixel operations.
+
+SHADER COMPLEXITY
+  the GPU runs a tiny program for each vertex and each pixel.
+
+  simple: "put it here, color it this" = fast
+  complex: "calculate lighting from 8 sources, sample 4 textures,
+           apply normal mapping, do fresnel..." = slow
+
+  lofivor's shaders are trivial. AAA game shaders are not.
+
+CPU-GPU SYNCHRONIZATION
+  the CPU and GPU work in parallel, but sometimes they have to wait
+  for each other.
+
+  if the CPU needs to read GPU results, it stalls.
+  if the GPU needs new data and the CPU is busy, it stalls.
+
+  good code keeps them both busy without waiting.
+
+
+why "real games" hit CPU walls
+------------------------------
+
+rendering is just putting colors on pixels. that's the GPU's job.
+
+but games aren't just rendering. they're also:
+
+  - COLLISION DETECTION
+    does entity A overlap entity B?
+
+    naive approach: check every pair
+    1,000 entities = 500,000 checks (n squared / 2)
+    10,000 entities = 50,000,000 checks
+    1,000,000 entities = 500,000,000,000,000 checks
+
+    that's 500 trillion. per frame. not happening.
+
+    smart approach: spatial partitioning (grids, quadtrees)
+    only check nearby entities. but still, at millions of entities,
+    even "nearby" is a lot.
+
+  - AI / BEHAVIOR
+    each entity decides what to do.
+
+    simple: move toward player. cheap.
+    complex: pathfind around obstacles, consider threats, coordinate
+             with allies, remember state. expensive.
+
+    lofivor entities just drift in a direction. no decisions.
+    a real game enemy makes decisions every frame.
+
+  - PHYSICS
+    entities push each other, bounce, have mass and friction.
+    every interaction is math. lots of entities = lots of math.
+
+  - GAME LOGIC
+    damage calculations, spawning, leveling, cooldowns, buffs...
+    all of this runs on the CPU, every frame.
+
+so: lofivor can render 700k entities because they don't DO anything.
+a game with 700k entities that think, collide, and interact would
+need god-tier optimization or would simply not run.
+
+
+what makes AAA games slow on old hardware?
+------------------------------------------
+
+it's not entity count. most AAA games have maybe hundreds of
+"entities" on screen. it's everything else:
+
+  TEXTURE RESOLUTION
+    a 4K texture is 67 million pixels of data. per texture.
+    one character might have 10+ textures (diffuse, normal, specular,
+    roughness, ambient occlusion...).
+
+    old hardware: less VRAM, slower texture sampling.
+
+  SHADER COMPLEXITY
+    modern materials simulate light physics. subsurface scattering,
+    global illumination, ray-traced reflections.
+
+    each pixel might do hundreds of math operations.
+
+  POST-PROCESSING
+    bloom, motion blur, depth of field, ambient occlusion, anti-aliasing.
+    full-screen passes that touch every pixel multiple times.
+
+  MESH COMPLEXITY
+    a character might be 100,000 triangles.
+    10 characters = 1 million triangles.
+    each triangle goes through the vertex shader.
+
+  SHADOWS
+    render the scene again from the light's perspective.
+    for each light. every frame.
+
+AAA games are doing 100x more work per pixel than lofivor.
+lofivor is doing 100x more pixels than AAA games.
+
+different problems.
+
+
+the "abuse" vs "respect" distinction
+------------------------------------
+
+abuse: making the hardware do unnecessary work.
+respect: achieving your goal with minimal waste.
+
+examples of abuse (that lofivor fixed):
+
+  - sending 64 bytes (a full matrix) when you need 12 bytes (x, y, color)
+  - one draw call per entity when you could batch
+  - calculating transforms on CPU when GPU could do it
+  - clearing the screen twice
+  - uploading the same data every frame
+
+examples of abuse in the wild:
+
+  - electron apps using a whole browser to show a chat window
+  - games that re-render static UI every frame
+  - loading 4K textures for objects that appear 20 pixels tall
+  - running AI pathfinding for off-screen entities
+
+the hardware has limits. respecting them means fitting your game
+within those limits through smart decisions. abusing them means
+throwing cycles at problems you created yourself.
+
+
+so can you do 1 million entities with juice on old hardware?
+------------------------------------------------------------
+
+yes, with the right decisions.
+
+what "juice" typically means:
+  - screen shake (free, just offset the camera)
+  - particle effects (separate system, heavily optimized)
+  - flash/hit feedback (change a color value)
+  - sound (different system entirely)
+
+particles are special: they're designed for millions of tiny things.
+they don't collide, don't think, often don't even persist (spawn,
+drift, fade, die). GPU particle systems are essentially what lofivor
+became: minimal data, instanced rendering.
+
+what would kill you at 1 million:
+  - per-entity collision
+  - per-entity AI
+  - per-entity sprite variety (texture switches)
+  - per-entity complex shaders
+
+what you could do:
+  - 1 million particles (visual only, no logic)
+  - 10,000 enemies with collision/AI + 990,000 particles
+  - 100,000 enemies with simple behavior + spatial hash collision
+
+the secret: most of what looks like "millions of things" in games
+is actually a small number of meaningful entities + a large number
+of dumb particles.
+
+
+the laws of physics (sort of)
+-----------------------------
+
+there are hard limits:
+
+  MEMORY BUS BANDWIDTH
+    a DDR4 system might move 25 GB/s.
+    1 million entities at 12 bytes each = 12 MB.
+    at 60fps = 720 MB/s just for entity data.
+    that's only 3% of bandwidth. plenty of room.
+
+    but a naive approach (64 bytes, plus overhead) could be
+    10x worse. suddenly you're at 30%.
+
+  CLOCK CYCLES
+    a 3GHz CPU does 3 billion operations per second.
+    at 60fps, that's 50 million operations per frame.
+    1 million entities = 50 operations each.
+
+    50 operations is: a few multiplies, some loads/stores, a branch.
+    that's barely enough for "move in a direction".
+    pathfinding? AI? collision? not a chance.
+
+  PARALLELISM
+    GPUs have thousands of cores but they're simple.
+    CPUs have few cores but they're smart.
+
+    entity rendering: perfectly parallel (GPU wins)
+    entity decision-making: often sequential (CPU bound)
+
+so yes, physics constrains us. but "physics" here means:
+  - how fast electrons move through silicon
+  - how much data fits on a wire
+  - how many transistors fit on a chip
+
+within those limits, there's room. lots of room, if you're clever.
+lofivor went from 5k to 700k by being clever, not by breaking physics.
+
+
+the actual lesson
+-----------------
+
+the limit isn't really "the hardware can't do it."
+
+the limit is "the hardware can't do it THE WAY YOU'RE DOING IT."
+
+every optimization in lofivor was finding a different way:
+  - don't draw circles, blit textures
+  - don't call functions, submit vertices directly
+  - don't send matrices, send packed structs
+  - don't update on CPU, use compute shaders
+
+the hardware was always capable of 700k. the code wasn't asking right.
+
+this is true at every level. that old laptop struggling with 10k
+entities in some game? probably not the laptop's fault. probably
+the game is doing something wasteful that doesn't need to be.
+
+"runs poorly on old hardware" often means "we didn't try to make
+it run on old hardware" not "it's impossible on old hardware."
+
+
+closing thought
+---------------
+
+10 million is a lot. but 1 million? 2 million?
+
+with discipline: yes.
+with decisions that respect the hardware: yes.
+with awareness of what's actually expensive: yes.
+
+the knowledge of what's expensive is the key.
+
+most developers don't have it. they use high-level abstractions
+that hide the cost. they've never seen a frame budget or a
+bandwidth calculation.
+
+lofivor is a learning tool. the journey from 5k to 700k teaches
+where the costs are. once you see them, you can't unsee them.
+
+you start asking: "what is this actually doing? what does it cost?
+is there a cheaper way?"
+
+that's the skill. not the specific techniques—those change with
+hardware. the skill is asking the questions.
--- a/journal.txt
+++ b/journal.txt
@ -206,3 +206,38 @@ total improvement from baseline:
 - SSBO: 60fps @ ~700k entities
 - ~140x improvement overall!

+---
+
+optimization 6: compute shader updates
+--------------------------------------
+technique: move entity position + respawn logic from CPU to GPU compute shader
+code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
+version: 0.7.0
+
+struct GpuEntity {
+    x: f32,        // 4 bytes
+    y: f32,        // 4 bytes
+    packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
+    color: u32,    // 4 bytes
+};                 // = 16 bytes total (was 12)
+
+changes:
+- entity_update.comp: position update, center check, edge respawn, velocity calc
+- GPU RNG: PCG-style PRNG seeded with entity id + frame number
+- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
+- CPU update loop skipped entirely when compute enabled
+
+benchmark results (i5-6500T / HD 530):
+- update time: ~5ms → ~0ms at 150k entities
+- render time unchanged (GPU-bound as before)
+- total frame time improvement at high entity counts
+
+analysis: CPU was doing ~150k position updates + distance checks + respawn logic
+per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
+new entities when user adds them, not per-frame. memory barrier ensures compute
+writes visible to vertex shader before draw.
+
+flags:
+- --compute: GPU compute updates (now default)
+- --cpu: fallback to CPU update path for comparison
+
--- a/16
+++ b/16
@ -42,11 +42,20 @@ check:
 test:
    zig build test

-# auto-benchmark (ramps entities until performance degrades, works on linux/windows)
+# run sandbox (GPU compute is default)
+sandbox:
+    zig build -Doptimize=ReleaseFast run
+
+# auto-benchmark (ramps entities until performance degrades)
 bench:
    zig build -Doptimize=ReleaseFast run -- --bench
    cat benchmark.log

+# benchmark with CPU update path (for comparison)
+bench-cpu:
+    zig build -Doptimize=ReleaseFast run -- --bench --cpu
+    cat benchmark.log
+
 # software-rendered benchmark (for CI/headless servers)
 [linux]
 bench-sw:
@ -58,3 +67,8 @@ bench-sw:
 bench-sw:
    @echo "bench-sw: windows doesn't have xvfb equivalent"
    @echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
+
+[linux]
+profile port="9876":
+  # start Tracy: tracy-profiler -a 127.0.0.1 -p {{port}}
+  zig build -Dtracy=true -Doptimize=ReleaseFast && TRACY_PORT={{port}} ./zig-out/bin/sandbox
--- a/releases/0.1.0-unoptimized.txt
+++ b/releases/0.1.0-unoptimized.txt
@ -0,0 +1,8 @@
+the baseline: one draw call per entity, pure and simple
+
+- individual rl.drawCircle() calls in a loop
+- ~5k entities at 60fps before frame times tank
+- linear scaling: 10k = ~43ms, 20k = ~77ms
+- render-bound (update loop stays under 1ms even at 30k)
+- each circle is its own GPU draw call
+- the starting point for optimization experiments
--- a/releases/0.2.0-texture_blitting.txt
+++ b/releases/0.2.0-texture_blitting.txt
@ -0,0 +1,8 @@
+pre-render once, blit many: 10x improvement
+
+- render circle to 16x16 texture at startup
+- drawTexture() per entity instead of drawCircle()
+- raylib batches same-texture draws internally
+- ~50k entities at 60fps
+- simple change, big win
+- still one function call per entity, but GPU work is batched
--- a/releases/0.3.0-quad_batching.txt
+++ b/releases/0.3.0-quad_batching.txt
@ -0,0 +1,9 @@
+bypass the wrapper, go straight to rlgl: 2x more
+
+- skip drawTexture(), submit vertices directly via rl.gl
+- manually build quads: rlTexCoord2f + rlVertex2f per corner
+- rlBegin/rlEnd wraps the whole entity loop
+- ~100k entities at 60fps
+- eliminates per-call function overhead
+- vertices go straight to GPU buffer
+- 20x improvement over baseline
--- a/releases/0.3.1-batch_buffer.txt
+++ b/releases/0.3.1-batch_buffer.txt
@ -0,0 +1,11 @@
+bigger buffer, fewer flushes: squeezing out more headroom
+
+- increased raylib batch buffer from 8192 to 32768 vertices
+- ~140k entities at 60fps on i5-6500T
+- ~40% improvement over default buffer
+- fewer GPU flushes per frame
+- also added: release workflows for github and forgejo
+- added OPTIMIZATIONS.md documenting the journey
+- added README, UI panel with FPS display
+- heap allocated entity array to support 1 million entities
+- per-entity RGB colors
--- a/releases/0.4.0-gpu_instancing.txt
+++ b/releases/0.4.0-gpu_instancing.txt
@ -0,0 +1,13 @@
+gpu instancing: a disappointing discovery
+
+- drawMeshInstanced() with per-entity transform matrices
+- ~150k entities at 60fps - barely better than rlgl batching
+- negligible improvement on integrated graphics
+- why it didn't help:
+  - integrated GPU shares system RAM (no PCIe transfer savings)
+  - 64-byte matrix per entity vs ~80 bytes for rlgl vertices
+  - bottleneck is memory bandwidth, not draw call overhead
+  - rlgl batching already minimizes draw calls effectively
+- orthographic camera setup for 2D-like rendering
+- heap-allocated transforms buffer (64MB too big for stack)
+- lesson learned: not all "advanced" techniques are wins
--- a/releases/0.5.0-ssbo_instancing.txt
+++ b/releases/0.5.0-ssbo_instancing.txt
@ -0,0 +1,17 @@
+ssbo breakthrough: 5x gain by shrinking the data
+
+- pack entity data (x, y, color) into 12-byte struct
+- upload via shader storage buffer object (SSBO)
+- ~700k entities at 60fps (i5-6500T / HD 530)
+- ~950k entities at ~57fps
+- 5x improvement over previous best
+- 140x total from baseline
+- why it works:
+  - 12 bytes vs 64 bytes (matrices) = 5.3x less bandwidth
+  - 12 bytes vs 80 bytes (rlgl vertices) = 6.7x less bandwidth
+  - no CPU-side matrix calculations
+  - GPU does NDC conversion and color unpacking
+- custom vertex/fragment shaders
+- single rlDrawVertexArrayInstanced() call for all entities
+- shaders embedded at build time
+- removed FPS cap, added optional vsync arg
--- a/releases/0.5.1-windows_build.txt
+++ b/releases/0.5.1-windows_build.txt
@ -0,0 +1,5 @@
+cross-platform release: adding windows to the party
+
+- updated github release workflow
+- builds for both linux and windows now
+- no code changes, just CI/CD work
--- a/releases/0.6.0-zoom_zoom.txt
+++ b/releases/0.6.0-zoom_zoom.txt
@ -0,0 +1,10 @@
+zoom and pan: making millions of entities explorable
+
+- mouse wheel zoom
+- click and drag panning
+- orthographic camera transforms
+- memory panel showing entity buffer sizes
+- background draws immediately (no flicker)
+- tab key toggles UI panels
+- explained "lofivor" name in README (lo-fi survivor)
+- shader updated for zoom/pan transforms
--- a/releases/0.6.1-q_to_quit.txt
+++ b/releases/0.6.1-q_to_quit.txt
@ -0,0 +1,5 @@
+quick exit: zoom out then quit
+
+- q key first zooms out, second press quits
+- nice way to see the full entity field before closing
+- minor UI text fix
--- a/releases/0.7.0-compute_shader.txt
+++ b/releases/0.7.0-compute_shader.txt
@ -0,0 +1,11 @@
+compute shader: moving physics to the GPU
+
+- entity position updates now run on GPU via compute shader
+- GPU-based RNG for entity velocity randomization
+- full simulation loop stays on GPU, no CPU roundtrip
+- new compute.zig module for shader management
+- GpuEntity struct with position, velocity, and color
+- tracy profiling integration
+- FPS display turns green (good) or red (bad)
+- added design docs for zoom/pan and compute shader work
+- cross-platform alignment fixes for shader data
--- a/src/compute.zig
+++ b/src/compute.zig
@ -0,0 +1,111 @@
+// compute shader module for GPU entity updates
+// wraps raw GL calls that raylib doesn't expose directly
+
+const std = @import("std");
+const rl = @import("raylib");
+const sandbox = @import("sandbox.zig");
+
+const comp_source = @embedFile("shaders/entity_update.comp");
+
+// GL constants not exposed by raylib-zig
+const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
+
+// function pointer type for glMemoryBarrier
+const GlMemoryBarrierFn = *const fn (barriers: u32) callconv(.c) void;
+
+pub const ComputeShader = struct {
+    program_id: u32,
+    entity_count_loc: i32,
+    frame_number_loc: i32,
+    screen_size_loc: i32,
+    center_loc: i32,
+    respawn_radius_loc: i32,
+    entity_speed_loc: i32,
+    glMemoryBarrier: GlMemoryBarrierFn,
+
+    pub fn init() ?ComputeShader {
+        // load glMemoryBarrier dynamically
+        const barrier_ptr = rl.gl.rlGetProcAddress("glMemoryBarrier");
+        const glMemoryBarrier: GlMemoryBarrierFn = @ptrCast(@alignCast(barrier_ptr));
+
+        // compile compute shader
+        const shader_id = rl.gl.rlCompileShader(comp_source, rl.gl.rl_compute_shader);
+        if (shader_id == 0) {
+            std.debug.print("compute: failed to compile compute shader\n", .{});
+            return null;
+        }
+
+        // link compute program
+        const program_id = rl.gl.rlLoadComputeShaderProgram(shader_id);
+        if (program_id == 0) {
+            std.debug.print("compute: failed to link compute program\n", .{});
+            return null;
+        }
+
+        // get uniform locations
+        const entity_count_loc = rl.gl.rlGetLocationUniform(program_id, "entityCount");
+        const frame_number_loc = rl.gl.rlGetLocationUniform(program_id, "frameNumber");
+        const screen_size_loc = rl.gl.rlGetLocationUniform(program_id, "screenSize");
+        const center_loc = rl.gl.rlGetLocationUniform(program_id, "center");
+        const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
+        const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
+
+        std.debug.print("compute: shader loaded\n", .{});
+
+        return .{
+            .program_id = program_id,
+            .entity_count_loc = entity_count_loc,
+            .frame_number_loc = frame_number_loc,
+            .screen_size_loc = screen_size_loc,
+            .center_loc = center_loc,
+            .respawn_radius_loc = respawn_radius_loc,
+            .entity_speed_loc = entity_speed_loc,
+            .glMemoryBarrier = glMemoryBarrier,
+        };
+    }
+
+    pub fn deinit(self: *ComputeShader) void {
+        rl.gl.rlUnloadShaderProgram(self.program_id);
+    }
+
+    pub fn dispatch(self: *ComputeShader, ssbo_id: u32, entity_count: u32, frame_number: u32) void {
+        if (entity_count == 0) return;
+
+        // constants from sandbox.zig
+        const screen_w: f32 = @floatFromInt(sandbox.SCREEN_WIDTH);
+        const screen_h: f32 = @floatFromInt(sandbox.SCREEN_HEIGHT);
+        const center_x: f32 = screen_w / 2.0;
+        const center_y: f32 = screen_h / 2.0;
+        const respawn_radius: f32 = 10.0; // RESPAWN_THRESHOLD
+        const entity_speed: f32 = 2.0; // ENTITY_SPEED
+
+        // bind compute shader
+        rl.gl.rlEnableShader(self.program_id);
+
+        // set uniforms
+        rl.gl.rlSetUniform(self.entity_count_loc, &entity_count, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
+        rl.gl.rlSetUniform(self.frame_number_loc, &frame_number, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
+
+        const screen_size = [2]f32{ screen_w, screen_h };
+        rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
+
+        const center = [2]f32{ center_x, center_y };
+        rl.gl.rlSetUniform(self.center_loc, &center, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
+
+        rl.gl.rlSetUniform(self.respawn_radius_loc, &respawn_radius, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
+        rl.gl.rlSetUniform(self.entity_speed_loc, &entity_speed, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
+
+        // bind SSBO to binding point 0
+        rl.gl.rlBindShaderBuffer(ssbo_id, 0);
+
+        // dispatch compute workgroups: ceil(entity_count / 256)
+        const groups = (entity_count + 255) / 256;
+        rl.gl.rlComputeShaderDispatch(groups, 1, 1);
+
+        // memory barrier - ensure compute writes are visible to vertex shader
+        self.glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+        // unbind
+        rl.gl.rlBindShaderBuffer(0, 0);
+    }
+};
--- a/src/sandbox.zig
+++ b/src/sandbox.zig
@ -18,7 +18,7 @@ pub const Entity = struct {
    color: u32,
 };

-pub const MAX_ENTITIES: usize = 1_000_000;
+pub const MAX_ENTITIES: usize = 10_000_000;

 pub const Entities = struct {
    items: []Entity,
@ -287,34 +287,69 @@ test "update respawns entity at edge when reaching center" {
    try std.testing.expect(on_left or on_right or on_top or on_bottom);
 }

-// GPU entity for SSBO rendering (position + color only, no velocity)
+// GPU entity for SSBO rendering (16 bytes, matches compute shader layout)
 pub const GpuEntity = extern struct {
    x: f32,
    y: f32,
+    packed_vel: i32, // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
    color: u32,
 };

+// pack two f32 velocities into a single i32 (fixed-point 8.8 format)
+pub fn packVelocity(vx: f32, vy: f32) i32 {
+    const vx_fixed: i16 = @intFromFloat(std.math.clamp(vx * 256.0, -32768.0, 32767.0));
+    const vy_fixed: i16 = @intFromFloat(std.math.clamp(vy * 256.0, -32768.0, 32767.0));
+    return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
+}
+
 test "GpuEntity struct has correct size for SSBO" {
-    // SSBO layout: x(4) + y(4) + color(4) = 12 bytes
-    try std.testing.expectEqual(@as(usize, 12), @sizeOf(GpuEntity));
+    // SSBO layout: x(4) + y(4) + packed_vel(4) + color(4) = 16 bytes
+    try std.testing.expectEqual(@as(usize, 16), @sizeOf(GpuEntity));
 }

 test "GpuEntity can be created from Entity" {
    const entity = Entity{
        .x = 100.0,
        .y = 200.0,
-        .vx = 1.5, // ignored for GPU
-        .vy = -0.5, // ignored for GPU
+        .vx = 1.5,
+        .vy = -0.5,
        .color = 0x00FFFF,
    };

    const gpu_entity = GpuEntity{
        .x = entity.x,
        .y = entity.y,
+        .packed_vel = packVelocity(entity.vx, entity.vy),
        .color = entity.color,
    };

    try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
    try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
    try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
+
+    // unpack and verify velocity (should round-trip within precision)
+    const vx_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel >> 16)))) / 256.0;
+    const vy_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel)))) / 256.0;
+    try std.testing.expectApproxEqAbs(@as(f32, 1.5), vx_unpacked, 0.004);
+    try std.testing.expectApproxEqAbs(@as(f32, -0.5), vy_unpacked, 0.004);
+}
+
+test "packVelocity round-trips correctly" {
+    // test positive values
+    const packed1 = packVelocity(2.0, 1.5);
+    const vx1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1 >> 16)))) / 256.0;
+    const vy1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1)))) / 256.0;
+    try std.testing.expectApproxEqAbs(@as(f32, 2.0), vx1, 0.004);
+    try std.testing.expectApproxEqAbs(@as(f32, 1.5), vy1, 0.004);
+
+    // test negative values
+    const packed2 = packVelocity(-1.0, -2.5);
+    const vx2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2 >> 16)))) / 256.0;
+    const vy2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2)))) / 256.0;
+    try std.testing.expectApproxEqAbs(@as(f32, -1.0), vx2, 0.004);
+    try std.testing.expectApproxEqAbs(@as(f32, -2.5), vy2, 0.004);
+
+    // test zero
+    const packed3 = packVelocity(0.0, 0.0);
+    try std.testing.expectEqual(@as(i32, 0), packed3);
 }
--- a/src/sandbox_main.zig
+++ b/src/sandbox_main.zig
@ -3,9 +3,11 @@

 const std = @import("std");
 const rl = @import("raylib");
+const ztracy = @import("ztracy");
 const sandbox = @import("sandbox.zig");
 const ui = @import("ui.zig");
 const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
+const ComputeShader = @import("compute.zig").ComputeShader;

 const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
 const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
@ -20,7 +22,7 @@ const TEXTURE_SIZE: i32 = 16; // must be >= 2 * radius
 const MESH_SIZE: f32 = @floatFromInt(TEXTURE_SIZE); // match texture size

 // logging thresholds
-const TARGET_FRAME_MS: f32 = 16.7; // 60fps
+const TARGET_FRAME_MS: f32 = 8.33; // 120fps
 const THRESHOLD_MARGIN: f32 = 2.0; // hysteresis margin to avoid bounce
 const JUMP_THRESHOLD_MS: f32 = 5.0; // log if frame time jumps by this much
 const HEARTBEAT_INTERVAL: f32 = 10.0; // seconds between periodic logs
@ -31,6 +33,11 @@ const BENCH_RAMP_AMOUNT: usize = 50_000; // entities added per ramp
 const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
 const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long

+// zoom settings
+const ZOOM_MIN: f32 = 1.0;
+const ZOOM_MAX: f32 = 10.0;
+const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
+
 const BenchmarkLogger = struct {
    file: ?std.fs.File,
    last_logged_frame_ms: f32,
@ -156,6 +163,8 @@ pub fn main() !void {
    var bench_mode = false;
    var use_instancing = false;
    var use_ssbo = true;
+    var use_vsync = false;
+    var use_compute = true; // GPU compute is now default
    var args = try std.process.argsWithAllocator(std.heap.page_allocator);
    defer args.deinit();
    _ = args.skip(); // skip program name
@ -167,12 +176,23 @@ pub fn main() !void {
            use_ssbo = false; // legacy GPU instancing path
        } else if (std.mem.eql(u8, arg, "--legacy")) {
            use_ssbo = false; // legacy rlgl batched path
+        } else if (std.mem.eql(u8, arg, "--vsync")) {
+            use_vsync = true;
+        } else if (std.mem.eql(u8, arg, "--cpu")) {
+            use_compute = false; // fallback to CPU update path
        }
    }

+    if (use_vsync) {
+        rl.setConfigFlags(.{ .vsync_hint = true });
+    }
    rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
    defer rl.closeWindow();
-    rl.setTargetFPS(60);
+
+    // show background immediately (avoid black screen during init)
+    rl.beginDrawing();
+    rl.clearBackground(BG_COLOR);
+    rl.endDrawing();

    // use larger batch buffer: 16384 elements vs default 8192
    // fewer flushes = less driver overhead per frame
@ -241,6 +261,26 @@ pub fn main() !void {
        if (ssbo_renderer) |*r| r.deinit();
    }

+    // compute shader setup (only if --compute flag)
+    var compute_shader: ?ComputeShader = null;
+
+    if (use_compute) {
+        if (!use_ssbo) {
+            std.debug.print("--compute requires SSBO mode (default), ignoring\n", .{});
+        } else {
+            compute_shader = ComputeShader.init();
+            if (compute_shader == null) {
+                std.debug.print("failed to initialize compute shader, falling back to CPU\n", .{});
+            } else {
+                std.debug.print("compute shader mode enabled\n", .{});
+            }
+        }
+    }
+
+    defer {
+        if (compute_shader) |*c| c.deinit();
+    }
+
    // load UI font (embedded)
    const font_data = @embedFile("verdanab.ttf");
    const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
@ -254,6 +294,11 @@ pub fn main() !void {
    var rng = prng.random();

    var paused = false;
+
+    // camera state for zoom/pan
+    var zoom: f32 = 1.0;
+    var pan = @Vector(2, f32){ 0, 0 };
+
    var logger = BenchmarkLogger.init();
    defer logger.deinit();

@ -261,6 +306,7 @@ pub fn main() !void {
    var update_time_us: i64 = 0;
    var render_time_us: i64 = 0;
    var elapsed: f32 = 0;
+    var frame_number: u32 = 0;

    // auto-benchmark state
    var last_ramp_time: f32 = 0;
@ -306,24 +352,47 @@ pub fn main() !void {
        } else {
            // manual controls
            handleInput(&entities, &rng, &paused);
+            if (handleCamera(&zoom, &pan)) break;
        }

        // update
        if (!paused) {
+            const tracy_update = ztracy.ZoneN(@src(), "update");
+            defer tracy_update.End();
            const update_start = std.time.microTimestamp();
+
+            if (compute_shader == null) {
+                // CPU update path (positions + respawn)
                sandbox.update(&entities, &rng);
+            }
+            // GPU compute path handles update in render section before draw
+
            update_time_us = std.time.microTimestamp() - update_start;
        }

        // render
+        const tracy_render = ztracy.ZoneN(@src(), "render");
+        defer tracy_render.End();
        const render_start = std.time.microTimestamp();

        rl.beginDrawing();
        rl.clearBackground(BG_COLOR);

        if (use_ssbo) {
-            // SSBO instanced rendering path (12 bytes per entity)
-            ssbo_renderer.?.render(&entities);
+            // dispatch compute shader before render (if enabled)
+            if (compute_shader) |*cs| {
+                if (!paused) {
+                    const tracy_compute = ztracy.ZoneN(@src(), "compute_dispatch");
+                    defer tracy_compute.End();
+                    cs.dispatch(ssbo_renderer.?.ssbo_id, @intCast(entities.count), frame_number);
+                    frame_number +%= 1;
+                }
+                // GPU compute mode - only upload new entities, positions updated on GPU
+                ssbo_renderer.?.renderComputeMode(&entities, zoom, pan);
+            } else {
+                // CPU mode - upload entity data to GPU
+                ssbo_renderer.?.render(&entities, zoom, pan);
+            }
        } else if (use_instancing) {
            // GPU instancing path (64 bytes per entity)
            const xforms = transforms.?;
@ -374,7 +443,8 @@ pub fn main() !void {

        // metrics overlay (skip in bench mode for cleaner headless run)
        if (!bench_mode) {
-            ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
+            ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
+            ui.drawMemory(entities.count, ui_font);
        }

        rl.endDrawing();
@ -385,6 +455,9 @@ pub fn main() !void {
        const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
        const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
        logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
+
+        // tracy frame mark
+        ztracy.FrameMark();
    }
 }

@ -397,7 +470,7 @@ var sub_timer: f32 = 0;
 fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) void {
    const dt = rl.getFrameTime();
    const shift = rl.isKeyDown(.left_shift) or rl.isKeyDown(.right_shift);
-    const add_count: usize = if (shift) 10000 else 1000;
+    const add_count: usize = if (shift) 50_000 else 10_000;

    const add_held = rl.isKeyDown(.equal) or rl.isKeyDown(.kp_add);
    const sub_held = rl.isKeyDown(.minus) or rl.isKeyDown(.kp_subtract);
@ -439,4 +512,86 @@ fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) voi
    if (rl.isKeyPressed(.space)) {
        paused.* = !paused.*;
    }
+
+    // toggle ui: tab
+    if (rl.isKeyPressed(.tab)) {
+        ui.show_ui = !ui.show_ui;
+    }
+}
+
+fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) bool {
+    const wheel = rl.getMouseWheelMove();
+
+    if (wheel != 0) {
+        const mouse_pos = rl.getMousePosition();
+        const old_zoom = zoom.*;
+
+        // calculate new zoom
+        const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
+        var new_zoom = old_zoom * zoom_factor;
+        new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
+
+        if (new_zoom != old_zoom) {
+            // zoom toward mouse cursor:
+            // keep the world point under the cursor stationary
+            // world_pos = (screen_pos / old_zoom) + old_pan
+            // new_pan = world_pos - (screen_pos / new_zoom)
+            const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
+            const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
+            pan.*[0] = world_x - (mouse_pos.x / new_zoom);
+            pan.*[1] = world_y - (mouse_pos.y / new_zoom);
+            zoom.* = new_zoom;
+
+            // clamp pan to bounds
+            clampPan(pan, zoom.*);
+        }
+    }
+
+    // pan with any mouse button drag (only when zoomed in)
+    if (zoom.* > 1.0) {
+        const any_button = rl.isMouseButtonDown(.left) or
+            rl.isMouseButtonDown(.right) or
+            rl.isMouseButtonDown(.middle);
+        if (any_button) {
+            const delta = rl.getMouseDelta();
+            // drag down = view down, drag right = view right
+            pan.*[0] -= delta.x / zoom.*;
+            pan.*[1] += delta.y / zoom.*;
+            clampPan(pan, zoom.*);
+        }
+    }
+
+    // reset on Return or Enter
+    if (rl.isKeyPressed(.enter) or rl.isKeyPressed(.kp_enter)) {
+        zoom.* = 1.0;
+        pan.* = @Vector(2, f32){ 0, 0 };
+    }
+
+    // q: reset zoom if zoomed in, otherwise quit
+    if (rl.isKeyPressed(.q)) {
+        if (zoom.* > 1.0) {
+            zoom.* = 1.0;
+            pan.* = @Vector(2, f32){ 0, 0 };
+        } else {
+            return true; // signal to quit
+        }
+    }
+
+    return false;
+}
+
+fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
+    // when zoomed in, limit pan so viewport stays in simulation bounds
+    // visible area = screen_size / zoom
+    // max pan = world_size - visible_area
+    const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
+    const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
+    const visible_w = screen_w / zoom;
+    const visible_h = screen_h / zoom;
+
+    const max_pan_x = @max(0, screen_w - visible_w);
+    const max_pan_y = @max(0, screen_h - visible_h);
+
+    pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
+    pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
 }
--- a/src/shaders/entity.vert
+++ b/src/shaders/entity.vert
@ -4,10 +4,11 @@
 layout(location = 0) in vec2 position;
 layout(location = 1) in vec2 texCoord;

-// entity data from SSBO
+// entity data from SSBO (16 bytes, matches compute shader layout)
 struct Entity {
    float x;
    float y;
+    int packedVel;  // vx high 16 bits, vy low 16 bits (fixed-point 8.8), unused in vertex shader
    uint color;
 };

@ -17,6 +18,8 @@ layout(std430, binding = 0) readonly buffer EntityData {

 // screen size for NDC conversion
 uniform vec2 screenSize;
+uniform float zoom;
+uniform vec2 pan;

 out vec2 fragTexCoord;
 out vec3 fragColor;
@ -25,13 +28,13 @@ void main() {
    // get entity data from SSBO
    Entity e = entities[gl_InstanceID];

-    // convert entity position to NDC
-    // entity coords are in screen pixels, convert to [-1, 1]
-    float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
-    float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
+    // apply pan offset and zoom to convert to NDC
+    // pan is in screen pixels, zoom scales the view
+    float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
+    float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;

-    // quad size in NDC (16 pixels)
-    float quadSizeNdc = 16.0 / screenSize.x;
+    // quad size in NDC (16 pixels, scaled by zoom)
+    float quadSizeNdc = (16.0 * zoom) / screenSize.x;

    // offset by quad corner position
    gl_Position = vec4(ndcX + position.x * quadSizeNdc,
--- a/src/shaders/entity_update.comp
+++ b/src/shaders/entity_update.comp
@ -0,0 +1,97 @@
+#version 430
+
+layout(local_size_x = 256) in;
+
+struct Entity {
+    float x;
+    float y;
+    int packedVel;  // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
+    uint color;
+};
+
+layout(std430, binding = 0) buffer Entities {
+    Entity entities[];
+};
+
+uniform uint entityCount;
+uniform uint frameNumber;
+uniform vec2 screenSize;
+uniform vec2 center;
+uniform float respawnRadius;
+uniform float entitySpeed;
+
+// PCG-style GPU RNG - returns value in [0, 1)
+uint pcg(inout uint state) {
+    state = state * 747796405u + 2891336453u;
+    uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    return (word >> 22u) ^ word;
+}
+
+float randFloat(inout uint state) {
+    return float(pcg(state)) / 4294967296.0;
+}
+
+// pack velocity into fixed-point 8.8 format
+int packVelocity(float vx, float vy) {
+    int vx_fixed = int(clamp(vx * 256.0, -32768.0, 32767.0));
+    int vy_fixed = int(clamp(vy * 256.0, -32768.0, 32767.0));
+    return (vx_fixed << 16) | (vy_fixed & 0xFFFF);
+}
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    if (id >= entityCount) return;
+
+    Entity e = entities[id];
+
+    // unpack velocity (fixed-point 8.8)
+    float vx = float(e.packedVel >> 16) / 256.0;
+    float vy = float((e.packedVel << 16) >> 16) / 256.0;  // sign-extend low 16 bits
+
+    // update position
+    e.x += vx;
+    e.y += vy;
+
+    // check if reached center - respawn at edge
+    float dx = e.x - center.x;
+    float dy = e.y - center.y;
+    if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
+        // init RNG with entity id and frame number
+        uint rng = id * 1103515245u + frameNumber * 12345u + 1u;
+
+        // pick random edge: 0=top, 1=bottom, 2=left, 3=right
+        uint edge = pcg(rng) & 3u;
+        float t = randFloat(rng);
+
+        // spawn on edge
+        if (edge == 0u) {  // top
+            e.x = t * screenSize.x;
+            e.y = 0.0;
+        } else if (edge == 1u) {  // bottom
+            e.x = t * screenSize.x;
+            e.y = screenSize.y;
+        } else if (edge == 2u) {  // left
+            e.x = 0.0;
+            e.y = t * screenSize.y;
+        } else {  // right
+            e.x = screenSize.x;
+            e.y = t * screenSize.y;
+        }
+
+        // velocity toward center
+        dx = center.x - e.x;
+        dy = center.y - e.y;
+        float dist = sqrt(dx*dx + dy*dy);
+        vx = (dx / dist) * entitySpeed;
+        vy = (dy / dist) * entitySpeed;
+        e.packedVel = packVelocity(vx, vy);
+
+        // new random color
+        uint r = pcg(rng) & 0xFFu;
+        uint g = pcg(rng) & 0xFFu;
+        uint b = pcg(rng) & 0xFFu;
+        e.color = (r << 16u) | (g << 8u) | b;
+    }
+
+    entities[id] = e;
+}
--- a/src/ssbo_renderer.zig
+++ b/src/ssbo_renderer.zig
@ -3,6 +3,7 @@

 const std = @import("std");
 const rl = @import("raylib");
+const ztracy = @import("ztracy");
 const sandbox = @import("sandbox.zig");

 const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
@ -19,8 +20,11 @@ pub const SsboRenderer = struct {
    ssbo_id: u32,
    screen_size_loc: i32,
    circle_texture_loc: i32,
+    zoom_loc: i32,
+    pan_loc: i32,
    circle_texture_id: u32,
    gpu_buffer: []sandbox.GpuEntity,
+    last_entity_count: usize, // track count to detect when entities are added

    const QUAD_SIZE: f32 = 16.0;

@ -53,6 +57,8 @@ pub const SsboRenderer = struct {
        // get uniform locations
        const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
        const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
+        const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
+        const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");

        if (screen_size_loc < 0) {
            std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
@ -94,7 +100,7 @@ pub const SsboRenderer = struct {
        rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
        rl.gl.rlEnableVertexAttribute(1);

-        // create SSBO for entity data (12 bytes per entity, 1M entities = 12MB)
+        // create SSBO for entity data (16 bytes per entity, 1M entities = 16MB)
        const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
        const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
        if (ssbo_id == 0) {
@ -116,8 +122,11 @@ pub const SsboRenderer = struct {
            .ssbo_id = ssbo_id,
            .screen_size_loc = screen_size_loc,
            .circle_texture_loc = circle_texture_loc,
+            .zoom_loc = zoom_loc,
+            .pan_loc = pan_loc,
            .circle_texture_id = circle_texture.id,
            .gpu_buffer = gpu_buffer,
+            .last_entity_count = 0,
        };
    }

@ -129,25 +138,80 @@ pub const SsboRenderer = struct {
        std.heap.page_allocator.free(self.gpu_buffer);
    }

-    pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities) void {
+    pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
+        self.renderInternal(entities, zoom, pan, false);
+    }
+
+    pub fn renderComputeMode(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
        if (entities.count == 0) return;

        // flush raylib's internal render batch before our custom GL calls
        rl.gl.rlDrawRenderBatchActive();

-        // copy entity data to GPU buffer (position + color only)
-        for (entities.items[0..entities.count], 0..) |entity, i| {
+        // upload NEW entities when count increases (entities added on CPU)
+        if (entities.count > self.last_entity_count) {
+            const zone = ztracy.ZoneN(@src(), "ssbo_upload_new");
+            defer zone.End();
+
+            // copy new entities to GPU buffer
+            for (entities.items[self.last_entity_count..entities.count], self.last_entity_count..) |entity, i| {
                self.gpu_buffer[i] = .{
                    .x = entity.x,
                    .y = entity.y,
+                    .packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
                    .color = entity.color,
                };
            }

+            // upload only the new portion to SSBO
+            const offset: u32 = @intCast(self.last_entity_count * @sizeOf(sandbox.GpuEntity));
+            const new_count = entities.count - self.last_entity_count;
+            const data_size: u32 = @intCast(new_count * @sizeOf(sandbox.GpuEntity));
+            rl.gl.rlUpdateShaderBuffer(self.ssbo_id, &self.gpu_buffer[self.last_entity_count], data_size, offset);
+
+            self.last_entity_count = entities.count;
+        } else if (entities.count < self.last_entity_count) {
+            // entities were removed, update count
+            self.last_entity_count = entities.count;
+        }
+
+        self.drawInstanced(entities.count, zoom, pan);
+    }
+
+    fn renderInternal(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32), skip_upload: bool) void {
+        if (entities.count == 0) return;
+
+        // flush raylib's internal render batch before our custom GL calls
+        rl.gl.rlDrawRenderBatchActive();
+
+        if (!skip_upload) {
+            // copy entity data to GPU buffer (position + packed velocity + color)
+            {
+                const zone = ztracy.ZoneN(@src(), "ssbo_copy");
+                defer zone.End();
+                for (entities.items[0..entities.count], 0..) |entity, i| {
+                    self.gpu_buffer[i] = .{
+                        .x = entity.x,
+                        .y = entity.y,
+                        .packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
+                        .color = entity.color,
+                    };
+                }
+            }
+
            // upload to SSBO
+            {
+                const zone = ztracy.ZoneN(@src(), "ssbo_upload");
+                defer zone.End();
                const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
                rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
+            }
+        }

+        self.drawInstanced(entities.count, zoom, pan);
+    }
+
+    fn drawInstanced(self: *SsboRenderer, entity_count: usize, zoom: f32, pan: @Vector(2, f32)) void {
        // bind shader
        rl.gl.rlEnableShader(self.shader_id);

@ -155,6 +219,13 @@ pub const SsboRenderer = struct {
        const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
        rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);

+        // set zoom uniform
+        rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
+
+        // set pan uniform
+        const pan_arr = [2]f32{ pan[0], pan[1] };
+        rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
+
        // bind texture
        rl.gl.rlActiveTextureSlot(0);
        rl.gl.rlEnableTexture(self.circle_texture_id);
@ -170,9 +241,13 @@ pub const SsboRenderer = struct {
        rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));

        // bind VAO and draw
+        {
+            const zone = ztracy.ZoneN(@src(), "ssbo_draw");
+            defer zone.End();
            _ = rl.gl.rlEnableVertexArray(self.vao_id);
            rl.gl.rlEnableVertexBuffer(self.vbo_id);
-        rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entities.count));
+            rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entity_count));
+        }

        // cleanup - restore raylib's expected state
        rl.gl.rlDisableVertexArray();
--- a/src/ui.zig
+++ b/src/ui.zig
@ -19,13 +19,23 @@ pub const box_padding: f32 = 8;
 pub const text_color = rl.Color.white;
 pub const dim_text_color = rl.Color.gray;
 pub const highlight_color = rl.Color.yellow;
+pub const fps_good_color = rl.Color.green;
+pub const fps_bad_color = rl.Color.red;
 pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };

+// =============================================================================
+// state
+// =============================================================================
+
+pub var show_ui: bool = true;
+
 // =============================================================================
 // drawing functions
 // =============================================================================

-pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
+pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
+    if (!show_ui) return;
+
    var buf: [256]u8 = undefined;

    // fps box (above metrics)
@ -33,13 +43,16 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
    rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
    const frame_ms = rl.getFrameTime() * 1000.0;
    const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
-    const fps_text = std.fmt.bufPrintZ(&buf, "FPS: {d:.0}", .{fps}) catch "?";
-    rl.drawTextEx(font, fps_text, .{ .x = padding, .y = padding }, font_size, 0, text_color);
+    rl.drawTextEx(font, "FPS: ", .{ .x = padding, .y = padding }, font_size, 0, text_color);
+    const fps_text = std.fmt.bufPrintZ(&buf, "{d:.0}", .{fps}) catch "?";
+    const fps_color = if (fps >= 60.0) fps_good_color else fps_bad_color;
+    const label_width = rl.measureTextEx(font, "FPS: ", font_size, 0).x;
+    rl.drawTextEx(font, fps_text, .{ .x = padding + label_width, .y = padding }, font_size, 0, fps_color);

    // metrics box (below fps)
    const metrics_y: i32 = 5 + fps_box_height + 5;
    var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
-    const bg_height: i32 = if (paused) 130 else 100;
+    const bg_height: i32 = if (paused) 150 else 120;
    rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);

    // entity count
@ -64,6 +77,11 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
    rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
    y += line_height;

+    // zoom level
+    const zoom_text = std.fmt.bufPrintZ(&buf, "zoom:     {d:.1}x", .{zoom}) catch "?";
+    rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
+    y += line_height;
+
    // paused indicator
    if (paused) {
        y += line_height;
@ -74,18 +92,56 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
    drawControls(font, metrics_y + bg_height);
 }

+pub fn drawMemory(entity_count: usize, font: rl.Font) void {
+    if (!show_ui) return;
+
+    var buf: [256]u8 = undefined;
+
+    const box_width: i32 = 160;
+    const box_height: i32 = @intFromFloat(line_height * 3 + box_padding * 2);
+    const box_x: i32 = @as(i32, @intCast(sandbox.SCREEN_WIDTH)) - box_width - 5;
+    const box_y: i32 = 5;
+
+    rl.drawRectangle(box_x, box_y, box_width, box_height, box_bg);
+
+    var y: f32 = @as(f32, @floatFromInt(box_y)) + box_padding;
+    const x: f32 = @floatFromInt(box_x + @as(i32, @intFromFloat(box_padding)));
+
+    // entity memory (CPU side)
+    const entity_bytes = entity_count * @sizeOf(sandbox.Entity);
+    const entity_mb = @as(f32, @floatFromInt(entity_bytes)) / (1024.0 * 1024.0);
+    const entity_text = std.fmt.bufPrintZ(&buf, "cpu:  {d:.1} MB", .{entity_mb}) catch "?";
+    rl.drawTextEx(font, entity_text, .{ .x = x, .y = y }, font_size, 0, text_color);
+    y += line_height;
+
+    // GPU buffer memory (SSBO)
+    const gpu_bytes = entity_count * @sizeOf(sandbox.GpuEntity);
+    const gpu_mb = @as(f32, @floatFromInt(gpu_bytes)) / (1024.0 * 1024.0);
+    const gpu_text = std.fmt.bufPrintZ(&buf, "gpu:  {d:.1} MB", .{gpu_mb}) catch "?";
+    rl.drawTextEx(font, gpu_text, .{ .x = x, .y = y }, font_size, 0, text_color);
+    y += line_height;
+
+    // total
+    const total_mb = entity_mb + gpu_mb;
+    const total_text = std.fmt.bufPrintZ(&buf, "total: {d:.1} MB", .{total_mb}) catch "?";
+    rl.drawTextEx(font, total_text, .{ .x = x, .y = y }, font_size, 0, dim_text_color);
+}
+
 fn drawControls(font: rl.Font, metrics_bottom: i32) void {
-    const ctrl_box_height: i32 = @intFromFloat(small_line_height * 4 + box_padding * 2);
+    const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
    const ctrl_box_y: i32 = metrics_bottom + 5;
    rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);

    var y: f32 = @as(f32, @floatFromInt(ctrl_box_y)) + box_padding;

    const controls = [_][]const u8{
-        "+/-: 1000 entities",
-        "shift +/-: 10000",
-        "space: pause",
-        "r: reset",
+        "+/-: 10k entities",
+        "shift +/-: 50k",
+        "scroll: zoom",
+        "drag: pan (zoomed)",
+        "space: pause, r: reset",
+        "q: zoom out / quit",
+        "tab: toggle ui",
    };

    for (controls) |text| {
Author	SHA1	Message	Date
Jared Miller	02fd358611	Modify forgejo release to target our own runner Some checks failed release / build (push) Failing after 1m19s Details	2025-12-20 12:47:54 -05:00
Jared Miller	5b890b18e4	Add glossary and rops doc	2025-12-19 07:50:32 -05:00
Jared Miller	55b0d7fab7	Add hd530 notes with point sprite experience	2025-12-17 21:22:02 -05:00
Jared Miller	a842800ede	Add release notes	2025-12-17 21:01:21 -05:00
Jared Miller	0568204cb7	Clean up todo and optimizations doc	2025-12-17 20:58:31 -05:00
Jared Miller	516b4af458	Mark off the computer shader todo	2025-12-17 14:13:51 -05:00
Jared Miller	6dcafc8f3c	Add doc "why rendering millions of entities is hard"	2025-12-17 14:02:41 -05:00
Jared Miller	9f3495b882	Add alignCast for cross platform strictness Some checks failed release / build (release) Has been cancelled Details	2025-12-17 11:56:06 -05:00
Jared Miller	90bb30b6c6	Cleanup compute shader implementation	2025-12-17 11:48:45 -05:00
Jared Miller	9e8226de32	Add GPU RNG to computer shader	2025-12-17 10:02:09 -05:00
Jared Miller	62d010bdc0	Add computer shader infrastructure	2025-12-17 09:50:35 -05:00
Jared Miller	45c37bfcd2	Add shader plans	2025-12-17 09:50:22 -05:00
Jared Miller	5fd82000cf	Add GpuEntity struct expansion	2025-12-17 09:43:11 -05:00
Jared Miller	c30b9c0ed0	Add a doc on hysteria	2025-12-16 17:37:51 -05:00
Jared Miller	ebe28e5669	Add tracy profiling	2025-12-16 17:27:55 -05:00
Jared Miller	7b43b5726e	Add zoom plans	2025-12-16 14:35:24 -05:00
Jared Miller	d0dcb701f8	Color FPS green or red	2025-12-16 13:58:56 -05:00
Jared Miller	e1d5dc136e	Bind q to zoom-out and quit	2025-12-16 13:03:52 -05:00
Jared Miller	3e2e39100a	Add zoom and panning via mouse	2025-12-16 12:56:54 -05:00
Jared Miller	26383ed79e	Add memory panel	2025-12-16 11:28:02 -05:00
Jared Miller	1782bc8db7	Draw background right away	2025-12-16 11:16:38 -05:00
Jared Miller	3f9e33feaf	Add panel toggling via tab	2025-12-16 11:14:34 -05:00
Jared Miller	d8bc9ac927	Explain lofivor name in README	2025-12-16 10:50:35 -05:00
Jared Miller	9e48f2cc2d	Add windows build to release	2025-12-16 10:31:25 -05:00
Jared Miller	123322494e	Remove FPS cap with optional vsync arg	2025-12-16 10:31:25 -05:00
Jared Miller	2deaa66a78	Set entity commands to 10k and 50k increments	2025-12-16 10:31:25 -05:00