From 45c37bfcd29b74dc2081860d2fa326aa552d6ae7 Mon Sep 17 00:00:00 2001
From: Jared Miller <jared@smell.flowers>
Date: Wed, 17 Dec 2025 09:50:22 -0500
Subject: [PATCH] Add shader plans

---
 .../2025-12-17-compute-shader-updates.md      | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 docs/plans/2025-12-17-compute-shader-updates.md

diff --git a/docs/plans/2025-12-17-compute-shader-updates.md b/docs/plans/2025-12-17-compute-shader-updates.md
new file mode 100644
index 0000000..d1aa937
--- /dev/null
+++ b/docs/plans/2025-12-17-compute-shader-updates.md
@@ -0,0 +1,170 @@
+# compute shader entity updates
+
+move entity position math to GPU, eliminate CPU→GPU sync per frame.
+
+## context
+
+current bottleneck: per-frame `rlUpdateShaderBuffer()` uploads all entity data from CPU to GPU. at 950k entities that's 19MB/frame. targeting 10M entities would be 160MB/frame.
+
+solution: keep entity data on GPU entirely. compute shader updates positions, vertex shader renders. CPU just dispatches.
+
+## data structures
+
+**GpuEntity (16 bytes, std430):**
+```glsl
+struct Entity {
+    float x;        // world position
+    float y;
+    int packedVel;  // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
+    uint color;     // 0xRRGGBB
+};
+```
+
+**zig side:**
+```zig
+const GpuEntity = extern struct {
+    x: f32,
+    y: f32,
+    packed_vel: i32,
+    color: u32,
+};
+
+fn packVelocity(vx: f32, vy: f32) i32 {
+    const vx_fixed: i16 = @intFromFloat(vx * 256.0);
+    const vy_fixed: i16 = @intFromFloat(vy * 256.0);
+    return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
+}
+```
+
+## compute shader
+
+`src/shaders/entity_update.comp`:
+```glsl
+#version 430
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer Entities {
+    Entity entities[];
+};
+
+uniform uint entityCount;
+uniform uint frameNumber;
+uniform vec2 screenSize;
+uniform vec2 center;
+uniform float respawnRadius;
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    if (id >= entityCount) return;
+
+    Entity e = entities[id];
+
+    // unpack velocity
+    float vx = float(e.packedVel >> 16) / 256.0;
+    float vy = float((e.packedVel << 16) >> 16) / 256.0;
+
+    // update position
+    e.x += vx;
+    e.y += vy;
+
+    // respawn check
+    float dx = e.x - center.x;
+    float dy = e.y - center.y;
+    if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
+        // GPU RNG
+        uint seed = id * 1103515245u + frameNumber * 12345u;
+        seed = seed * 747796405u + 2891336453u;
+
+        uint edge = seed & 3u;
+        float t = float((seed >> 2) & 0xFFFFu) / 65535.0;
+
+        // spawn on edge with velocity toward center
+        // (full edge logic in implementation)
+    }
+
+    entities[id] = e;
+}
+```
+
+## integration
+
+raylib doesn't wrap compute shaders. use raw GL calls via `compute.zig`:
+
+```zig
+pub fn dispatch(entity_count: u32, frame: u32) void {
+    gl.glUseProgram(program);
+    gl.glUniform1ui(entity_count_loc, entity_count);
+    gl.glUniform1ui(frame_loc, frame);
+    // ... other uniforms
+
+    const groups = (entity_count + 255) / 256;
+    gl.glDispatchCompute(groups, 1, 1);
+    gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
+}
+```
+
+## frame flow
+
+**before:**
+```
+CPU: update positions (5ms at 950k)
+CPU: copy to gpu_buffer
+CPU→GPU: rlUpdateShaderBuffer() ← bottleneck
+GPU: render
+```
+
+**after:**
+```
+GPU: compute dispatch (~0ms CPU time)
+GPU: memory barrier
+GPU: render
+```
+
+## implementation steps
+
+each step is a commit point if desired.
+
+### step 1: GpuEntity struct expansion
+- modify `GpuEntity` in sandbox.zig: add `packed_vel` field
+- add `packVelocity()` helper
+- update ssbo_renderer to handle 16-byte stride
+- verify existing rendering still works
+
+### step 2: compute shader infrastructure
+- create `src/compute.zig` with GL bindings
+- create `src/shaders/entity_update.comp` (position update only, no respawn yet)
+- load and compile compute shader in sandbox_main.zig
+- dispatch before render, verify positions update
+
+### step 3: respawn logic
+- add GPU RNG to compute shader
+- implement edge spawning + velocity calculation
+- remove CPU update loop from sandbox.zig
+
+### step 4: cleanup
+- remove dead code (cpu update, per-frame upload)
+- add `--compute` flag to toggle (keep old path for comparison)
+- benchmark and document results
+
+## files changed
+
+**new:**
+- `src/shaders/entity_update.comp`
+- `src/compute.zig`
+
+**modified:**
+- `src/sandbox.zig` — GpuEntity struct, packVelocity(), remove CPU update
+- `src/ssbo_renderer.zig` — remove per-frame upload
+- `src/sandbox_main.zig` — init compute, dispatch in frame loop
+
+## risks
+
+1. **driver quirks** — intel HD 530 compute support is fine but older, may hit edge cases
+2. **debugging** — GPU code harder to debug, start with small counts
+3. **fallback** — keep `--compute` flag to A/B test against existing SSBO path
+
+## expected results
+
+- CPU update time: ~5ms → ~0ms
+- no per-frame buffer upload
+- target: 1M+ entities, pushing toward 10M ceiling