diff --git a/docs/plans/2025-12-17-compute-shader-updates.md b/docs/plans/2025-12-17-compute-shader-updates.md index d1aa937..ec0adbe 100644 --- a/docs/plans/2025-12-17-compute-shader-updates.md +++ b/docs/plans/2025-12-17-compute-shader-updates.md @@ -141,10 +141,10 @@ each step is a commit point if desired. - implement edge spawning + velocity calculation - remove CPU update loop from sandbox.zig -### step 4: cleanup -- remove dead code (cpu update, per-frame upload) -- add `--compute` flag to toggle (keep old path for comparison) -- benchmark and document results +### step 4: cleanup ✓ +- `--compute` is now default, `--cpu` flag for fallback/comparison +- justfile updated: `just bench` (compute), `just bench-cpu` (comparison) +- verbose debug output reduced ## files changed diff --git a/journal.txt b/journal.txt index 34c1234..98a61e3 100644 --- a/journal.txt +++ b/journal.txt @@ -206,3 +206,38 @@ total improvement from baseline: - SSBO: 60fps @ ~700k entities - ~140x improvement overall! +--- + +optimization 6: compute shader updates +-------------------------------------- +technique: move entity position + respawn logic from CPU to GPU compute shader +code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig +version: 0.7.0 + +struct GpuEntity { + x: f32, // 4 bytes + y: f32, // 4 bytes + packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8) + color: u32, // 4 bytes +}; // = 16 bytes total (was 12) + +changes: +- entity_update.comp: position update, center check, edge respawn, velocity calc +- GPU RNG: PCG-style PRNG seeded with entity id + frame number +- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows) +- CPU update loop skipped entirely when compute enabled + +benchmark results (i5-6500T / HD 530): +- update time: ~5ms → ~0ms at 150k entities +- render time unchanged (GPU-bound as before) +- total frame time improvement at high entity counts + +analysis: CPU was doing ~150k position updates + distance checks + respawn logic +per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads +new entities when user adds them, not per-frame. memory barrier ensures compute +writes visible to vertex shader before draw. + +flags: +- --compute: GPU compute updates (now default) +- --cpu: fallback to CPU update path for comparison + diff --git a/justfile b/justfile index bd30a32..a0701f1 100644 --- a/justfile +++ b/justfile @@ -42,11 +42,20 @@ check: test: zig build test +# run sandbox (GPU compute is default) +sandbox: + zig build -Doptimize=ReleaseFast run + # auto-benchmark (ramps entities until performance degrades) bench: zig build -Doptimize=ReleaseFast run -- --bench cat benchmark.log +# benchmark with CPU update path (for comparison) +bench-cpu: + zig build -Doptimize=ReleaseFast run -- --bench --cpu + cat benchmark.log + # software-rendered benchmark (for CI/headless servers) [linux] bench-sw: diff --git a/src/compute.zig b/src/compute.zig index 4f28741..5365057 100644 --- a/src/compute.zig +++ b/src/compute.zig @@ -50,15 +50,7 @@ pub const ComputeShader = struct { const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius"); const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed"); - std.debug.print("compute: shader loaded (program_id={}, uniforms: count={}, frame={}, screen={}, center={}, radius={}, speed={})\n", .{ - program_id, - entity_count_loc, - frame_number_loc, - screen_size_loc, - center_loc, - respawn_radius_loc, - entity_speed_loc, - }); + std.debug.print("compute: shader loaded\n", .{}); return .{ .program_id = program_id, diff --git a/src/sandbox_main.zig b/src/sandbox_main.zig index cbb8e38..27cf27c 100644 --- a/src/sandbox_main.zig +++ b/src/sandbox_main.zig @@ -164,7 +164,7 @@ pub fn main() !void { var use_instancing = false; var use_ssbo = true; var use_vsync = false; - var use_compute = false; + var use_compute = true; // GPU compute is now default var args = try std.process.argsWithAllocator(std.heap.page_allocator); defer args.deinit(); _ = args.skip(); // skip program name @@ -178,8 +178,8 @@ pub fn main() !void { use_ssbo = false; // legacy rlgl batched path } else if (std.mem.eql(u8, arg, "--vsync")) { use_vsync = true; - } else if (std.mem.eql(u8, arg, "--compute")) { - use_compute = true; + } else if (std.mem.eql(u8, arg, "--cpu")) { + use_compute = false; // fallback to CPU update path } }