Cleanup compute shader implementation

This commit is contained in:
Jared Miller 2025-12-17 10:07:22 -05:00
parent 9e8226de32
commit 90bb30b6c6
No known key found for this signature in database
5 changed files with 52 additions and 16 deletions

View file

@ -141,10 +141,10 @@ each step is a commit point if desired.
- implement edge spawning + velocity calculation - implement edge spawning + velocity calculation
- remove CPU update loop from sandbox.zig - remove CPU update loop from sandbox.zig
### step 4: cleanup ### step 4: cleanup
- remove dead code (cpu update, per-frame upload) - `--compute` is now default, `--cpu` flag for fallback/comparison
- add `--compute` flag to toggle (keep old path for comparison) - justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
- benchmark and document results - verbose debug output reduced
## files changed ## files changed

View file

@ -206,3 +206,38 @@ total improvement from baseline:
- SSBO: 60fps @ ~700k entities - SSBO: 60fps @ ~700k entities
- ~140x improvement overall! - ~140x improvement overall!
---
optimization 6: compute shader updates
--------------------------------------
technique: move entity position + respawn logic from CPU to GPU compute shader
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
version: 0.7.0
struct GpuEntity {
x: f32, // 4 bytes
y: f32, // 4 bytes
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
color: u32, // 4 bytes
}; // = 16 bytes total (was 12)
changes:
- entity_update.comp: position update, center check, edge respawn, velocity calc
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
- CPU update loop skipped entirely when compute enabled
benchmark results (i5-6500T / HD 530):
- update time: ~5ms → ~0ms at 150k entities
- render time unchanged (GPU-bound as before)
- total frame time improvement at high entity counts
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
new entities when user adds them, not per-frame. memory barrier ensures compute
writes visible to vertex shader before draw.
flags:
- --compute: GPU compute updates (now default)
- --cpu: fallback to CPU update path for comparison

View file

@ -42,11 +42,20 @@ check:
test: test:
zig build test zig build test
# run sandbox (GPU compute is default)
sandbox:
zig build -Doptimize=ReleaseFast run
# auto-benchmark (ramps entities until performance degrades) # auto-benchmark (ramps entities until performance degrades)
bench: bench:
zig build -Doptimize=ReleaseFast run -- --bench zig build -Doptimize=ReleaseFast run -- --bench
cat benchmark.log cat benchmark.log
# benchmark with CPU update path (for comparison)
bench-cpu:
zig build -Doptimize=ReleaseFast run -- --bench --cpu
cat benchmark.log
# software-rendered benchmark (for CI/headless servers) # software-rendered benchmark (for CI/headless servers)
[linux] [linux]
bench-sw: bench-sw:

View file

@ -50,15 +50,7 @@ pub const ComputeShader = struct {
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius"); const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed"); const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
std.debug.print("compute: shader loaded (program_id={}, uniforms: count={}, frame={}, screen={}, center={}, radius={}, speed={})\n", .{ std.debug.print("compute: shader loaded\n", .{});
program_id,
entity_count_loc,
frame_number_loc,
screen_size_loc,
center_loc,
respawn_radius_loc,
entity_speed_loc,
});
return .{ return .{
.program_id = program_id, .program_id = program_id,

View file

@ -164,7 +164,7 @@ pub fn main() !void {
var use_instancing = false; var use_instancing = false;
var use_ssbo = true; var use_ssbo = true;
var use_vsync = false; var use_vsync = false;
var use_compute = false; var use_compute = true; // GPU compute is now default
var args = try std.process.argsWithAllocator(std.heap.page_allocator); var args = try std.process.argsWithAllocator(std.heap.page_allocator);
defer args.deinit(); defer args.deinit();
_ = args.skip(); // skip program name _ = args.skip(); // skip program name
@ -178,8 +178,8 @@ pub fn main() !void {
use_ssbo = false; // legacy rlgl batched path use_ssbo = false; // legacy rlgl batched path
} else if (std.mem.eql(u8, arg, "--vsync")) { } else if (std.mem.eql(u8, arg, "--vsync")) {
use_vsync = true; use_vsync = true;
} else if (std.mem.eql(u8, arg, "--compute")) { } else if (std.mem.eql(u8, arg, "--cpu")) {
use_compute = true; use_compute = false; // fallback to CPU update path
} }
} }