Cleanup compute shader implementation
This commit is contained in:
parent
9e8226de32
commit
90bb30b6c6
5 changed files with 52 additions and 16 deletions
|
|
@ -141,10 +141,10 @@ each step is a commit point if desired.
|
|||
- implement edge spawning + velocity calculation
|
||||
- remove CPU update loop from sandbox.zig
|
||||
|
||||
### step 4: cleanup
|
||||
- remove dead code (cpu update, per-frame upload)
|
||||
- add `--compute` flag to toggle (keep old path for comparison)
|
||||
- benchmark and document results
|
||||
### step 4: cleanup ✓
|
||||
- `--compute` is now default, `--cpu` flag for fallback/comparison
|
||||
- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
|
||||
- verbose debug output reduced
|
||||
|
||||
## files changed
|
||||
|
||||
|
|
|
|||
35
journal.txt
35
journal.txt
|
|
@ -206,3 +206,38 @@ total improvement from baseline:
|
|||
- SSBO: 60fps @ ~700k entities
|
||||
- ~140x improvement overall!
|
||||
|
||||
---
|
||||
|
||||
optimization 6: compute shader updates
|
||||
--------------------------------------
|
||||
technique: move entity position + respawn logic from CPU to GPU compute shader
|
||||
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
|
||||
version: 0.7.0
|
||||
|
||||
struct GpuEntity {
|
||||
x: f32, // 4 bytes
|
||||
y: f32, // 4 bytes
|
||||
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
|
||||
color: u32, // 4 bytes
|
||||
}; // = 16 bytes total (was 12)
|
||||
|
||||
changes:
|
||||
- entity_update.comp: position update, center check, edge respawn, velocity calc
|
||||
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
|
||||
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
|
||||
- CPU update loop skipped entirely when compute enabled
|
||||
|
||||
benchmark results (i5-6500T / HD 530):
|
||||
- update time: ~5ms → ~0ms at 150k entities
|
||||
- render time unchanged (GPU-bound as before)
|
||||
- total frame time improvement at high entity counts
|
||||
|
||||
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
|
||||
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
|
||||
new entities when user adds them, not per-frame. memory barrier ensures compute
|
||||
writes visible to vertex shader before draw.
|
||||
|
||||
flags:
|
||||
- --compute: GPU compute updates (now default)
|
||||
- --cpu: fallback to CPU update path for comparison
|
||||
|
||||
|
|
|
|||
9
justfile
9
justfile
|
|
@ -42,11 +42,20 @@ check:
|
|||
test:
|
||||
zig build test
|
||||
|
||||
# run sandbox (GPU compute is default)
|
||||
sandbox:
|
||||
zig build -Doptimize=ReleaseFast run
|
||||
|
||||
# auto-benchmark (ramps entities until performance degrades)
|
||||
bench:
|
||||
zig build -Doptimize=ReleaseFast run -- --bench
|
||||
cat benchmark.log
|
||||
|
||||
# benchmark with CPU update path (for comparison)
|
||||
bench-cpu:
|
||||
zig build -Doptimize=ReleaseFast run -- --bench --cpu
|
||||
cat benchmark.log
|
||||
|
||||
# software-rendered benchmark (for CI/headless servers)
|
||||
[linux]
|
||||
bench-sw:
|
||||
|
|
|
|||
|
|
@ -50,15 +50,7 @@ pub const ComputeShader = struct {
|
|||
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
||||
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
||||
|
||||
std.debug.print("compute: shader loaded (program_id={}, uniforms: count={}, frame={}, screen={}, center={}, radius={}, speed={})\n", .{
|
||||
program_id,
|
||||
entity_count_loc,
|
||||
frame_number_loc,
|
||||
screen_size_loc,
|
||||
center_loc,
|
||||
respawn_radius_loc,
|
||||
entity_speed_loc,
|
||||
});
|
||||
std.debug.print("compute: shader loaded\n", .{});
|
||||
|
||||
return .{
|
||||
.program_id = program_id,
|
||||
|
|
|
|||
|
|
@ -164,7 +164,7 @@ pub fn main() !void {
|
|||
var use_instancing = false;
|
||||
var use_ssbo = true;
|
||||
var use_vsync = false;
|
||||
var use_compute = false;
|
||||
var use_compute = true; // GPU compute is now default
|
||||
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
||||
defer args.deinit();
|
||||
_ = args.skip(); // skip program name
|
||||
|
|
@ -178,8 +178,8 @@ pub fn main() !void {
|
|||
use_ssbo = false; // legacy rlgl batched path
|
||||
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
||||
use_vsync = true;
|
||||
} else if (std.mem.eql(u8, arg, "--compute")) {
|
||||
use_compute = true;
|
||||
} else if (std.mem.eql(u8, arg, "--cpu")) {
|
||||
use_compute = false; // fallback to CPU update path
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue