Cleanup compute shader implementation
This commit is contained in:
parent
9e8226de32
commit
90bb30b6c6
5 changed files with 52 additions and 16 deletions
|
|
@ -141,10 +141,10 @@ each step is a commit point if desired.
|
||||||
- implement edge spawning + velocity calculation
|
- implement edge spawning + velocity calculation
|
||||||
- remove CPU update loop from sandbox.zig
|
- remove CPU update loop from sandbox.zig
|
||||||
|
|
||||||
### step 4: cleanup
|
### step 4: cleanup ✓
|
||||||
- remove dead code (cpu update, per-frame upload)
|
- `--compute` is now default, `--cpu` flag for fallback/comparison
|
||||||
- add `--compute` flag to toggle (keep old path for comparison)
|
- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
|
||||||
- benchmark and document results
|
- verbose debug output reduced
|
||||||
|
|
||||||
## files changed
|
## files changed
|
||||||
|
|
||||||
|
|
|
||||||
35
journal.txt
35
journal.txt
|
|
@ -206,3 +206,38 @@ total improvement from baseline:
|
||||||
- SSBO: 60fps @ ~700k entities
|
- SSBO: 60fps @ ~700k entities
|
||||||
- ~140x improvement overall!
|
- ~140x improvement overall!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
optimization 6: compute shader updates
|
||||||
|
--------------------------------------
|
||||||
|
technique: move entity position + respawn logic from CPU to GPU compute shader
|
||||||
|
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
|
||||||
|
version: 0.7.0
|
||||||
|
|
||||||
|
struct GpuEntity {
|
||||||
|
x: f32, // 4 bytes
|
||||||
|
y: f32, // 4 bytes
|
||||||
|
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
|
||||||
|
color: u32, // 4 bytes
|
||||||
|
}; // = 16 bytes total (was 12)
|
||||||
|
|
||||||
|
changes:
|
||||||
|
- entity_update.comp: position update, center check, edge respawn, velocity calc
|
||||||
|
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
|
||||||
|
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
|
||||||
|
- CPU update loop skipped entirely when compute enabled
|
||||||
|
|
||||||
|
benchmark results (i5-6500T / HD 530):
|
||||||
|
- update time: ~5ms → ~0ms at 150k entities
|
||||||
|
- render time unchanged (GPU-bound as before)
|
||||||
|
- total frame time improvement at high entity counts
|
||||||
|
|
||||||
|
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
|
||||||
|
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
|
||||||
|
new entities when user adds them, not per-frame. memory barrier ensures compute
|
||||||
|
writes visible to vertex shader before draw.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
- --compute: GPU compute updates (now default)
|
||||||
|
- --cpu: fallback to CPU update path for comparison
|
||||||
|
|
||||||
|
|
|
||||||
9
justfile
9
justfile
|
|
@ -42,11 +42,20 @@ check:
|
||||||
test:
|
test:
|
||||||
zig build test
|
zig build test
|
||||||
|
|
||||||
|
# run sandbox (GPU compute is default)
|
||||||
|
sandbox:
|
||||||
|
zig build -Doptimize=ReleaseFast run
|
||||||
|
|
||||||
# auto-benchmark (ramps entities until performance degrades)
|
# auto-benchmark (ramps entities until performance degrades)
|
||||||
bench:
|
bench:
|
||||||
zig build -Doptimize=ReleaseFast run -- --bench
|
zig build -Doptimize=ReleaseFast run -- --bench
|
||||||
cat benchmark.log
|
cat benchmark.log
|
||||||
|
|
||||||
|
# benchmark with CPU update path (for comparison)
|
||||||
|
bench-cpu:
|
||||||
|
zig build -Doptimize=ReleaseFast run -- --bench --cpu
|
||||||
|
cat benchmark.log
|
||||||
|
|
||||||
# software-rendered benchmark (for CI/headless servers)
|
# software-rendered benchmark (for CI/headless servers)
|
||||||
[linux]
|
[linux]
|
||||||
bench-sw:
|
bench-sw:
|
||||||
|
|
|
||||||
|
|
@ -50,15 +50,7 @@ pub const ComputeShader = struct {
|
||||||
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
||||||
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
||||||
|
|
||||||
std.debug.print("compute: shader loaded (program_id={}, uniforms: count={}, frame={}, screen={}, center={}, radius={}, speed={})\n", .{
|
std.debug.print("compute: shader loaded\n", .{});
|
||||||
program_id,
|
|
||||||
entity_count_loc,
|
|
||||||
frame_number_loc,
|
|
||||||
screen_size_loc,
|
|
||||||
center_loc,
|
|
||||||
respawn_radius_loc,
|
|
||||||
entity_speed_loc,
|
|
||||||
});
|
|
||||||
|
|
||||||
return .{
|
return .{
|
||||||
.program_id = program_id,
|
.program_id = program_id,
|
||||||
|
|
|
||||||
|
|
@ -164,7 +164,7 @@ pub fn main() !void {
|
||||||
var use_instancing = false;
|
var use_instancing = false;
|
||||||
var use_ssbo = true;
|
var use_ssbo = true;
|
||||||
var use_vsync = false;
|
var use_vsync = false;
|
||||||
var use_compute = false;
|
var use_compute = true; // GPU compute is now default
|
||||||
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
||||||
defer args.deinit();
|
defer args.deinit();
|
||||||
_ = args.skip(); // skip program name
|
_ = args.skip(); // skip program name
|
||||||
|
|
@ -178,8 +178,8 @@ pub fn main() !void {
|
||||||
use_ssbo = false; // legacy rlgl batched path
|
use_ssbo = false; // legacy rlgl batched path
|
||||||
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
||||||
use_vsync = true;
|
use_vsync = true;
|
||||||
} else if (std.mem.eql(u8, arg, "--compute")) {
|
} else if (std.mem.eql(u8, arg, "--cpu")) {
|
||||||
use_compute = true;
|
use_compute = false; // fallback to CPU update path
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue