diff --git a/TODO.md b/TODO.md index 4960b57..41e6de7 100644 --- a/TODO.md +++ b/TODO.md @@ -30,13 +30,17 @@ findings (AMD Radeon test): based on phase 2 results: - [x] batch rendering via texture blitting (10x improvement) +- [x] rlgl quad batching (2x improvement on top) - [x] ~~if cpu-bound: SIMD, struct-of-arrays, multithreading~~ (not needed) - [x] re-test after each change findings: - texture blitting: pre-render circle to texture, drawTexture() per entity +- rlgl batching: submit vertices directly via rl.gl, bypass drawTexture overhead - baseline: 60fps @ ~5k entities -- optimized: 60fps @ ~50k entities, 30fps @ 100k entities +- after texture blitting: 60fps @ ~50k entities +- after rlgl batching: 60fps @ ~100k entities +- total: ~20x improvement from baseline - see journal.txt for detailed benchmarks ## phase 4: add collision diff --git a/benchmarks/benchmark2.log b/benchmarks/benchmark2.log new file mode 100644 index 0000000..9f63224 --- /dev/null +++ b/benchmarks/benchmark2.log @@ -0,0 +1,17 @@ +# lofivor sandbox benchmark +# time entities frame_ms update_ms render_ms note +[0.1s] entities=0 frame=103.6ms update=0.0ms render=14.1ms [!60fps] +[0.1s] entities=0 frame=16.7ms update=0.0ms render=99.5ms [+60fps] +[0.2s] entities=0 frame=104.5ms update=0.0ms render=15.8ms [!60fps] +[0.2s] entities=0 frame=16.7ms update=0.0ms render=16.8ms [+60fps] +[10.0s] entities=4000 frame=16.7ms update=0.0ms render=16.6ms +[20.0s] entities=10000 frame=16.7ms update=0.1ms render=16.7ms +[21.5s] entities=11000 frame=19.7ms update=0.1ms render=16.6ms [!60fps] +[21.5s] entities=11000 frame=16.7ms update=0.1ms render=16.5ms [+60fps] +[21.5s] entities=11000 frame=27.1ms update=0.0ms render=16.6ms [!60fps] +[21.5s] entities=11000 frame=16.7ms update=0.1ms render=16.7ms [+60fps] +[30.0s] entities=23000 frame=16.7ms update=0.2ms render=16.5ms +[38.3s] entities=52000 frame=18.8ms update=0.3ms render=16.6ms [!60fps] +[38.4s] entities=53000 frame=16.7ms update=0.2ms render=16.4ms [+60fps] +[38.9s] entities=55000 frame=21.0ms update=0.3ms render=17.1ms [!60fps] +[40.0s] entities=59000 frame=20.6ms update=0.3ms render=18.4ms diff --git a/benchmarks/benchmark3.log b/benchmarks/benchmark3.log new file mode 100644 index 0000000..27658b7 --- /dev/null +++ b/benchmarks/benchmark3.log @@ -0,0 +1,15 @@ +# lofivor sandbox benchmark +# time entities frame_ms update_ms render_ms note +[0.1s] entities=0 frame=78.7ms update=0.0ms render=12.2ms [!60fps] +[0.1s] entities=0 frame=16.7ms update=0.0ms render=15.0ms [+60fps] +[10.0s] entities=40000 frame=16.7ms update=0.3ms render=16.5ms +[13.5s] entities=100000 frame=19.5ms update=0.5ms render=16.6ms [!60fps] +[13.5s] entities=100000 frame=16.7ms update=0.4ms render=16.4ms [+60fps] +[15.0s] entities=100000 frame=18.8ms update=0.9ms render=16.7ms [!60fps] +[15.0s] entities=100000 frame=16.7ms update=0.5ms render=16.2ms [+60fps] +[15.3s] entities=100000 frame=19.2ms update=0.7ms render=18.8ms [!60fps] +[15.6s] entities=100000 frame=16.7ms update=0.5ms render=16.2ms [+60fps] +[16.1s] entities=100000 frame=18.8ms update=0.5ms render=17.8ms [!60fps] +[16.2s] entities=100000 frame=16.7ms update=0.4ms render=16.3ms [+60fps] +[17.2s] entities=100000 frame=18.7ms update=0.5ms render=17.7ms [!60fps] +[18.4s] entities=100000 frame=16.7ms update=0.3ms render=16.3ms [+60fps] diff --git a/benchmarks/benchmark_original.log b/benchmarks/benchmark_original.log new file mode 100644 index 0000000..0c2e7ce --- /dev/null +++ b/benchmarks/benchmark_original.log @@ -0,0 +1,24 @@ +# lofivor sandbox benchmark +# time entities frame_ms update_ms render_ms note +[0.1s] entities=0 frame=83.0ms update=0.0ms render=12.4ms [!60fps] +[0.1s] entities=0 frame=16.7ms update=0.0ms render=12.4ms [+60fps] +[10.0s] entities=4000 frame=16.7ms update=0.0ms render=16.9ms +[12.7s] entities=5000 frame=19.9ms update=0.0ms render=19.4ms [!60fps] +[14.8s] entities=6000 frame=26.4ms update=0.0ms render=23.5ms [jump] +[18.0s] entities=7000 frame=32.4ms update=0.0ms render=27.7ms [jump] +[20.0s] entities=8000 frame=30.9ms update=0.0ms render=30.8ms +[20.3s] entities=8000 frame=36.1ms update=0.0ms render=29.9ms [jump] +[20.4s] entities=8000 frame=52.6ms update=0.0ms render=30.8ms [jump] +[30.0s] entities=11000 frame=43.5ms update=0.1ms render=43.4ms +[30.6s] entities=12000 frame=54.5ms update=0.1ms render=45.3ms [jump] +[38.5s] entities=15000 frame=60.6ms update=0.1ms render=59.2ms [jump] +[40.0s] entities=15000 frame=57.1ms update=0.1ms render=57.5ms +[40.4s] entities=16000 frame=62.7ms update=0.1ms render=62.8ms [jump] +[42.6s] entities=17000 frame=72.9ms update=0.1ms render=65.5ms [jump] +[48.4s] entities=20000 frame=79.1ms update=0.1ms render=75.2ms [jump] +[50.1s] entities=20000 frame=77.0ms update=0.1ms render=75.0ms +[50.8s] entities=21000 frame=83.1ms update=0.1ms render=80.2ms [jump] +[52.9s] entities=22000 frame=92.9ms update=0.1ms render=86.6ms [jump] +[57.3s] entities=24000 frame=98.5ms update=0.1ms render=100.2ms [jump] +[60.1s] entities=25000 frame=97.4ms update=0.1ms render=95.6ms +[62.6s] entities=26000 frame=107.6ms update=0.1ms render=100.2ms [jump] diff --git a/journal.txt b/journal.txt index f2825f3..942114c 100644 --- a/journal.txt +++ b/journal.txt @@ -53,7 +53,37 @@ remains negligible (<0.6ms even at 100k). --- -optimization 2: [pending] +optimization 2: rlgl quad batching +----------------------------------- +technique: bypass drawTexture(), submit vertices directly via rlgl +code: sandbox_main.zig:175-197 +- rl.gl.rlSetTexture() once +- rl.gl.rlBegin(rl_quads) +- loop: rlTexCoord2f + rlVertex2f for 4 vertices per entity +- rl.gl.rlEnd() + +benchmark3.log results: +- 40k entities: 16.7ms (vsync-locked) +- 100k entities: 16.7-19.2ms (~55-60fps) + +comparison to optimization 1: +- texture blitting: 100k @ 33-37ms (~30fps) +- rlgl batching: 100k @ 16.7-19ms (~55-60fps) +- ~2x improvement + +total improvement from baseline: +- baseline: 60fps @ ~5k entities +- final: 60fps @ ~100k entities +- ~20x improvement overall + +analysis: drawTexture() has per-call overhead (type conversions, batch state +checks). rlgl submits vertices directly to GPU buffer. raylib's internal batch +(8192 vertices = ~2048 quads) auto-flushes, so 100k entities = ~49 draw calls +vs 100k drawTexture calls with their overhead. + +--- + +optimization 3: [pending] ------------------------- technique: results: diff --git a/src/sandbox_main.zig b/src/sandbox_main.zig index 14e3849..7152ef1 100644 --- a/src/sandbox_main.zig +++ b/src/sandbox_main.zig @@ -168,17 +168,34 @@ pub fn main() !void { rl.beginDrawing(); rl.clearBackground(BG_COLOR); - // draw entities using pre-rendered circle texture - const half_size = @as(f32, @floatFromInt(TEXTURE_SIZE)) / 2.0; + // draw entities using rlgl quad batching + const size = @as(f32, @floatFromInt(TEXTURE_SIZE)); + const half = size / 2.0; + + rl.gl.rlSetTexture(circle_texture.id); + rl.gl.rlBegin(rl.gl.rl_quads); + rl.gl.rlColor4ub(255, 255, 255, 255); // white tint + for (entities.items[0..entities.count]) |entity| { - rl.drawTexture( - circle_texture, - @intFromFloat(entity.x - half_size), - @intFromFloat(entity.y - half_size), - rl.Color.white, // tint (white = use original colors) - ); + const x1 = entity.x - half; + const y1 = entity.y - half; + const x2 = entity.x + half; + const y2 = entity.y + half; + + // quad vertices: bottom-left, bottom-right, top-right, top-left + rl.gl.rlTexCoord2f(0, 0); + rl.gl.rlVertex2f(x1, y2); + rl.gl.rlTexCoord2f(1, 0); + rl.gl.rlVertex2f(x2, y2); + rl.gl.rlTexCoord2f(1, 1); + rl.gl.rlVertex2f(x2, y1); + rl.gl.rlTexCoord2f(0, 1); + rl.gl.rlVertex2f(x1, y1); } + rl.gl.rlEnd(); + rl.gl.rlSetTexture(0); + // metrics overlay drawMetrics(&entities, update_time_us, render_time_us, paused);