Add rlgl quad batching optimization

This commit is contained in:
Jared Tyler Miller 2025-12-14 23:29:50 -05:00 committed by Jared Miller
parent c010746897
commit 04673ef31f
6 changed files with 117 additions and 10 deletions

View file

@ -30,13 +30,17 @@ findings (AMD Radeon test):
based on phase 2 results:
- [x] batch rendering via texture blitting (10x improvement)
- [x] rlgl quad batching (2x improvement on top)
- [x] ~~if cpu-bound: SIMD, struct-of-arrays, multithreading~~ (not needed)
- [x] re-test after each change
findings:
- texture blitting: pre-render circle to texture, drawTexture() per entity
- rlgl batching: submit vertices directly via rl.gl, bypass drawTexture overhead
- baseline: 60fps @ ~5k entities
- optimized: 60fps @ ~50k entities, 30fps @ 100k entities
- after texture blitting: 60fps @ ~50k entities
- after rlgl batching: 60fps @ ~100k entities
- total: ~20x improvement from baseline
- see journal.txt for detailed benchmarks
## phase 4: add collision

17
benchmarks/benchmark2.log Normal file
View file

@ -0,0 +1,17 @@
# lofivor sandbox benchmark
# time entities frame_ms update_ms render_ms note
[0.1s] entities=0 frame=103.6ms update=0.0ms render=14.1ms [!60fps]
[0.1s] entities=0 frame=16.7ms update=0.0ms render=99.5ms [+60fps]
[0.2s] entities=0 frame=104.5ms update=0.0ms render=15.8ms [!60fps]
[0.2s] entities=0 frame=16.7ms update=0.0ms render=16.8ms [+60fps]
[10.0s] entities=4000 frame=16.7ms update=0.0ms render=16.6ms
[20.0s] entities=10000 frame=16.7ms update=0.1ms render=16.7ms
[21.5s] entities=11000 frame=19.7ms update=0.1ms render=16.6ms [!60fps]
[21.5s] entities=11000 frame=16.7ms update=0.1ms render=16.5ms [+60fps]
[21.5s] entities=11000 frame=27.1ms update=0.0ms render=16.6ms [!60fps]
[21.5s] entities=11000 frame=16.7ms update=0.1ms render=16.7ms [+60fps]
[30.0s] entities=23000 frame=16.7ms update=0.2ms render=16.5ms
[38.3s] entities=52000 frame=18.8ms update=0.3ms render=16.6ms [!60fps]
[38.4s] entities=53000 frame=16.7ms update=0.2ms render=16.4ms [+60fps]
[38.9s] entities=55000 frame=21.0ms update=0.3ms render=17.1ms [!60fps]
[40.0s] entities=59000 frame=20.6ms update=0.3ms render=18.4ms

15
benchmarks/benchmark3.log Normal file
View file

@ -0,0 +1,15 @@
# lofivor sandbox benchmark
# time entities frame_ms update_ms render_ms note
[0.1s] entities=0 frame=78.7ms update=0.0ms render=12.2ms [!60fps]
[0.1s] entities=0 frame=16.7ms update=0.0ms render=15.0ms [+60fps]
[10.0s] entities=40000 frame=16.7ms update=0.3ms render=16.5ms
[13.5s] entities=100000 frame=19.5ms update=0.5ms render=16.6ms [!60fps]
[13.5s] entities=100000 frame=16.7ms update=0.4ms render=16.4ms [+60fps]
[15.0s] entities=100000 frame=18.8ms update=0.9ms render=16.7ms [!60fps]
[15.0s] entities=100000 frame=16.7ms update=0.5ms render=16.2ms [+60fps]
[15.3s] entities=100000 frame=19.2ms update=0.7ms render=18.8ms [!60fps]
[15.6s] entities=100000 frame=16.7ms update=0.5ms render=16.2ms [+60fps]
[16.1s] entities=100000 frame=18.8ms update=0.5ms render=17.8ms [!60fps]
[16.2s] entities=100000 frame=16.7ms update=0.4ms render=16.3ms [+60fps]
[17.2s] entities=100000 frame=18.7ms update=0.5ms render=17.7ms [!60fps]
[18.4s] entities=100000 frame=16.7ms update=0.3ms render=16.3ms [+60fps]

View file

@ -0,0 +1,24 @@
# lofivor sandbox benchmark
# time entities frame_ms update_ms render_ms note
[0.1s] entities=0 frame=83.0ms update=0.0ms render=12.4ms [!60fps]
[0.1s] entities=0 frame=16.7ms update=0.0ms render=12.4ms [+60fps]
[10.0s] entities=4000 frame=16.7ms update=0.0ms render=16.9ms
[12.7s] entities=5000 frame=19.9ms update=0.0ms render=19.4ms [!60fps]
[14.8s] entities=6000 frame=26.4ms update=0.0ms render=23.5ms [jump]
[18.0s] entities=7000 frame=32.4ms update=0.0ms render=27.7ms [jump]
[20.0s] entities=8000 frame=30.9ms update=0.0ms render=30.8ms
[20.3s] entities=8000 frame=36.1ms update=0.0ms render=29.9ms [jump]
[20.4s] entities=8000 frame=52.6ms update=0.0ms render=30.8ms [jump]
[30.0s] entities=11000 frame=43.5ms update=0.1ms render=43.4ms
[30.6s] entities=12000 frame=54.5ms update=0.1ms render=45.3ms [jump]
[38.5s] entities=15000 frame=60.6ms update=0.1ms render=59.2ms [jump]
[40.0s] entities=15000 frame=57.1ms update=0.1ms render=57.5ms
[40.4s] entities=16000 frame=62.7ms update=0.1ms render=62.8ms [jump]
[42.6s] entities=17000 frame=72.9ms update=0.1ms render=65.5ms [jump]
[48.4s] entities=20000 frame=79.1ms update=0.1ms render=75.2ms [jump]
[50.1s] entities=20000 frame=77.0ms update=0.1ms render=75.0ms
[50.8s] entities=21000 frame=83.1ms update=0.1ms render=80.2ms [jump]
[52.9s] entities=22000 frame=92.9ms update=0.1ms render=86.6ms [jump]
[57.3s] entities=24000 frame=98.5ms update=0.1ms render=100.2ms [jump]
[60.1s] entities=25000 frame=97.4ms update=0.1ms render=95.6ms
[62.6s] entities=26000 frame=107.6ms update=0.1ms render=100.2ms [jump]

View file

@ -53,7 +53,37 @@ remains negligible (<0.6ms even at 100k).
---
optimization 2: [pending]
optimization 2: rlgl quad batching
-----------------------------------
technique: bypass drawTexture(), submit vertices directly via rlgl
code: sandbox_main.zig:175-197
- rl.gl.rlSetTexture() once
- rl.gl.rlBegin(rl_quads)
- loop: rlTexCoord2f + rlVertex2f for 4 vertices per entity
- rl.gl.rlEnd()
benchmark3.log results:
- 40k entities: 16.7ms (vsync-locked)
- 100k entities: 16.7-19.2ms (~55-60fps)
comparison to optimization 1:
- texture blitting: 100k @ 33-37ms (~30fps)
- rlgl batching: 100k @ 16.7-19ms (~55-60fps)
- ~2x improvement
total improvement from baseline:
- baseline: 60fps @ ~5k entities
- final: 60fps @ ~100k entities
- ~20x improvement overall
analysis: drawTexture() has per-call overhead (type conversions, batch state
checks). rlgl submits vertices directly to GPU buffer. raylib's internal batch
(8192 vertices = ~2048 quads) auto-flushes, so 100k entities = ~49 draw calls
vs 100k drawTexture calls with their overhead.
---
optimization 3: [pending]
-------------------------
technique:
results:

View file

@ -168,17 +168,34 @@ pub fn main() !void {
rl.beginDrawing();
rl.clearBackground(BG_COLOR);
// draw entities using pre-rendered circle texture
const half_size = @as(f32, @floatFromInt(TEXTURE_SIZE)) / 2.0;
// draw entities using rlgl quad batching
const size = @as(f32, @floatFromInt(TEXTURE_SIZE));
const half = size / 2.0;
rl.gl.rlSetTexture(circle_texture.id);
rl.gl.rlBegin(rl.gl.rl_quads);
rl.gl.rlColor4ub(255, 255, 255, 255); // white tint
for (entities.items[0..entities.count]) |entity| {
rl.drawTexture(
circle_texture,
@intFromFloat(entity.x - half_size),
@intFromFloat(entity.y - half_size),
rl.Color.white, // tint (white = use original colors)
);
const x1 = entity.x - half;
const y1 = entity.y - half;
const x2 = entity.x + half;
const y2 = entity.y + half;
// quad vertices: bottom-left, bottom-right, top-right, top-left
rl.gl.rlTexCoord2f(0, 0);
rl.gl.rlVertex2f(x1, y2);
rl.gl.rlTexCoord2f(1, 0);
rl.gl.rlVertex2f(x2, y2);
rl.gl.rlTexCoord2f(1, 1);
rl.gl.rlVertex2f(x2, y1);
rl.gl.rlTexCoord2f(0, 1);
rl.gl.rlVertex2f(x1, y1);
}
rl.gl.rlEnd();
rl.gl.rlSetTexture(0);
// metrics overlay
drawMetrics(&entities, update_time_us, render_time_us, paused);