Compare commits
23 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02fd358611 | |||
| 5b890b18e4 | |||
| 55b0d7fab7 | |||
| a842800ede | |||
| 0568204cb7 | |||
| 516b4af458 | |||
| 6dcafc8f3c | |||
| 9f3495b882 | |||
| 90bb30b6c6 | |||
| 9e8226de32 | |||
| 62d010bdc0 | |||
| 45c37bfcd2 | |||
| 5fd82000cf | |||
| c30b9c0ed0 | |||
| ebe28e5669 | |||
| 7b43b5726e | |||
| d0dcb701f8 | |||
| e1d5dc136e | |||
| 3e2e39100a | |||
| 26383ed79e | |||
| 1782bc8db7 | |||
| 3f9e33feaf | |||
| d8bc9ac927 |
34 changed files with 2515 additions and 62 deletions
|
|
@ -1,12 +1,14 @@
|
||||||
name: release
|
name: release
|
||||||
|
|
||||||
on:
|
on:
|
||||||
release:
|
push:
|
||||||
types: [published]
|
tags:
|
||||||
|
- '*'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
runs-on: codeberg-small
|
runs-on: ubuntu-latest
|
||||||
|
container: catthehacker/ubuntu:act-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
@ -35,16 +37,32 @@ jobs:
|
||||||
|
|
||||||
- name: Upload to release
|
- name: Upload to release
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
FORGEJO_TOKEN: ${{ secrets.FORGEJO_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
RELEASE_ID="${{ github.event.release.id }}"
|
TAG="${{ github.ref_name }}"
|
||||||
API_URL="${{ github.api_url }}/repos/${{ github.repository }}/releases/${RELEASE_ID}/assets"
|
API_BASE="${{ github.server_url }}/api/v1"
|
||||||
|
REPO="${{ github.repository }}"
|
||||||
|
|
||||||
|
# check if release exists
|
||||||
|
RELEASE_ID=$(curl -sf \
|
||||||
|
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||||
|
"${API_BASE}/repos/${REPO}/releases/tags/${TAG}" | jq -r '.id // empty')
|
||||||
|
|
||||||
|
if [ -z "$RELEASE_ID" ]; then
|
||||||
|
echo "Creating release for ${TAG}..."
|
||||||
|
RELEASE_ID=$(curl -sf \
|
||||||
|
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"tag_name":"'"${TAG}"'","name":"'"${TAG}"'"}' \
|
||||||
|
"${API_BASE}/repos/${REPO}/releases" | jq -r '.id')
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Release ID: ${RELEASE_ID}"
|
||||||
|
|
||||||
for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
|
for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
|
||||||
echo "Uploading $file..."
|
echo "Uploading $file..."
|
||||||
curl -X POST \
|
curl -sf \
|
||||||
-H "Authorization: token ${GITHUB_TOKEN}" \
|
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||||
-H "Content-Type: application/octet-stream" \
|
-F "attachment=@${file}" \
|
||||||
--data-binary @"$file" \
|
"${API_BASE}/repos/${REPO}/releases/${RELEASE_ID}/assets?name=${file}"
|
||||||
"${API_URL}?name=${file}"
|
|
||||||
done
|
done
|
||||||
|
|
|
||||||
|
|
@ -82,8 +82,8 @@ these target the rendering bottleneck since update loop is already fast.
|
||||||
|
|
||||||
| technique | description | expected gain |
|
| technique | description | expected gain |
|
||||||
| ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
|
| ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
|
||||||
| ~~SSBO instance data~~ | ~~pack (x, y, color) = 12 bytes instead of 64-byte matrices~~ | **done** - see optimization 5 |
|
| SSBO instance data | pack (x, y, color) = 12 bytes instead of 64-byte matrices | done - see optimization 5 |
|
||||||
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | significant |
|
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | done - see optimization 6 |
|
||||||
| OpenGL vs Vulkan | test raylib's Vulkan backend | unknown |
|
| OpenGL vs Vulkan | test raylib's Vulkan backend | unknown |
|
||||||
| discrete GPU testing | test on dedicated GPU where instancing/SSBO shine | significant (different hw) |
|
| discrete GPU testing | test on dedicated GPU where instancing/SSBO shine | significant (different hw) |
|
||||||
|
|
||||||
|
|
@ -126,6 +126,33 @@ currently not the bottleneck - update stays <1ms at 100k. these become relevant
|
||||||
| entity pools | pre-allocated, reusable entity slots | reduces allocation overhead |
|
| entity pools | pre-allocated, reusable entity slots | reduces allocation overhead |
|
||||||
| component packing | minimize struct padding | better cache utilization |
|
| component packing | minimize struct padding | better cache utilization |
|
||||||
|
|
||||||
|
#### estimated gains summary
|
||||||
|
|
||||||
|
| Optimization | Expected Gain | Why |
|
||||||
|
|------------------------|---------------|---------------------------------------------------|
|
||||||
|
| SIMD updates | 0% | Update already on GPU |
|
||||||
|
| Multithreaded update | 0% | Update already on GPU |
|
||||||
|
| Cache-friendly layouts | 0% | CPU doesn't iterate entities |
|
||||||
|
| Fixed-point math | 0% or worse | GPUs are optimized for float |
|
||||||
|
| SoA vs AoS | ~5% | Only helps data upload, not bottleneck |
|
||||||
|
| Frustum culling | 5-15% | Most entities converge to center anyway |
|
||||||
|
| LOD rendering | 20-40% | Real gains - fewer fragments for distant entities |
|
||||||
|
| Temporal techniques | ~50% | But with visual artifacts (flickering) |
|
||||||
|
|
||||||
|
Realistic total if you did everything: ~30-50% improvement
|
||||||
|
|
||||||
|
That'd take you from ~1.4M @ 38fps to maybe ~1.8-2M @ 38fps, or ~1.4M @ 50-55fps.
|
||||||
|
|
||||||
|
What would actually move the needle:
|
||||||
|
- GPU-side frustum culling in compute shader (cull before render, not after)
|
||||||
|
- Point sprites instead of quads for distant entities (4 vertices → 1)
|
||||||
|
- Indirect draw calls (GPU decides what to render, CPU never touches entity data)
|
||||||
|
|
||||||
|
Your real bottleneck is fill rate and vertex throughput on HD 530 integrated
|
||||||
|
graphics. The CPU side is already essentially free.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## testing methodology
|
## testing methodology
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@ lofivor
|
||||||
sandbox stress test for measuring entity rendering performance on weak hardware.
|
sandbox stress test for measuring entity rendering performance on weak hardware.
|
||||||
written in zig with raylib.
|
written in zig with raylib.
|
||||||
|
|
||||||
|
(lofivor aka lofi-survivor)
|
||||||
|
|
||||||
build & run
|
build & run
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
|
|
|
||||||
24
TODO.md
24
TODO.md
|
|
@ -59,7 +59,7 @@ further options (if needed):
|
||||||
- [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
|
- [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
|
||||||
- [x] GPU instancing (single draw call for all entities)
|
- [x] GPU instancing (single draw call for all entities)
|
||||||
- [x] SSBO instance data (12 bytes vs 64-byte matrices)
|
- [x] SSBO instance data (12 bytes vs 64-byte matrices)
|
||||||
- [ ] compute shader entity updates (if raylib supports)
|
- [x] compute shader entity updates (raylib supports via rlgl)
|
||||||
- [ ] compare OpenGL vs Vulkan backend
|
- [ ] compare OpenGL vs Vulkan backend
|
||||||
|
|
||||||
findings (i5-6500T / HD 530):
|
findings (i5-6500T / HD 530):
|
||||||
|
|
@ -68,14 +68,18 @@ findings (i5-6500T / HD 530):
|
||||||
- instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
|
- instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
|
||||||
- bottleneck is memory bandwidth, not draw call overhead
|
- bottleneck is memory bandwidth, not draw call overhead
|
||||||
- rlgl batching is already near-optimal for this hardware
|
- rlgl batching is already near-optimal for this hardware
|
||||||
|
- compute shaders: update time ~5ms → ~0ms at 150k entities (CPU freed entirely)
|
||||||
|
|
||||||
## future optimization concepts
|
## future optimization concepts (GPU-focused)
|
||||||
|
|
||||||
- [ ] SIMD entity updates (AVX2/SSE)
|
- [ ] GPU-side frustum culling in compute shader
|
||||||
- [ ] struct-of-arrays vs array-of-structs benchmark
|
- [ ] point sprites for distant/small entities (4 verts → 1)
|
||||||
- [ ] multithreaded update loop (thread pool)
|
- [ ] indirect draw calls (glDrawArraysIndirect)
|
||||||
- [ ] cache-friendly memory layouts
|
|
||||||
- [ ] LOD rendering (skip distant entities or reduce detail)
|
## future optimization concepts (CPU - not currently bottleneck)
|
||||||
- [ ] frustum culling (only render visible)
|
|
||||||
- [ ] temporal techniques (update subset per frame)
|
- [ ] SIMD / SoA / multithreading (if game logic makes CPU hot again)
|
||||||
- [ ] fixed-point vs floating-point math
|
|
||||||
|
## other ideas that aren't about optimization
|
||||||
|
|
||||||
|
- [ ] scanline shader
|
||||||
|
|
|
||||||
13
build.zig
13
build.zig
|
|
@ -4,6 +4,9 @@ pub fn build(b: *std.Build) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardOptimizeOption(.{});
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
|
|
||||||
|
// tracy profiling (run with -Dtracy=true)
|
||||||
|
const enable_tracy = b.option(bool, "tracy", "Enable Tracy profiler") orelse false;
|
||||||
|
|
||||||
const raylib_dep = b.dependency("raylib_zig", .{
|
const raylib_dep = b.dependency("raylib_zig", .{
|
||||||
.target = target,
|
.target = target,
|
||||||
.optimize = optimize,
|
.optimize = optimize,
|
||||||
|
|
@ -24,6 +27,16 @@ pub fn build(b: *std.Build) void {
|
||||||
sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
|
sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
|
||||||
sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));
|
sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));
|
||||||
|
|
||||||
|
// tracy integration (optional)
|
||||||
|
const ztracy = b.dependency("ztracy", .{
|
||||||
|
.enable_ztracy = enable_tracy,
|
||||||
|
.on_demand = true, // allow connecting after app starts
|
||||||
|
});
|
||||||
|
sandbox_exe.root_module.addImport("ztracy", ztracy.module("root"));
|
||||||
|
if (enable_tracy) {
|
||||||
|
sandbox_exe.linkLibrary(ztracy.artifact("tracy"));
|
||||||
|
}
|
||||||
|
|
||||||
b.installArtifact(sandbox_exe);
|
b.installArtifact(sandbox_exe);
|
||||||
|
|
||||||
const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
|
const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,10 @@
|
||||||
.url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
|
.url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
|
||||||
.hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
|
.hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
|
||||||
},
|
},
|
||||||
|
.ztracy = .{
|
||||||
|
.url = "git+https://github.com/zig-gamedev/ztracy?ref=main#e7b401dea9ce006f8b236e3a2ca1a9f3d5c3e896",
|
||||||
|
.hash = "ztracy-0.14.0-dev-zHJSq78GGQC904aYvBPn6OOvRVOq_opAwDfeHZdvQyej",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
.paths = .{
|
.paths = .{
|
||||||
"build.zig",
|
"build.zig",
|
||||||
|
|
|
||||||
292
docs/GLOSSARY.txt
Normal file
292
docs/GLOSSARY.txt
Normal file
|
|
@ -0,0 +1,292 @@
|
||||||
|
lofivor glossary
|
||||||
|
================
|
||||||
|
|
||||||
|
terms that come up when optimizing graphics.
|
||||||
|
|
||||||
|
|
||||||
|
clock cycle
|
||||||
|
-----------
|
||||||
|
|
||||||
|
one "tick" of the processor's internal clock.
|
||||||
|
|
||||||
|
a CPU or GPU has a crystal oscillator that vibrates at a fixed rate.
|
||||||
|
each vibration = one cycle. the processor does some work each cycle.
|
||||||
|
|
||||||
|
1 GHz = 1 billion cycles per second
|
||||||
|
1 MHz = 1 million cycles per second
|
||||||
|
|
||||||
|
so a 1 GHz processor has 1 billion opportunities to do work per second.
|
||||||
|
|
||||||
|
"one operation per cycle" is idealized. real work often takes multiple
|
||||||
|
cycles (memory access: 100+ cycles, division: 10-20 cycles, add: 1 cycle).
|
||||||
|
|
||||||
|
your HD 530 runs at ~950 MHz, so roughly 950 million cycles per second.
|
||||||
|
at 60fps, that's about 15.8 million cycles per frame.
|
||||||
|
|
||||||
|
|
||||||
|
fill rate
|
||||||
|
---------
|
||||||
|
|
||||||
|
pixels written per second. measured in megapixels/s or gigapixels/s.
|
||||||
|
|
||||||
|
fill rate = ROPs * clock speed * pixels per clock
|
||||||
|
|
||||||
|
your HD 530: 3 ROPs * 950 MHz * 1 = 2.85 GPixels/s theoretical max.
|
||||||
|
|
||||||
|
|
||||||
|
overdraw
|
||||||
|
--------
|
||||||
|
|
||||||
|
drawing the same pixel multiple times per frame.
|
||||||
|
|
||||||
|
if two entities overlap, the back one gets drawn, then the front one
|
||||||
|
overwrites it. the back one's work was wasted.
|
||||||
|
|
||||||
|
overdraw ratio = total pixels drawn / screen pixels
|
||||||
|
|
||||||
|
1080p = 2.07M pixels. if you draw 20M pixels, overdraw = ~10x.
|
||||||
|
|
||||||
|
|
||||||
|
bandwidth
|
||||||
|
---------
|
||||||
|
|
||||||
|
data transfer rate. measured in bytes/second (GB/s, MB/s).
|
||||||
|
|
||||||
|
memory bandwidth = how fast data moves between processor and RAM.
|
||||||
|
|
||||||
|
your HD 530 shares DDR4 with the CPU: ~30 GB/s total.
|
||||||
|
a discrete GPU has dedicated VRAM: 200-900 GB/s.
|
||||||
|
|
||||||
|
|
||||||
|
latency
|
||||||
|
-------
|
||||||
|
|
||||||
|
time delay. measured in nanoseconds (ns) or cycles.
|
||||||
|
|
||||||
|
memory latency = time to fetch data from RAM.
|
||||||
|
- L1 cache: ~4 cycles
|
||||||
|
- L2 cache: ~12 cycles
|
||||||
|
- L3 cache: ~40 cycles
|
||||||
|
- main RAM: ~200 cycles
|
||||||
|
|
||||||
|
this is why cache matters. a cache miss = 50x slower than a hit.
|
||||||
|
|
||||||
|
|
||||||
|
throughput vs latency
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
latency = how long ONE thing takes.
|
||||||
|
throughput = how many things per second.
|
||||||
|
|
||||||
|
a pipeline can have high latency but high throughput.
|
||||||
|
|
||||||
|
example: a car wash takes 10 minutes (latency).
|
||||||
|
but if cars enter every 1 minute, throughput is 60 cars/hour.
|
||||||
|
|
||||||
|
GPUs hide latency with throughput. one thread waits for memory?
|
||||||
|
switch to another thread. thousands of threads keep the GPU busy.
|
||||||
|
|
||||||
|
|
||||||
|
draw call
|
||||||
|
---------
|
||||||
|
|
||||||
|
one command from CPU to GPU: "draw this batch of geometry."
|
||||||
|
|
||||||
|
each draw call has overhead:
|
||||||
|
- CPU prepares command buffer
|
||||||
|
- driver validates state
|
||||||
|
- GPU switches context
|
||||||
|
|
||||||
|
1 draw call for 1M triangles: fast.
|
||||||
|
1M draw calls for 1M triangles: slow.
|
||||||
|
|
||||||
|
lofivor uses 1 draw call for all entities (instanced rendering).
|
||||||
|
|
||||||
|
|
||||||
|
instancing
|
||||||
|
----------
|
||||||
|
|
||||||
|
drawing many copies of the same geometry in one draw call.
|
||||||
|
|
||||||
|
instead of: draw triangle, draw triangle, draw triangle...
|
||||||
|
you say: draw this triangle 1 million times, here are the positions.
|
||||||
|
|
||||||
|
the GPU handles the replication. massively more efficient.
|
||||||
|
|
||||||
|
|
||||||
|
shader
|
||||||
|
------
|
||||||
|
|
||||||
|
a small program that runs on the GPU.
|
||||||
|
|
||||||
|
the name is historical - early shaders calculated shading/lighting.
|
||||||
|
but today: a shader is just software running on GPU hardware.
|
||||||
|
it doesn't have to do with shading at all.
|
||||||
|
|
||||||
|
more precisely: a shader turns one piece of data into another piece of data.
|
||||||
|
- vertex shader: positions → screen coordinates
|
||||||
|
- fragment shader: fragments → pixel colors
|
||||||
|
- compute shader: data → data (anything)
|
||||||
|
|
||||||
|
GPUs are massively parallel, so shaders run on thousands of inputs at once.
|
||||||
|
CPUs have stagnated; GPUs keep getting faster. modern engines like UE5
|
||||||
|
increasingly use shaders for work that used to be CPU-only.
|
||||||
|
|
||||||
|
|
||||||
|
SSBO (shader storage buffer object)
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
|
a block of GPU memory that shaders can read/write.
|
||||||
|
|
||||||
|
unlike uniforms (small, read-only), SSBOs can be large and writable.
|
||||||
|
lofivor stores all entity data in an SSBO: positions, velocities, colors.
|
||||||
|
|
||||||
|
|
||||||
|
compute shader
|
||||||
|
--------------
|
||||||
|
|
||||||
|
a shader that does general computation, not rendering.
|
||||||
|
|
||||||
|
runs on GPU cores but doesn't output pixels. just processes data.
|
||||||
|
lofivor uses compute shaders to update entity positions.
|
||||||
|
|
||||||
|
because compute exists, shaders can be anything: physics, AI, sorting,
|
||||||
|
image processing. the GPU is a general-purpose parallel processor.
|
||||||
|
|
||||||
|
|
||||||
|
fragment / pixel shader
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
program that runs once per pixel (actually per "fragment").
|
||||||
|
|
||||||
|
determines the final color of each pixel. this is where:
|
||||||
|
- texture sampling happens
|
||||||
|
- lighting calculations happen
|
||||||
|
- the expensive math lives
|
||||||
|
|
||||||
|
lofivor's fragment shader: sample texture, multiply by color. trivial.
|
||||||
|
AAA game fragment shader: 500+ instructions. expensive.
|
||||||
|
|
||||||
|
|
||||||
|
vertex shader
|
||||||
|
-------------
|
||||||
|
|
||||||
|
program that runs once per vertex.
|
||||||
|
|
||||||
|
transforms 3D positions to screen positions. lofivor's vertex shader
|
||||||
|
reads from SSBO and positions the quad corners.
|
||||||
|
|
||||||
|
|
||||||
|
ROP (render output unit)
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
final stage of GPU pipeline. writes pixels to framebuffer.
|
||||||
|
|
||||||
|
handles: depth test, stencil test, blending, antialiasing.
|
||||||
|
your bottleneck on HD 530. see docs/rops.txt.
|
||||||
|
|
||||||
|
|
||||||
|
TMU (texture mapping unit)
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
samples textures. reads pixel colors from texture memory.
|
||||||
|
|
||||||
|
your HD 530 has 24 TMUs. they're fast (22.8 GTexels/s).
|
||||||
|
texture sampling is cheap relative to ROPs on this hardware.
|
||||||
|
|
||||||
|
|
||||||
|
EU (execution unit)
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
intel's term for shader cores.
|
||||||
|
|
||||||
|
your HD 530 has 24 EUs, each with 8 ALUs = 192 ALUs total.
|
||||||
|
these run your vertex, fragment, and compute shaders.
|
||||||
|
|
||||||
|
|
||||||
|
ALU (arithmetic logic unit)
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
does math. add, multiply, compare, bitwise operations.
|
||||||
|
|
||||||
|
one ALU can do one operation per cycle (simple ops).
|
||||||
|
complex ops (sqrt, sin, cos) take multiple cycles.
|
||||||
|
|
||||||
|
|
||||||
|
framebuffer
|
||||||
|
-----------
|
||||||
|
|
||||||
|
the image being rendered. lives in GPU memory.
|
||||||
|
|
||||||
|
at 1080p with 32-bit color: 1920 * 1080 * 4 = 8.3 MB.
|
||||||
|
double-buffered (front + back): 16.6 MB.
|
||||||
|
|
||||||
|
|
||||||
|
vsync
|
||||||
|
-----
|
||||||
|
|
||||||
|
synchronizing frame presentation with monitor refresh.
|
||||||
|
|
||||||
|
without vsync: tearing (half old frame, half new frame).
|
||||||
|
with vsync: smooth, but if you miss 16.7ms, you wait for next refresh.
|
||||||
|
|
||||||
|
|
||||||
|
frame budget
|
||||||
|
------------
|
||||||
|
|
||||||
|
time available per frame.
|
||||||
|
|
||||||
|
60 fps = 16.67 ms per frame
|
||||||
|
30 fps = 33.33 ms per frame
|
||||||
|
|
||||||
|
everything (CPU + GPU) must complete within budget or frames drop.
|
||||||
|
|
||||||
|
|
||||||
|
pipeline stall
|
||||||
|
--------------
|
||||||
|
|
||||||
|
GPU waiting for something. bad for performance.
|
||||||
|
|
||||||
|
causes:
|
||||||
|
- waiting for memory (cache miss)
|
||||||
|
- waiting for previous stage to finish
|
||||||
|
- synchronization points (barriers)
|
||||||
|
- `discard` in fragment shader (breaks early-z)
|
||||||
|
|
||||||
|
|
||||||
|
early-z
|
||||||
|
-------
|
||||||
|
|
||||||
|
optimization: test depth BEFORE running fragment shader.
|
||||||
|
|
||||||
|
if pixel will be occluded, skip the expensive shader work.
|
||||||
|
`discard` breaks this because GPU can't know depth until shader runs.
|
||||||
|
|
||||||
|
|
||||||
|
LOD (level of detail)
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
using simpler geometry/textures for distant objects.
|
||||||
|
|
||||||
|
far away = fewer pixels = less detail needed.
|
||||||
|
saves vertices, texture bandwidth, and fill rate.
|
||||||
|
|
||||||
|
|
||||||
|
frustum culling
|
||||||
|
---------------
|
||||||
|
|
||||||
|
don't draw what's outside the camera view.
|
||||||
|
|
||||||
|
the "frustum" is the pyramid-shaped visible region.
|
||||||
|
anything outside = wasted work. cull it before sending to GPU.
|
||||||
|
|
||||||
|
|
||||||
|
spatial partitioning
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
organizing entities by position for fast queries.
|
||||||
|
|
||||||
|
types: grid, quadtree, octree, BVH.
|
||||||
|
|
||||||
|
"which entities are near point X?" goes from O(n) to O(log n).
|
||||||
|
essential for collision detection at scale.
|
||||||
119
docs/hd530_optimization_guide.md
Normal file
119
docs/hd530_optimization_guide.md
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
# intel hd 530 optimization guide for lofivor
|
||||||
|
|
||||||
|
based on hardware specs and empirical testing.
|
||||||
|
|
||||||
|
## hardware constraints
|
||||||
|
|
||||||
|
from `intel_hd_graphics_530.txt`:
|
||||||
|
|
||||||
|
| resource | value | implication |
|
||||||
|
| ---------- | ------- | ------------- |
|
||||||
|
| ROPs | 3 | fill rate limited - this is our ceiling |
|
||||||
|
| TMUs | 24 | texture sampling is relatively fast |
|
||||||
|
| memory | shared DDR4 ~30GB/s | bandwidth is precious, no VRAM |
|
||||||
|
| pixel rate | 2.85 GPixel/s | max theoretical throughput |
|
||||||
|
| EUs | 24 (192 ALUs) | decent compute, weak vs discrete |
|
||||||
|
| L3 cache | 768 KB | small, cache misses hurt |
|
||||||
|
|
||||||
|
the bottleneck is ROPs (fill rate), not vertices or compute.
|
||||||
|
|
||||||
|
## what works (proven)
|
||||||
|
|
||||||
|
### SSBO instance data
|
||||||
|
- 16 bytes per entity vs 64 bytes (matrices)
|
||||||
|
- minimizes bandwidth on shared memory bus
|
||||||
|
- result: ~5x improvement over instancing
|
||||||
|
|
||||||
|
### compute shader updates
|
||||||
|
- GPU does position/velocity updates
|
||||||
|
- no CPU→GPU sync per frame
|
||||||
|
- result: update time essentially free
|
||||||
|
|
||||||
|
### texture sampling
|
||||||
|
- 22.8 GTexel/s is fast relative to other units
|
||||||
|
- pre-baked circle texture beats procedural math
|
||||||
|
- result: 2x faster than procedural fragment shader
|
||||||
|
|
||||||
|
### instanced triangles/quads
|
||||||
|
- most optimized driver path
|
||||||
|
- intel mesa heavily optimizes this
|
||||||
|
- result: baseline, hard to beat
|
||||||
|
|
||||||
|
## what doesn't work (proven)
|
||||||
|
|
||||||
|
### point sprites
|
||||||
|
- theoretically 6x fewer vertices
|
||||||
|
- reality: 2.4x SLOWER on this hardware
|
||||||
|
- triangle rasterizer is more optimized
|
||||||
|
- see `docs/point_sprites_experiment.md`
|
||||||
|
|
||||||
|
### procedural fragment shaders
|
||||||
|
- `length()`, `smoothstep()`, `discard` are expensive
|
||||||
|
- EUs are weaker than discrete GPUs
|
||||||
|
- `discard` breaks early-z optimization
|
||||||
|
- result: 3.7x slower than texture sampling
|
||||||
|
|
||||||
|
### complex fragment math
|
||||||
|
- only 24 EUs, each running 8 ALUs
|
||||||
|
- transcendentals (sqrt, sin, cos) are 4x slower than FMAD
|
||||||
|
- avoid in hot path
|
||||||
|
|
||||||
|
## what to try next (theoretical)
|
||||||
|
|
||||||
|
### likely to help
|
||||||
|
|
||||||
|
| technique | why it should work | expected gain |
|
||||||
|
| ----------- | ------------------- | --------------- |
|
||||||
|
| frustum culling (GPU) | reduce fill rate, which is bottleneck | 10-30% depending on view |
|
||||||
|
| smaller points when zoomed out (LOD) | fewer pixels per entity = less ROP work | 20-40% |
|
||||||
|
| early-z / depth pre-pass | skip fragment work for occluded pixels | moderate |
|
||||||
|
|
||||||
|
### unlikely to help
|
||||||
|
|
||||||
|
| technique | why it won't help |
|
||||||
|
| ----------- | ------------------ |
|
||||||
|
| more vertex optimization | already fill rate bound, not vertex bound |
|
||||||
|
| SIMD on CPU | updates already on GPU |
|
||||||
|
| multithreading | CPU isn't the bottleneck |
|
||||||
|
| different vertex layouts | negligible vs fill rate |
|
||||||
|
|
||||||
|
### uncertain (need to test)
|
||||||
|
|
||||||
|
| technique | notes |
|
||||||
|
| ----------- | ------- |
|
||||||
|
| vulkan backend | might have less driver overhead, or might not matter |
|
||||||
|
| indirect draw calls | GPU decides what to render, but we're not CPU bound |
|
||||||
|
| fp16 in shaders | HD 530 has 2:1 fp16 ratio, might help fragment shader |
|
||||||
|
|
||||||
|
## key insights
|
||||||
|
|
||||||
|
1. fill rate is king - with only 3 ROPs, everything comes down to how many
|
||||||
|
pixels we're writing. optimizations that don't reduce pixel count won't
|
||||||
|
help.
|
||||||
|
|
||||||
|
2. shared memory hurts - no dedicated VRAM means CPU and GPU compete for
|
||||||
|
bandwidth. keep data transfers minimal.
|
||||||
|
|
||||||
|
3. driver optimization matters - the "common path" (triangles) is more
|
||||||
|
optimized than alternatives (points). don't be clever.
|
||||||
|
|
||||||
|
4. texture sampling is cheap - 22.8 GTexel/s is fast. prefer texture
|
||||||
|
lookups over ALU math in fragment shaders.
|
||||||
|
|
||||||
|
5. avoid discard - breaks early-z, causes pipeline stalls. alpha blending
|
||||||
|
is faster than discard.
|
||||||
|
|
||||||
|
## current ceiling
|
||||||
|
|
||||||
|
~950k entities @ 57fps (SSBO + compute + quads)
|
||||||
|
|
||||||
|
to go higher, we need to reduce fill rate:
|
||||||
|
- cull offscreen entities
|
||||||
|
- reduce entity size when zoomed out
|
||||||
|
- or accept lower fps at higher counts
|
||||||
|
|
||||||
|
## references
|
||||||
|
|
||||||
|
- intel gen9 compute architecture whitepaper
|
||||||
|
- empirical benchmarks in `benchmark_current_i56500t.log`
|
||||||
|
- point sprites experiment in `docs/point_sprites_experiment.md`
|
||||||
31
docs/hysteria.md
Normal file
31
docs/hysteria.md
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
# hysteresis in lofivor
|
||||||
|
|
||||||
|
## the problem without it
|
||||||
|
|
||||||
|
say your target is 8.33ms. your frame times naturally jitter: 8.2, 8.4, 8.3, 8.5, 8.2...
|
||||||
|
|
||||||
|
without hysteresis, every time it crosses 8.33ms you'd log "crossed threshold!" - potentially dozens of times per second. the log becomes useless noise.
|
||||||
|
|
||||||
|
## how the code works
|
||||||
|
|
||||||
|
from `sandbox_main.zig` lines 74-89:
|
||||||
|
|
||||||
|
```
|
||||||
|
was_above=false → need frame_ms > 10.33 (target + 2.0 margin) to flip to true
|
||||||
|
was_above=true → need frame_ms < 8.33 (target) to flip back to false
|
||||||
|
```
|
||||||
|
|
||||||
|
this creates a "dead zone" between 8.33 and 10.33ms where no state change happens.
|
||||||
|
|
||||||
|
## the magnet analogy
|
||||||
|
|
||||||
|
the `was_above_target` boolean is like the magnet's current polarity. the frame time "pushing" past thresholds is like the magnetic field. the key insight: **the threshold you need to cross depends on which side you're currently on.**
|
||||||
|
|
||||||
|
if you're in "good" state, you need a significant spike (>10.33ms) before you flip to "bad". if you're in "bad" state, you only need to drop below 8.33ms to recover. this asymmetry is the hysteresis.
|
||||||
|
|
||||||
|
## real-world examples
|
||||||
|
|
||||||
|
- thermostat: heat on at 68°F, off at 72°F (prevents rapid on/off cycling)
|
||||||
|
- schmitt trigger in electronics: same concept, prevents noise from causing oscillation
|
||||||
|
|
||||||
|
the `THRESHOLD_MARGIN` of 2.0ms is the "width" of the hysteresis band - bigger = more stable but less responsive.
|
||||||
54
docs/plans/2025-12-16-zoom-pan-design.md
Normal file
54
docs/plans/2025-12-16-zoom-pan-design.md
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
# Zoom/Pan Camera Design
|
||||||
|
|
||||||
|
A viewport camera for zooming into and panning around the simulation without affecting entity behavior.
|
||||||
|
|
||||||
|
## Core Behavior
|
||||||
|
|
||||||
|
### Zoom
|
||||||
|
- Scroll wheel zooms toward mouse cursor position
|
||||||
|
- Range: 1x (default floor) to 10x (ceiling)
|
||||||
|
- Instant response, no animation
|
||||||
|
- Esc or Space resets to 1x and clears pan offset
|
||||||
|
|
||||||
|
### Pan
|
||||||
|
- Any mouse button (left/middle/right) + drag pans the viewport
|
||||||
|
- Only available when zoom > 1x
|
||||||
|
- Bounded to simulation area - cannot pan into empty space
|
||||||
|
|
||||||
|
### UI
|
||||||
|
- Display current zoom level in existing panel under render info (e.g., `zoom: 2.3x`)
|
||||||
|
|
||||||
|
## Implementation Approach
|
||||||
|
|
||||||
|
### State
|
||||||
|
New camera state in `sandbox_main.zig`:
|
||||||
|
```zig
|
||||||
|
var zoom: f32 = 1.0;
|
||||||
|
var pan: @Vector(2, f32) = .{ 0, 0 };
|
||||||
|
```
|
||||||
|
|
||||||
|
### Shader Changes
|
||||||
|
Modify `entity.vert` to accept `zoom` and `pan` uniforms:
|
||||||
|
- Apply pan offset before converting to NDC
|
||||||
|
- Scale by zoom factor
|
||||||
|
- Scale quad size by zoom so entities appear larger
|
||||||
|
|
||||||
|
### Input Handling
|
||||||
|
- `getMouseWheelMove()` adjusts zoom (clamped 1.0–10.0)
|
||||||
|
- Zoom-toward-cursor: adjust pan to keep point under cursor stationary
|
||||||
|
- Mouse drag (any button) adjusts pan with bounds checking
|
||||||
|
- Esc/Space resets zoom to 1.0 and pan to (0, 0)
|
||||||
|
|
||||||
|
### Zoom-Toward-Cursor Math
|
||||||
|
When zooming from `oldZoom` to `newZoom` with cursor at `mousePos`:
|
||||||
|
```
|
||||||
|
worldMousePos = (mousePos / oldZoom) + pan
|
||||||
|
newPan = worldMousePos - (mousePos / newZoom)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pan Bounds
|
||||||
|
Constrain pan so viewport stays within simulation area:
|
||||||
|
```
|
||||||
|
maxPan = simulationSize - (screenSize / zoom)
|
||||||
|
pan = clamp(pan, 0, maxPan)
|
||||||
|
```
|
||||||
440
docs/plans/2025-12-16-zoom-pan-plan.md
Normal file
440
docs/plans/2025-12-16-zoom-pan-plan.md
Normal file
|
|
@ -0,0 +1,440 @@
|
||||||
|
# Zoom/Pan Camera Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Add viewport zoom (scroll wheel toward cursor) and pan (any mouse drag when zoomed) to observe the simulation up close.
|
||||||
|
|
||||||
|
**Architecture:** Camera state (zoom, pan) lives in sandbox_main.zig. Passed to shader as uniforms. All rendering paths use the same camera state, but only SSBO path gets shader-based zoom (others would need separate work).
|
||||||
|
|
||||||
|
**Tech Stack:** Zig, raylib, GLSL 430
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Add camera state and shader uniforms
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/sandbox_main.zig:266` (add state after `var paused`)
|
||||||
|
- Modify: `src/ssbo_renderer.zig:20-21` (add uniform locations to struct)
|
||||||
|
- Modify: `src/ssbo_renderer.zig:54-62` (get uniform locations in init)
|
||||||
|
- Modify: `src/ssbo_renderer.zig:154-156` (pass uniforms in render)
|
||||||
|
|
||||||
|
**Step 1: Add camera state to sandbox_main.zig**
|
||||||
|
|
||||||
|
After line 266 (`var paused = false;`), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// camera state for zoom/pan
|
||||||
|
var zoom: f32 = 1.0;
|
||||||
|
var pan = @Vector(2, f32){ 0, 0 };
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Add uniform locations to SsboRenderer struct**
|
||||||
|
|
||||||
|
In `src/ssbo_renderer.zig`, add to struct fields after line 21 (`circle_texture_loc`):
|
||||||
|
|
||||||
|
```zig
|
||||||
|
zoom_loc: i32,
|
||||||
|
pan_loc: i32,
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Get uniform locations in init**
|
||||||
|
|
||||||
|
After line 55 (`const circle_texture_loc = ...`), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
||||||
|
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Add fields to return struct**
|
||||||
|
|
||||||
|
In the return statement (around line 112), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
.zoom_loc = zoom_loc,
|
||||||
|
.pan_loc = pan_loc,
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Pass uniforms in render method**
|
||||||
|
|
||||||
|
Change render signature to accept zoom/pan:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||||
|
```
|
||||||
|
|
||||||
|
After line 156 (setting screenSize uniform), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// set zoom uniform
|
||||||
|
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||||
|
|
||||||
|
// set pan uniform
|
||||||
|
const pan_arr = [2]f32{ pan[0], pan[1] };
|
||||||
|
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 6: Update render call in sandbox_main.zig**
|
||||||
|
|
||||||
|
Change line 336 from:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
ssbo_renderer.?.render(&entities);
|
||||||
|
```
|
||||||
|
|
||||||
|
To:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
ssbo_renderer.?.render(&entities, zoom, pan);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 7: Build and verify compiles**
|
||||||
|
|
||||||
|
Run: `zig build`
|
||||||
|
|
||||||
|
Expected: Compiles with no errors (shader won't use uniforms yet, but that's fine)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: Update vertex shader for zoom/pan
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/shaders/entity.vert`
|
||||||
|
|
||||||
|
**Step 1: Add uniforms**
|
||||||
|
|
||||||
|
After line 19 (`uniform vec2 screenSize;`), add:
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
uniform float zoom;
|
||||||
|
uniform vec2 pan;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Update NDC calculation**
|
||||||
|
|
||||||
|
Replace lines 29-31:
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
// convert entity position to NDC
|
||||||
|
// entity coords are in screen pixels, convert to [-1, 1]
|
||||||
|
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
||||||
|
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
||||||
|
```
|
||||||
|
|
||||||
|
With:
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
// apply pan offset and zoom to convert to NDC
|
||||||
|
// pan is in screen pixels, zoom scales the view
|
||||||
|
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
||||||
|
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Scale quad size by zoom**
|
||||||
|
|
||||||
|
Replace line 34:
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
float quadSizeNdc = 16.0 / screenSize.x;
|
||||||
|
```
|
||||||
|
|
||||||
|
With:
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Build and test**
|
||||||
|
|
||||||
|
Run: `zig build && ./zig-out/bin/lofivor`
|
||||||
|
|
||||||
|
Expected: Renders exactly as before (zoom=1.0, pan=0,0 should be identical to old behavior)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: Add zoom input handling
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/sandbox_main.zig` (handleInput function and main loop)
|
||||||
|
|
||||||
|
**Step 1: Add zoom constants**
|
||||||
|
|
||||||
|
After line 32 (BENCH_EXIT_SUSTAIN), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// zoom settings
|
||||||
|
const ZOOM_MIN: f32 = 1.0;
|
||||||
|
const ZOOM_MAX: f32 = 10.0;
|
||||||
|
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Create handleCamera function**
|
||||||
|
|
||||||
|
After the `handleInput` function (around line 458), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) void {
|
||||||
|
const wheel = rl.getMouseWheelMove();
|
||||||
|
|
||||||
|
if (wheel != 0) {
|
||||||
|
const mouse_pos = rl.getMousePosition();
|
||||||
|
const old_zoom = zoom.*;
|
||||||
|
|
||||||
|
// calculate new zoom
|
||||||
|
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
||||||
|
var new_zoom = old_zoom * zoom_factor;
|
||||||
|
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
||||||
|
|
||||||
|
if (new_zoom != old_zoom) {
|
||||||
|
// zoom toward mouse cursor:
|
||||||
|
// keep the world point under the cursor stationary
|
||||||
|
// world_pos = (screen_pos / old_zoom) + old_pan
|
||||||
|
// new_pan = world_pos - (screen_pos / new_zoom)
|
||||||
|
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
||||||
|
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
||||||
|
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
||||||
|
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
||||||
|
zoom.* = new_zoom;
|
||||||
|
|
||||||
|
// clamp pan to bounds
|
||||||
|
clampPan(pan, zoom.*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset on Esc or Space (Space also toggles pause in handleInput)
|
||||||
|
if (rl.isKeyPressed(.escape)) {
|
||||||
|
zoom.* = 1.0;
|
||||||
|
pan.* = @Vector(2, f32){ 0, 0 };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
||||||
|
// when zoomed in, limit pan so viewport stays in simulation bounds
|
||||||
|
// visible area = screen_size / zoom
|
||||||
|
// max pan = world_size - visible_area
|
||||||
|
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
||||||
|
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
||||||
|
const visible_w = screen_w / zoom;
|
||||||
|
const visible_h = screen_h / zoom;
|
||||||
|
|
||||||
|
const max_pan_x = @max(0, screen_w - visible_w);
|
||||||
|
const max_pan_y = @max(0, screen_h - visible_h);
|
||||||
|
|
||||||
|
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
||||||
|
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Call handleCamera in main loop**
|
||||||
|
|
||||||
|
In the main loop, after the `handleInput` call (line 318), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
handleCamera(&zoom, &pan);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Also reset zoom when Space is pressed**
|
||||||
|
|
||||||
|
In `handleInput`, modify the space key handler (around line 450):
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// pause: space (also resets zoom in handleCamera context)
|
||||||
|
if (rl.isKeyPressed(.space)) {
|
||||||
|
paused.* = !paused.*;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Actually, handleInput doesn't have access to zoom/pan. We need to either:
|
||||||
|
- Pass zoom/pan to handleInput
|
||||||
|
- Handle space reset in handleCamera
|
||||||
|
|
||||||
|
Let's handle it in handleCamera. Add after the escape check:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// Space also resets zoom (pause is handled separately in handleInput)
|
||||||
|
if (rl.isKeyPressed(.space)) {
|
||||||
|
zoom.* = 1.0;
|
||||||
|
pan.* = @Vector(2, f32){ 0, 0 };
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Build and test zoom**
|
||||||
|
|
||||||
|
Run: `zig build && ./zig-out/bin/lofivor`
|
||||||
|
|
||||||
|
Test:
|
||||||
|
1. Scroll up - entities should get bigger (zoom in toward cursor)
|
||||||
|
2. Scroll down - entities get smaller (but not below 1x)
|
||||||
|
3. Press Esc or Space - resets to default view
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: Add pan input handling
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/sandbox_main.zig` (handleCamera function)
|
||||||
|
|
||||||
|
**Step 1: Add pan logic to handleCamera**
|
||||||
|
|
||||||
|
Add this after the zoom handling, before the reset checks:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
// pan with any mouse button drag (only when zoomed in)
|
||||||
|
if (zoom.* > 1.0) {
|
||||||
|
const any_button = rl.isMouseButtonDown(.left) or
|
||||||
|
rl.isMouseButtonDown(.right) or
|
||||||
|
rl.isMouseButtonDown(.middle);
|
||||||
|
if (any_button) {
|
||||||
|
const delta = rl.getMouseDelta();
|
||||||
|
// pan in opposite direction of drag (drag right = view moves left = pan increases)
|
||||||
|
pan.*[0] -= delta.x / zoom.*;
|
||||||
|
pan.*[1] -= delta.y / zoom.*;
|
||||||
|
clampPan(pan, zoom.*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Build and test pan**
|
||||||
|
|
||||||
|
Run: `zig build && ./zig-out/bin/lofivor`
|
||||||
|
|
||||||
|
Test:
|
||||||
|
1. Scroll to zoom in past 1x
|
||||||
|
2. Click and drag with any mouse button - viewport should pan
|
||||||
|
3. Try to pan past edges - should be bounded
|
||||||
|
4. At 1x zoom, dragging should do nothing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: Add zoom display to UI
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/ui.zig:34` (drawMetrics signature)
|
||||||
|
- Modify: `src/ui.zig:71-72` (add zoom line after render)
|
||||||
|
- Modify: `src/sandbox_main.zig:387` (pass zoom to drawMetrics)
|
||||||
|
|
||||||
|
**Step 1: Update drawMetrics signature**
|
||||||
|
|
||||||
|
Change line 34:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
||||||
|
```
|
||||||
|
|
||||||
|
To:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Increase box height for zoom line**
|
||||||
|
|
||||||
|
Change line 50:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const bg_height: i32 = if (paused) 130 else 100;
|
||||||
|
```
|
||||||
|
|
||||||
|
To:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const bg_height: i32 = if (paused) 150 else 120;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Add zoom display after render line**
|
||||||
|
|
||||||
|
After line 72 (render_text draw), add:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
y += line_height;
|
||||||
|
|
||||||
|
// zoom level
|
||||||
|
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
||||||
|
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Update call in sandbox_main.zig**
|
||||||
|
|
||||||
|
Change line 387:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
||||||
|
```
|
||||||
|
|
||||||
|
To:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Build and test UI**
|
||||||
|
|
||||||
|
Run: `zig build && ./zig-out/bin/lofivor`
|
||||||
|
|
||||||
|
Test:
|
||||||
|
1. UI should show "zoom: 1.0x" in white
|
||||||
|
2. Scroll to zoom - should update and turn yellow when > 1x
|
||||||
|
3. Reset with Esc - back to white 1.0x
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 6: Update controls legend
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/ui.zig:120-139` (drawControls function)
|
||||||
|
|
||||||
|
**Step 1: Update controls list and box height**
|
||||||
|
|
||||||
|
Change line 121:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 5 + box_padding * 2);
|
||||||
|
```
|
||||||
|
|
||||||
|
To:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
||||||
|
```
|
||||||
|
|
||||||
|
Change the controls array (lines 127-133):
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const controls = [_][]const u8{
|
||||||
|
"+/-: 10k entities",
|
||||||
|
"shift +/-: 50k",
|
||||||
|
"scroll: zoom",
|
||||||
|
"drag: pan (zoomed)",
|
||||||
|
"space: pause/reset",
|
||||||
|
"esc: reset zoom",
|
||||||
|
"tab: toggle ui",
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Build and final test**
|
||||||
|
|
||||||
|
Run: `zig build && ./zig-out/bin/lofivor`
|
||||||
|
|
||||||
|
Full test:
|
||||||
|
1. Scroll wheel zooms toward cursor (1x-10x)
|
||||||
|
2. Any mouse drag pans when zoomed > 1x
|
||||||
|
3. Pan is bounded to simulation area
|
||||||
|
4. Esc resets zoom/pan
|
||||||
|
5. Space toggles pause AND resets zoom/pan
|
||||||
|
6. UI shows zoom level (yellow when zoomed)
|
||||||
|
7. Controls legend shows new controls
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 7: Commit
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add src/sandbox_main.zig src/ssbo_renderer.zig src/shaders/entity.vert src/ui.zig
|
||||||
|
git commit -m "feat: add zoom/pan camera
|
||||||
|
|
||||||
|
- scroll wheel zooms toward cursor (1x-10x range)
|
||||||
|
- any mouse button drag pans when zoomed
|
||||||
|
- pan bounded to simulation area
|
||||||
|
- esc/space resets to default view
|
||||||
|
- zoom level shown in metrics panel"
|
||||||
|
```
|
||||||
170
docs/plans/2025-12-17-compute-shader-updates.md
Normal file
170
docs/plans/2025-12-17-compute-shader-updates.md
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
# compute shader entity updates
|
||||||
|
|
||||||
|
move entity position math to GPU, eliminate CPU→GPU sync per frame.
|
||||||
|
|
||||||
|
## context
|
||||||
|
|
||||||
|
current bottleneck: per-frame `rlUpdateShaderBuffer()` uploads all entity data from CPU to GPU. at 950k entities that's 19MB/frame. targeting 10M entities would be 160MB/frame.
|
||||||
|
|
||||||
|
solution: keep entity data on GPU entirely. compute shader updates positions, vertex shader renders. CPU just dispatches.
|
||||||
|
|
||||||
|
## data structures
|
||||||
|
|
||||||
|
**GpuEntity (16 bytes, std430):**
|
||||||
|
```glsl
|
||||||
|
struct Entity {
|
||||||
|
float x; // world position
|
||||||
|
float y;
|
||||||
|
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||||
|
uint color; // 0xRRGGBB
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**zig side:**
|
||||||
|
```zig
|
||||||
|
const GpuEntity = extern struct {
|
||||||
|
x: f32,
|
||||||
|
y: f32,
|
||||||
|
packed_vel: i32,
|
||||||
|
color: u32,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn packVelocity(vx: f32, vy: f32) i32 {
|
||||||
|
const vx_fixed: i16 = @intFromFloat(vx * 256.0);
|
||||||
|
const vy_fixed: i16 = @intFromFloat(vy * 256.0);
|
||||||
|
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## compute shader
|
||||||
|
|
||||||
|
`src/shaders/entity_update.comp`:
|
||||||
|
```glsl
|
||||||
|
#version 430
|
||||||
|
layout(local_size_x = 256) in;
|
||||||
|
|
||||||
|
layout(std430, binding = 0) buffer Entities {
|
||||||
|
Entity entities[];
|
||||||
|
};
|
||||||
|
|
||||||
|
uniform uint entityCount;
|
||||||
|
uniform uint frameNumber;
|
||||||
|
uniform vec2 screenSize;
|
||||||
|
uniform vec2 center;
|
||||||
|
uniform float respawnRadius;
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint id = gl_GlobalInvocationID.x;
|
||||||
|
if (id >= entityCount) return;
|
||||||
|
|
||||||
|
Entity e = entities[id];
|
||||||
|
|
||||||
|
// unpack velocity
|
||||||
|
float vx = float(e.packedVel >> 16) / 256.0;
|
||||||
|
float vy = float((e.packedVel << 16) >> 16) / 256.0;
|
||||||
|
|
||||||
|
// update position
|
||||||
|
e.x += vx;
|
||||||
|
e.y += vy;
|
||||||
|
|
||||||
|
// respawn check
|
||||||
|
float dx = e.x - center.x;
|
||||||
|
float dy = e.y - center.y;
|
||||||
|
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
||||||
|
// GPU RNG
|
||||||
|
uint seed = id * 1103515245u + frameNumber * 12345u;
|
||||||
|
seed = seed * 747796405u + 2891336453u;
|
||||||
|
|
||||||
|
uint edge = seed & 3u;
|
||||||
|
float t = float((seed >> 2) & 0xFFFFu) / 65535.0;
|
||||||
|
|
||||||
|
// spawn on edge with velocity toward center
|
||||||
|
// (full edge logic in implementation)
|
||||||
|
}
|
||||||
|
|
||||||
|
entities[id] = e;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## integration
|
||||||
|
|
||||||
|
raylib doesn't wrap compute shaders. use raw GL calls via `compute.zig`:
|
||||||
|
|
||||||
|
```zig
|
||||||
|
pub fn dispatch(entity_count: u32, frame: u32) void {
|
||||||
|
gl.glUseProgram(program);
|
||||||
|
gl.glUniform1ui(entity_count_loc, entity_count);
|
||||||
|
gl.glUniform1ui(frame_loc, frame);
|
||||||
|
// ... other uniforms
|
||||||
|
|
||||||
|
const groups = (entity_count + 255) / 256;
|
||||||
|
gl.glDispatchCompute(groups, 1, 1);
|
||||||
|
gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## frame flow
|
||||||
|
|
||||||
|
**before:**
|
||||||
|
```
|
||||||
|
CPU: update positions (5ms at 950k)
|
||||||
|
CPU: copy to gpu_buffer
|
||||||
|
CPU→GPU: rlUpdateShaderBuffer() ← bottleneck
|
||||||
|
GPU: render
|
||||||
|
```
|
||||||
|
|
||||||
|
**after:**
|
||||||
|
```
|
||||||
|
GPU: compute dispatch (~0ms CPU time)
|
||||||
|
GPU: memory barrier
|
||||||
|
GPU: render
|
||||||
|
```
|
||||||
|
|
||||||
|
## implementation steps
|
||||||
|
|
||||||
|
each step is a commit point if desired.
|
||||||
|
|
||||||
|
### step 1: GpuEntity struct expansion
|
||||||
|
- modify `GpuEntity` in sandbox.zig: add `packed_vel` field
|
||||||
|
- add `packVelocity()` helper
|
||||||
|
- update ssbo_renderer to handle 16-byte stride
|
||||||
|
- verify existing rendering still works
|
||||||
|
|
||||||
|
### step 2: compute shader infrastructure
|
||||||
|
- create `src/compute.zig` with GL bindings
|
||||||
|
- create `src/shaders/entity_update.comp` (position update only, no respawn yet)
|
||||||
|
- load and compile compute shader in sandbox_main.zig
|
||||||
|
- dispatch before render, verify positions update
|
||||||
|
|
||||||
|
### step 3: respawn logic
|
||||||
|
- add GPU RNG to compute shader
|
||||||
|
- implement edge spawning + velocity calculation
|
||||||
|
- remove CPU update loop from sandbox.zig
|
||||||
|
|
||||||
|
### step 4: cleanup ✓
|
||||||
|
- `--compute` is now default, `--cpu` flag for fallback/comparison
|
||||||
|
- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
|
||||||
|
- verbose debug output reduced
|
||||||
|
|
||||||
|
## files changed
|
||||||
|
|
||||||
|
**new:**
|
||||||
|
- `src/shaders/entity_update.comp`
|
||||||
|
- `src/compute.zig`
|
||||||
|
|
||||||
|
**modified:**
|
||||||
|
- `src/sandbox.zig` — GpuEntity struct, packVelocity(), remove CPU update
|
||||||
|
- `src/ssbo_renderer.zig` — remove per-frame upload
|
||||||
|
- `src/sandbox_main.zig` — init compute, dispatch in frame loop
|
||||||
|
|
||||||
|
## risks
|
||||||
|
|
||||||
|
1. **driver quirks** — intel HD 530 compute support is fine but older, may hit edge cases
|
||||||
|
2. **debugging** — GPU code harder to debug, start with small counts
|
||||||
|
3. **fallback** — keep `--compute` flag to A/B test against existing SSBO path
|
||||||
|
|
||||||
|
## expected results
|
||||||
|
|
||||||
|
- CPU update time: ~5ms → ~0ms
|
||||||
|
- no per-frame buffer upload
|
||||||
|
- target: 1M+ entities, pushing toward 10M ceiling
|
||||||
89
docs/point_sprites_experiment.md
Normal file
89
docs/point_sprites_experiment.md
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
# point sprites experiment
|
||||||
|
|
||||||
|
branch: `point-sprites` (point-sprites work)
|
||||||
|
date: 2024-12
|
||||||
|
hardware: intel hd 530 (skylake gt2, i5-6500T)
|
||||||
|
|
||||||
|
## hypothesis
|
||||||
|
|
||||||
|
point sprites should be faster than quads because:
|
||||||
|
- 1 vertex per entity instead of 6 (quad = 2 triangles)
|
||||||
|
- less vertex throughput
|
||||||
|
- `gl_PointCoord` provides texture coords automatically
|
||||||
|
|
||||||
|
## implementation
|
||||||
|
|
||||||
|
### vertex shader changes
|
||||||
|
- removed quad vertex attributes (position, texcoord)
|
||||||
|
- use `gl_PointSize = 16.0 * zoom` for size control
|
||||||
|
- position calculated from SSBO data only
|
||||||
|
|
||||||
|
### fragment shader changes
|
||||||
|
- use `gl_PointCoord` instead of vertex texcoord
|
||||||
|
- sample circle texture for alpha
|
||||||
|
|
||||||
|
### renderer changes
|
||||||
|
- load `glEnable` and `glDrawArraysInstanced` via `rlGetProcAddress`
|
||||||
|
- enable `GL_PROGRAM_POINT_SIZE`
|
||||||
|
- draw with `glDrawArraysInstanced(GL_POINTS, 0, 1, count)`
|
||||||
|
- removed VBO (no vertex data needed)
|
||||||
|
|
||||||
|
## results
|
||||||
|
|
||||||
|
### attempt 1: procedural circle in fragment shader
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
vec2 coord = gl_PointCoord - vec2(0.5);
|
||||||
|
float dist = length(coord);
|
||||||
|
float alpha = 1.0 - smoothstep(0.4, 0.5, dist);
|
||||||
|
if (alpha < 0.01) discard;
|
||||||
|
```
|
||||||
|
|
||||||
|
**benchmark @ 350k entities:**
|
||||||
|
- point sprites: 23ms render, 43fps
|
||||||
|
- quads (main): 6.2ms render, 151fps
|
||||||
|
- **result: 3.7x SLOWER**
|
||||||
|
|
||||||
|
**why:** `discard` breaks early-z optimization, `length()` and `smoothstep()` are ALU-heavy, intel integrated GPUs are weak at fragment shader math.
|
||||||
|
|
||||||
|
### attempt 2: texture sampling
|
||||||
|
|
||||||
|
```glsl
|
||||||
|
float alpha = texture(circleTexture, gl_PointCoord).r;
|
||||||
|
finalColor = vec4(fragColor, alpha);
|
||||||
|
```
|
||||||
|
|
||||||
|
**benchmark @ 450k entities:**
|
||||||
|
- point sprites: 19.1ms render, 52fps
|
||||||
|
- quads (main): 8.0ms render, 122fps
|
||||||
|
- **result: 2.4x SLOWER**
|
||||||
|
|
||||||
|
better than procedural, but still significantly slower than quads.
|
||||||
|
|
||||||
|
## analysis
|
||||||
|
|
||||||
|
the theoretical advantage (1/6 vertices) doesn't translate to real performance because:
|
||||||
|
|
||||||
|
1. **triangle path is more optimized** - intel's driver heavily optimizes the standard triangle rasterization path. point sprites use a less-traveled code path.
|
||||||
|
|
||||||
|
2. **fill rate is the bottleneck** - HD 530 has only 3 ROPs. we're bound by how fast we can write pixels, not by vertex count. reducing vertices from 6 to 1 doesn't help when fill rate is the constraint.
|
||||||
|
|
||||||
|
3. **point size overhead** - each point requires computing `gl_PointSize` and setting up the point sprite rasterization, which may have per-vertex overhead.
|
||||||
|
|
||||||
|
4. **texture cache behavior** - `gl_PointCoord` may have worse cache locality than explicit vertex texcoords.
|
||||||
|
|
||||||
|
## conclusion
|
||||||
|
|
||||||
|
**point sprites are a regression on intel hd 530.**
|
||||||
|
|
||||||
|
the optimization makes theoretical sense but fails in practice on this hardware. the quad/triangle path is simply more optimized in intel's mesa driver.
|
||||||
|
|
||||||
|
**keep this branch for testing on discrete GPUs** where point sprites might actually help (nvidia/amd have different optimization priorities).
|
||||||
|
|
||||||
|
## lessons learned
|
||||||
|
|
||||||
|
1. always benchmark, don't assume
|
||||||
|
2. "fewer vertices" doesn't always mean faster
|
||||||
|
3. integrated GPU optimization is different from discrete
|
||||||
|
4. the most optimized path is usually the most common path (triangles)
|
||||||
|
5. fill rate matters more than vertex count at high entity counts
|
||||||
201
docs/rops.txt
Normal file
201
docs/rops.txt
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
rops: render output units
|
||||||
|
=========================
|
||||||
|
|
||||||
|
what they are, where they came from, and what yours can do.
|
||||||
|
|
||||||
|
|
||||||
|
what is a rop?
|
||||||
|
--------------
|
||||||
|
|
||||||
|
ROP = Render Output Unit (originally "Raster Operations Pipeline")
|
||||||
|
|
||||||
|
it's the final stage of the GPU pipeline. after all the fancy shader
|
||||||
|
math is done, the ROP is the unit that actually writes pixels to memory.
|
||||||
|
|
||||||
|
think of it as the bottleneck between "calculated" and "visible."
|
||||||
|
|
||||||
|
a ROP does:
|
||||||
|
- depth testing (is this pixel in front of what's already there?)
|
||||||
|
- stencil testing (mask operations)
|
||||||
|
- blending (alpha, additive, etc)
|
||||||
|
- anti-aliasing resolve
|
||||||
|
- writing the final color to the framebuffer
|
||||||
|
|
||||||
|
one ROP can write one pixel per clock cycle (roughly).
|
||||||
|
|
||||||
|
|
||||||
|
the first rop
|
||||||
|
-------------
|
||||||
|
|
||||||
|
the term comes from the IBM 8514/A (1987), which had dedicated hardware
|
||||||
|
for "raster operations" - bitwise operations on pixels (AND, OR, XOR).
|
||||||
|
this was revolutionary because before this, the CPU did all pixel math.
|
||||||
|
|
||||||
|
but the modern ROP as we know it emerged with:
|
||||||
|
|
||||||
|
NVIDIA NV1 (1995)
|
||||||
|
one of the first chips with dedicated pixel output hardware
|
||||||
|
could do ~1 million textured pixels/second
|
||||||
|
|
||||||
|
3dfx Voodoo (1996)
|
||||||
|
the card that defined the modern GPU pipeline
|
||||||
|
had 1 TMU + 1 pixel pipeline (essentially 1 ROP)
|
||||||
|
could push 45 million pixels/second
|
||||||
|
that ONE pipeline ran Quake at 640x480
|
||||||
|
|
||||||
|
NVIDIA GeForce 256 (1999)
|
||||||
|
"the first GPU" - named itself with that term
|
||||||
|
4 pixel pipelines = 4 ROPs
|
||||||
|
480 million pixels/second
|
||||||
|
|
||||||
|
so the original consumer 3D cards had... 1 ROP. and they ran Quake.
|
||||||
|
|
||||||
|
|
||||||
|
what one rop can do
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
let's do the math.
|
||||||
|
|
||||||
|
one ROP at 100 MHz (3dfx Voodoo era):
|
||||||
|
100 million cycles/second
|
||||||
|
~1 pixel per cycle
|
||||||
|
= 100 megapixels/second
|
||||||
|
|
||||||
|
at 640x480 @ 60fps:
|
||||||
|
640 * 480 * 60 = 18.4 megapixels/second needed
|
||||||
|
|
||||||
|
so ONE ROP at 100MHz could handle 640x480 with ~5x headroom for overdraw.
|
||||||
|
|
||||||
|
at 1024x768 @ 60fps:
|
||||||
|
1024 * 768 * 60 = 47 megapixels/second
|
||||||
|
|
||||||
|
now you're at 2x overdraw max. still playable, but tight.
|
||||||
|
|
||||||
|
|
||||||
|
one modern rop
|
||||||
|
--------------
|
||||||
|
|
||||||
|
a single modern ROP runs at ~1-2 GHz and can do more per cycle:
|
||||||
|
- multiple color outputs (MRT)
|
||||||
|
- 64-bit or 128-bit color formats
|
||||||
|
- compressed writes
|
||||||
|
|
||||||
|
rough estimate for one ROP at 1.5 GHz:
|
||||||
|
~1.5 billion pixels/second base throughput
|
||||||
|
|
||||||
|
at 1920x1080 @ 60fps:
|
||||||
|
1920 * 1080 * 60 = 124 megapixels/second
|
||||||
|
|
||||||
|
one ROP could handle 1080p with 12x overdraw headroom.
|
||||||
|
|
||||||
|
at 4K @ 60fps:
|
||||||
|
3840 * 2160 * 60 = 497 megapixels/second
|
||||||
|
|
||||||
|
one ROP could handle 4K with 3x overdraw. tight, but possible.
|
||||||
|
|
||||||
|
|
||||||
|
your three rops (intel hd 530)
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
HD 530 specs:
|
||||||
|
- 3 ROPs
|
||||||
|
- ~950 MHz boost clock
|
||||||
|
- theoretical: 2.85 GPixels/second
|
||||||
|
|
||||||
|
let's break that down:
|
||||||
|
|
||||||
|
at 1080p @ 60fps (124 MP/s needed):
|
||||||
|
2850 / 124 = 23x overdraw budget
|
||||||
|
|
||||||
|
that's actually generous! you could draw each pixel 23 times.
|
||||||
|
|
||||||
|
so why does lofivor struggle at 1M entities?
|
||||||
|
|
||||||
|
because 1M entities at 4x4 pixels = 16M pixels minimum.
|
||||||
|
but with overlap? let's say average 10x overdraw:
|
||||||
|
160M pixels/frame
|
||||||
|
at 60fps = 9.6 billion pixels/second
|
||||||
|
|
||||||
|
your ceiling is 2.85 billion.
|
||||||
|
|
||||||
|
so you're 3.4x over budget. that's why you top out around 300k-400k
|
||||||
|
before frame drops (which matches empirical testing).
|
||||||
|
|
||||||
|
|
||||||
|
the real constraint
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
ROPs don't work in isolation. they're limited by:
|
||||||
|
|
||||||
|
1. MEMORY BANDWIDTH
|
||||||
|
each pixel write = memory access
|
||||||
|
HD 530 shares DDR4 with CPU (~30 GB/s)
|
||||||
|
at 32-bit color: 30GB/s / 4 bytes = 7.5 billion pixels/second max
|
||||||
|
but you're competing with CPU, texture reads, etc.
|
||||||
|
realistic: maybe 2-3 billion pixels for framebuffer writes
|
||||||
|
|
||||||
|
2. TEXTURE SAMPLING
|
||||||
|
if fragment shader samples textures, TMUs must keep up
|
||||||
|
HD 530 has 24 TMUs, so this isn't the bottleneck
|
||||||
|
|
||||||
|
3. SHADER EXECUTION
|
||||||
|
ROPs wait for fragments to be shaded
|
||||||
|
if shaders are slow, ROPs starve
|
||||||
|
lofivor's shaders are trivial, so this isn't the bottleneck
|
||||||
|
|
||||||
|
for lofivor specifically: your 3 ROPs are THE ceiling.
|
||||||
|
|
||||||
|
|
||||||
|
what could you do with more rops?
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
comparison:
|
||||||
|
|
||||||
|
Intel HD 530: 3 ROPs, 2.85 GPixels/s
|
||||||
|
GTX 1060: 48 ROPs, 72 GPixels/s
|
||||||
|
RTX 3080: 96 ROPs, 164 GPixels/s
|
||||||
|
RTX 4090: 176 ROPs, 443 GPixels/s
|
||||||
|
|
||||||
|
with a GTX 1060 (25x your fill rate):
|
||||||
|
lofivor could probably hit 5-10 million entities
|
||||||
|
|
||||||
|
with an RTX 4090 (155x your fill rate):
|
||||||
|
tens of millions, limited by other factors
|
||||||
|
|
||||||
|
|
||||||
|
perspective: what 3 rops means historically
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
your HD 530 has roughly the fill rate of:
|
||||||
|
- GeForce 4 Ti 4600 (2002): 4 ROPs, 1.2 GPixels/s
|
||||||
|
- Radeon 9700 Pro (2002): 8 ROPs, 2.6 GPixels/s
|
||||||
|
|
||||||
|
you're running hardware that, in raw pixel output, matches GPUs from
|
||||||
|
20+ years ago. but with modern features (compute shaders, SSBO, etc).
|
||||||
|
|
||||||
|
this is why lofivor is interesting: you're achieving 700k+ entities
|
||||||
|
on fill-rate-equivalent hardware that originally ran games with
|
||||||
|
maybe 10,000 triangles on screen.
|
||||||
|
|
||||||
|
the difference is technique. those 2002 games did complex per-pixel
|
||||||
|
lighting, shadows, multiple texture passes. lofivor does one texture
|
||||||
|
sample and one blend. same fill rate, 100x the entities.
|
||||||
|
|
||||||
|
|
||||||
|
the lesson
|
||||||
|
----------
|
||||||
|
|
||||||
|
ROPs are simple: they write pixels.
|
||||||
|
|
||||||
|
the number you have determines your pixel budget.
|
||||||
|
everything else (shaders, vertices, CPU logic) only matters if
|
||||||
|
the ROPs aren't your bottleneck.
|
||||||
|
|
||||||
|
with 3 ROPs, you have roughly 2.85 billion pixels/second.
|
||||||
|
spend them wisely:
|
||||||
|
- cull what's offscreen (don't spend pixels on invisible things)
|
||||||
|
- shrink distant objects (LOD saves pixels)
|
||||||
|
- reduce overlap (spatial organization)
|
||||||
|
- keep shaders simple (don't starve the ROPs)
|
||||||
|
|
||||||
|
your 3 ROPs can do remarkable things. Quake ran on 1.
|
||||||
316
docs/why-millions-is-hard.txt
Normal file
316
docs/why-millions-is-hard.txt
Normal file
|
|
@ -0,0 +1,316 @@
|
||||||
|
why rendering millions of entities is hard
|
||||||
|
=========================================
|
||||||
|
|
||||||
|
and what "hard" actually means, from first principles.
|
||||||
|
|
||||||
|
|
||||||
|
the simple answer
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
every frame, your computer does work. work takes time. you have 16.7
|
||||||
|
milliseconds to do all the work before the next frame (at 60fps).
|
||||||
|
|
||||||
|
if the work takes longer than 16.7ms, you miss the deadline. frames drop.
|
||||||
|
the game stutters.
|
||||||
|
|
||||||
|
10 million entities means 10 million units of work. whether that fits in
|
||||||
|
16.7ms depends on how much work each unit is.
|
||||||
|
|
||||||
|
|
||||||
|
what is "work" anyway?
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
let's trace what happens when you draw one entity:
|
||||||
|
|
||||||
|
1. CPU: "here's an entity at position (340, 512), color cyan"
|
||||||
|
2. that data travels over a bus to the GPU
|
||||||
|
3. GPU: receives the data, stores it in memory
|
||||||
|
4. GPU: runs a vertex shader (figures out where on screen)
|
||||||
|
5. GPU: runs a fragment shader (figures out what color each pixel is)
|
||||||
|
6. GPU: writes pixels to the framebuffer
|
||||||
|
7. framebuffer gets sent to your monitor
|
||||||
|
|
||||||
|
each step has a speed limit. the slowest step is your bottleneck.
|
||||||
|
|
||||||
|
|
||||||
|
the bottlenecks, explained simply
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
MEMORY BANDWIDTH
|
||||||
|
how fast data can move around. measured in GB/s.
|
||||||
|
|
||||||
|
think of it like a highway. you can have a fast car (processor), but
|
||||||
|
if the highway is jammed, you're stuck in traffic.
|
||||||
|
|
||||||
|
an integrated GPU (like Intel HD 530) shares the highway with the CPU.
|
||||||
|
a discrete GPU (like an RTX card) has its own private highway.
|
||||||
|
|
||||||
|
this is why lofivor's SSBO optimization helped so much: shrinking
|
||||||
|
entity data from 64 bytes to 12 bytes means 5x less traffic.
|
||||||
|
|
||||||
|
DRAW CALLS
|
||||||
|
every time you say "GPU, draw this thing", there's overhead.
|
||||||
|
the CPU and GPU have to synchronize, state gets set up, etc.
|
||||||
|
|
||||||
|
1 draw call for 1 million entities: fast
|
||||||
|
1 million draw calls for 1 million entities: slow
|
||||||
|
|
||||||
|
this is why batching matters. not the drawing itself, but the
|
||||||
|
*coordination* of drawing.
|
||||||
|
|
||||||
|
FILL RATE
|
||||||
|
how many pixels the GPU can color per second.
|
||||||
|
|
||||||
|
a 4x4 pixel entity = 16 pixels
|
||||||
|
1 million entities = 16 million pixels minimum
|
||||||
|
|
||||||
|
but your screen is only ~2 million pixels (1920x1080). so entities
|
||||||
|
overlap. "overdraw" means coloring the same pixel multiple times.
|
||||||
|
|
||||||
|
10 million overlapping entities might touch each pixel 50+ times.
|
||||||
|
that's 100 million pixel operations.
|
||||||
|
|
||||||
|
SHADER COMPLEXITY
|
||||||
|
the GPU runs a tiny program for each vertex and each pixel.
|
||||||
|
|
||||||
|
simple: "put it here, color it this" = fast
|
||||||
|
complex: "calculate lighting from 8 sources, sample 4 textures,
|
||||||
|
apply normal mapping, do fresnel..." = slow
|
||||||
|
|
||||||
|
lofivor's shaders are trivial. AAA game shaders are not.
|
||||||
|
|
||||||
|
CPU-GPU SYNCHRONIZATION
|
||||||
|
the CPU and GPU work in parallel, but sometimes they have to wait
|
||||||
|
for each other.
|
||||||
|
|
||||||
|
if the CPU needs to read GPU results, it stalls.
|
||||||
|
if the GPU needs new data and the CPU is busy, it stalls.
|
||||||
|
|
||||||
|
good code keeps them both busy without waiting.
|
||||||
|
|
||||||
|
|
||||||
|
why "real games" hit CPU walls
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
rendering is just putting colors on pixels. that's the GPU's job.
|
||||||
|
|
||||||
|
but games aren't just rendering. they're also:
|
||||||
|
|
||||||
|
- COLLISION DETECTION
|
||||||
|
does entity A overlap entity B?
|
||||||
|
|
||||||
|
naive approach: check every pair
|
||||||
|
1,000 entities = 500,000 checks (n squared / 2)
|
||||||
|
10,000 entities = 50,000,000 checks
|
||||||
|
1,000,000 entities = 500,000,000,000,000 checks
|
||||||
|
|
||||||
|
that's 500 trillion. per frame. not happening.
|
||||||
|
|
||||||
|
smart approach: spatial partitioning (grids, quadtrees)
|
||||||
|
only check nearby entities. but still, at millions of entities,
|
||||||
|
even "nearby" is a lot.
|
||||||
|
|
||||||
|
- AI / BEHAVIOR
|
||||||
|
each entity decides what to do.
|
||||||
|
|
||||||
|
simple: move toward player. cheap.
|
||||||
|
complex: pathfind around obstacles, consider threats, coordinate
|
||||||
|
with allies, remember state. expensive.
|
||||||
|
|
||||||
|
lofivor entities just drift in a direction. no decisions.
|
||||||
|
a real game enemy makes decisions every frame.
|
||||||
|
|
||||||
|
- PHYSICS
|
||||||
|
entities push each other, bounce, have mass and friction.
|
||||||
|
every interaction is math. lots of entities = lots of math.
|
||||||
|
|
||||||
|
- GAME LOGIC
|
||||||
|
damage calculations, spawning, leveling, cooldowns, buffs...
|
||||||
|
all of this runs on the CPU, every frame.
|
||||||
|
|
||||||
|
so: lofivor can render 700k entities because they don't DO anything.
|
||||||
|
a game with 700k entities that think, collide, and interact would
|
||||||
|
need god-tier optimization or would simply not run.
|
||||||
|
|
||||||
|
|
||||||
|
what makes AAA games slow on old hardware?
|
||||||
|
------------------------------------------
|
||||||
|
|
||||||
|
it's not entity count. most AAA games have maybe hundreds of
|
||||||
|
"entities" on screen. it's everything else:
|
||||||
|
|
||||||
|
TEXTURE RESOLUTION
|
||||||
|
a 4K texture is 67 million pixels of data. per texture.
|
||||||
|
one character might have 10+ textures (diffuse, normal, specular,
|
||||||
|
roughness, ambient occlusion...).
|
||||||
|
|
||||||
|
old hardware: less VRAM, slower texture sampling.
|
||||||
|
|
||||||
|
SHADER COMPLEXITY
|
||||||
|
modern materials simulate light physics. subsurface scattering,
|
||||||
|
global illumination, ray-traced reflections.
|
||||||
|
|
||||||
|
each pixel might do hundreds of math operations.
|
||||||
|
|
||||||
|
POST-PROCESSING
|
||||||
|
bloom, motion blur, depth of field, ambient occlusion, anti-aliasing.
|
||||||
|
full-screen passes that touch every pixel multiple times.
|
||||||
|
|
||||||
|
MESH COMPLEXITY
|
||||||
|
a character might be 100,000 triangles.
|
||||||
|
10 characters = 1 million triangles.
|
||||||
|
each triangle goes through the vertex shader.
|
||||||
|
|
||||||
|
SHADOWS
|
||||||
|
render the scene again from the light's perspective.
|
||||||
|
for each light. every frame.
|
||||||
|
|
||||||
|
AAA games are doing 100x more work per pixel than lofivor.
|
||||||
|
lofivor is doing 100x more pixels than AAA games.
|
||||||
|
|
||||||
|
different problems.
|
||||||
|
|
||||||
|
|
||||||
|
the "abuse" vs "respect" distinction
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
abuse: making the hardware do unnecessary work.
|
||||||
|
respect: achieving your goal with minimal waste.
|
||||||
|
|
||||||
|
examples of abuse (that lofivor fixed):
|
||||||
|
|
||||||
|
- sending 64 bytes (a full matrix) when you need 12 bytes (x, y, color)
|
||||||
|
- one draw call per entity when you could batch
|
||||||
|
- calculating transforms on CPU when GPU could do it
|
||||||
|
- clearing the screen twice
|
||||||
|
- uploading the same data every frame
|
||||||
|
|
||||||
|
examples of abuse in the wild:
|
||||||
|
|
||||||
|
- electron apps using a whole browser to show a chat window
|
||||||
|
- games that re-render static UI every frame
|
||||||
|
- loading 4K textures for objects that appear 20 pixels tall
|
||||||
|
- running AI pathfinding for off-screen entities
|
||||||
|
|
||||||
|
the hardware has limits. respecting them means fitting your game
|
||||||
|
within those limits through smart decisions. abusing them means
|
||||||
|
throwing cycles at problems you created yourself.
|
||||||
|
|
||||||
|
|
||||||
|
so can you do 1 million entities with juice on old hardware?
|
||||||
|
------------------------------------------------------------
|
||||||
|
|
||||||
|
yes, with the right decisions.
|
||||||
|
|
||||||
|
what "juice" typically means:
|
||||||
|
- screen shake (free, just offset the camera)
|
||||||
|
- particle effects (separate system, heavily optimized)
|
||||||
|
- flash/hit feedback (change a color value)
|
||||||
|
- sound (different system entirely)
|
||||||
|
|
||||||
|
particles are special: they're designed for millions of tiny things.
|
||||||
|
they don't collide, don't think, often don't even persist (spawn,
|
||||||
|
drift, fade, die). GPU particle systems are essentially what lofivor
|
||||||
|
became: minimal data, instanced rendering.
|
||||||
|
|
||||||
|
what would kill you at 1 million:
|
||||||
|
- per-entity collision
|
||||||
|
- per-entity AI
|
||||||
|
- per-entity sprite variety (texture switches)
|
||||||
|
- per-entity complex shaders
|
||||||
|
|
||||||
|
what you could do:
|
||||||
|
- 1 million particles (visual only, no logic)
|
||||||
|
- 10,000 enemies with collision/AI + 990,000 particles
|
||||||
|
- 100,000 enemies with simple behavior + spatial hash collision
|
||||||
|
|
||||||
|
the secret: most of what looks like "millions of things" in games
|
||||||
|
is actually a small number of meaningful entities + a large number
|
||||||
|
of dumb particles.
|
||||||
|
|
||||||
|
|
||||||
|
the laws of physics (sort of)
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
there are hard limits:
|
||||||
|
|
||||||
|
MEMORY BUS BANDWIDTH
|
||||||
|
a DDR4 system might move 25 GB/s.
|
||||||
|
1 million entities at 12 bytes each = 12 MB.
|
||||||
|
at 60fps = 720 MB/s just for entity data.
|
||||||
|
that's only 3% of bandwidth. plenty of room.
|
||||||
|
|
||||||
|
but a naive approach (64 bytes, plus overhead) could be
|
||||||
|
10x worse. suddenly you're at 30%.
|
||||||
|
|
||||||
|
CLOCK CYCLES
|
||||||
|
a 3GHz CPU does 3 billion operations per second.
|
||||||
|
at 60fps, that's 50 million operations per frame.
|
||||||
|
1 million entities = 50 operations each.
|
||||||
|
|
||||||
|
50 operations is: a few multiplies, some loads/stores, a branch.
|
||||||
|
that's barely enough for "move in a direction".
|
||||||
|
pathfinding? AI? collision? not a chance.
|
||||||
|
|
||||||
|
PARALLELISM
|
||||||
|
GPUs have thousands of cores but they're simple.
|
||||||
|
CPUs have few cores but they're smart.
|
||||||
|
|
||||||
|
entity rendering: perfectly parallel (GPU wins)
|
||||||
|
entity decision-making: often sequential (CPU bound)
|
||||||
|
|
||||||
|
so yes, physics constrains us. but "physics" here means:
|
||||||
|
- how fast electrons move through silicon
|
||||||
|
- how much data fits on a wire
|
||||||
|
- how many transistors fit on a chip
|
||||||
|
|
||||||
|
within those limits, there's room. lots of room, if you're clever.
|
||||||
|
lofivor went from 5k to 700k by being clever, not by breaking physics.
|
||||||
|
|
||||||
|
|
||||||
|
the actual lesson
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
the limit isn't really "the hardware can't do it."
|
||||||
|
|
||||||
|
the limit is "the hardware can't do it THE WAY YOU'RE DOING IT."
|
||||||
|
|
||||||
|
every optimization in lofivor was finding a different way:
|
||||||
|
- don't draw circles, blit textures
|
||||||
|
- don't call functions, submit vertices directly
|
||||||
|
- don't send matrices, send packed structs
|
||||||
|
- don't update on CPU, use compute shaders
|
||||||
|
|
||||||
|
the hardware was always capable of 700k. the code wasn't asking right.
|
||||||
|
|
||||||
|
this is true at every level. that old laptop struggling with 10k
|
||||||
|
entities in some game? probably not the laptop's fault. probably
|
||||||
|
the game is doing something wasteful that doesn't need to be.
|
||||||
|
|
||||||
|
"runs poorly on old hardware" often means "we didn't try to make
|
||||||
|
it run on old hardware" not "it's impossible on old hardware."
|
||||||
|
|
||||||
|
|
||||||
|
closing thought
|
||||||
|
---------------
|
||||||
|
|
||||||
|
10 million is a lot. but 1 million? 2 million?
|
||||||
|
|
||||||
|
with discipline: yes.
|
||||||
|
with decisions that respect the hardware: yes.
|
||||||
|
with awareness of what's actually expensive: yes.
|
||||||
|
|
||||||
|
the knowledge of what's expensive is the key.
|
||||||
|
|
||||||
|
most developers don't have it. they use high-level abstractions
|
||||||
|
that hide the cost. they've never seen a frame budget or a
|
||||||
|
bandwidth calculation.
|
||||||
|
|
||||||
|
lofivor is a learning tool. the journey from 5k to 700k teaches
|
||||||
|
where the costs are. once you see them, you can't unsee them.
|
||||||
|
|
||||||
|
you start asking: "what is this actually doing? what does it cost?
|
||||||
|
is there a cheaper way?"
|
||||||
|
|
||||||
|
that's the skill. not the specific techniques—those change with
|
||||||
|
hardware. the skill is asking the questions.
|
||||||
35
journal.txt
35
journal.txt
|
|
@ -206,3 +206,38 @@ total improvement from baseline:
|
||||||
- SSBO: 60fps @ ~700k entities
|
- SSBO: 60fps @ ~700k entities
|
||||||
- ~140x improvement overall!
|
- ~140x improvement overall!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
optimization 6: compute shader updates
|
||||||
|
--------------------------------------
|
||||||
|
technique: move entity position + respawn logic from CPU to GPU compute shader
|
||||||
|
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
|
||||||
|
version: 0.7.0
|
||||||
|
|
||||||
|
struct GpuEntity {
|
||||||
|
x: f32, // 4 bytes
|
||||||
|
y: f32, // 4 bytes
|
||||||
|
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
|
||||||
|
color: u32, // 4 bytes
|
||||||
|
}; // = 16 bytes total (was 12)
|
||||||
|
|
||||||
|
changes:
|
||||||
|
- entity_update.comp: position update, center check, edge respawn, velocity calc
|
||||||
|
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
|
||||||
|
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
|
||||||
|
- CPU update loop skipped entirely when compute enabled
|
||||||
|
|
||||||
|
benchmark results (i5-6500T / HD 530):
|
||||||
|
- update time: ~5ms → ~0ms at 150k entities
|
||||||
|
- render time unchanged (GPU-bound as before)
|
||||||
|
- total frame time improvement at high entity counts
|
||||||
|
|
||||||
|
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
|
||||||
|
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
|
||||||
|
new entities when user adds them, not per-frame. memory barrier ensures compute
|
||||||
|
writes visible to vertex shader before draw.
|
||||||
|
|
||||||
|
flags:
|
||||||
|
- --compute: GPU compute updates (now default)
|
||||||
|
- --cpu: fallback to CPU update path for comparison
|
||||||
|
|
||||||
|
|
|
||||||
16
justfile
16
justfile
|
|
@ -42,11 +42,20 @@ check:
|
||||||
test:
|
test:
|
||||||
zig build test
|
zig build test
|
||||||
|
|
||||||
# auto-benchmark (ramps entities until performance degrades, works on linux/windows)
|
# run sandbox (GPU compute is default)
|
||||||
|
sandbox:
|
||||||
|
zig build -Doptimize=ReleaseFast run
|
||||||
|
|
||||||
|
# auto-benchmark (ramps entities until performance degrades)
|
||||||
bench:
|
bench:
|
||||||
zig build -Doptimize=ReleaseFast run -- --bench
|
zig build -Doptimize=ReleaseFast run -- --bench
|
||||||
cat benchmark.log
|
cat benchmark.log
|
||||||
|
|
||||||
|
# benchmark with CPU update path (for comparison)
|
||||||
|
bench-cpu:
|
||||||
|
zig build -Doptimize=ReleaseFast run -- --bench --cpu
|
||||||
|
cat benchmark.log
|
||||||
|
|
||||||
# software-rendered benchmark (for CI/headless servers)
|
# software-rendered benchmark (for CI/headless servers)
|
||||||
[linux]
|
[linux]
|
||||||
bench-sw:
|
bench-sw:
|
||||||
|
|
@ -58,3 +67,8 @@ bench-sw:
|
||||||
bench-sw:
|
bench-sw:
|
||||||
@echo "bench-sw: windows doesn't have xvfb equivalent"
|
@echo "bench-sw: windows doesn't have xvfb equivalent"
|
||||||
@echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
|
@echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
|
||||||
|
|
||||||
|
[linux]
|
||||||
|
profile port="9876":
|
||||||
|
# start Tracy: tracy-profiler -a 127.0.0.1 -p {{port}}
|
||||||
|
zig build -Dtracy=true -Doptimize=ReleaseFast && TRACY_PORT={{port}} ./zig-out/bin/sandbox
|
||||||
|
|
|
||||||
8
releases/0.1.0-unoptimized.txt
Normal file
8
releases/0.1.0-unoptimized.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
the baseline: one draw call per entity, pure and simple
|
||||||
|
|
||||||
|
- individual rl.drawCircle() calls in a loop
|
||||||
|
- ~5k entities at 60fps before frame times tank
|
||||||
|
- linear scaling: 10k = ~43ms, 20k = ~77ms
|
||||||
|
- render-bound (update loop stays under 1ms even at 30k)
|
||||||
|
- each circle is its own GPU draw call
|
||||||
|
- the starting point for optimization experiments
|
||||||
8
releases/0.2.0-texture_blitting.txt
Normal file
8
releases/0.2.0-texture_blitting.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
pre-render once, blit many: 10x improvement
|
||||||
|
|
||||||
|
- render circle to 16x16 texture at startup
|
||||||
|
- drawTexture() per entity instead of drawCircle()
|
||||||
|
- raylib batches same-texture draws internally
|
||||||
|
- ~50k entities at 60fps
|
||||||
|
- simple change, big win
|
||||||
|
- still one function call per entity, but GPU work is batched
|
||||||
9
releases/0.3.0-quad_batching.txt
Normal file
9
releases/0.3.0-quad_batching.txt
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
bypass the wrapper, go straight to rlgl: 2x more
|
||||||
|
|
||||||
|
- skip drawTexture(), submit vertices directly via rl.gl
|
||||||
|
- manually build quads: rlTexCoord2f + rlVertex2f per corner
|
||||||
|
- rlBegin/rlEnd wraps the whole entity loop
|
||||||
|
- ~100k entities at 60fps
|
||||||
|
- eliminates per-call function overhead
|
||||||
|
- vertices go straight to GPU buffer
|
||||||
|
- 20x improvement over baseline
|
||||||
11
releases/0.3.1-batch_buffer.txt
Normal file
11
releases/0.3.1-batch_buffer.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
bigger buffer, fewer flushes: squeezing out more headroom
|
||||||
|
|
||||||
|
- increased raylib batch buffer from 8192 to 32768 vertices
|
||||||
|
- ~140k entities at 60fps on i5-6500T
|
||||||
|
- ~40% improvement over default buffer
|
||||||
|
- fewer GPU flushes per frame
|
||||||
|
- also added: release workflows for github and forgejo
|
||||||
|
- added OPTIMIZATIONS.md documenting the journey
|
||||||
|
- added README, UI panel with FPS display
|
||||||
|
- heap allocated entity array to support 1 million entities
|
||||||
|
- per-entity RGB colors
|
||||||
13
releases/0.4.0-gpu_instancing.txt
Normal file
13
releases/0.4.0-gpu_instancing.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
gpu instancing: a disappointing discovery
|
||||||
|
|
||||||
|
- drawMeshInstanced() with per-entity transform matrices
|
||||||
|
- ~150k entities at 60fps - barely better than rlgl batching
|
||||||
|
- negligible improvement on integrated graphics
|
||||||
|
- why it didn't help:
|
||||||
|
- integrated GPU shares system RAM (no PCIe transfer savings)
|
||||||
|
- 64-byte matrix per entity vs ~80 bytes for rlgl vertices
|
||||||
|
- bottleneck is memory bandwidth, not draw call overhead
|
||||||
|
- rlgl batching already minimizes draw calls effectively
|
||||||
|
- orthographic camera setup for 2D-like rendering
|
||||||
|
- heap-allocated transforms buffer (64MB too big for stack)
|
||||||
|
- lesson learned: not all "advanced" techniques are wins
|
||||||
17
releases/0.5.0-ssbo_instancing.txt
Normal file
17
releases/0.5.0-ssbo_instancing.txt
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
ssbo breakthrough: 5x gain by shrinking the data
|
||||||
|
|
||||||
|
- pack entity data (x, y, color) into 12-byte struct
|
||||||
|
- upload via shader storage buffer object (SSBO)
|
||||||
|
- ~700k entities at 60fps (i5-6500T / HD 530)
|
||||||
|
- ~950k entities at ~57fps
|
||||||
|
- 5x improvement over previous best
|
||||||
|
- 140x total from baseline
|
||||||
|
- why it works:
|
||||||
|
- 12 bytes vs 64 bytes (matrices) = 5.3x less bandwidth
|
||||||
|
- 12 bytes vs 80 bytes (rlgl vertices) = 6.7x less bandwidth
|
||||||
|
- no CPU-side matrix calculations
|
||||||
|
- GPU does NDC conversion and color unpacking
|
||||||
|
- custom vertex/fragment shaders
|
||||||
|
- single rlDrawVertexArrayInstanced() call for all entities
|
||||||
|
- shaders embedded at build time
|
||||||
|
- removed FPS cap, added optional vsync arg
|
||||||
5
releases/0.5.1-windows_build.txt
Normal file
5
releases/0.5.1-windows_build.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
cross-platform release: adding windows to the party
|
||||||
|
|
||||||
|
- updated github release workflow
|
||||||
|
- builds for both linux and windows now
|
||||||
|
- no code changes, just CI/CD work
|
||||||
10
releases/0.6.0-zoom_zoom.txt
Normal file
10
releases/0.6.0-zoom_zoom.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
zoom and pan: making millions of entities explorable
|
||||||
|
|
||||||
|
- mouse wheel zoom
|
||||||
|
- click and drag panning
|
||||||
|
- orthographic camera transforms
|
||||||
|
- memory panel showing entity buffer sizes
|
||||||
|
- background draws immediately (no flicker)
|
||||||
|
- tab key toggles UI panels
|
||||||
|
- explained "lofivor" name in README (lo-fi survivor)
|
||||||
|
- shader updated for zoom/pan transforms
|
||||||
5
releases/0.6.1-q_to_quit.txt
Normal file
5
releases/0.6.1-q_to_quit.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
quick exit: zoom out then quit
|
||||||
|
|
||||||
|
- q key first zooms out, second press quits
|
||||||
|
- nice way to see the full entity field before closing
|
||||||
|
- minor UI text fix
|
||||||
11
releases/0.7.0-compute_shader.txt
Normal file
11
releases/0.7.0-compute_shader.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
compute shader: moving physics to the GPU
|
||||||
|
|
||||||
|
- entity position updates now run on GPU via compute shader
|
||||||
|
- GPU-based RNG for entity velocity randomization
|
||||||
|
- full simulation loop stays on GPU, no CPU roundtrip
|
||||||
|
- new compute.zig module for shader management
|
||||||
|
- GpuEntity struct with position, velocity, and color
|
||||||
|
- tracy profiling integration
|
||||||
|
- FPS display turns green (good) or red (bad)
|
||||||
|
- added design docs for zoom/pan and compute shader work
|
||||||
|
- cross-platform alignment fixes for shader data
|
||||||
111
src/compute.zig
Normal file
111
src/compute.zig
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
// compute shader module for GPU entity updates
|
||||||
|
// wraps raw GL calls that raylib doesn't expose directly
|
||||||
|
|
||||||
|
const std = @import("std");
|
||||||
|
const rl = @import("raylib");
|
||||||
|
const sandbox = @import("sandbox.zig");
|
||||||
|
|
||||||
|
const comp_source = @embedFile("shaders/entity_update.comp");
|
||||||
|
|
||||||
|
// GL constants not exposed by raylib-zig
|
||||||
|
const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
|
||||||
|
|
||||||
|
// function pointer type for glMemoryBarrier
|
||||||
|
const GlMemoryBarrierFn = *const fn (barriers: u32) callconv(.c) void;
|
||||||
|
|
||||||
|
pub const ComputeShader = struct {
|
||||||
|
program_id: u32,
|
||||||
|
entity_count_loc: i32,
|
||||||
|
frame_number_loc: i32,
|
||||||
|
screen_size_loc: i32,
|
||||||
|
center_loc: i32,
|
||||||
|
respawn_radius_loc: i32,
|
||||||
|
entity_speed_loc: i32,
|
||||||
|
glMemoryBarrier: GlMemoryBarrierFn,
|
||||||
|
|
||||||
|
pub fn init() ?ComputeShader {
|
||||||
|
// load glMemoryBarrier dynamically
|
||||||
|
const barrier_ptr = rl.gl.rlGetProcAddress("glMemoryBarrier");
|
||||||
|
const glMemoryBarrier: GlMemoryBarrierFn = @ptrCast(@alignCast(barrier_ptr));
|
||||||
|
|
||||||
|
// compile compute shader
|
||||||
|
const shader_id = rl.gl.rlCompileShader(comp_source, rl.gl.rl_compute_shader);
|
||||||
|
if (shader_id == 0) {
|
||||||
|
std.debug.print("compute: failed to compile compute shader\n", .{});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// link compute program
|
||||||
|
const program_id = rl.gl.rlLoadComputeShaderProgram(shader_id);
|
||||||
|
if (program_id == 0) {
|
||||||
|
std.debug.print("compute: failed to link compute program\n", .{});
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get uniform locations
|
||||||
|
const entity_count_loc = rl.gl.rlGetLocationUniform(program_id, "entityCount");
|
||||||
|
const frame_number_loc = rl.gl.rlGetLocationUniform(program_id, "frameNumber");
|
||||||
|
const screen_size_loc = rl.gl.rlGetLocationUniform(program_id, "screenSize");
|
||||||
|
const center_loc = rl.gl.rlGetLocationUniform(program_id, "center");
|
||||||
|
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
||||||
|
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
||||||
|
|
||||||
|
std.debug.print("compute: shader loaded\n", .{});
|
||||||
|
|
||||||
|
return .{
|
||||||
|
.program_id = program_id,
|
||||||
|
.entity_count_loc = entity_count_loc,
|
||||||
|
.frame_number_loc = frame_number_loc,
|
||||||
|
.screen_size_loc = screen_size_loc,
|
||||||
|
.center_loc = center_loc,
|
||||||
|
.respawn_radius_loc = respawn_radius_loc,
|
||||||
|
.entity_speed_loc = entity_speed_loc,
|
||||||
|
.glMemoryBarrier = glMemoryBarrier,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *ComputeShader) void {
|
||||||
|
rl.gl.rlUnloadShaderProgram(self.program_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn dispatch(self: *ComputeShader, ssbo_id: u32, entity_count: u32, frame_number: u32) void {
|
||||||
|
if (entity_count == 0) return;
|
||||||
|
|
||||||
|
// constants from sandbox.zig
|
||||||
|
const screen_w: f32 = @floatFromInt(sandbox.SCREEN_WIDTH);
|
||||||
|
const screen_h: f32 = @floatFromInt(sandbox.SCREEN_HEIGHT);
|
||||||
|
const center_x: f32 = screen_w / 2.0;
|
||||||
|
const center_y: f32 = screen_h / 2.0;
|
||||||
|
const respawn_radius: f32 = 10.0; // RESPAWN_THRESHOLD
|
||||||
|
const entity_speed: f32 = 2.0; // ENTITY_SPEED
|
||||||
|
|
||||||
|
// bind compute shader
|
||||||
|
rl.gl.rlEnableShader(self.program_id);
|
||||||
|
|
||||||
|
// set uniforms
|
||||||
|
rl.gl.rlSetUniform(self.entity_count_loc, &entity_count, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
||||||
|
rl.gl.rlSetUniform(self.frame_number_loc, &frame_number, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
||||||
|
|
||||||
|
const screen_size = [2]f32{ screen_w, screen_h };
|
||||||
|
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
|
||||||
|
const center = [2]f32{ center_x, center_y };
|
||||||
|
rl.gl.rlSetUniform(self.center_loc, ¢er, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
|
||||||
|
rl.gl.rlSetUniform(self.respawn_radius_loc, &respawn_radius, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||||
|
rl.gl.rlSetUniform(self.entity_speed_loc, &entity_speed, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||||
|
|
||||||
|
// bind SSBO to binding point 0
|
||||||
|
rl.gl.rlBindShaderBuffer(ssbo_id, 0);
|
||||||
|
|
||||||
|
// dispatch compute workgroups: ceil(entity_count / 256)
|
||||||
|
const groups = (entity_count + 255) / 256;
|
||||||
|
rl.gl.rlComputeShaderDispatch(groups, 1, 1);
|
||||||
|
|
||||||
|
// memory barrier - ensure compute writes are visible to vertex shader
|
||||||
|
self.glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
||||||
|
|
||||||
|
// unbind
|
||||||
|
rl.gl.rlBindShaderBuffer(0, 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
@ -287,34 +287,69 @@ test "update respawns entity at edge when reaching center" {
|
||||||
try std.testing.expect(on_left or on_right or on_top or on_bottom);
|
try std.testing.expect(on_left or on_right or on_top or on_bottom);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU entity for SSBO rendering (position + color only, no velocity)
|
// GPU entity for SSBO rendering (16 bytes, matches compute shader layout)
|
||||||
pub const GpuEntity = extern struct {
|
pub const GpuEntity = extern struct {
|
||||||
x: f32,
|
x: f32,
|
||||||
y: f32,
|
y: f32,
|
||||||
|
packed_vel: i32, // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||||
color: u32,
|
color: u32,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// pack two f32 velocities into a single i32 (fixed-point 8.8 format)
|
||||||
|
pub fn packVelocity(vx: f32, vy: f32) i32 {
|
||||||
|
const vx_fixed: i16 = @intFromFloat(std.math.clamp(vx * 256.0, -32768.0, 32767.0));
|
||||||
|
const vy_fixed: i16 = @intFromFloat(std.math.clamp(vy * 256.0, -32768.0, 32767.0));
|
||||||
|
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
test "GpuEntity struct has correct size for SSBO" {
|
test "GpuEntity struct has correct size for SSBO" {
|
||||||
// SSBO layout: x(4) + y(4) + color(4) = 12 bytes
|
// SSBO layout: x(4) + y(4) + packed_vel(4) + color(4) = 16 bytes
|
||||||
try std.testing.expectEqual(@as(usize, 12), @sizeOf(GpuEntity));
|
try std.testing.expectEqual(@as(usize, 16), @sizeOf(GpuEntity));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "GpuEntity can be created from Entity" {
|
test "GpuEntity can be created from Entity" {
|
||||||
const entity = Entity{
|
const entity = Entity{
|
||||||
.x = 100.0,
|
.x = 100.0,
|
||||||
.y = 200.0,
|
.y = 200.0,
|
||||||
.vx = 1.5, // ignored for GPU
|
.vx = 1.5,
|
||||||
.vy = -0.5, // ignored for GPU
|
.vy = -0.5,
|
||||||
.color = 0x00FFFF,
|
.color = 0x00FFFF,
|
||||||
};
|
};
|
||||||
|
|
||||||
const gpu_entity = GpuEntity{
|
const gpu_entity = GpuEntity{
|
||||||
.x = entity.x,
|
.x = entity.x,
|
||||||
.y = entity.y,
|
.y = entity.y,
|
||||||
|
.packed_vel = packVelocity(entity.vx, entity.vy),
|
||||||
.color = entity.color,
|
.color = entity.color,
|
||||||
};
|
};
|
||||||
|
|
||||||
try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
|
try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
|
||||||
try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
|
try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
|
||||||
try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
|
try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
|
||||||
|
|
||||||
|
// unpack and verify velocity (should round-trip within precision)
|
||||||
|
const vx_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel >> 16)))) / 256.0;
|
||||||
|
const vy_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel)))) / 256.0;
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vx_unpacked, 0.004);
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, -0.5), vy_unpacked, 0.004);
|
||||||
|
}
|
||||||
|
|
||||||
|
test "packVelocity round-trips correctly" {
|
||||||
|
// test positive values
|
||||||
|
const packed1 = packVelocity(2.0, 1.5);
|
||||||
|
const vx1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1 >> 16)))) / 256.0;
|
||||||
|
const vy1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1)))) / 256.0;
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, 2.0), vx1, 0.004);
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vy1, 0.004);
|
||||||
|
|
||||||
|
// test negative values
|
||||||
|
const packed2 = packVelocity(-1.0, -2.5);
|
||||||
|
const vx2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2 >> 16)))) / 256.0;
|
||||||
|
const vy2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2)))) / 256.0;
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, -1.0), vx2, 0.004);
|
||||||
|
try std.testing.expectApproxEqAbs(@as(f32, -2.5), vy2, 0.004);
|
||||||
|
|
||||||
|
// test zero
|
||||||
|
const packed3 = packVelocity(0.0, 0.0);
|
||||||
|
try std.testing.expectEqual(@as(i32, 0), packed3);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,11 @@
|
||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const rl = @import("raylib");
|
const rl = @import("raylib");
|
||||||
|
const ztracy = @import("ztracy");
|
||||||
const sandbox = @import("sandbox.zig");
|
const sandbox = @import("sandbox.zig");
|
||||||
const ui = @import("ui.zig");
|
const ui = @import("ui.zig");
|
||||||
const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
|
const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
|
||||||
|
const ComputeShader = @import("compute.zig").ComputeShader;
|
||||||
|
|
||||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||||
const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
|
const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
|
||||||
|
|
@ -31,6 +33,11 @@ const BENCH_RAMP_AMOUNT: usize = 50_000; // entities added per ramp
|
||||||
const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
|
const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
|
||||||
const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long
|
const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long
|
||||||
|
|
||||||
|
// zoom settings
|
||||||
|
const ZOOM_MIN: f32 = 1.0;
|
||||||
|
const ZOOM_MAX: f32 = 10.0;
|
||||||
|
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
||||||
|
|
||||||
const BenchmarkLogger = struct {
|
const BenchmarkLogger = struct {
|
||||||
file: ?std.fs.File,
|
file: ?std.fs.File,
|
||||||
last_logged_frame_ms: f32,
|
last_logged_frame_ms: f32,
|
||||||
|
|
@ -157,6 +164,7 @@ pub fn main() !void {
|
||||||
var use_instancing = false;
|
var use_instancing = false;
|
||||||
var use_ssbo = true;
|
var use_ssbo = true;
|
||||||
var use_vsync = false;
|
var use_vsync = false;
|
||||||
|
var use_compute = true; // GPU compute is now default
|
||||||
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
||||||
defer args.deinit();
|
defer args.deinit();
|
||||||
_ = args.skip(); // skip program name
|
_ = args.skip(); // skip program name
|
||||||
|
|
@ -170,6 +178,8 @@ pub fn main() !void {
|
||||||
use_ssbo = false; // legacy rlgl batched path
|
use_ssbo = false; // legacy rlgl batched path
|
||||||
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
||||||
use_vsync = true;
|
use_vsync = true;
|
||||||
|
} else if (std.mem.eql(u8, arg, "--cpu")) {
|
||||||
|
use_compute = false; // fallback to CPU update path
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -179,6 +189,11 @@ pub fn main() !void {
|
||||||
rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
|
rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
|
||||||
defer rl.closeWindow();
|
defer rl.closeWindow();
|
||||||
|
|
||||||
|
// show background immediately (avoid black screen during init)
|
||||||
|
rl.beginDrawing();
|
||||||
|
rl.clearBackground(BG_COLOR);
|
||||||
|
rl.endDrawing();
|
||||||
|
|
||||||
// use larger batch buffer: 16384 elements vs default 8192
|
// use larger batch buffer: 16384 elements vs default 8192
|
||||||
// fewer flushes = less driver overhead per frame
|
// fewer flushes = less driver overhead per frame
|
||||||
const numElements: i32 = 8192 * 4; // quads = 4 verts
|
const numElements: i32 = 8192 * 4; // quads = 4 verts
|
||||||
|
|
@ -246,6 +261,26 @@ pub fn main() !void {
|
||||||
if (ssbo_renderer) |*r| r.deinit();
|
if (ssbo_renderer) |*r| r.deinit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute shader setup (only if --compute flag)
|
||||||
|
var compute_shader: ?ComputeShader = null;
|
||||||
|
|
||||||
|
if (use_compute) {
|
||||||
|
if (!use_ssbo) {
|
||||||
|
std.debug.print("--compute requires SSBO mode (default), ignoring\n", .{});
|
||||||
|
} else {
|
||||||
|
compute_shader = ComputeShader.init();
|
||||||
|
if (compute_shader == null) {
|
||||||
|
std.debug.print("failed to initialize compute shader, falling back to CPU\n", .{});
|
||||||
|
} else {
|
||||||
|
std.debug.print("compute shader mode enabled\n", .{});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
if (compute_shader) |*c| c.deinit();
|
||||||
|
}
|
||||||
|
|
||||||
// load UI font (embedded)
|
// load UI font (embedded)
|
||||||
const font_data = @embedFile("verdanab.ttf");
|
const font_data = @embedFile("verdanab.ttf");
|
||||||
const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
|
const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
|
||||||
|
|
@ -259,6 +294,11 @@ pub fn main() !void {
|
||||||
var rng = prng.random();
|
var rng = prng.random();
|
||||||
|
|
||||||
var paused = false;
|
var paused = false;
|
||||||
|
|
||||||
|
// camera state for zoom/pan
|
||||||
|
var zoom: f32 = 1.0;
|
||||||
|
var pan = @Vector(2, f32){ 0, 0 };
|
||||||
|
|
||||||
var logger = BenchmarkLogger.init();
|
var logger = BenchmarkLogger.init();
|
||||||
defer logger.deinit();
|
defer logger.deinit();
|
||||||
|
|
||||||
|
|
@ -266,6 +306,7 @@ pub fn main() !void {
|
||||||
var update_time_us: i64 = 0;
|
var update_time_us: i64 = 0;
|
||||||
var render_time_us: i64 = 0;
|
var render_time_us: i64 = 0;
|
||||||
var elapsed: f32 = 0;
|
var elapsed: f32 = 0;
|
||||||
|
var frame_number: u32 = 0;
|
||||||
|
|
||||||
// auto-benchmark state
|
// auto-benchmark state
|
||||||
var last_ramp_time: f32 = 0;
|
var last_ramp_time: f32 = 0;
|
||||||
|
|
@ -311,24 +352,47 @@ pub fn main() !void {
|
||||||
} else {
|
} else {
|
||||||
// manual controls
|
// manual controls
|
||||||
handleInput(&entities, &rng, &paused);
|
handleInput(&entities, &rng, &paused);
|
||||||
|
if (handleCamera(&zoom, &pan)) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// update
|
// update
|
||||||
if (!paused) {
|
if (!paused) {
|
||||||
|
const tracy_update = ztracy.ZoneN(@src(), "update");
|
||||||
|
defer tracy_update.End();
|
||||||
const update_start = std.time.microTimestamp();
|
const update_start = std.time.microTimestamp();
|
||||||
sandbox.update(&entities, &rng);
|
|
||||||
|
if (compute_shader == null) {
|
||||||
|
// CPU update path (positions + respawn)
|
||||||
|
sandbox.update(&entities, &rng);
|
||||||
|
}
|
||||||
|
// GPU compute path handles update in render section before draw
|
||||||
|
|
||||||
update_time_us = std.time.microTimestamp() - update_start;
|
update_time_us = std.time.microTimestamp() - update_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
// render
|
// render
|
||||||
|
const tracy_render = ztracy.ZoneN(@src(), "render");
|
||||||
|
defer tracy_render.End();
|
||||||
const render_start = std.time.microTimestamp();
|
const render_start = std.time.microTimestamp();
|
||||||
|
|
||||||
rl.beginDrawing();
|
rl.beginDrawing();
|
||||||
rl.clearBackground(BG_COLOR);
|
rl.clearBackground(BG_COLOR);
|
||||||
|
|
||||||
if (use_ssbo) {
|
if (use_ssbo) {
|
||||||
// SSBO instanced rendering path (12 bytes per entity)
|
// dispatch compute shader before render (if enabled)
|
||||||
ssbo_renderer.?.render(&entities);
|
if (compute_shader) |*cs| {
|
||||||
|
if (!paused) {
|
||||||
|
const tracy_compute = ztracy.ZoneN(@src(), "compute_dispatch");
|
||||||
|
defer tracy_compute.End();
|
||||||
|
cs.dispatch(ssbo_renderer.?.ssbo_id, @intCast(entities.count), frame_number);
|
||||||
|
frame_number +%= 1;
|
||||||
|
}
|
||||||
|
// GPU compute mode - only upload new entities, positions updated on GPU
|
||||||
|
ssbo_renderer.?.renderComputeMode(&entities, zoom, pan);
|
||||||
|
} else {
|
||||||
|
// CPU mode - upload entity data to GPU
|
||||||
|
ssbo_renderer.?.render(&entities, zoom, pan);
|
||||||
|
}
|
||||||
} else if (use_instancing) {
|
} else if (use_instancing) {
|
||||||
// GPU instancing path (64 bytes per entity)
|
// GPU instancing path (64 bytes per entity)
|
||||||
const xforms = transforms.?;
|
const xforms = transforms.?;
|
||||||
|
|
@ -379,7 +443,8 @@ pub fn main() !void {
|
||||||
|
|
||||||
// metrics overlay (skip in bench mode for cleaner headless run)
|
// metrics overlay (skip in bench mode for cleaner headless run)
|
||||||
if (!bench_mode) {
|
if (!bench_mode) {
|
||||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
||||||
|
ui.drawMemory(entities.count, ui_font);
|
||||||
}
|
}
|
||||||
|
|
||||||
rl.endDrawing();
|
rl.endDrawing();
|
||||||
|
|
@ -390,6 +455,9 @@ pub fn main() !void {
|
||||||
const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
|
const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
|
||||||
const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
|
const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
|
||||||
logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
|
logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
|
||||||
|
|
||||||
|
// tracy frame mark
|
||||||
|
ztracy.FrameMark();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -444,4 +512,86 @@ fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) voi
|
||||||
if (rl.isKeyPressed(.space)) {
|
if (rl.isKeyPressed(.space)) {
|
||||||
paused.* = !paused.*;
|
paused.* = !paused.*;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// toggle ui: tab
|
||||||
|
if (rl.isKeyPressed(.tab)) {
|
||||||
|
ui.show_ui = !ui.show_ui;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) bool {
|
||||||
|
const wheel = rl.getMouseWheelMove();
|
||||||
|
|
||||||
|
if (wheel != 0) {
|
||||||
|
const mouse_pos = rl.getMousePosition();
|
||||||
|
const old_zoom = zoom.*;
|
||||||
|
|
||||||
|
// calculate new zoom
|
||||||
|
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
||||||
|
var new_zoom = old_zoom * zoom_factor;
|
||||||
|
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
||||||
|
|
||||||
|
if (new_zoom != old_zoom) {
|
||||||
|
// zoom toward mouse cursor:
|
||||||
|
// keep the world point under the cursor stationary
|
||||||
|
// world_pos = (screen_pos / old_zoom) + old_pan
|
||||||
|
// new_pan = world_pos - (screen_pos / new_zoom)
|
||||||
|
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
||||||
|
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
||||||
|
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
||||||
|
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
||||||
|
zoom.* = new_zoom;
|
||||||
|
|
||||||
|
// clamp pan to bounds
|
||||||
|
clampPan(pan, zoom.*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pan with any mouse button drag (only when zoomed in)
|
||||||
|
if (zoom.* > 1.0) {
|
||||||
|
const any_button = rl.isMouseButtonDown(.left) or
|
||||||
|
rl.isMouseButtonDown(.right) or
|
||||||
|
rl.isMouseButtonDown(.middle);
|
||||||
|
if (any_button) {
|
||||||
|
const delta = rl.getMouseDelta();
|
||||||
|
// drag down = view down, drag right = view right
|
||||||
|
pan.*[0] -= delta.x / zoom.*;
|
||||||
|
pan.*[1] += delta.y / zoom.*;
|
||||||
|
clampPan(pan, zoom.*);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset on Return or Enter
|
||||||
|
if (rl.isKeyPressed(.enter) or rl.isKeyPressed(.kp_enter)) {
|
||||||
|
zoom.* = 1.0;
|
||||||
|
pan.* = @Vector(2, f32){ 0, 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// q: reset zoom if zoomed in, otherwise quit
|
||||||
|
if (rl.isKeyPressed(.q)) {
|
||||||
|
if (zoom.* > 1.0) {
|
||||||
|
zoom.* = 1.0;
|
||||||
|
pan.* = @Vector(2, f32){ 0, 0 };
|
||||||
|
} else {
|
||||||
|
return true; // signal to quit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
||||||
|
// when zoomed in, limit pan so viewport stays in simulation bounds
|
||||||
|
// visible area = screen_size / zoom
|
||||||
|
// max pan = world_size - visible_area
|
||||||
|
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
||||||
|
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
||||||
|
const visible_w = screen_w / zoom;
|
||||||
|
const visible_h = screen_h / zoom;
|
||||||
|
|
||||||
|
const max_pan_x = @max(0, screen_w - visible_w);
|
||||||
|
const max_pan_y = @max(0, screen_h - visible_h);
|
||||||
|
|
||||||
|
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
||||||
|
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,11 @@
|
||||||
layout(location = 0) in vec2 position;
|
layout(location = 0) in vec2 position;
|
||||||
layout(location = 1) in vec2 texCoord;
|
layout(location = 1) in vec2 texCoord;
|
||||||
|
|
||||||
// entity data from SSBO
|
// entity data from SSBO (16 bytes, matches compute shader layout)
|
||||||
struct Entity {
|
struct Entity {
|
||||||
float x;
|
float x;
|
||||||
float y;
|
float y;
|
||||||
|
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8), unused in vertex shader
|
||||||
uint color;
|
uint color;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -17,6 +18,8 @@ layout(std430, binding = 0) readonly buffer EntityData {
|
||||||
|
|
||||||
// screen size for NDC conversion
|
// screen size for NDC conversion
|
||||||
uniform vec2 screenSize;
|
uniform vec2 screenSize;
|
||||||
|
uniform float zoom;
|
||||||
|
uniform vec2 pan;
|
||||||
|
|
||||||
out vec2 fragTexCoord;
|
out vec2 fragTexCoord;
|
||||||
out vec3 fragColor;
|
out vec3 fragColor;
|
||||||
|
|
@ -25,13 +28,13 @@ void main() {
|
||||||
// get entity data from SSBO
|
// get entity data from SSBO
|
||||||
Entity e = entities[gl_InstanceID];
|
Entity e = entities[gl_InstanceID];
|
||||||
|
|
||||||
// convert entity position to NDC
|
// apply pan offset and zoom to convert to NDC
|
||||||
// entity coords are in screen pixels, convert to [-1, 1]
|
// pan is in screen pixels, zoom scales the view
|
||||||
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
||||||
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
||||||
|
|
||||||
// quad size in NDC (16 pixels)
|
// quad size in NDC (16 pixels, scaled by zoom)
|
||||||
float quadSizeNdc = 16.0 / screenSize.x;
|
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
||||||
|
|
||||||
// offset by quad corner position
|
// offset by quad corner position
|
||||||
gl_Position = vec4(ndcX + position.x * quadSizeNdc,
|
gl_Position = vec4(ndcX + position.x * quadSizeNdc,
|
||||||
|
|
|
||||||
97
src/shaders/entity_update.comp
Normal file
97
src/shaders/entity_update.comp
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
#version 430
|
||||||
|
|
||||||
|
layout(local_size_x = 256) in;
|
||||||
|
|
||||||
|
struct Entity {
|
||||||
|
float x;
|
||||||
|
float y;
|
||||||
|
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||||
|
uint color;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(std430, binding = 0) buffer Entities {
|
||||||
|
Entity entities[];
|
||||||
|
};
|
||||||
|
|
||||||
|
uniform uint entityCount;
|
||||||
|
uniform uint frameNumber;
|
||||||
|
uniform vec2 screenSize;
|
||||||
|
uniform vec2 center;
|
||||||
|
uniform float respawnRadius;
|
||||||
|
uniform float entitySpeed;
|
||||||
|
|
||||||
|
// PCG-style GPU RNG - returns value in [0, 1)
|
||||||
|
uint pcg(inout uint state) {
|
||||||
|
state = state * 747796405u + 2891336453u;
|
||||||
|
uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
|
||||||
|
return (word >> 22u) ^ word;
|
||||||
|
}
|
||||||
|
|
||||||
|
float randFloat(inout uint state) {
|
||||||
|
return float(pcg(state)) / 4294967296.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// pack velocity into fixed-point 8.8 format
|
||||||
|
int packVelocity(float vx, float vy) {
|
||||||
|
int vx_fixed = int(clamp(vx * 256.0, -32768.0, 32767.0));
|
||||||
|
int vy_fixed = int(clamp(vy * 256.0, -32768.0, 32767.0));
|
||||||
|
return (vx_fixed << 16) | (vy_fixed & 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint id = gl_GlobalInvocationID.x;
|
||||||
|
if (id >= entityCount) return;
|
||||||
|
|
||||||
|
Entity e = entities[id];
|
||||||
|
|
||||||
|
// unpack velocity (fixed-point 8.8)
|
||||||
|
float vx = float(e.packedVel >> 16) / 256.0;
|
||||||
|
float vy = float((e.packedVel << 16) >> 16) / 256.0; // sign-extend low 16 bits
|
||||||
|
|
||||||
|
// update position
|
||||||
|
e.x += vx;
|
||||||
|
e.y += vy;
|
||||||
|
|
||||||
|
// check if reached center - respawn at edge
|
||||||
|
float dx = e.x - center.x;
|
||||||
|
float dy = e.y - center.y;
|
||||||
|
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
||||||
|
// init RNG with entity id and frame number
|
||||||
|
uint rng = id * 1103515245u + frameNumber * 12345u + 1u;
|
||||||
|
|
||||||
|
// pick random edge: 0=top, 1=bottom, 2=left, 3=right
|
||||||
|
uint edge = pcg(rng) & 3u;
|
||||||
|
float t = randFloat(rng);
|
||||||
|
|
||||||
|
// spawn on edge
|
||||||
|
if (edge == 0u) { // top
|
||||||
|
e.x = t * screenSize.x;
|
||||||
|
e.y = 0.0;
|
||||||
|
} else if (edge == 1u) { // bottom
|
||||||
|
e.x = t * screenSize.x;
|
||||||
|
e.y = screenSize.y;
|
||||||
|
} else if (edge == 2u) { // left
|
||||||
|
e.x = 0.0;
|
||||||
|
e.y = t * screenSize.y;
|
||||||
|
} else { // right
|
||||||
|
e.x = screenSize.x;
|
||||||
|
e.y = t * screenSize.y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// velocity toward center
|
||||||
|
dx = center.x - e.x;
|
||||||
|
dy = center.y - e.y;
|
||||||
|
float dist = sqrt(dx*dx + dy*dy);
|
||||||
|
vx = (dx / dist) * entitySpeed;
|
||||||
|
vy = (dy / dist) * entitySpeed;
|
||||||
|
e.packedVel = packVelocity(vx, vy);
|
||||||
|
|
||||||
|
// new random color
|
||||||
|
uint r = pcg(rng) & 0xFFu;
|
||||||
|
uint g = pcg(rng) & 0xFFu;
|
||||||
|
uint b = pcg(rng) & 0xFFu;
|
||||||
|
e.color = (r << 16u) | (g << 8u) | b;
|
||||||
|
}
|
||||||
|
|
||||||
|
entities[id] = e;
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const rl = @import("raylib");
|
const rl = @import("raylib");
|
||||||
|
const ztracy = @import("ztracy");
|
||||||
const sandbox = @import("sandbox.zig");
|
const sandbox = @import("sandbox.zig");
|
||||||
|
|
||||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||||
|
|
@ -19,8 +20,11 @@ pub const SsboRenderer = struct {
|
||||||
ssbo_id: u32,
|
ssbo_id: u32,
|
||||||
screen_size_loc: i32,
|
screen_size_loc: i32,
|
||||||
circle_texture_loc: i32,
|
circle_texture_loc: i32,
|
||||||
|
zoom_loc: i32,
|
||||||
|
pan_loc: i32,
|
||||||
circle_texture_id: u32,
|
circle_texture_id: u32,
|
||||||
gpu_buffer: []sandbox.GpuEntity,
|
gpu_buffer: []sandbox.GpuEntity,
|
||||||
|
last_entity_count: usize, // track count to detect when entities are added
|
||||||
|
|
||||||
const QUAD_SIZE: f32 = 16.0;
|
const QUAD_SIZE: f32 = 16.0;
|
||||||
|
|
||||||
|
|
@ -53,6 +57,8 @@ pub const SsboRenderer = struct {
|
||||||
// get uniform locations
|
// get uniform locations
|
||||||
const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
|
const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
|
||||||
const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
|
const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
|
||||||
|
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
||||||
|
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
||||||
|
|
||||||
if (screen_size_loc < 0) {
|
if (screen_size_loc < 0) {
|
||||||
std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
|
std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
|
||||||
|
|
@ -94,7 +100,7 @@ pub const SsboRenderer = struct {
|
||||||
rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
|
rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
|
||||||
rl.gl.rlEnableVertexAttribute(1);
|
rl.gl.rlEnableVertexAttribute(1);
|
||||||
|
|
||||||
// create SSBO for entity data (12 bytes per entity, 1M entities = 12MB)
|
// create SSBO for entity data (16 bytes per entity, 1M entities = 16MB)
|
||||||
const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
|
const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
|
||||||
const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
|
const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
|
||||||
if (ssbo_id == 0) {
|
if (ssbo_id == 0) {
|
||||||
|
|
@ -116,8 +122,11 @@ pub const SsboRenderer = struct {
|
||||||
.ssbo_id = ssbo_id,
|
.ssbo_id = ssbo_id,
|
||||||
.screen_size_loc = screen_size_loc,
|
.screen_size_loc = screen_size_loc,
|
||||||
.circle_texture_loc = circle_texture_loc,
|
.circle_texture_loc = circle_texture_loc,
|
||||||
|
.zoom_loc = zoom_loc,
|
||||||
|
.pan_loc = pan_loc,
|
||||||
.circle_texture_id = circle_texture.id,
|
.circle_texture_id = circle_texture.id,
|
||||||
.gpu_buffer = gpu_buffer,
|
.gpu_buffer = gpu_buffer,
|
||||||
|
.last_entity_count = 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -129,25 +138,80 @@ pub const SsboRenderer = struct {
|
||||||
std.heap.page_allocator.free(self.gpu_buffer);
|
std.heap.page_allocator.free(self.gpu_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities) void {
|
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||||
|
self.renderInternal(entities, zoom, pan, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn renderComputeMode(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||||
if (entities.count == 0) return;
|
if (entities.count == 0) return;
|
||||||
|
|
||||||
// flush raylib's internal render batch before our custom GL calls
|
// flush raylib's internal render batch before our custom GL calls
|
||||||
rl.gl.rlDrawRenderBatchActive();
|
rl.gl.rlDrawRenderBatchActive();
|
||||||
|
|
||||||
// copy entity data to GPU buffer (position + color only)
|
// upload NEW entities when count increases (entities added on CPU)
|
||||||
for (entities.items[0..entities.count], 0..) |entity, i| {
|
if (entities.count > self.last_entity_count) {
|
||||||
self.gpu_buffer[i] = .{
|
const zone = ztracy.ZoneN(@src(), "ssbo_upload_new");
|
||||||
.x = entity.x,
|
defer zone.End();
|
||||||
.y = entity.y,
|
|
||||||
.color = entity.color,
|
// copy new entities to GPU buffer
|
||||||
};
|
for (entities.items[self.last_entity_count..entities.count], self.last_entity_count..) |entity, i| {
|
||||||
|
self.gpu_buffer[i] = .{
|
||||||
|
.x = entity.x,
|
||||||
|
.y = entity.y,
|
||||||
|
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
||||||
|
.color = entity.color,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// upload only the new portion to SSBO
|
||||||
|
const offset: u32 = @intCast(self.last_entity_count * @sizeOf(sandbox.GpuEntity));
|
||||||
|
const new_count = entities.count - self.last_entity_count;
|
||||||
|
const data_size: u32 = @intCast(new_count * @sizeOf(sandbox.GpuEntity));
|
||||||
|
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, &self.gpu_buffer[self.last_entity_count], data_size, offset);
|
||||||
|
|
||||||
|
self.last_entity_count = entities.count;
|
||||||
|
} else if (entities.count < self.last_entity_count) {
|
||||||
|
// entities were removed, update count
|
||||||
|
self.last_entity_count = entities.count;
|
||||||
}
|
}
|
||||||
|
|
||||||
// upload to SSBO
|
self.drawInstanced(entities.count, zoom, pan);
|
||||||
const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
|
}
|
||||||
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
|
|
||||||
|
|
||||||
|
fn renderInternal(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32), skip_upload: bool) void {
|
||||||
|
if (entities.count == 0) return;
|
||||||
|
|
||||||
|
// flush raylib's internal render batch before our custom GL calls
|
||||||
|
rl.gl.rlDrawRenderBatchActive();
|
||||||
|
|
||||||
|
if (!skip_upload) {
|
||||||
|
// copy entity data to GPU buffer (position + packed velocity + color)
|
||||||
|
{
|
||||||
|
const zone = ztracy.ZoneN(@src(), "ssbo_copy");
|
||||||
|
defer zone.End();
|
||||||
|
for (entities.items[0..entities.count], 0..) |entity, i| {
|
||||||
|
self.gpu_buffer[i] = .{
|
||||||
|
.x = entity.x,
|
||||||
|
.y = entity.y,
|
||||||
|
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
||||||
|
.color = entity.color,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// upload to SSBO
|
||||||
|
{
|
||||||
|
const zone = ztracy.ZoneN(@src(), "ssbo_upload");
|
||||||
|
defer zone.End();
|
||||||
|
const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
|
||||||
|
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.drawInstanced(entities.count, zoom, pan);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn drawInstanced(self: *SsboRenderer, entity_count: usize, zoom: f32, pan: @Vector(2, f32)) void {
|
||||||
// bind shader
|
// bind shader
|
||||||
rl.gl.rlEnableShader(self.shader_id);
|
rl.gl.rlEnableShader(self.shader_id);
|
||||||
|
|
||||||
|
|
@ -155,6 +219,13 @@ pub const SsboRenderer = struct {
|
||||||
const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
|
const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
|
||||||
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
|
||||||
|
// set zoom uniform
|
||||||
|
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||||
|
|
||||||
|
// set pan uniform
|
||||||
|
const pan_arr = [2]f32{ pan[0], pan[1] };
|
||||||
|
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
|
||||||
// bind texture
|
// bind texture
|
||||||
rl.gl.rlActiveTextureSlot(0);
|
rl.gl.rlActiveTextureSlot(0);
|
||||||
rl.gl.rlEnableTexture(self.circle_texture_id);
|
rl.gl.rlEnableTexture(self.circle_texture_id);
|
||||||
|
|
@ -170,9 +241,13 @@ pub const SsboRenderer = struct {
|
||||||
rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));
|
rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));
|
||||||
|
|
||||||
// bind VAO and draw
|
// bind VAO and draw
|
||||||
_ = rl.gl.rlEnableVertexArray(self.vao_id);
|
{
|
||||||
rl.gl.rlEnableVertexBuffer(self.vbo_id);
|
const zone = ztracy.ZoneN(@src(), "ssbo_draw");
|
||||||
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entities.count));
|
defer zone.End();
|
||||||
|
_ = rl.gl.rlEnableVertexArray(self.vao_id);
|
||||||
|
rl.gl.rlEnableVertexBuffer(self.vbo_id);
|
||||||
|
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entity_count));
|
||||||
|
}
|
||||||
|
|
||||||
// cleanup - restore raylib's expected state
|
// cleanup - restore raylib's expected state
|
||||||
rl.gl.rlDisableVertexArray();
|
rl.gl.rlDisableVertexArray();
|
||||||
|
|
|
||||||
70
src/ui.zig
70
src/ui.zig
|
|
@ -19,13 +19,23 @@ pub const box_padding: f32 = 8;
|
||||||
pub const text_color = rl.Color.white;
|
pub const text_color = rl.Color.white;
|
||||||
pub const dim_text_color = rl.Color.gray;
|
pub const dim_text_color = rl.Color.gray;
|
||||||
pub const highlight_color = rl.Color.yellow;
|
pub const highlight_color = rl.Color.yellow;
|
||||||
|
pub const fps_good_color = rl.Color.green;
|
||||||
|
pub const fps_bad_color = rl.Color.red;
|
||||||
pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };
|
pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// state
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
pub var show_ui: bool = true;
|
||||||
|
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
// drawing functions
|
// drawing functions
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
||||||
|
if (!show_ui) return;
|
||||||
|
|
||||||
var buf: [256]u8 = undefined;
|
var buf: [256]u8 = undefined;
|
||||||
|
|
||||||
// fps box (above metrics)
|
// fps box (above metrics)
|
||||||
|
|
@ -33,13 +43,16 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
|
rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
|
||||||
const frame_ms = rl.getFrameTime() * 1000.0;
|
const frame_ms = rl.getFrameTime() * 1000.0;
|
||||||
const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
|
const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
|
||||||
const fps_text = std.fmt.bufPrintZ(&buf, "FPS: {d:.0}", .{fps}) catch "?";
|
rl.drawTextEx(font, "FPS: ", .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
||||||
rl.drawTextEx(font, fps_text, .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
const fps_text = std.fmt.bufPrintZ(&buf, "{d:.0}", .{fps}) catch "?";
|
||||||
|
const fps_color = if (fps >= 60.0) fps_good_color else fps_bad_color;
|
||||||
|
const label_width = rl.measureTextEx(font, "FPS: ", font_size, 0).x;
|
||||||
|
rl.drawTextEx(font, fps_text, .{ .x = padding + label_width, .y = padding }, font_size, 0, fps_color);
|
||||||
|
|
||||||
// metrics box (below fps)
|
// metrics box (below fps)
|
||||||
const metrics_y: i32 = 5 + fps_box_height + 5;
|
const metrics_y: i32 = 5 + fps_box_height + 5;
|
||||||
var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
|
var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
|
||||||
const bg_height: i32 = if (paused) 130 else 100;
|
const bg_height: i32 = if (paused) 150 else 120;
|
||||||
rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);
|
rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);
|
||||||
|
|
||||||
// entity count
|
// entity count
|
||||||
|
|
@ -64,6 +77,11 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
|
rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
|
||||||
y += line_height;
|
y += line_height;
|
||||||
|
|
||||||
|
// zoom level
|
||||||
|
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
||||||
|
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
||||||
|
y += line_height;
|
||||||
|
|
||||||
// paused indicator
|
// paused indicator
|
||||||
if (paused) {
|
if (paused) {
|
||||||
y += line_height;
|
y += line_height;
|
||||||
|
|
@ -74,8 +92,43 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
drawControls(font, metrics_y + bg_height);
|
drawControls(font, metrics_y + bg_height);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn drawMemory(entity_count: usize, font: rl.Font) void {
|
||||||
|
if (!show_ui) return;
|
||||||
|
|
||||||
|
var buf: [256]u8 = undefined;
|
||||||
|
|
||||||
|
const box_width: i32 = 160;
|
||||||
|
const box_height: i32 = @intFromFloat(line_height * 3 + box_padding * 2);
|
||||||
|
const box_x: i32 = @as(i32, @intCast(sandbox.SCREEN_WIDTH)) - box_width - 5;
|
||||||
|
const box_y: i32 = 5;
|
||||||
|
|
||||||
|
rl.drawRectangle(box_x, box_y, box_width, box_height, box_bg);
|
||||||
|
|
||||||
|
var y: f32 = @as(f32, @floatFromInt(box_y)) + box_padding;
|
||||||
|
const x: f32 = @floatFromInt(box_x + @as(i32, @intFromFloat(box_padding)));
|
||||||
|
|
||||||
|
// entity memory (CPU side)
|
||||||
|
const entity_bytes = entity_count * @sizeOf(sandbox.Entity);
|
||||||
|
const entity_mb = @as(f32, @floatFromInt(entity_bytes)) / (1024.0 * 1024.0);
|
||||||
|
const entity_text = std.fmt.bufPrintZ(&buf, "cpu: {d:.1} MB", .{entity_mb}) catch "?";
|
||||||
|
rl.drawTextEx(font, entity_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
||||||
|
y += line_height;
|
||||||
|
|
||||||
|
// GPU buffer memory (SSBO)
|
||||||
|
const gpu_bytes = entity_count * @sizeOf(sandbox.GpuEntity);
|
||||||
|
const gpu_mb = @as(f32, @floatFromInt(gpu_bytes)) / (1024.0 * 1024.0);
|
||||||
|
const gpu_text = std.fmt.bufPrintZ(&buf, "gpu: {d:.1} MB", .{gpu_mb}) catch "?";
|
||||||
|
rl.drawTextEx(font, gpu_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
||||||
|
y += line_height;
|
||||||
|
|
||||||
|
// total
|
||||||
|
const total_mb = entity_mb + gpu_mb;
|
||||||
|
const total_text = std.fmt.bufPrintZ(&buf, "total: {d:.1} MB", .{total_mb}) catch "?";
|
||||||
|
rl.drawTextEx(font, total_text, .{ .x = x, .y = y }, font_size, 0, dim_text_color);
|
||||||
|
}
|
||||||
|
|
||||||
fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
||||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 4 + box_padding * 2);
|
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
||||||
const ctrl_box_y: i32 = metrics_bottom + 5;
|
const ctrl_box_y: i32 = metrics_bottom + 5;
|
||||||
rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);
|
rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);
|
||||||
|
|
||||||
|
|
@ -84,8 +137,11 @@ fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
||||||
const controls = [_][]const u8{
|
const controls = [_][]const u8{
|
||||||
"+/-: 10k entities",
|
"+/-: 10k entities",
|
||||||
"shift +/-: 50k",
|
"shift +/-: 50k",
|
||||||
"space: pause",
|
"scroll: zoom",
|
||||||
"r: reset",
|
"drag: pan (zoomed)",
|
||||||
|
"space: pause, r: reset",
|
||||||
|
"q: zoom out / quit",
|
||||||
|
"tab: toggle ui",
|
||||||
};
|
};
|
||||||
|
|
||||||
for (controls) |text| {
|
for (controls) |text| {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue