Compare commits
No commits in common. "main" and "0.5.0" have entirely different histories.
35 changed files with 67 additions and 2537 deletions
|
|
@ -1,14 +1,12 @@
|
||||||
name: release
|
name: release
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
release:
|
||||||
tags:
|
types: [published]
|
||||||
- '*'
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: codeberg-small
|
||||||
container: catthehacker/ubuntu:act-latest
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
@ -37,32 +35,16 @@ jobs:
|
||||||
|
|
||||||
- name: Upload to release
|
- name: Upload to release
|
||||||
env:
|
env:
|
||||||
FORGEJO_TOKEN: ${{ secrets.FORGEJO_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
TAG="${{ github.ref_name }}"
|
RELEASE_ID="${{ github.event.release.id }}"
|
||||||
API_BASE="${{ github.server_url }}/api/v1"
|
API_URL="${{ github.api_url }}/repos/${{ github.repository }}/releases/${RELEASE_ID}/assets"
|
||||||
REPO="${{ github.repository }}"
|
|
||||||
|
|
||||||
# check if release exists
|
|
||||||
RELEASE_ID=$(curl -sf \
|
|
||||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
|
||||||
"${API_BASE}/repos/${REPO}/releases/tags/${TAG}" | jq -r '.id // empty')
|
|
||||||
|
|
||||||
if [ -z "$RELEASE_ID" ]; then
|
|
||||||
echo "Creating release for ${TAG}..."
|
|
||||||
RELEASE_ID=$(curl -sf \
|
|
||||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"tag_name":"'"${TAG}"'","name":"'"${TAG}"'"}' \
|
|
||||||
"${API_BASE}/repos/${REPO}/releases" | jq -r '.id')
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Release ID: ${RELEASE_ID}"
|
|
||||||
|
|
||||||
for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
|
for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
|
||||||
echo "Uploading $file..."
|
echo "Uploading $file..."
|
||||||
curl -sf \
|
curl -X POST \
|
||||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
-H "Authorization: token ${GITHUB_TOKEN}" \
|
||||||
-F "attachment=@${file}" \
|
-H "Content-Type: application/octet-stream" \
|
||||||
"${API_BASE}/repos/${REPO}/releases/${RELEASE_ID}/assets?name=${file}"
|
--data-binary @"$file" \
|
||||||
|
"${API_URL}?name=${file}"
|
||||||
done
|
done
|
||||||
|
|
|
||||||
27
.github/workflows/release.yml
vendored
27
.github/workflows/release.yml
vendored
|
|
@ -10,14 +10,9 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- os: ubuntu-latest
|
- os: ubuntu-latest
|
||||||
target: native
|
artifact: sandbox-linux-x86_64
|
||||||
artifact: lofivor-linux-x86_64
|
|
||||||
- os: ubuntu-latest
|
|
||||||
target: x86_64-windows-gnu
|
|
||||||
artifact: lofivor-windows-x86_64.exe
|
|
||||||
- os: macos-latest
|
- os: macos-latest
|
||||||
target: native
|
artifact: sandbox-macos-aarch64
|
||||||
artifact: lofivor-macos-aarch64
|
|
||||||
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
|
@ -31,24 +26,12 @@ jobs:
|
||||||
version: 0.15.2
|
version: 0.15.2
|
||||||
|
|
||||||
- name: Install X11 dependencies (Linux)
|
- name: Install X11 dependencies (Linux)
|
||||||
if: matrix.os == 'ubuntu-latest' && matrix.target == 'native'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
run: sudo apt-get update && sudo apt-get install -y libx11-dev libxcursor-dev libxrandr-dev libxinerama-dev libxi-dev libxext-dev libxfixes-dev libgl1-mesa-dev
|
run: sudo apt-get update && sudo apt-get install -y libx11-dev libxcursor-dev libxrandr-dev libxinerama-dev libxi-dev libxext-dev libxfixes-dev libgl1-mesa-dev
|
||||||
|
|
||||||
- name: Build native
|
- run: zig build -Doptimize=ReleaseFast
|
||||||
if: matrix.target == 'native'
|
|
||||||
run: zig build -Doptimize=ReleaseFast
|
|
||||||
|
|
||||||
- name: Build cross-compile
|
- run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
|
||||||
if: matrix.target != 'native'
|
|
||||||
run: zig build -Dtarget=${{ matrix.target }} -Doptimize=ReleaseFast
|
|
||||||
|
|
||||||
- name: Rename artifact (Unix)
|
|
||||||
if: "!contains(matrix.artifact, '.exe')"
|
|
||||||
run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
|
|
||||||
|
|
||||||
- name: Rename artifact (Windows)
|
|
||||||
if: contains(matrix.artifact, '.exe')
|
|
||||||
run: mv zig-out/bin/sandbox.exe ${{ matrix.artifact }}
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
|
|
||||||
|
|
@ -82,8 +82,8 @@ these target the rendering bottleneck since update loop is already fast.
|
||||||
|
|
||||||
| technique | description | expected gain |
|
| technique | description | expected gain |
|
||||||
| ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
|
| ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
|
||||||
| SSBO instance data | pack (x, y, color) = 12 bytes instead of 64-byte matrices | done - see optimization 5 |
|
| ~~SSBO instance data~~ | ~~pack (x, y, color) = 12 bytes instead of 64-byte matrices~~ | **done** - see optimization 5 |
|
||||||
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | done - see optimization 6 |
|
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | significant |
|
||||||
| OpenGL vs Vulkan | test raylib's Vulkan backend | unknown |
|
| OpenGL vs Vulkan | test raylib's Vulkan backend | unknown |
|
||||||
| discrete GPU testing | test on dedicated GPU where instancing/SSBO shine | significant (different hw) |
|
| discrete GPU testing | test on dedicated GPU where instancing/SSBO shine | significant (different hw) |
|
||||||
|
|
||||||
|
|
@ -126,33 +126,6 @@ currently not the bottleneck - update stays <1ms at 100k. these become relevant
|
||||||
| entity pools | pre-allocated, reusable entity slots | reduces allocation overhead |
|
| entity pools | pre-allocated, reusable entity slots | reduces allocation overhead |
|
||||||
| component packing | minimize struct padding | better cache utilization |
|
| component packing | minimize struct padding | better cache utilization |
|
||||||
|
|
||||||
#### estimated gains summary
|
|
||||||
|
|
||||||
| Optimization | Expected Gain | Why |
|
|
||||||
|------------------------|---------------|---------------------------------------------------|
|
|
||||||
| SIMD updates | 0% | Update already on GPU |
|
|
||||||
| Multithreaded update | 0% | Update already on GPU |
|
|
||||||
| Cache-friendly layouts | 0% | CPU doesn't iterate entities |
|
|
||||||
| Fixed-point math | 0% or worse | GPUs are optimized for float |
|
|
||||||
| SoA vs AoS | ~5% | Only helps data upload, not bottleneck |
|
|
||||||
| Frustum culling | 5-15% | Most entities converge to center anyway |
|
|
||||||
| LOD rendering | 20-40% | Real gains - fewer fragments for distant entities |
|
|
||||||
| Temporal techniques | ~50% | But with visual artifacts (flickering) |
|
|
||||||
|
|
||||||
Realistic total if you did everything: ~30-50% improvement
|
|
||||||
|
|
||||||
That'd take you from ~1.4M @ 38fps to maybe ~1.8-2M @ 38fps, or ~1.4M @ 50-55fps.
|
|
||||||
|
|
||||||
What would actually move the needle:
|
|
||||||
- GPU-side frustum culling in compute shader (cull before render, not after)
|
|
||||||
- Point sprites instead of quads for distant entities (4 vertices → 1)
|
|
||||||
- Indirect draw calls (GPU decides what to render, CPU never touches entity data)
|
|
||||||
|
|
||||||
Your real bottleneck is fill rate and vertex throughput on HD 530 integrated
|
|
||||||
graphics. The CPU side is already essentially free.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## testing methodology
|
## testing methodology
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,6 @@ lofivor
|
||||||
sandbox stress test for measuring entity rendering performance on weak hardware.
|
sandbox stress test for measuring entity rendering performance on weak hardware.
|
||||||
written in zig with raylib.
|
written in zig with raylib.
|
||||||
|
|
||||||
(lofivor aka lofi-survivor)
|
|
||||||
|
|
||||||
build & run
|
build & run
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
|
|
|
||||||
24
TODO.md
24
TODO.md
|
|
@ -59,7 +59,7 @@ further options (if needed):
|
||||||
- [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
|
- [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
|
||||||
- [x] GPU instancing (single draw call for all entities)
|
- [x] GPU instancing (single draw call for all entities)
|
||||||
- [x] SSBO instance data (12 bytes vs 64-byte matrices)
|
- [x] SSBO instance data (12 bytes vs 64-byte matrices)
|
||||||
- [x] compute shader entity updates (raylib supports via rlgl)
|
- [ ] compute shader entity updates (if raylib supports)
|
||||||
- [ ] compare OpenGL vs Vulkan backend
|
- [ ] compare OpenGL vs Vulkan backend
|
||||||
|
|
||||||
findings (i5-6500T / HD 530):
|
findings (i5-6500T / HD 530):
|
||||||
|
|
@ -68,18 +68,14 @@ findings (i5-6500T / HD 530):
|
||||||
- instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
|
- instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
|
||||||
- bottleneck is memory bandwidth, not draw call overhead
|
- bottleneck is memory bandwidth, not draw call overhead
|
||||||
- rlgl batching is already near-optimal for this hardware
|
- rlgl batching is already near-optimal for this hardware
|
||||||
- compute shaders: update time ~5ms → ~0ms at 150k entities (CPU freed entirely)
|
|
||||||
|
|
||||||
## future optimization concepts (GPU-focused)
|
## future optimization concepts
|
||||||
|
|
||||||
- [ ] GPU-side frustum culling in compute shader
|
- [ ] SIMD entity updates (AVX2/SSE)
|
||||||
- [ ] point sprites for distant/small entities (4 verts → 1)
|
- [ ] struct-of-arrays vs array-of-structs benchmark
|
||||||
- [ ] indirect draw calls (glDrawArraysIndirect)
|
- [ ] multithreaded update loop (thread pool)
|
||||||
|
- [ ] cache-friendly memory layouts
|
||||||
## future optimization concepts (CPU - not currently bottleneck)
|
- [ ] LOD rendering (skip distant entities or reduce detail)
|
||||||
|
- [ ] frustum culling (only render visible)
|
||||||
- [ ] SIMD / SoA / multithreading (if game logic makes CPU hot again)
|
- [ ] temporal techniques (update subset per frame)
|
||||||
|
- [ ] fixed-point vs floating-point math
|
||||||
## other ideas that aren't about optimization
|
|
||||||
|
|
||||||
- [ ] scanline shader
|
|
||||||
|
|
|
||||||
13
build.zig
13
build.zig
|
|
@ -4,9 +4,6 @@ pub fn build(b: *std.Build) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardOptimizeOption(.{});
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
|
|
||||||
// tracy profiling (run with -Dtracy=true)
|
|
||||||
const enable_tracy = b.option(bool, "tracy", "Enable Tracy profiler") orelse false;
|
|
||||||
|
|
||||||
const raylib_dep = b.dependency("raylib_zig", .{
|
const raylib_dep = b.dependency("raylib_zig", .{
|
||||||
.target = target,
|
.target = target,
|
||||||
.optimize = optimize,
|
.optimize = optimize,
|
||||||
|
|
@ -27,16 +24,6 @@ pub fn build(b: *std.Build) void {
|
||||||
sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
|
sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
|
||||||
sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));
|
sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));
|
||||||
|
|
||||||
// tracy integration (optional)
|
|
||||||
const ztracy = b.dependency("ztracy", .{
|
|
||||||
.enable_ztracy = enable_tracy,
|
|
||||||
.on_demand = true, // allow connecting after app starts
|
|
||||||
});
|
|
||||||
sandbox_exe.root_module.addImport("ztracy", ztracy.module("root"));
|
|
||||||
if (enable_tracy) {
|
|
||||||
sandbox_exe.linkLibrary(ztracy.artifact("tracy"));
|
|
||||||
}
|
|
||||||
|
|
||||||
b.installArtifact(sandbox_exe);
|
b.installArtifact(sandbox_exe);
|
||||||
|
|
||||||
const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
|
const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,6 @@
|
||||||
.url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
|
.url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
|
||||||
.hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
|
.hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
|
||||||
},
|
},
|
||||||
.ztracy = .{
|
|
||||||
.url = "git+https://github.com/zig-gamedev/ztracy?ref=main#e7b401dea9ce006f8b236e3a2ca1a9f3d5c3e896",
|
|
||||||
.hash = "ztracy-0.14.0-dev-zHJSq78GGQC904aYvBPn6OOvRVOq_opAwDfeHZdvQyej",
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
.paths = .{
|
.paths = .{
|
||||||
"build.zig",
|
"build.zig",
|
||||||
|
|
|
||||||
|
|
@ -1,292 +0,0 @@
|
||||||
lofivor glossary
|
|
||||||
================
|
|
||||||
|
|
||||||
terms that come up when optimizing graphics.
|
|
||||||
|
|
||||||
|
|
||||||
clock cycle
|
|
||||||
-----------
|
|
||||||
|
|
||||||
one "tick" of the processor's internal clock.
|
|
||||||
|
|
||||||
a CPU or GPU has a crystal oscillator that vibrates at a fixed rate.
|
|
||||||
each vibration = one cycle. the processor does some work each cycle.
|
|
||||||
|
|
||||||
1 GHz = 1 billion cycles per second
|
|
||||||
1 MHz = 1 million cycles per second
|
|
||||||
|
|
||||||
so a 1 GHz processor has 1 billion opportunities to do work per second.
|
|
||||||
|
|
||||||
"one operation per cycle" is idealized. real work often takes multiple
|
|
||||||
cycles (memory access: 100+ cycles, division: 10-20 cycles, add: 1 cycle).
|
|
||||||
|
|
||||||
your HD 530 runs at ~950 MHz, so roughly 950 million cycles per second.
|
|
||||||
at 60fps, that's about 15.8 million cycles per frame.
|
|
||||||
|
|
||||||
|
|
||||||
fill rate
|
|
||||||
---------
|
|
||||||
|
|
||||||
pixels written per second. measured in megapixels/s or gigapixels/s.
|
|
||||||
|
|
||||||
fill rate = ROPs * clock speed * pixels per clock
|
|
||||||
|
|
||||||
your HD 530: 3 ROPs * 950 MHz * 1 = 2.85 GPixels/s theoretical max.
|
|
||||||
|
|
||||||
|
|
||||||
overdraw
|
|
||||||
--------
|
|
||||||
|
|
||||||
drawing the same pixel multiple times per frame.
|
|
||||||
|
|
||||||
if two entities overlap, the back one gets drawn, then the front one
|
|
||||||
overwrites it. the back one's work was wasted.
|
|
||||||
|
|
||||||
overdraw ratio = total pixels drawn / screen pixels
|
|
||||||
|
|
||||||
1080p = 2.07M pixels. if you draw 20M pixels, overdraw = ~10x.
|
|
||||||
|
|
||||||
|
|
||||||
bandwidth
|
|
||||||
---------
|
|
||||||
|
|
||||||
data transfer rate. measured in bytes/second (GB/s, MB/s).
|
|
||||||
|
|
||||||
memory bandwidth = how fast data moves between processor and RAM.
|
|
||||||
|
|
||||||
your HD 530 shares DDR4 with the CPU: ~30 GB/s total.
|
|
||||||
a discrete GPU has dedicated VRAM: 200-900 GB/s.
|
|
||||||
|
|
||||||
|
|
||||||
latency
|
|
||||||
-------
|
|
||||||
|
|
||||||
time delay. measured in nanoseconds (ns) or cycles.
|
|
||||||
|
|
||||||
memory latency = time to fetch data from RAM.
|
|
||||||
- L1 cache: ~4 cycles
|
|
||||||
- L2 cache: ~12 cycles
|
|
||||||
- L3 cache: ~40 cycles
|
|
||||||
- main RAM: ~200 cycles
|
|
||||||
|
|
||||||
this is why cache matters. a cache miss = 50x slower than a hit.
|
|
||||||
|
|
||||||
|
|
||||||
throughput vs latency
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
latency = how long ONE thing takes.
|
|
||||||
throughput = how many things per second.
|
|
||||||
|
|
||||||
a pipeline can have high latency but high throughput.
|
|
||||||
|
|
||||||
example: a car wash takes 10 minutes (latency).
|
|
||||||
but if cars enter every 1 minute, throughput is 60 cars/hour.
|
|
||||||
|
|
||||||
GPUs hide latency with throughput. one thread waits for memory?
|
|
||||||
switch to another thread. thousands of threads keep the GPU busy.
|
|
||||||
|
|
||||||
|
|
||||||
draw call
|
|
||||||
---------
|
|
||||||
|
|
||||||
one command from CPU to GPU: "draw this batch of geometry."
|
|
||||||
|
|
||||||
each draw call has overhead:
|
|
||||||
- CPU prepares command buffer
|
|
||||||
- driver validates state
|
|
||||||
- GPU switches context
|
|
||||||
|
|
||||||
1 draw call for 1M triangles: fast.
|
|
||||||
1M draw calls for 1M triangles: slow.
|
|
||||||
|
|
||||||
lofivor uses 1 draw call for all entities (instanced rendering).
|
|
||||||
|
|
||||||
|
|
||||||
instancing
|
|
||||||
----------
|
|
||||||
|
|
||||||
drawing many copies of the same geometry in one draw call.
|
|
||||||
|
|
||||||
instead of: draw triangle, draw triangle, draw triangle...
|
|
||||||
you say: draw this triangle 1 million times, here are the positions.
|
|
||||||
|
|
||||||
the GPU handles the replication. massively more efficient.
|
|
||||||
|
|
||||||
|
|
||||||
shader
|
|
||||||
------
|
|
||||||
|
|
||||||
a small program that runs on the GPU.
|
|
||||||
|
|
||||||
the name is historical - early shaders calculated shading/lighting.
|
|
||||||
but today: a shader is just software running on GPU hardware.
|
|
||||||
it doesn't have to do with shading at all.
|
|
||||||
|
|
||||||
more precisely: a shader turns one piece of data into another piece of data.
|
|
||||||
- vertex shader: positions → screen coordinates
|
|
||||||
- fragment shader: fragments → pixel colors
|
|
||||||
- compute shader: data → data (anything)
|
|
||||||
|
|
||||||
GPUs are massively parallel, so shaders run on thousands of inputs at once.
|
|
||||||
CPUs have stagnated; GPUs keep getting faster. modern engines like UE5
|
|
||||||
increasingly use shaders for work that used to be CPU-only.
|
|
||||||
|
|
||||||
|
|
||||||
SSBO (shader storage buffer object)
|
|
||||||
-----------------------------------
|
|
||||||
|
|
||||||
a block of GPU memory that shaders can read/write.
|
|
||||||
|
|
||||||
unlike uniforms (small, read-only), SSBOs can be large and writable.
|
|
||||||
lofivor stores all entity data in an SSBO: positions, velocities, colors.
|
|
||||||
|
|
||||||
|
|
||||||
compute shader
|
|
||||||
--------------
|
|
||||||
|
|
||||||
a shader that does general computation, not rendering.
|
|
||||||
|
|
||||||
runs on GPU cores but doesn't output pixels. just processes data.
|
|
||||||
lofivor uses compute shaders to update entity positions.
|
|
||||||
|
|
||||||
because compute exists, shaders can be anything: physics, AI, sorting,
|
|
||||||
image processing. the GPU is a general-purpose parallel processor.
|
|
||||||
|
|
||||||
|
|
||||||
fragment / pixel shader
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
program that runs once per pixel (actually per "fragment").
|
|
||||||
|
|
||||||
determines the final color of each pixel. this is where:
|
|
||||||
- texture sampling happens
|
|
||||||
- lighting calculations happen
|
|
||||||
- the expensive math lives
|
|
||||||
|
|
||||||
lofivor's fragment shader: sample texture, multiply by color. trivial.
|
|
||||||
AAA game fragment shader: 500+ instructions. expensive.
|
|
||||||
|
|
||||||
|
|
||||||
vertex shader
|
|
||||||
-------------
|
|
||||||
|
|
||||||
program that runs once per vertex.
|
|
||||||
|
|
||||||
transforms 3D positions to screen positions. lofivor's vertex shader
|
|
||||||
reads from SSBO and positions the quad corners.
|
|
||||||
|
|
||||||
|
|
||||||
ROP (render output unit)
|
|
||||||
------------------------
|
|
||||||
|
|
||||||
final stage of GPU pipeline. writes pixels to framebuffer.
|
|
||||||
|
|
||||||
handles: depth test, stencil test, blending, antialiasing.
|
|
||||||
your bottleneck on HD 530. see docs/rops.txt.
|
|
||||||
|
|
||||||
|
|
||||||
TMU (texture mapping unit)
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
samples textures. reads pixel colors from texture memory.
|
|
||||||
|
|
||||||
your HD 530 has 24 TMUs. they're fast (22.8 GTexels/s).
|
|
||||||
texture sampling is cheap relative to ROPs on this hardware.
|
|
||||||
|
|
||||||
|
|
||||||
EU (execution unit)
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
intel's term for shader cores.
|
|
||||||
|
|
||||||
your HD 530 has 24 EUs, each with 8 ALUs = 192 ALUs total.
|
|
||||||
these run your vertex, fragment, and compute shaders.
|
|
||||||
|
|
||||||
|
|
||||||
ALU (arithmetic logic unit)
|
|
||||||
---------------------------
|
|
||||||
|
|
||||||
does math. add, multiply, compare, bitwise operations.
|
|
||||||
|
|
||||||
one ALU can do one operation per cycle (simple ops).
|
|
||||||
complex ops (sqrt, sin, cos) take multiple cycles.
|
|
||||||
|
|
||||||
|
|
||||||
framebuffer
|
|
||||||
-----------
|
|
||||||
|
|
||||||
the image being rendered. lives in GPU memory.
|
|
||||||
|
|
||||||
at 1080p with 32-bit color: 1920 * 1080 * 4 = 8.3 MB.
|
|
||||||
double-buffered (front + back): 16.6 MB.
|
|
||||||
|
|
||||||
|
|
||||||
vsync
|
|
||||||
-----
|
|
||||||
|
|
||||||
synchronizing frame presentation with monitor refresh.
|
|
||||||
|
|
||||||
without vsync: tearing (half old frame, half new frame).
|
|
||||||
with vsync: smooth, but if you miss 16.7ms, you wait for next refresh.
|
|
||||||
|
|
||||||
|
|
||||||
frame budget
|
|
||||||
------------
|
|
||||||
|
|
||||||
time available per frame.
|
|
||||||
|
|
||||||
60 fps = 16.67 ms per frame
|
|
||||||
30 fps = 33.33 ms per frame
|
|
||||||
|
|
||||||
everything (CPU + GPU) must complete within budget or frames drop.
|
|
||||||
|
|
||||||
|
|
||||||
pipeline stall
|
|
||||||
--------------
|
|
||||||
|
|
||||||
GPU waiting for something. bad for performance.
|
|
||||||
|
|
||||||
causes:
|
|
||||||
- waiting for memory (cache miss)
|
|
||||||
- waiting for previous stage to finish
|
|
||||||
- synchronization points (barriers)
|
|
||||||
- `discard` in fragment shader (breaks early-z)
|
|
||||||
|
|
||||||
|
|
||||||
early-z
|
|
||||||
-------
|
|
||||||
|
|
||||||
optimization: test depth BEFORE running fragment shader.
|
|
||||||
|
|
||||||
if pixel will be occluded, skip the expensive shader work.
|
|
||||||
`discard` breaks this because GPU can't know depth until shader runs.
|
|
||||||
|
|
||||||
|
|
||||||
LOD (level of detail)
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
using simpler geometry/textures for distant objects.
|
|
||||||
|
|
||||||
far away = fewer pixels = less detail needed.
|
|
||||||
saves vertices, texture bandwidth, and fill rate.
|
|
||||||
|
|
||||||
|
|
||||||
frustum culling
|
|
||||||
---------------
|
|
||||||
|
|
||||||
don't draw what's outside the camera view.
|
|
||||||
|
|
||||||
the "frustum" is the pyramid-shaped visible region.
|
|
||||||
anything outside = wasted work. cull it before sending to GPU.
|
|
||||||
|
|
||||||
|
|
||||||
spatial partitioning
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
organizing entities by position for fast queries.
|
|
||||||
|
|
||||||
types: grid, quadtree, octree, BVH.
|
|
||||||
|
|
||||||
"which entities are near point X?" goes from O(n) to O(log n).
|
|
||||||
essential for collision detection at scale.
|
|
||||||
|
|
@ -1,119 +0,0 @@
|
||||||
# intel hd 530 optimization guide for lofivor
|
|
||||||
|
|
||||||
based on hardware specs and empirical testing.
|
|
||||||
|
|
||||||
## hardware constraints
|
|
||||||
|
|
||||||
from `intel_hd_graphics_530.txt`:
|
|
||||||
|
|
||||||
| resource | value | implication |
|
|
||||||
| ---------- | ------- | ------------- |
|
|
||||||
| ROPs | 3 | fill rate limited - this is our ceiling |
|
|
||||||
| TMUs | 24 | texture sampling is relatively fast |
|
|
||||||
| memory | shared DDR4 ~30GB/s | bandwidth is precious, no VRAM |
|
|
||||||
| pixel rate | 2.85 GPixel/s | max theoretical throughput |
|
|
||||||
| EUs | 24 (192 ALUs) | decent compute, weak vs discrete |
|
|
||||||
| L3 cache | 768 KB | small, cache misses hurt |
|
|
||||||
|
|
||||||
the bottleneck is ROPs (fill rate), not vertices or compute.
|
|
||||||
|
|
||||||
## what works (proven)
|
|
||||||
|
|
||||||
### SSBO instance data
|
|
||||||
- 16 bytes per entity vs 64 bytes (matrices)
|
|
||||||
- minimizes bandwidth on shared memory bus
|
|
||||||
- result: ~5x improvement over instancing
|
|
||||||
|
|
||||||
### compute shader updates
|
|
||||||
- GPU does position/velocity updates
|
|
||||||
- no CPU→GPU sync per frame
|
|
||||||
- result: update time essentially free
|
|
||||||
|
|
||||||
### texture sampling
|
|
||||||
- 22.8 GTexel/s is fast relative to other units
|
|
||||||
- pre-baked circle texture beats procedural math
|
|
||||||
- result: 2x faster than procedural fragment shader
|
|
||||||
|
|
||||||
### instanced triangles/quads
|
|
||||||
- most optimized driver path
|
|
||||||
- intel mesa heavily optimizes this
|
|
||||||
- result: baseline, hard to beat
|
|
||||||
|
|
||||||
## what doesn't work (proven)
|
|
||||||
|
|
||||||
### point sprites
|
|
||||||
- theoretically 6x fewer vertices
|
|
||||||
- reality: 2.4x SLOWER on this hardware
|
|
||||||
- triangle rasterizer is more optimized
|
|
||||||
- see `docs/point_sprites_experiment.md`
|
|
||||||
|
|
||||||
### procedural fragment shaders
|
|
||||||
- `length()`, `smoothstep()`, `discard` are expensive
|
|
||||||
- EUs are weaker than discrete GPUs
|
|
||||||
- `discard` breaks early-z optimization
|
|
||||||
- result: 3.7x slower than texture sampling
|
|
||||||
|
|
||||||
### complex fragment math
|
|
||||||
- only 24 EUs, each running 8 ALUs
|
|
||||||
- transcendentals (sqrt, sin, cos) are 4x slower than FMAD
|
|
||||||
- avoid in hot path
|
|
||||||
|
|
||||||
## what to try next (theoretical)
|
|
||||||
|
|
||||||
### likely to help
|
|
||||||
|
|
||||||
| technique | why it should work | expected gain |
|
|
||||||
| ----------- | ------------------- | --------------- |
|
|
||||||
| frustum culling (GPU) | reduce fill rate, which is bottleneck | 10-30% depending on view |
|
|
||||||
| smaller points when zoomed out (LOD) | fewer pixels per entity = less ROP work | 20-40% |
|
|
||||||
| early-z / depth pre-pass | skip fragment work for occluded pixels | moderate |
|
|
||||||
|
|
||||||
### unlikely to help
|
|
||||||
|
|
||||||
| technique | why it won't help |
|
|
||||||
| ----------- | ------------------ |
|
|
||||||
| more vertex optimization | already fill rate bound, not vertex bound |
|
|
||||||
| SIMD on CPU | updates already on GPU |
|
|
||||||
| multithreading | CPU isn't the bottleneck |
|
|
||||||
| different vertex layouts | negligible vs fill rate |
|
|
||||||
|
|
||||||
### uncertain (need to test)
|
|
||||||
|
|
||||||
| technique | notes |
|
|
||||||
| ----------- | ------- |
|
|
||||||
| vulkan backend | might have less driver overhead, or might not matter |
|
|
||||||
| indirect draw calls | GPU decides what to render, but we're not CPU bound |
|
|
||||||
| fp16 in shaders | HD 530 has 2:1 fp16 ratio, might help fragment shader |
|
|
||||||
|
|
||||||
## key insights
|
|
||||||
|
|
||||||
1. fill rate is king - with only 3 ROPs, everything comes down to how many
|
|
||||||
pixels we're writing. optimizations that don't reduce pixel count won't
|
|
||||||
help.
|
|
||||||
|
|
||||||
2. shared memory hurts - no dedicated VRAM means CPU and GPU compete for
|
|
||||||
bandwidth. keep data transfers minimal.
|
|
||||||
|
|
||||||
3. driver optimization matters - the "common path" (triangles) is more
|
|
||||||
optimized than alternatives (points). don't be clever.
|
|
||||||
|
|
||||||
4. texture sampling is cheap - 22.8 GTexel/s is fast. prefer texture
|
|
||||||
lookups over ALU math in fragment shaders.
|
|
||||||
|
|
||||||
5. avoid discard - breaks early-z, causes pipeline stalls. alpha blending
|
|
||||||
is faster than discard.
|
|
||||||
|
|
||||||
## current ceiling
|
|
||||||
|
|
||||||
~950k entities @ 57fps (SSBO + compute + quads)
|
|
||||||
|
|
||||||
to go higher, we need to reduce fill rate:
|
|
||||||
- cull offscreen entities
|
|
||||||
- reduce entity size when zoomed out
|
|
||||||
- or accept lower fps at higher counts
|
|
||||||
|
|
||||||
## references
|
|
||||||
|
|
||||||
- intel gen9 compute architecture whitepaper
|
|
||||||
- empirical benchmarks in `benchmark_current_i56500t.log`
|
|
||||||
- point sprites experiment in `docs/point_sprites_experiment.md`
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
# hysteresis in lofivor
|
|
||||||
|
|
||||||
## the problem without it
|
|
||||||
|
|
||||||
say your target is 8.33ms. your frame times naturally jitter: 8.2, 8.4, 8.3, 8.5, 8.2...
|
|
||||||
|
|
||||||
without hysteresis, every time it crosses 8.33ms you'd log "crossed threshold!" - potentially dozens of times per second. the log becomes useless noise.
|
|
||||||
|
|
||||||
## how the code works
|
|
||||||
|
|
||||||
from `sandbox_main.zig` lines 74-89:
|
|
||||||
|
|
||||||
```
|
|
||||||
was_above=false → need frame_ms > 10.33 (target + 2.0 margin) to flip to true
|
|
||||||
was_above=true → need frame_ms < 8.33 (target) to flip back to false
|
|
||||||
```
|
|
||||||
|
|
||||||
this creates a "dead zone" between 8.33 and 10.33ms where no state change happens.
|
|
||||||
|
|
||||||
## the magnet analogy
|
|
||||||
|
|
||||||
the `was_above_target` boolean is like the magnet's current polarity. the frame time "pushing" past thresholds is like the magnetic field. the key insight: **the threshold you need to cross depends on which side you're currently on.**
|
|
||||||
|
|
||||||
if you're in "good" state, you need a significant spike (>10.33ms) before you flip to "bad". if you're in "bad" state, you only need to drop below 8.33ms to recover. this asymmetry is the hysteresis.
|
|
||||||
|
|
||||||
## real-world examples
|
|
||||||
|
|
||||||
- thermostat: heat on at 68°F, off at 72°F (prevents rapid on/off cycling)
|
|
||||||
- schmitt trigger in electronics: same concept, prevents noise from causing oscillation
|
|
||||||
|
|
||||||
the `THRESHOLD_MARGIN` of 2.0ms is the "width" of the hysteresis band - bigger = more stable but less responsive.
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
||||||
# Zoom/Pan Camera Design
|
|
||||||
|
|
||||||
A viewport camera for zooming into and panning around the simulation without affecting entity behavior.
|
|
||||||
|
|
||||||
## Core Behavior
|
|
||||||
|
|
||||||
### Zoom
|
|
||||||
- Scroll wheel zooms toward mouse cursor position
|
|
||||||
- Range: 1x (default floor) to 10x (ceiling)
|
|
||||||
- Instant response, no animation
|
|
||||||
- Esc or Space resets to 1x and clears pan offset
|
|
||||||
|
|
||||||
### Pan
|
|
||||||
- Any mouse button (left/middle/right) + drag pans the viewport
|
|
||||||
- Only available when zoom > 1x
|
|
||||||
- Bounded to simulation area - cannot pan into empty space
|
|
||||||
|
|
||||||
### UI
|
|
||||||
- Display current zoom level in existing panel under render info (e.g., `zoom: 2.3x`)
|
|
||||||
|
|
||||||
## Implementation Approach
|
|
||||||
|
|
||||||
### State
|
|
||||||
New camera state in `sandbox_main.zig`:
|
|
||||||
```zig
|
|
||||||
var zoom: f32 = 1.0;
|
|
||||||
var pan: @Vector(2, f32) = .{ 0, 0 };
|
|
||||||
```
|
|
||||||
|
|
||||||
### Shader Changes
|
|
||||||
Modify `entity.vert` to accept `zoom` and `pan` uniforms:
|
|
||||||
- Apply pan offset before converting to NDC
|
|
||||||
- Scale by zoom factor
|
|
||||||
- Scale quad size by zoom so entities appear larger
|
|
||||||
|
|
||||||
### Input Handling
|
|
||||||
- `getMouseWheelMove()` adjusts zoom (clamped 1.0–10.0)
|
|
||||||
- Zoom-toward-cursor: adjust pan to keep point under cursor stationary
|
|
||||||
- Mouse drag (any button) adjusts pan with bounds checking
|
|
||||||
- Esc/Space resets zoom to 1.0 and pan to (0, 0)
|
|
||||||
|
|
||||||
### Zoom-Toward-Cursor Math
|
|
||||||
When zooming from `oldZoom` to `newZoom` with cursor at `mousePos`:
|
|
||||||
```
|
|
||||||
worldMousePos = (mousePos / oldZoom) + pan
|
|
||||||
newPan = worldMousePos - (mousePos / newZoom)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pan Bounds
|
|
||||||
Constrain pan so viewport stays within simulation area:
|
|
||||||
```
|
|
||||||
maxPan = simulationSize - (screenSize / zoom)
|
|
||||||
pan = clamp(pan, 0, maxPan)
|
|
||||||
```
|
|
||||||
|
|
@ -1,440 +0,0 @@
|
||||||
# Zoom/Pan Camera Implementation Plan
|
|
||||||
|
|
||||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
|
||||||
|
|
||||||
**Goal:** Add viewport zoom (scroll wheel toward cursor) and pan (any mouse drag when zoomed) to observe the simulation up close.
|
|
||||||
|
|
||||||
**Architecture:** Camera state (zoom, pan) lives in sandbox_main.zig. Passed to shader as uniforms. All rendering paths use the same camera state, but only SSBO path gets shader-based zoom (others would need separate work).
|
|
||||||
|
|
||||||
**Tech Stack:** Zig, raylib, GLSL 430
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 1: Add camera state and shader uniforms
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/sandbox_main.zig:266` (add state after `var paused`)
|
|
||||||
- Modify: `src/ssbo_renderer.zig:20-21` (add uniform locations to struct)
|
|
||||||
- Modify: `src/ssbo_renderer.zig:54-62` (get uniform locations in init)
|
|
||||||
- Modify: `src/ssbo_renderer.zig:154-156` (pass uniforms in render)
|
|
||||||
|
|
||||||
**Step 1: Add camera state to sandbox_main.zig**
|
|
||||||
|
|
||||||
After line 266 (`var paused = false;`), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// camera state for zoom/pan
|
|
||||||
var zoom: f32 = 1.0;
|
|
||||||
var pan = @Vector(2, f32){ 0, 0 };
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Add uniform locations to SsboRenderer struct**
|
|
||||||
|
|
||||||
In `src/ssbo_renderer.zig`, add to struct fields after line 21 (`circle_texture_loc`):
|
|
||||||
|
|
||||||
```zig
|
|
||||||
zoom_loc: i32,
|
|
||||||
pan_loc: i32,
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 3: Get uniform locations in init**
|
|
||||||
|
|
||||||
After line 55 (`const circle_texture_loc = ...`), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
|
||||||
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 4: Add fields to return struct**
|
|
||||||
|
|
||||||
In the return statement (around line 112), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
.zoom_loc = zoom_loc,
|
|
||||||
.pan_loc = pan_loc,
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 5: Pass uniforms in render method**
|
|
||||||
|
|
||||||
Change render signature to accept zoom/pan:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
|
||||||
```
|
|
||||||
|
|
||||||
After line 156 (setting screenSize uniform), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// set zoom uniform
|
|
||||||
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
|
||||||
|
|
||||||
// set pan uniform
|
|
||||||
const pan_arr = [2]f32{ pan[0], pan[1] };
|
|
||||||
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 6: Update render call in sandbox_main.zig**
|
|
||||||
|
|
||||||
Change line 336 from:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
ssbo_renderer.?.render(&entities);
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
ssbo_renderer.?.render(&entities, zoom, pan);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 7: Build and verify compiles**
|
|
||||||
|
|
||||||
Run: `zig build`
|
|
||||||
|
|
||||||
Expected: Compiles with no errors (shader won't use uniforms yet, but that's fine)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 2: Update vertex shader for zoom/pan
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/shaders/entity.vert`
|
|
||||||
|
|
||||||
**Step 1: Add uniforms**
|
|
||||||
|
|
||||||
After line 19 (`uniform vec2 screenSize;`), add:
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
uniform float zoom;
|
|
||||||
uniform vec2 pan;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Update NDC calculation**
|
|
||||||
|
|
||||||
Replace lines 29-31:
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
// convert entity position to NDC
|
|
||||||
// entity coords are in screen pixels, convert to [-1, 1]
|
|
||||||
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
|
||||||
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
|
||||||
```
|
|
||||||
|
|
||||||
With:
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
// apply pan offset and zoom to convert to NDC
|
|
||||||
// pan is in screen pixels, zoom scales the view
|
|
||||||
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
|
||||||
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 3: Scale quad size by zoom**
|
|
||||||
|
|
||||||
Replace line 34:
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
float quadSizeNdc = 16.0 / screenSize.x;
|
|
||||||
```
|
|
||||||
|
|
||||||
With:
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 4: Build and test**
|
|
||||||
|
|
||||||
Run: `zig build && ./zig-out/bin/lofivor`
|
|
||||||
|
|
||||||
Expected: Renders exactly as before (zoom=1.0, pan=0,0 should be identical to old behavior)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 3: Add zoom input handling
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/sandbox_main.zig` (handleInput function and main loop)
|
|
||||||
|
|
||||||
**Step 1: Add zoom constants**
|
|
||||||
|
|
||||||
After line 32 (BENCH_EXIT_SUSTAIN), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// zoom settings
|
|
||||||
const ZOOM_MIN: f32 = 1.0;
|
|
||||||
const ZOOM_MAX: f32 = 10.0;
|
|
||||||
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Create handleCamera function**
|
|
||||||
|
|
||||||
After the `handleInput` function (around line 458), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) void {
|
|
||||||
const wheel = rl.getMouseWheelMove();
|
|
||||||
|
|
||||||
if (wheel != 0) {
|
|
||||||
const mouse_pos = rl.getMousePosition();
|
|
||||||
const old_zoom = zoom.*;
|
|
||||||
|
|
||||||
// calculate new zoom
|
|
||||||
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
|
||||||
var new_zoom = old_zoom * zoom_factor;
|
|
||||||
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
|
||||||
|
|
||||||
if (new_zoom != old_zoom) {
|
|
||||||
// zoom toward mouse cursor:
|
|
||||||
// keep the world point under the cursor stationary
|
|
||||||
// world_pos = (screen_pos / old_zoom) + old_pan
|
|
||||||
// new_pan = world_pos - (screen_pos / new_zoom)
|
|
||||||
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
|
||||||
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
|
||||||
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
|
||||||
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
|
||||||
zoom.* = new_zoom;
|
|
||||||
|
|
||||||
// clamp pan to bounds
|
|
||||||
clampPan(pan, zoom.*);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// reset on Esc or Space (Space also toggles pause in handleInput)
|
|
||||||
if (rl.isKeyPressed(.escape)) {
|
|
||||||
zoom.* = 1.0;
|
|
||||||
pan.* = @Vector(2, f32){ 0, 0 };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
|
||||||
// when zoomed in, limit pan so viewport stays in simulation bounds
|
|
||||||
// visible area = screen_size / zoom
|
|
||||||
// max pan = world_size - visible_area
|
|
||||||
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
|
||||||
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
|
||||||
const visible_w = screen_w / zoom;
|
|
||||||
const visible_h = screen_h / zoom;
|
|
||||||
|
|
||||||
const max_pan_x = @max(0, screen_w - visible_w);
|
|
||||||
const max_pan_y = @max(0, screen_h - visible_h);
|
|
||||||
|
|
||||||
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
|
||||||
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 3: Call handleCamera in main loop**
|
|
||||||
|
|
||||||
In the main loop, after the `handleInput` call (line 318), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
handleCamera(&zoom, &pan);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 4: Also reset zoom when Space is pressed**
|
|
||||||
|
|
||||||
In `handleInput`, modify the space key handler (around line 450):
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// pause: space (also resets zoom in handleCamera context)
|
|
||||||
if (rl.isKeyPressed(.space)) {
|
|
||||||
paused.* = !paused.*;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Actually, handleInput doesn't have access to zoom/pan. We need to either:
|
|
||||||
- Pass zoom/pan to handleInput
|
|
||||||
- Handle space reset in handleCamera
|
|
||||||
|
|
||||||
Let's handle it in handleCamera. Add after the escape check:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// Space also resets zoom (pause is handled separately in handleInput)
|
|
||||||
if (rl.isKeyPressed(.space)) {
|
|
||||||
zoom.* = 1.0;
|
|
||||||
pan.* = @Vector(2, f32){ 0, 0 };
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 5: Build and test zoom**
|
|
||||||
|
|
||||||
Run: `zig build && ./zig-out/bin/lofivor`
|
|
||||||
|
|
||||||
Test:
|
|
||||||
1. Scroll up - entities should get bigger (zoom in toward cursor)
|
|
||||||
2. Scroll down - entities get smaller (but not below 1x)
|
|
||||||
3. Press Esc or Space - resets to default view
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 4: Add pan input handling
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/sandbox_main.zig` (handleCamera function)
|
|
||||||
|
|
||||||
**Step 1: Add pan logic to handleCamera**
|
|
||||||
|
|
||||||
Add this after the zoom handling, before the reset checks:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
// pan with any mouse button drag (only when zoomed in)
|
|
||||||
if (zoom.* > 1.0) {
|
|
||||||
const any_button = rl.isMouseButtonDown(.left) or
|
|
||||||
rl.isMouseButtonDown(.right) or
|
|
||||||
rl.isMouseButtonDown(.middle);
|
|
||||||
if (any_button) {
|
|
||||||
const delta = rl.getMouseDelta();
|
|
||||||
// pan in opposite direction of drag (drag right = view moves left = pan increases)
|
|
||||||
pan.*[0] -= delta.x / zoom.*;
|
|
||||||
pan.*[1] -= delta.y / zoom.*;
|
|
||||||
clampPan(pan, zoom.*);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Build and test pan**
|
|
||||||
|
|
||||||
Run: `zig build && ./zig-out/bin/lofivor`
|
|
||||||
|
|
||||||
Test:
|
|
||||||
1. Scroll to zoom in past 1x
|
|
||||||
2. Click and drag with any mouse button - viewport should pan
|
|
||||||
3. Try to pan past edges - should be bounded
|
|
||||||
4. At 1x zoom, dragging should do nothing
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 5: Add zoom display to UI
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/ui.zig:34` (drawMetrics signature)
|
|
||||||
- Modify: `src/ui.zig:71-72` (add zoom line after render)
|
|
||||||
- Modify: `src/sandbox_main.zig:387` (pass zoom to drawMetrics)
|
|
||||||
|
|
||||||
**Step 1: Update drawMetrics signature**
|
|
||||||
|
|
||||||
Change line 34:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Increase box height for zoom line**
|
|
||||||
|
|
||||||
Change line 50:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const bg_height: i32 = if (paused) 130 else 100;
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const bg_height: i32 = if (paused) 150 else 120;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 3: Add zoom display after render line**
|
|
||||||
|
|
||||||
After line 72 (render_text draw), add:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
y += line_height;
|
|
||||||
|
|
||||||
// zoom level
|
|
||||||
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
|
||||||
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 4: Update call in sandbox_main.zig**
|
|
||||||
|
|
||||||
Change line 387:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 5: Build and test UI**
|
|
||||||
|
|
||||||
Run: `zig build && ./zig-out/bin/lofivor`
|
|
||||||
|
|
||||||
Test:
|
|
||||||
1. UI should show "zoom: 1.0x" in white
|
|
||||||
2. Scroll to zoom - should update and turn yellow when > 1x
|
|
||||||
3. Reset with Esc - back to white 1.0x
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 6: Update controls legend
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `src/ui.zig:120-139` (drawControls function)
|
|
||||||
|
|
||||||
**Step 1: Update controls list and box height**
|
|
||||||
|
|
||||||
Change line 121:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 5 + box_padding * 2);
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
|
||||||
```
|
|
||||||
|
|
||||||
Change the controls array (lines 127-133):
|
|
||||||
|
|
||||||
```zig
|
|
||||||
const controls = [_][]const u8{
|
|
||||||
"+/-: 10k entities",
|
|
||||||
"shift +/-: 50k",
|
|
||||||
"scroll: zoom",
|
|
||||||
"drag: pan (zoomed)",
|
|
||||||
"space: pause/reset",
|
|
||||||
"esc: reset zoom",
|
|
||||||
"tab: toggle ui",
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 2: Build and final test**
|
|
||||||
|
|
||||||
Run: `zig build && ./zig-out/bin/lofivor`
|
|
||||||
|
|
||||||
Full test:
|
|
||||||
1. Scroll wheel zooms toward cursor (1x-10x)
|
|
||||||
2. Any mouse drag pans when zoomed > 1x
|
|
||||||
3. Pan is bounded to simulation area
|
|
||||||
4. Esc resets zoom/pan
|
|
||||||
5. Space toggles pause AND resets zoom/pan
|
|
||||||
6. UI shows zoom level (yellow when zoomed)
|
|
||||||
7. Controls legend shows new controls
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 7: Commit
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add src/sandbox_main.zig src/ssbo_renderer.zig src/shaders/entity.vert src/ui.zig
|
|
||||||
git commit -m "feat: add zoom/pan camera
|
|
||||||
|
|
||||||
- scroll wheel zooms toward cursor (1x-10x range)
|
|
||||||
- any mouse button drag pans when zoomed
|
|
||||||
- pan bounded to simulation area
|
|
||||||
- esc/space resets to default view
|
|
||||||
- zoom level shown in metrics panel"
|
|
||||||
```
|
|
||||||
|
|
@ -1,170 +0,0 @@
|
||||||
# compute shader entity updates
|
|
||||||
|
|
||||||
move entity position math to GPU, eliminate CPU→GPU sync per frame.
|
|
||||||
|
|
||||||
## context
|
|
||||||
|
|
||||||
current bottleneck: per-frame `rlUpdateShaderBuffer()` uploads all entity data from CPU to GPU. at 950k entities that's 19MB/frame. targeting 10M entities would be 160MB/frame.
|
|
||||||
|
|
||||||
solution: keep entity data on GPU entirely. compute shader updates positions, vertex shader renders. CPU just dispatches.
|
|
||||||
|
|
||||||
## data structures
|
|
||||||
|
|
||||||
**GpuEntity (16 bytes, std430):**
|
|
||||||
```glsl
|
|
||||||
struct Entity {
|
|
||||||
float x; // world position
|
|
||||||
float y;
|
|
||||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
|
||||||
uint color; // 0xRRGGBB
|
|
||||||
};
|
|
||||||
```
|
|
||||||
|
|
||||||
**zig side:**
|
|
||||||
```zig
|
|
||||||
const GpuEntity = extern struct {
|
|
||||||
x: f32,
|
|
||||||
y: f32,
|
|
||||||
packed_vel: i32,
|
|
||||||
color: u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
fn packVelocity(vx: f32, vy: f32) i32 {
|
|
||||||
const vx_fixed: i16 = @intFromFloat(vx * 256.0);
|
|
||||||
const vy_fixed: i16 = @intFromFloat(vy * 256.0);
|
|
||||||
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## compute shader
|
|
||||||
|
|
||||||
`src/shaders/entity_update.comp`:
|
|
||||||
```glsl
|
|
||||||
#version 430
|
|
||||||
layout(local_size_x = 256) in;
|
|
||||||
|
|
||||||
layout(std430, binding = 0) buffer Entities {
|
|
||||||
Entity entities[];
|
|
||||||
};
|
|
||||||
|
|
||||||
uniform uint entityCount;
|
|
||||||
uniform uint frameNumber;
|
|
||||||
uniform vec2 screenSize;
|
|
||||||
uniform vec2 center;
|
|
||||||
uniform float respawnRadius;
|
|
||||||
|
|
||||||
void main() {
|
|
||||||
uint id = gl_GlobalInvocationID.x;
|
|
||||||
if (id >= entityCount) return;
|
|
||||||
|
|
||||||
Entity e = entities[id];
|
|
||||||
|
|
||||||
// unpack velocity
|
|
||||||
float vx = float(e.packedVel >> 16) / 256.0;
|
|
||||||
float vy = float((e.packedVel << 16) >> 16) / 256.0;
|
|
||||||
|
|
||||||
// update position
|
|
||||||
e.x += vx;
|
|
||||||
e.y += vy;
|
|
||||||
|
|
||||||
// respawn check
|
|
||||||
float dx = e.x - center.x;
|
|
||||||
float dy = e.y - center.y;
|
|
||||||
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
|
||||||
// GPU RNG
|
|
||||||
uint seed = id * 1103515245u + frameNumber * 12345u;
|
|
||||||
seed = seed * 747796405u + 2891336453u;
|
|
||||||
|
|
||||||
uint edge = seed & 3u;
|
|
||||||
float t = float((seed >> 2) & 0xFFFFu) / 65535.0;
|
|
||||||
|
|
||||||
// spawn on edge with velocity toward center
|
|
||||||
// (full edge logic in implementation)
|
|
||||||
}
|
|
||||||
|
|
||||||
entities[id] = e;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## integration
|
|
||||||
|
|
||||||
raylib doesn't wrap compute shaders. use raw GL calls via `compute.zig`:
|
|
||||||
|
|
||||||
```zig
|
|
||||||
pub fn dispatch(entity_count: u32, frame: u32) void {
|
|
||||||
gl.glUseProgram(program);
|
|
||||||
gl.glUniform1ui(entity_count_loc, entity_count);
|
|
||||||
gl.glUniform1ui(frame_loc, frame);
|
|
||||||
// ... other uniforms
|
|
||||||
|
|
||||||
const groups = (entity_count + 255) / 256;
|
|
||||||
gl.glDispatchCompute(groups, 1, 1);
|
|
||||||
gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## frame flow
|
|
||||||
|
|
||||||
**before:**
|
|
||||||
```
|
|
||||||
CPU: update positions (5ms at 950k)
|
|
||||||
CPU: copy to gpu_buffer
|
|
||||||
CPU→GPU: rlUpdateShaderBuffer() ← bottleneck
|
|
||||||
GPU: render
|
|
||||||
```
|
|
||||||
|
|
||||||
**after:**
|
|
||||||
```
|
|
||||||
GPU: compute dispatch (~0ms CPU time)
|
|
||||||
GPU: memory barrier
|
|
||||||
GPU: render
|
|
||||||
```
|
|
||||||
|
|
||||||
## implementation steps
|
|
||||||
|
|
||||||
each step is a commit point if desired.
|
|
||||||
|
|
||||||
### step 1: GpuEntity struct expansion
|
|
||||||
- modify `GpuEntity` in sandbox.zig: add `packed_vel` field
|
|
||||||
- add `packVelocity()` helper
|
|
||||||
- update ssbo_renderer to handle 16-byte stride
|
|
||||||
- verify existing rendering still works
|
|
||||||
|
|
||||||
### step 2: compute shader infrastructure
|
|
||||||
- create `src/compute.zig` with GL bindings
|
|
||||||
- create `src/shaders/entity_update.comp` (position update only, no respawn yet)
|
|
||||||
- load and compile compute shader in sandbox_main.zig
|
|
||||||
- dispatch before render, verify positions update
|
|
||||||
|
|
||||||
### step 3: respawn logic
|
|
||||||
- add GPU RNG to compute shader
|
|
||||||
- implement edge spawning + velocity calculation
|
|
||||||
- remove CPU update loop from sandbox.zig
|
|
||||||
|
|
||||||
### step 4: cleanup ✓
|
|
||||||
- `--compute` is now default, `--cpu` flag for fallback/comparison
|
|
||||||
- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
|
|
||||||
- verbose debug output reduced
|
|
||||||
|
|
||||||
## files changed
|
|
||||||
|
|
||||||
**new:**
|
|
||||||
- `src/shaders/entity_update.comp`
|
|
||||||
- `src/compute.zig`
|
|
||||||
|
|
||||||
**modified:**
|
|
||||||
- `src/sandbox.zig` — GpuEntity struct, packVelocity(), remove CPU update
|
|
||||||
- `src/ssbo_renderer.zig` — remove per-frame upload
|
|
||||||
- `src/sandbox_main.zig` — init compute, dispatch in frame loop
|
|
||||||
|
|
||||||
## risks
|
|
||||||
|
|
||||||
1. **driver quirks** — intel HD 530 compute support is fine but older, may hit edge cases
|
|
||||||
2. **debugging** — GPU code harder to debug, start with small counts
|
|
||||||
3. **fallback** — keep `--compute` flag to A/B test against existing SSBO path
|
|
||||||
|
|
||||||
## expected results
|
|
||||||
|
|
||||||
- CPU update time: ~5ms → ~0ms
|
|
||||||
- no per-frame buffer upload
|
|
||||||
- target: 1M+ entities, pushing toward 10M ceiling
|
|
||||||
|
|
@ -1,89 +0,0 @@
|
||||||
# point sprites experiment
|
|
||||||
|
|
||||||
branch: `point-sprites` (point-sprites work)
|
|
||||||
date: 2024-12
|
|
||||||
hardware: intel hd 530 (skylake gt2, i5-6500T)
|
|
||||||
|
|
||||||
## hypothesis
|
|
||||||
|
|
||||||
point sprites should be faster than quads because:
|
|
||||||
- 1 vertex per entity instead of 6 (quad = 2 triangles)
|
|
||||||
- less vertex throughput
|
|
||||||
- `gl_PointCoord` provides texture coords automatically
|
|
||||||
|
|
||||||
## implementation
|
|
||||||
|
|
||||||
### vertex shader changes
|
|
||||||
- removed quad vertex attributes (position, texcoord)
|
|
||||||
- use `gl_PointSize = 16.0 * zoom` for size control
|
|
||||||
- position calculated from SSBO data only
|
|
||||||
|
|
||||||
### fragment shader changes
|
|
||||||
- use `gl_PointCoord` instead of vertex texcoord
|
|
||||||
- sample circle texture for alpha
|
|
||||||
|
|
||||||
### renderer changes
|
|
||||||
- load `glEnable` and `glDrawArraysInstanced` via `rlGetProcAddress`
|
|
||||||
- enable `GL_PROGRAM_POINT_SIZE`
|
|
||||||
- draw with `glDrawArraysInstanced(GL_POINTS, 0, 1, count)`
|
|
||||||
- removed VBO (no vertex data needed)
|
|
||||||
|
|
||||||
## results
|
|
||||||
|
|
||||||
### attempt 1: procedural circle in fragment shader
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
vec2 coord = gl_PointCoord - vec2(0.5);
|
|
||||||
float dist = length(coord);
|
|
||||||
float alpha = 1.0 - smoothstep(0.4, 0.5, dist);
|
|
||||||
if (alpha < 0.01) discard;
|
|
||||||
```
|
|
||||||
|
|
||||||
**benchmark @ 350k entities:**
|
|
||||||
- point sprites: 23ms render, 43fps
|
|
||||||
- quads (main): 6.2ms render, 151fps
|
|
||||||
- **result: 3.7x SLOWER**
|
|
||||||
|
|
||||||
**why:** `discard` breaks early-z optimization, `length()` and `smoothstep()` are ALU-heavy, intel integrated GPUs are weak at fragment shader math.
|
|
||||||
|
|
||||||
### attempt 2: texture sampling
|
|
||||||
|
|
||||||
```glsl
|
|
||||||
float alpha = texture(circleTexture, gl_PointCoord).r;
|
|
||||||
finalColor = vec4(fragColor, alpha);
|
|
||||||
```
|
|
||||||
|
|
||||||
**benchmark @ 450k entities:**
|
|
||||||
- point sprites: 19.1ms render, 52fps
|
|
||||||
- quads (main): 8.0ms render, 122fps
|
|
||||||
- **result: 2.4x SLOWER**
|
|
||||||
|
|
||||||
better than procedural, but still significantly slower than quads.
|
|
||||||
|
|
||||||
## analysis
|
|
||||||
|
|
||||||
the theoretical advantage (1/6 vertices) doesn't translate to real performance because:
|
|
||||||
|
|
||||||
1. **triangle path is more optimized** - intel's driver heavily optimizes the standard triangle rasterization path. point sprites use a less-traveled code path.
|
|
||||||
|
|
||||||
2. **fill rate is the bottleneck** - HD 530 has only 3 ROPs. we're bound by how fast we can write pixels, not by vertex count. reducing vertices from 6 to 1 doesn't help when fill rate is the constraint.
|
|
||||||
|
|
||||||
3. **point size overhead** - each point requires computing `gl_PointSize` and setting up the point sprite rasterization, which may have per-vertex overhead.
|
|
||||||
|
|
||||||
4. **texture cache behavior** - `gl_PointCoord` may have worse cache locality than explicit vertex texcoords.
|
|
||||||
|
|
||||||
## conclusion
|
|
||||||
|
|
||||||
**point sprites are a regression on intel hd 530.**
|
|
||||||
|
|
||||||
the optimization makes theoretical sense but fails in practice on this hardware. the quad/triangle path is simply more optimized in intel's mesa driver.
|
|
||||||
|
|
||||||
**keep this branch for testing on discrete GPUs** where point sprites might actually help (nvidia/amd have different optimization priorities).
|
|
||||||
|
|
||||||
## lessons learned
|
|
||||||
|
|
||||||
1. always benchmark, don't assume
|
|
||||||
2. "fewer vertices" doesn't always mean faster
|
|
||||||
3. integrated GPU optimization is different from discrete
|
|
||||||
4. the most optimized path is usually the most common path (triangles)
|
|
||||||
5. fill rate matters more than vertex count at high entity counts
|
|
||||||
201
docs/rops.txt
201
docs/rops.txt
|
|
@ -1,201 +0,0 @@
|
||||||
rops: render output units
|
|
||||||
=========================
|
|
||||||
|
|
||||||
what they are, where they came from, and what yours can do.
|
|
||||||
|
|
||||||
|
|
||||||
what is a rop?
|
|
||||||
--------------
|
|
||||||
|
|
||||||
ROP = Render Output Unit (originally "Raster Operations Pipeline")
|
|
||||||
|
|
||||||
it's the final stage of the GPU pipeline. after all the fancy shader
|
|
||||||
math is done, the ROP is the unit that actually writes pixels to memory.
|
|
||||||
|
|
||||||
think of it as the bottleneck between "calculated" and "visible."
|
|
||||||
|
|
||||||
a ROP does:
|
|
||||||
- depth testing (is this pixel in front of what's already there?)
|
|
||||||
- stencil testing (mask operations)
|
|
||||||
- blending (alpha, additive, etc)
|
|
||||||
- anti-aliasing resolve
|
|
||||||
- writing the final color to the framebuffer
|
|
||||||
|
|
||||||
one ROP can write one pixel per clock cycle (roughly).
|
|
||||||
|
|
||||||
|
|
||||||
the first rop
|
|
||||||
-------------
|
|
||||||
|
|
||||||
the term comes from the IBM 8514/A (1987), which had dedicated hardware
|
|
||||||
for "raster operations" - bitwise operations on pixels (AND, OR, XOR).
|
|
||||||
this was revolutionary because before this, the CPU did all pixel math.
|
|
||||||
|
|
||||||
but the modern ROP as we know it emerged with:
|
|
||||||
|
|
||||||
NVIDIA NV1 (1995)
|
|
||||||
one of the first chips with dedicated pixel output hardware
|
|
||||||
could do ~1 million textured pixels/second
|
|
||||||
|
|
||||||
3dfx Voodoo (1996)
|
|
||||||
the card that defined the modern GPU pipeline
|
|
||||||
had 1 TMU + 1 pixel pipeline (essentially 1 ROP)
|
|
||||||
could push 45 million pixels/second
|
|
||||||
that ONE pipeline ran Quake at 640x480
|
|
||||||
|
|
||||||
NVIDIA GeForce 256 (1999)
|
|
||||||
"the first GPU" - named itself with that term
|
|
||||||
4 pixel pipelines = 4 ROPs
|
|
||||||
480 million pixels/second
|
|
||||||
|
|
||||||
so the original consumer 3D cards had... 1 ROP. and they ran Quake.
|
|
||||||
|
|
||||||
|
|
||||||
what one rop can do
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
let's do the math.
|
|
||||||
|
|
||||||
one ROP at 100 MHz (3dfx Voodoo era):
|
|
||||||
100 million cycles/second
|
|
||||||
~1 pixel per cycle
|
|
||||||
= 100 megapixels/second
|
|
||||||
|
|
||||||
at 640x480 @ 60fps:
|
|
||||||
640 * 480 * 60 = 18.4 megapixels/second needed
|
|
||||||
|
|
||||||
so ONE ROP at 100MHz could handle 640x480 with ~5x headroom for overdraw.
|
|
||||||
|
|
||||||
at 1024x768 @ 60fps:
|
|
||||||
1024 * 768 * 60 = 47 megapixels/second
|
|
||||||
|
|
||||||
now you're at 2x overdraw max. still playable, but tight.
|
|
||||||
|
|
||||||
|
|
||||||
one modern rop
|
|
||||||
--------------
|
|
||||||
|
|
||||||
a single modern ROP runs at ~1-2 GHz and can do more per cycle:
|
|
||||||
- multiple color outputs (MRT)
|
|
||||||
- 64-bit or 128-bit color formats
|
|
||||||
- compressed writes
|
|
||||||
|
|
||||||
rough estimate for one ROP at 1.5 GHz:
|
|
||||||
~1.5 billion pixels/second base throughput
|
|
||||||
|
|
||||||
at 1920x1080 @ 60fps:
|
|
||||||
1920 * 1080 * 60 = 124 megapixels/second
|
|
||||||
|
|
||||||
one ROP could handle 1080p with 12x overdraw headroom.
|
|
||||||
|
|
||||||
at 4K @ 60fps:
|
|
||||||
3840 * 2160 * 60 = 497 megapixels/second
|
|
||||||
|
|
||||||
one ROP could handle 4K with 3x overdraw. tight, but possible.
|
|
||||||
|
|
||||||
|
|
||||||
your three rops (intel hd 530)
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
HD 530 specs:
|
|
||||||
- 3 ROPs
|
|
||||||
- ~950 MHz boost clock
|
|
||||||
- theoretical: 2.85 GPixels/second
|
|
||||||
|
|
||||||
let's break that down:
|
|
||||||
|
|
||||||
at 1080p @ 60fps (124 MP/s needed):
|
|
||||||
2850 / 124 = 23x overdraw budget
|
|
||||||
|
|
||||||
that's actually generous! you could draw each pixel 23 times.
|
|
||||||
|
|
||||||
so why does lofivor struggle at 1M entities?
|
|
||||||
|
|
||||||
because 1M entities at 4x4 pixels = 16M pixels minimum.
|
|
||||||
but with overlap? let's say average 10x overdraw:
|
|
||||||
160M pixels/frame
|
|
||||||
at 60fps = 9.6 billion pixels/second
|
|
||||||
|
|
||||||
your ceiling is 2.85 billion.
|
|
||||||
|
|
||||||
so you're 3.4x over budget. that's why you top out around 300k-400k
|
|
||||||
before frame drops (which matches empirical testing).
|
|
||||||
|
|
||||||
|
|
||||||
the real constraint
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
ROPs don't work in isolation. they're limited by:
|
|
||||||
|
|
||||||
1. MEMORY BANDWIDTH
|
|
||||||
each pixel write = memory access
|
|
||||||
HD 530 shares DDR4 with CPU (~30 GB/s)
|
|
||||||
at 32-bit color: 30GB/s / 4 bytes = 7.5 billion pixels/second max
|
|
||||||
but you're competing with CPU, texture reads, etc.
|
|
||||||
realistic: maybe 2-3 billion pixels for framebuffer writes
|
|
||||||
|
|
||||||
2. TEXTURE SAMPLING
|
|
||||||
if fragment shader samples textures, TMUs must keep up
|
|
||||||
HD 530 has 24 TMUs, so this isn't the bottleneck
|
|
||||||
|
|
||||||
3. SHADER EXECUTION
|
|
||||||
ROPs wait for fragments to be shaded
|
|
||||||
if shaders are slow, ROPs starve
|
|
||||||
lofivor's shaders are trivial, so this isn't the bottleneck
|
|
||||||
|
|
||||||
for lofivor specifically: your 3 ROPs are THE ceiling.
|
|
||||||
|
|
||||||
|
|
||||||
what could you do with more rops?
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
comparison:
|
|
||||||
|
|
||||||
Intel HD 530: 3 ROPs, 2.85 GPixels/s
|
|
||||||
GTX 1060: 48 ROPs, 72 GPixels/s
|
|
||||||
RTX 3080: 96 ROPs, 164 GPixels/s
|
|
||||||
RTX 4090: 176 ROPs, 443 GPixels/s
|
|
||||||
|
|
||||||
with a GTX 1060 (25x your fill rate):
|
|
||||||
lofivor could probably hit 5-10 million entities
|
|
||||||
|
|
||||||
with an RTX 4090 (155x your fill rate):
|
|
||||||
tens of millions, limited by other factors
|
|
||||||
|
|
||||||
|
|
||||||
perspective: what 3 rops means historically
|
|
||||||
-------------------------------------------
|
|
||||||
|
|
||||||
your HD 530 has roughly the fill rate of:
|
|
||||||
- GeForce 4 Ti 4600 (2002): 4 ROPs, 1.2 GPixels/s
|
|
||||||
- Radeon 9700 Pro (2002): 8 ROPs, 2.6 GPixels/s
|
|
||||||
|
|
||||||
you're running hardware that, in raw pixel output, matches GPUs from
|
|
||||||
20+ years ago. but with modern features (compute shaders, SSBO, etc).
|
|
||||||
|
|
||||||
this is why lofivor is interesting: you're achieving 700k+ entities
|
|
||||||
on fill-rate-equivalent hardware that originally ran games with
|
|
||||||
maybe 10,000 triangles on screen.
|
|
||||||
|
|
||||||
the difference is technique. those 2002 games did complex per-pixel
|
|
||||||
lighting, shadows, multiple texture passes. lofivor does one texture
|
|
||||||
sample and one blend. same fill rate, 100x the entities.
|
|
||||||
|
|
||||||
|
|
||||||
the lesson
|
|
||||||
----------
|
|
||||||
|
|
||||||
ROPs are simple: they write pixels.
|
|
||||||
|
|
||||||
the number you have determines your pixel budget.
|
|
||||||
everything else (shaders, vertices, CPU logic) only matters if
|
|
||||||
the ROPs aren't your bottleneck.
|
|
||||||
|
|
||||||
with 3 ROPs, you have roughly 2.85 billion pixels/second.
|
|
||||||
spend them wisely:
|
|
||||||
- cull what's offscreen (don't spend pixels on invisible things)
|
|
||||||
- shrink distant objects (LOD saves pixels)
|
|
||||||
- reduce overlap (spatial organization)
|
|
||||||
- keep shaders simple (don't starve the ROPs)
|
|
||||||
|
|
||||||
your 3 ROPs can do remarkable things. Quake ran on 1.
|
|
||||||
|
|
@ -1,316 +0,0 @@
|
||||||
why rendering millions of entities is hard
|
|
||||||
=========================================
|
|
||||||
|
|
||||||
and what "hard" actually means, from first principles.
|
|
||||||
|
|
||||||
|
|
||||||
the simple answer
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
every frame, your computer does work. work takes time. you have 16.7
|
|
||||||
milliseconds to do all the work before the next frame (at 60fps).
|
|
||||||
|
|
||||||
if the work takes longer than 16.7ms, you miss the deadline. frames drop.
|
|
||||||
the game stutters.
|
|
||||||
|
|
||||||
10 million entities means 10 million units of work. whether that fits in
|
|
||||||
16.7ms depends on how much work each unit is.
|
|
||||||
|
|
||||||
|
|
||||||
what is "work" anyway?
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
let's trace what happens when you draw one entity:
|
|
||||||
|
|
||||||
1. CPU: "here's an entity at position (340, 512), color cyan"
|
|
||||||
2. that data travels over a bus to the GPU
|
|
||||||
3. GPU: receives the data, stores it in memory
|
|
||||||
4. GPU: runs a vertex shader (figures out where on screen)
|
|
||||||
5. GPU: runs a fragment shader (figures out what color each pixel is)
|
|
||||||
6. GPU: writes pixels to the framebuffer
|
|
||||||
7. framebuffer gets sent to your monitor
|
|
||||||
|
|
||||||
each step has a speed limit. the slowest step is your bottleneck.
|
|
||||||
|
|
||||||
|
|
||||||
the bottlenecks, explained simply
|
|
||||||
---------------------------------
|
|
||||||
|
|
||||||
MEMORY BANDWIDTH
|
|
||||||
how fast data can move around. measured in GB/s.
|
|
||||||
|
|
||||||
think of it like a highway. you can have a fast car (processor), but
|
|
||||||
if the highway is jammed, you're stuck in traffic.
|
|
||||||
|
|
||||||
an integrated GPU (like Intel HD 530) shares the highway with the CPU.
|
|
||||||
a discrete GPU (like an RTX card) has its own private highway.
|
|
||||||
|
|
||||||
this is why lofivor's SSBO optimization helped so much: shrinking
|
|
||||||
entity data from 64 bytes to 12 bytes means 5x less traffic.
|
|
||||||
|
|
||||||
DRAW CALLS
|
|
||||||
every time you say "GPU, draw this thing", there's overhead.
|
|
||||||
the CPU and GPU have to synchronize, state gets set up, etc.
|
|
||||||
|
|
||||||
1 draw call for 1 million entities: fast
|
|
||||||
1 million draw calls for 1 million entities: slow
|
|
||||||
|
|
||||||
this is why batching matters. not the drawing itself, but the
|
|
||||||
*coordination* of drawing.
|
|
||||||
|
|
||||||
FILL RATE
|
|
||||||
how many pixels the GPU can color per second.
|
|
||||||
|
|
||||||
a 4x4 pixel entity = 16 pixels
|
|
||||||
1 million entities = 16 million pixels minimum
|
|
||||||
|
|
||||||
but your screen is only ~2 million pixels (1920x1080). so entities
|
|
||||||
overlap. "overdraw" means coloring the same pixel multiple times.
|
|
||||||
|
|
||||||
10 million overlapping entities might touch each pixel 50+ times.
|
|
||||||
that's 100 million pixel operations.
|
|
||||||
|
|
||||||
SHADER COMPLEXITY
|
|
||||||
the GPU runs a tiny program for each vertex and each pixel.
|
|
||||||
|
|
||||||
simple: "put it here, color it this" = fast
|
|
||||||
complex: "calculate lighting from 8 sources, sample 4 textures,
|
|
||||||
apply normal mapping, do fresnel..." = slow
|
|
||||||
|
|
||||||
lofivor's shaders are trivial. AAA game shaders are not.
|
|
||||||
|
|
||||||
CPU-GPU SYNCHRONIZATION
|
|
||||||
the CPU and GPU work in parallel, but sometimes they have to wait
|
|
||||||
for each other.
|
|
||||||
|
|
||||||
if the CPU needs to read GPU results, it stalls.
|
|
||||||
if the GPU needs new data and the CPU is busy, it stalls.
|
|
||||||
|
|
||||||
good code keeps them both busy without waiting.
|
|
||||||
|
|
||||||
|
|
||||||
why "real games" hit CPU walls
|
|
||||||
------------------------------
|
|
||||||
|
|
||||||
rendering is just putting colors on pixels. that's the GPU's job.
|
|
||||||
|
|
||||||
but games aren't just rendering. they're also:
|
|
||||||
|
|
||||||
- COLLISION DETECTION
|
|
||||||
does entity A overlap entity B?
|
|
||||||
|
|
||||||
naive approach: check every pair
|
|
||||||
1,000 entities = 500,000 checks (n squared / 2)
|
|
||||||
10,000 entities = 50,000,000 checks
|
|
||||||
1,000,000 entities = 500,000,000,000,000 checks
|
|
||||||
|
|
||||||
that's 500 trillion. per frame. not happening.
|
|
||||||
|
|
||||||
smart approach: spatial partitioning (grids, quadtrees)
|
|
||||||
only check nearby entities. but still, at millions of entities,
|
|
||||||
even "nearby" is a lot.
|
|
||||||
|
|
||||||
- AI / BEHAVIOR
|
|
||||||
each entity decides what to do.
|
|
||||||
|
|
||||||
simple: move toward player. cheap.
|
|
||||||
complex: pathfind around obstacles, consider threats, coordinate
|
|
||||||
with allies, remember state. expensive.
|
|
||||||
|
|
||||||
lofivor entities just drift in a direction. no decisions.
|
|
||||||
a real game enemy makes decisions every frame.
|
|
||||||
|
|
||||||
- PHYSICS
|
|
||||||
entities push each other, bounce, have mass and friction.
|
|
||||||
every interaction is math. lots of entities = lots of math.
|
|
||||||
|
|
||||||
- GAME LOGIC
|
|
||||||
damage calculations, spawning, leveling, cooldowns, buffs...
|
|
||||||
all of this runs on the CPU, every frame.
|
|
||||||
|
|
||||||
so: lofivor can render 700k entities because they don't DO anything.
|
|
||||||
a game with 700k entities that think, collide, and interact would
|
|
||||||
need god-tier optimization or would simply not run.
|
|
||||||
|
|
||||||
|
|
||||||
what makes AAA games slow on old hardware?
|
|
||||||
------------------------------------------
|
|
||||||
|
|
||||||
it's not entity count. most AAA games have maybe hundreds of
|
|
||||||
"entities" on screen. it's everything else:
|
|
||||||
|
|
||||||
TEXTURE RESOLUTION
|
|
||||||
a 4K texture is 67 million pixels of data. per texture.
|
|
||||||
one character might have 10+ textures (diffuse, normal, specular,
|
|
||||||
roughness, ambient occlusion...).
|
|
||||||
|
|
||||||
old hardware: less VRAM, slower texture sampling.
|
|
||||||
|
|
||||||
SHADER COMPLEXITY
|
|
||||||
modern materials simulate light physics. subsurface scattering,
|
|
||||||
global illumination, ray-traced reflections.
|
|
||||||
|
|
||||||
each pixel might do hundreds of math operations.
|
|
||||||
|
|
||||||
POST-PROCESSING
|
|
||||||
bloom, motion blur, depth of field, ambient occlusion, anti-aliasing.
|
|
||||||
full-screen passes that touch every pixel multiple times.
|
|
||||||
|
|
||||||
MESH COMPLEXITY
|
|
||||||
a character might be 100,000 triangles.
|
|
||||||
10 characters = 1 million triangles.
|
|
||||||
each triangle goes through the vertex shader.
|
|
||||||
|
|
||||||
SHADOWS
|
|
||||||
render the scene again from the light's perspective.
|
|
||||||
for each light. every frame.
|
|
||||||
|
|
||||||
AAA games are doing 100x more work per pixel than lofivor.
|
|
||||||
lofivor is doing 100x more pixels than AAA games.
|
|
||||||
|
|
||||||
different problems.
|
|
||||||
|
|
||||||
|
|
||||||
the "abuse" vs "respect" distinction
|
|
||||||
------------------------------------
|
|
||||||
|
|
||||||
abuse: making the hardware do unnecessary work.
|
|
||||||
respect: achieving your goal with minimal waste.
|
|
||||||
|
|
||||||
examples of abuse (that lofivor fixed):
|
|
||||||
|
|
||||||
- sending 64 bytes (a full matrix) when you need 12 bytes (x, y, color)
|
|
||||||
- one draw call per entity when you could batch
|
|
||||||
- calculating transforms on CPU when GPU could do it
|
|
||||||
- clearing the screen twice
|
|
||||||
- uploading the same data every frame
|
|
||||||
|
|
||||||
examples of abuse in the wild:
|
|
||||||
|
|
||||||
- electron apps using a whole browser to show a chat window
|
|
||||||
- games that re-render static UI every frame
|
|
||||||
- loading 4K textures for objects that appear 20 pixels tall
|
|
||||||
- running AI pathfinding for off-screen entities
|
|
||||||
|
|
||||||
the hardware has limits. respecting them means fitting your game
|
|
||||||
within those limits through smart decisions. abusing them means
|
|
||||||
throwing cycles at problems you created yourself.
|
|
||||||
|
|
||||||
|
|
||||||
so can you do 1 million entities with juice on old hardware?
|
|
||||||
------------------------------------------------------------
|
|
||||||
|
|
||||||
yes, with the right decisions.
|
|
||||||
|
|
||||||
what "juice" typically means:
|
|
||||||
- screen shake (free, just offset the camera)
|
|
||||||
- particle effects (separate system, heavily optimized)
|
|
||||||
- flash/hit feedback (change a color value)
|
|
||||||
- sound (different system entirely)
|
|
||||||
|
|
||||||
particles are special: they're designed for millions of tiny things.
|
|
||||||
they don't collide, don't think, often don't even persist (spawn,
|
|
||||||
drift, fade, die). GPU particle systems are essentially what lofivor
|
|
||||||
became: minimal data, instanced rendering.
|
|
||||||
|
|
||||||
what would kill you at 1 million:
|
|
||||||
- per-entity collision
|
|
||||||
- per-entity AI
|
|
||||||
- per-entity sprite variety (texture switches)
|
|
||||||
- per-entity complex shaders
|
|
||||||
|
|
||||||
what you could do:
|
|
||||||
- 1 million particles (visual only, no logic)
|
|
||||||
- 10,000 enemies with collision/AI + 990,000 particles
|
|
||||||
- 100,000 enemies with simple behavior + spatial hash collision
|
|
||||||
|
|
||||||
the secret: most of what looks like "millions of things" in games
|
|
||||||
is actually a small number of meaningful entities + a large number
|
|
||||||
of dumb particles.
|
|
||||||
|
|
||||||
|
|
||||||
the laws of physics (sort of)
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
there are hard limits:
|
|
||||||
|
|
||||||
MEMORY BUS BANDWIDTH
|
|
||||||
a DDR4 system might move 25 GB/s.
|
|
||||||
1 million entities at 12 bytes each = 12 MB.
|
|
||||||
at 60fps = 720 MB/s just for entity data.
|
|
||||||
that's only 3% of bandwidth. plenty of room.
|
|
||||||
|
|
||||||
but a naive approach (64 bytes, plus overhead) could be
|
|
||||||
10x worse. suddenly you're at 30%.
|
|
||||||
|
|
||||||
CLOCK CYCLES
|
|
||||||
a 3GHz CPU does 3 billion operations per second.
|
|
||||||
at 60fps, that's 50 million operations per frame.
|
|
||||||
1 million entities = 50 operations each.
|
|
||||||
|
|
||||||
50 operations is: a few multiplies, some loads/stores, a branch.
|
|
||||||
that's barely enough for "move in a direction".
|
|
||||||
pathfinding? AI? collision? not a chance.
|
|
||||||
|
|
||||||
PARALLELISM
|
|
||||||
GPUs have thousands of cores but they're simple.
|
|
||||||
CPUs have few cores but they're smart.
|
|
||||||
|
|
||||||
entity rendering: perfectly parallel (GPU wins)
|
|
||||||
entity decision-making: often sequential (CPU bound)
|
|
||||||
|
|
||||||
so yes, physics constrains us. but "physics" here means:
|
|
||||||
- how fast electrons move through silicon
|
|
||||||
- how much data fits on a wire
|
|
||||||
- how many transistors fit on a chip
|
|
||||||
|
|
||||||
within those limits, there's room. lots of room, if you're clever.
|
|
||||||
lofivor went from 5k to 700k by being clever, not by breaking physics.
|
|
||||||
|
|
||||||
|
|
||||||
the actual lesson
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
the limit isn't really "the hardware can't do it."
|
|
||||||
|
|
||||||
the limit is "the hardware can't do it THE WAY YOU'RE DOING IT."
|
|
||||||
|
|
||||||
every optimization in lofivor was finding a different way:
|
|
||||||
- don't draw circles, blit textures
|
|
||||||
- don't call functions, submit vertices directly
|
|
||||||
- don't send matrices, send packed structs
|
|
||||||
- don't update on CPU, use compute shaders
|
|
||||||
|
|
||||||
the hardware was always capable of 700k. the code wasn't asking right.
|
|
||||||
|
|
||||||
this is true at every level. that old laptop struggling with 10k
|
|
||||||
entities in some game? probably not the laptop's fault. probably
|
|
||||||
the game is doing something wasteful that doesn't need to be.
|
|
||||||
|
|
||||||
"runs poorly on old hardware" often means "we didn't try to make
|
|
||||||
it run on old hardware" not "it's impossible on old hardware."
|
|
||||||
|
|
||||||
|
|
||||||
closing thought
|
|
||||||
---------------
|
|
||||||
|
|
||||||
10 million is a lot. but 1 million? 2 million?
|
|
||||||
|
|
||||||
with discipline: yes.
|
|
||||||
with decisions that respect the hardware: yes.
|
|
||||||
with awareness of what's actually expensive: yes.
|
|
||||||
|
|
||||||
the knowledge of what's expensive is the key.
|
|
||||||
|
|
||||||
most developers don't have it. they use high-level abstractions
|
|
||||||
that hide the cost. they've never seen a frame budget or a
|
|
||||||
bandwidth calculation.
|
|
||||||
|
|
||||||
lofivor is a learning tool. the journey from 5k to 700k teaches
|
|
||||||
where the costs are. once you see them, you can't unsee them.
|
|
||||||
|
|
||||||
you start asking: "what is this actually doing? what does it cost?
|
|
||||||
is there a cheaper way?"
|
|
||||||
|
|
||||||
that's the skill. not the specific techniques—those change with
|
|
||||||
hardware. the skill is asking the questions.
|
|
||||||
35
journal.txt
35
journal.txt
|
|
@ -206,38 +206,3 @@ total improvement from baseline:
|
||||||
- SSBO: 60fps @ ~700k entities
|
- SSBO: 60fps @ ~700k entities
|
||||||
- ~140x improvement overall!
|
- ~140x improvement overall!
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
optimization 6: compute shader updates
|
|
||||||
--------------------------------------
|
|
||||||
technique: move entity position + respawn logic from CPU to GPU compute shader
|
|
||||||
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
|
|
||||||
version: 0.7.0
|
|
||||||
|
|
||||||
struct GpuEntity {
|
|
||||||
x: f32, // 4 bytes
|
|
||||||
y: f32, // 4 bytes
|
|
||||||
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
|
|
||||||
color: u32, // 4 bytes
|
|
||||||
}; // = 16 bytes total (was 12)
|
|
||||||
|
|
||||||
changes:
|
|
||||||
- entity_update.comp: position update, center check, edge respawn, velocity calc
|
|
||||||
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
|
|
||||||
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
|
|
||||||
- CPU update loop skipped entirely when compute enabled
|
|
||||||
|
|
||||||
benchmark results (i5-6500T / HD 530):
|
|
||||||
- update time: ~5ms → ~0ms at 150k entities
|
|
||||||
- render time unchanged (GPU-bound as before)
|
|
||||||
- total frame time improvement at high entity counts
|
|
||||||
|
|
||||||
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
|
|
||||||
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
|
|
||||||
new entities when user adds them, not per-frame. memory barrier ensures compute
|
|
||||||
writes visible to vertex shader before draw.
|
|
||||||
|
|
||||||
flags:
|
|
||||||
- --compute: GPU compute updates (now default)
|
|
||||||
- --cpu: fallback to CPU update path for comparison
|
|
||||||
|
|
||||||
|
|
|
||||||
16
justfile
16
justfile
|
|
@ -42,20 +42,11 @@ check:
|
||||||
test:
|
test:
|
||||||
zig build test
|
zig build test
|
||||||
|
|
||||||
# run sandbox (GPU compute is default)
|
# auto-benchmark (ramps entities until performance degrades, works on linux/windows)
|
||||||
sandbox:
|
|
||||||
zig build -Doptimize=ReleaseFast run
|
|
||||||
|
|
||||||
# auto-benchmark (ramps entities until performance degrades)
|
|
||||||
bench:
|
bench:
|
||||||
zig build -Doptimize=ReleaseFast run -- --bench
|
zig build -Doptimize=ReleaseFast run -- --bench
|
||||||
cat benchmark.log
|
cat benchmark.log
|
||||||
|
|
||||||
# benchmark with CPU update path (for comparison)
|
|
||||||
bench-cpu:
|
|
||||||
zig build -Doptimize=ReleaseFast run -- --bench --cpu
|
|
||||||
cat benchmark.log
|
|
||||||
|
|
||||||
# software-rendered benchmark (for CI/headless servers)
|
# software-rendered benchmark (for CI/headless servers)
|
||||||
[linux]
|
[linux]
|
||||||
bench-sw:
|
bench-sw:
|
||||||
|
|
@ -67,8 +58,3 @@ bench-sw:
|
||||||
bench-sw:
|
bench-sw:
|
||||||
@echo "bench-sw: windows doesn't have xvfb equivalent"
|
@echo "bench-sw: windows doesn't have xvfb equivalent"
|
||||||
@echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
|
@echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
|
||||||
|
|
||||||
[linux]
|
|
||||||
profile port="9876":
|
|
||||||
# start Tracy: tracy-profiler -a 127.0.0.1 -p {{port}}
|
|
||||||
zig build -Dtracy=true -Doptimize=ReleaseFast && TRACY_PORT={{port}} ./zig-out/bin/sandbox
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
the baseline: one draw call per entity, pure and simple
|
|
||||||
|
|
||||||
- individual rl.drawCircle() calls in a loop
|
|
||||||
- ~5k entities at 60fps before frame times tank
|
|
||||||
- linear scaling: 10k = ~43ms, 20k = ~77ms
|
|
||||||
- render-bound (update loop stays under 1ms even at 30k)
|
|
||||||
- each circle is its own GPU draw call
|
|
||||||
- the starting point for optimization experiments
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
pre-render once, blit many: 10x improvement
|
|
||||||
|
|
||||||
- render circle to 16x16 texture at startup
|
|
||||||
- drawTexture() per entity instead of drawCircle()
|
|
||||||
- raylib batches same-texture draws internally
|
|
||||||
- ~50k entities at 60fps
|
|
||||||
- simple change, big win
|
|
||||||
- still one function call per entity, but GPU work is batched
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
bypass the wrapper, go straight to rlgl: 2x more
|
|
||||||
|
|
||||||
- skip drawTexture(), submit vertices directly via rl.gl
|
|
||||||
- manually build quads: rlTexCoord2f + rlVertex2f per corner
|
|
||||||
- rlBegin/rlEnd wraps the whole entity loop
|
|
||||||
- ~100k entities at 60fps
|
|
||||||
- eliminates per-call function overhead
|
|
||||||
- vertices go straight to GPU buffer
|
|
||||||
- 20x improvement over baseline
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
bigger buffer, fewer flushes: squeezing out more headroom
|
|
||||||
|
|
||||||
- increased raylib batch buffer from 8192 to 32768 vertices
|
|
||||||
- ~140k entities at 60fps on i5-6500T
|
|
||||||
- ~40% improvement over default buffer
|
|
||||||
- fewer GPU flushes per frame
|
|
||||||
- also added: release workflows for github and forgejo
|
|
||||||
- added OPTIMIZATIONS.md documenting the journey
|
|
||||||
- added README, UI panel with FPS display
|
|
||||||
- heap allocated entity array to support 1 million entities
|
|
||||||
- per-entity RGB colors
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
gpu instancing: a disappointing discovery
|
|
||||||
|
|
||||||
- drawMeshInstanced() with per-entity transform matrices
|
|
||||||
- ~150k entities at 60fps - barely better than rlgl batching
|
|
||||||
- negligible improvement on integrated graphics
|
|
||||||
- why it didn't help:
|
|
||||||
- integrated GPU shares system RAM (no PCIe transfer savings)
|
|
||||||
- 64-byte matrix per entity vs ~80 bytes for rlgl vertices
|
|
||||||
- bottleneck is memory bandwidth, not draw call overhead
|
|
||||||
- rlgl batching already minimizes draw calls effectively
|
|
||||||
- orthographic camera setup for 2D-like rendering
|
|
||||||
- heap-allocated transforms buffer (64MB too big for stack)
|
|
||||||
- lesson learned: not all "advanced" techniques are wins
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
ssbo breakthrough: 5x gain by shrinking the data
|
|
||||||
|
|
||||||
- pack entity data (x, y, color) into 12-byte struct
|
|
||||||
- upload via shader storage buffer object (SSBO)
|
|
||||||
- ~700k entities at 60fps (i5-6500T / HD 530)
|
|
||||||
- ~950k entities at ~57fps
|
|
||||||
- 5x improvement over previous best
|
|
||||||
- 140x total from baseline
|
|
||||||
- why it works:
|
|
||||||
- 12 bytes vs 64 bytes (matrices) = 5.3x less bandwidth
|
|
||||||
- 12 bytes vs 80 bytes (rlgl vertices) = 6.7x less bandwidth
|
|
||||||
- no CPU-side matrix calculations
|
|
||||||
- GPU does NDC conversion and color unpacking
|
|
||||||
- custom vertex/fragment shaders
|
|
||||||
- single rlDrawVertexArrayInstanced() call for all entities
|
|
||||||
- shaders embedded at build time
|
|
||||||
- removed FPS cap, added optional vsync arg
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
cross-platform release: adding windows to the party
|
|
||||||
|
|
||||||
- updated github release workflow
|
|
||||||
- builds for both linux and windows now
|
|
||||||
- no code changes, just CI/CD work
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
zoom and pan: making millions of entities explorable
|
|
||||||
|
|
||||||
- mouse wheel zoom
|
|
||||||
- click and drag panning
|
|
||||||
- orthographic camera transforms
|
|
||||||
- memory panel showing entity buffer sizes
|
|
||||||
- background draws immediately (no flicker)
|
|
||||||
- tab key toggles UI panels
|
|
||||||
- explained "lofivor" name in README (lo-fi survivor)
|
|
||||||
- shader updated for zoom/pan transforms
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
quick exit: zoom out then quit
|
|
||||||
|
|
||||||
- q key first zooms out, second press quits
|
|
||||||
- nice way to see the full entity field before closing
|
|
||||||
- minor UI text fix
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
compute shader: moving physics to the GPU
|
|
||||||
|
|
||||||
- entity position updates now run on GPU via compute shader
|
|
||||||
- GPU-based RNG for entity velocity randomization
|
|
||||||
- full simulation loop stays on GPU, no CPU roundtrip
|
|
||||||
- new compute.zig module for shader management
|
|
||||||
- GpuEntity struct with position, velocity, and color
|
|
||||||
- tracy profiling integration
|
|
||||||
- FPS display turns green (good) or red (bad)
|
|
||||||
- added design docs for zoom/pan and compute shader work
|
|
||||||
- cross-platform alignment fixes for shader data
|
|
||||||
111
src/compute.zig
111
src/compute.zig
|
|
@ -1,111 +0,0 @@
|
||||||
// compute shader module for GPU entity updates
|
|
||||||
// wraps raw GL calls that raylib doesn't expose directly
|
|
||||||
|
|
||||||
const std = @import("std");
|
|
||||||
const rl = @import("raylib");
|
|
||||||
const sandbox = @import("sandbox.zig");
|
|
||||||
|
|
||||||
const comp_source = @embedFile("shaders/entity_update.comp");
|
|
||||||
|
|
||||||
// GL constants not exposed by raylib-zig
|
|
||||||
const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
|
|
||||||
|
|
||||||
// function pointer type for glMemoryBarrier
|
|
||||||
const GlMemoryBarrierFn = *const fn (barriers: u32) callconv(.c) void;
|
|
||||||
|
|
||||||
pub const ComputeShader = struct {
|
|
||||||
program_id: u32,
|
|
||||||
entity_count_loc: i32,
|
|
||||||
frame_number_loc: i32,
|
|
||||||
screen_size_loc: i32,
|
|
||||||
center_loc: i32,
|
|
||||||
respawn_radius_loc: i32,
|
|
||||||
entity_speed_loc: i32,
|
|
||||||
glMemoryBarrier: GlMemoryBarrierFn,
|
|
||||||
|
|
||||||
pub fn init() ?ComputeShader {
|
|
||||||
// load glMemoryBarrier dynamically
|
|
||||||
const barrier_ptr = rl.gl.rlGetProcAddress("glMemoryBarrier");
|
|
||||||
const glMemoryBarrier: GlMemoryBarrierFn = @ptrCast(@alignCast(barrier_ptr));
|
|
||||||
|
|
||||||
// compile compute shader
|
|
||||||
const shader_id = rl.gl.rlCompileShader(comp_source, rl.gl.rl_compute_shader);
|
|
||||||
if (shader_id == 0) {
|
|
||||||
std.debug.print("compute: failed to compile compute shader\n", .{});
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// link compute program
|
|
||||||
const program_id = rl.gl.rlLoadComputeShaderProgram(shader_id);
|
|
||||||
if (program_id == 0) {
|
|
||||||
std.debug.print("compute: failed to link compute program\n", .{});
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get uniform locations
|
|
||||||
const entity_count_loc = rl.gl.rlGetLocationUniform(program_id, "entityCount");
|
|
||||||
const frame_number_loc = rl.gl.rlGetLocationUniform(program_id, "frameNumber");
|
|
||||||
const screen_size_loc = rl.gl.rlGetLocationUniform(program_id, "screenSize");
|
|
||||||
const center_loc = rl.gl.rlGetLocationUniform(program_id, "center");
|
|
||||||
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
|
||||||
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
|
||||||
|
|
||||||
std.debug.print("compute: shader loaded\n", .{});
|
|
||||||
|
|
||||||
return .{
|
|
||||||
.program_id = program_id,
|
|
||||||
.entity_count_loc = entity_count_loc,
|
|
||||||
.frame_number_loc = frame_number_loc,
|
|
||||||
.screen_size_loc = screen_size_loc,
|
|
||||||
.center_loc = center_loc,
|
|
||||||
.respawn_radius_loc = respawn_radius_loc,
|
|
||||||
.entity_speed_loc = entity_speed_loc,
|
|
||||||
.glMemoryBarrier = glMemoryBarrier,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *ComputeShader) void {
|
|
||||||
rl.gl.rlUnloadShaderProgram(self.program_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dispatch(self: *ComputeShader, ssbo_id: u32, entity_count: u32, frame_number: u32) void {
|
|
||||||
if (entity_count == 0) return;
|
|
||||||
|
|
||||||
// constants from sandbox.zig
|
|
||||||
const screen_w: f32 = @floatFromInt(sandbox.SCREEN_WIDTH);
|
|
||||||
const screen_h: f32 = @floatFromInt(sandbox.SCREEN_HEIGHT);
|
|
||||||
const center_x: f32 = screen_w / 2.0;
|
|
||||||
const center_y: f32 = screen_h / 2.0;
|
|
||||||
const respawn_radius: f32 = 10.0; // RESPAWN_THRESHOLD
|
|
||||||
const entity_speed: f32 = 2.0; // ENTITY_SPEED
|
|
||||||
|
|
||||||
// bind compute shader
|
|
||||||
rl.gl.rlEnableShader(self.program_id);
|
|
||||||
|
|
||||||
// set uniforms
|
|
||||||
rl.gl.rlSetUniform(self.entity_count_loc, &entity_count, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
|
||||||
rl.gl.rlSetUniform(self.frame_number_loc, &frame_number, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
|
||||||
|
|
||||||
const screen_size = [2]f32{ screen_w, screen_h };
|
|
||||||
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
|
||||||
|
|
||||||
const center = [2]f32{ center_x, center_y };
|
|
||||||
rl.gl.rlSetUniform(self.center_loc, ¢er, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
|
||||||
|
|
||||||
rl.gl.rlSetUniform(self.respawn_radius_loc, &respawn_radius, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
|
||||||
rl.gl.rlSetUniform(self.entity_speed_loc, &entity_speed, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
|
||||||
|
|
||||||
// bind SSBO to binding point 0
|
|
||||||
rl.gl.rlBindShaderBuffer(ssbo_id, 0);
|
|
||||||
|
|
||||||
// dispatch compute workgroups: ceil(entity_count / 256)
|
|
||||||
const groups = (entity_count + 255) / 256;
|
|
||||||
rl.gl.rlComputeShaderDispatch(groups, 1, 1);
|
|
||||||
|
|
||||||
// memory barrier - ensure compute writes are visible to vertex shader
|
|
||||||
self.glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
|
||||||
|
|
||||||
// unbind
|
|
||||||
rl.gl.rlBindShaderBuffer(0, 0);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
@ -287,69 +287,34 @@ test "update respawns entity at edge when reaching center" {
|
||||||
try std.testing.expect(on_left or on_right or on_top or on_bottom);
|
try std.testing.expect(on_left or on_right or on_top or on_bottom);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU entity for SSBO rendering (16 bytes, matches compute shader layout)
|
// GPU entity for SSBO rendering (position + color only, no velocity)
|
||||||
pub const GpuEntity = extern struct {
|
pub const GpuEntity = extern struct {
|
||||||
x: f32,
|
x: f32,
|
||||||
y: f32,
|
y: f32,
|
||||||
packed_vel: i32, // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
|
||||||
color: u32,
|
color: u32,
|
||||||
};
|
};
|
||||||
|
|
||||||
// pack two f32 velocities into a single i32 (fixed-point 8.8 format)
|
|
||||||
pub fn packVelocity(vx: f32, vy: f32) i32 {
|
|
||||||
const vx_fixed: i16 = @intFromFloat(std.math.clamp(vx * 256.0, -32768.0, 32767.0));
|
|
||||||
const vy_fixed: i16 = @intFromFloat(std.math.clamp(vy * 256.0, -32768.0, 32767.0));
|
|
||||||
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "GpuEntity struct has correct size for SSBO" {
|
test "GpuEntity struct has correct size for SSBO" {
|
||||||
// SSBO layout: x(4) + y(4) + packed_vel(4) + color(4) = 16 bytes
|
// SSBO layout: x(4) + y(4) + color(4) = 12 bytes
|
||||||
try std.testing.expectEqual(@as(usize, 16), @sizeOf(GpuEntity));
|
try std.testing.expectEqual(@as(usize, 12), @sizeOf(GpuEntity));
|
||||||
}
|
}
|
||||||
|
|
||||||
test "GpuEntity can be created from Entity" {
|
test "GpuEntity can be created from Entity" {
|
||||||
const entity = Entity{
|
const entity = Entity{
|
||||||
.x = 100.0,
|
.x = 100.0,
|
||||||
.y = 200.0,
|
.y = 200.0,
|
||||||
.vx = 1.5,
|
.vx = 1.5, // ignored for GPU
|
||||||
.vy = -0.5,
|
.vy = -0.5, // ignored for GPU
|
||||||
.color = 0x00FFFF,
|
.color = 0x00FFFF,
|
||||||
};
|
};
|
||||||
|
|
||||||
const gpu_entity = GpuEntity{
|
const gpu_entity = GpuEntity{
|
||||||
.x = entity.x,
|
.x = entity.x,
|
||||||
.y = entity.y,
|
.y = entity.y,
|
||||||
.packed_vel = packVelocity(entity.vx, entity.vy),
|
|
||||||
.color = entity.color,
|
.color = entity.color,
|
||||||
};
|
};
|
||||||
|
|
||||||
try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
|
try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
|
||||||
try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
|
try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
|
||||||
try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
|
try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
|
||||||
|
|
||||||
// unpack and verify velocity (should round-trip within precision)
|
|
||||||
const vx_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel >> 16)))) / 256.0;
|
|
||||||
const vy_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel)))) / 256.0;
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vx_unpacked, 0.004);
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, -0.5), vy_unpacked, 0.004);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "packVelocity round-trips correctly" {
|
|
||||||
// test positive values
|
|
||||||
const packed1 = packVelocity(2.0, 1.5);
|
|
||||||
const vx1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1 >> 16)))) / 256.0;
|
|
||||||
const vy1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1)))) / 256.0;
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, 2.0), vx1, 0.004);
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vy1, 0.004);
|
|
||||||
|
|
||||||
// test negative values
|
|
||||||
const packed2 = packVelocity(-1.0, -2.5);
|
|
||||||
const vx2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2 >> 16)))) / 256.0;
|
|
||||||
const vy2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2)))) / 256.0;
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, -1.0), vx2, 0.004);
|
|
||||||
try std.testing.expectApproxEqAbs(@as(f32, -2.5), vy2, 0.004);
|
|
||||||
|
|
||||||
// test zero
|
|
||||||
const packed3 = packVelocity(0.0, 0.0);
|
|
||||||
try std.testing.expectEqual(@as(i32, 0), packed3);
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,9 @@
|
||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const rl = @import("raylib");
|
const rl = @import("raylib");
|
||||||
const ztracy = @import("ztracy");
|
|
||||||
const sandbox = @import("sandbox.zig");
|
const sandbox = @import("sandbox.zig");
|
||||||
const ui = @import("ui.zig");
|
const ui = @import("ui.zig");
|
||||||
const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
|
const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
|
||||||
const ComputeShader = @import("compute.zig").ComputeShader;
|
|
||||||
|
|
||||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||||
const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
|
const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
|
||||||
|
|
@ -33,11 +31,6 @@ const BENCH_RAMP_AMOUNT: usize = 50_000; // entities added per ramp
|
||||||
const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
|
const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
|
||||||
const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long
|
const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long
|
||||||
|
|
||||||
// zoom settings
|
|
||||||
const ZOOM_MIN: f32 = 1.0;
|
|
||||||
const ZOOM_MAX: f32 = 10.0;
|
|
||||||
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
|
||||||
|
|
||||||
const BenchmarkLogger = struct {
|
const BenchmarkLogger = struct {
|
||||||
file: ?std.fs.File,
|
file: ?std.fs.File,
|
||||||
last_logged_frame_ms: f32,
|
last_logged_frame_ms: f32,
|
||||||
|
|
@ -164,7 +157,6 @@ pub fn main() !void {
|
||||||
var use_instancing = false;
|
var use_instancing = false;
|
||||||
var use_ssbo = true;
|
var use_ssbo = true;
|
||||||
var use_vsync = false;
|
var use_vsync = false;
|
||||||
var use_compute = true; // GPU compute is now default
|
|
||||||
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
||||||
defer args.deinit();
|
defer args.deinit();
|
||||||
_ = args.skip(); // skip program name
|
_ = args.skip(); // skip program name
|
||||||
|
|
@ -178,8 +170,6 @@ pub fn main() !void {
|
||||||
use_ssbo = false; // legacy rlgl batched path
|
use_ssbo = false; // legacy rlgl batched path
|
||||||
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
||||||
use_vsync = true;
|
use_vsync = true;
|
||||||
} else if (std.mem.eql(u8, arg, "--cpu")) {
|
|
||||||
use_compute = false; // fallback to CPU update path
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -189,11 +179,6 @@ pub fn main() !void {
|
||||||
rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
|
rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
|
||||||
defer rl.closeWindow();
|
defer rl.closeWindow();
|
||||||
|
|
||||||
// show background immediately (avoid black screen during init)
|
|
||||||
rl.beginDrawing();
|
|
||||||
rl.clearBackground(BG_COLOR);
|
|
||||||
rl.endDrawing();
|
|
||||||
|
|
||||||
// use larger batch buffer: 16384 elements vs default 8192
|
// use larger batch buffer: 16384 elements vs default 8192
|
||||||
// fewer flushes = less driver overhead per frame
|
// fewer flushes = less driver overhead per frame
|
||||||
const numElements: i32 = 8192 * 4; // quads = 4 verts
|
const numElements: i32 = 8192 * 4; // quads = 4 verts
|
||||||
|
|
@ -261,26 +246,6 @@ pub fn main() !void {
|
||||||
if (ssbo_renderer) |*r| r.deinit();
|
if (ssbo_renderer) |*r| r.deinit();
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute shader setup (only if --compute flag)
|
|
||||||
var compute_shader: ?ComputeShader = null;
|
|
||||||
|
|
||||||
if (use_compute) {
|
|
||||||
if (!use_ssbo) {
|
|
||||||
std.debug.print("--compute requires SSBO mode (default), ignoring\n", .{});
|
|
||||||
} else {
|
|
||||||
compute_shader = ComputeShader.init();
|
|
||||||
if (compute_shader == null) {
|
|
||||||
std.debug.print("failed to initialize compute shader, falling back to CPU\n", .{});
|
|
||||||
} else {
|
|
||||||
std.debug.print("compute shader mode enabled\n", .{});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
defer {
|
|
||||||
if (compute_shader) |*c| c.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
// load UI font (embedded)
|
// load UI font (embedded)
|
||||||
const font_data = @embedFile("verdanab.ttf");
|
const font_data = @embedFile("verdanab.ttf");
|
||||||
const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
|
const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
|
||||||
|
|
@ -294,11 +259,6 @@ pub fn main() !void {
|
||||||
var rng = prng.random();
|
var rng = prng.random();
|
||||||
|
|
||||||
var paused = false;
|
var paused = false;
|
||||||
|
|
||||||
// camera state for zoom/pan
|
|
||||||
var zoom: f32 = 1.0;
|
|
||||||
var pan = @Vector(2, f32){ 0, 0 };
|
|
||||||
|
|
||||||
var logger = BenchmarkLogger.init();
|
var logger = BenchmarkLogger.init();
|
||||||
defer logger.deinit();
|
defer logger.deinit();
|
||||||
|
|
||||||
|
|
@ -306,7 +266,6 @@ pub fn main() !void {
|
||||||
var update_time_us: i64 = 0;
|
var update_time_us: i64 = 0;
|
||||||
var render_time_us: i64 = 0;
|
var render_time_us: i64 = 0;
|
||||||
var elapsed: f32 = 0;
|
var elapsed: f32 = 0;
|
||||||
var frame_number: u32 = 0;
|
|
||||||
|
|
||||||
// auto-benchmark state
|
// auto-benchmark state
|
||||||
var last_ramp_time: f32 = 0;
|
var last_ramp_time: f32 = 0;
|
||||||
|
|
@ -352,47 +311,24 @@ pub fn main() !void {
|
||||||
} else {
|
} else {
|
||||||
// manual controls
|
// manual controls
|
||||||
handleInput(&entities, &rng, &paused);
|
handleInput(&entities, &rng, &paused);
|
||||||
if (handleCamera(&zoom, &pan)) break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// update
|
// update
|
||||||
if (!paused) {
|
if (!paused) {
|
||||||
const tracy_update = ztracy.ZoneN(@src(), "update");
|
|
||||||
defer tracy_update.End();
|
|
||||||
const update_start = std.time.microTimestamp();
|
const update_start = std.time.microTimestamp();
|
||||||
|
sandbox.update(&entities, &rng);
|
||||||
if (compute_shader == null) {
|
|
||||||
// CPU update path (positions + respawn)
|
|
||||||
sandbox.update(&entities, &rng);
|
|
||||||
}
|
|
||||||
// GPU compute path handles update in render section before draw
|
|
||||||
|
|
||||||
update_time_us = std.time.microTimestamp() - update_start;
|
update_time_us = std.time.microTimestamp() - update_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
// render
|
// render
|
||||||
const tracy_render = ztracy.ZoneN(@src(), "render");
|
|
||||||
defer tracy_render.End();
|
|
||||||
const render_start = std.time.microTimestamp();
|
const render_start = std.time.microTimestamp();
|
||||||
|
|
||||||
rl.beginDrawing();
|
rl.beginDrawing();
|
||||||
rl.clearBackground(BG_COLOR);
|
rl.clearBackground(BG_COLOR);
|
||||||
|
|
||||||
if (use_ssbo) {
|
if (use_ssbo) {
|
||||||
// dispatch compute shader before render (if enabled)
|
// SSBO instanced rendering path (12 bytes per entity)
|
||||||
if (compute_shader) |*cs| {
|
ssbo_renderer.?.render(&entities);
|
||||||
if (!paused) {
|
|
||||||
const tracy_compute = ztracy.ZoneN(@src(), "compute_dispatch");
|
|
||||||
defer tracy_compute.End();
|
|
||||||
cs.dispatch(ssbo_renderer.?.ssbo_id, @intCast(entities.count), frame_number);
|
|
||||||
frame_number +%= 1;
|
|
||||||
}
|
|
||||||
// GPU compute mode - only upload new entities, positions updated on GPU
|
|
||||||
ssbo_renderer.?.renderComputeMode(&entities, zoom, pan);
|
|
||||||
} else {
|
|
||||||
// CPU mode - upload entity data to GPU
|
|
||||||
ssbo_renderer.?.render(&entities, zoom, pan);
|
|
||||||
}
|
|
||||||
} else if (use_instancing) {
|
} else if (use_instancing) {
|
||||||
// GPU instancing path (64 bytes per entity)
|
// GPU instancing path (64 bytes per entity)
|
||||||
const xforms = transforms.?;
|
const xforms = transforms.?;
|
||||||
|
|
@ -443,8 +379,7 @@ pub fn main() !void {
|
||||||
|
|
||||||
// metrics overlay (skip in bench mode for cleaner headless run)
|
// metrics overlay (skip in bench mode for cleaner headless run)
|
||||||
if (!bench_mode) {
|
if (!bench_mode) {
|
||||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
||||||
ui.drawMemory(entities.count, ui_font);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
rl.endDrawing();
|
rl.endDrawing();
|
||||||
|
|
@ -455,9 +390,6 @@ pub fn main() !void {
|
||||||
const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
|
const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
|
||||||
const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
|
const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
|
||||||
logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
|
logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
|
||||||
|
|
||||||
// tracy frame mark
|
|
||||||
ztracy.FrameMark();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -512,86 +444,4 @@ fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) voi
|
||||||
if (rl.isKeyPressed(.space)) {
|
if (rl.isKeyPressed(.space)) {
|
||||||
paused.* = !paused.*;
|
paused.* = !paused.*;
|
||||||
}
|
}
|
||||||
|
|
||||||
// toggle ui: tab
|
|
||||||
if (rl.isKeyPressed(.tab)) {
|
|
||||||
ui.show_ui = !ui.show_ui;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) bool {
|
|
||||||
const wheel = rl.getMouseWheelMove();
|
|
||||||
|
|
||||||
if (wheel != 0) {
|
|
||||||
const mouse_pos = rl.getMousePosition();
|
|
||||||
const old_zoom = zoom.*;
|
|
||||||
|
|
||||||
// calculate new zoom
|
|
||||||
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
|
||||||
var new_zoom = old_zoom * zoom_factor;
|
|
||||||
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
|
||||||
|
|
||||||
if (new_zoom != old_zoom) {
|
|
||||||
// zoom toward mouse cursor:
|
|
||||||
// keep the world point under the cursor stationary
|
|
||||||
// world_pos = (screen_pos / old_zoom) + old_pan
|
|
||||||
// new_pan = world_pos - (screen_pos / new_zoom)
|
|
||||||
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
|
||||||
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
|
||||||
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
|
||||||
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
|
||||||
zoom.* = new_zoom;
|
|
||||||
|
|
||||||
// clamp pan to bounds
|
|
||||||
clampPan(pan, zoom.*);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// pan with any mouse button drag (only when zoomed in)
|
|
||||||
if (zoom.* > 1.0) {
|
|
||||||
const any_button = rl.isMouseButtonDown(.left) or
|
|
||||||
rl.isMouseButtonDown(.right) or
|
|
||||||
rl.isMouseButtonDown(.middle);
|
|
||||||
if (any_button) {
|
|
||||||
const delta = rl.getMouseDelta();
|
|
||||||
// drag down = view down, drag right = view right
|
|
||||||
pan.*[0] -= delta.x / zoom.*;
|
|
||||||
pan.*[1] += delta.y / zoom.*;
|
|
||||||
clampPan(pan, zoom.*);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// reset on Return or Enter
|
|
||||||
if (rl.isKeyPressed(.enter) or rl.isKeyPressed(.kp_enter)) {
|
|
||||||
zoom.* = 1.0;
|
|
||||||
pan.* = @Vector(2, f32){ 0, 0 };
|
|
||||||
}
|
|
||||||
|
|
||||||
// q: reset zoom if zoomed in, otherwise quit
|
|
||||||
if (rl.isKeyPressed(.q)) {
|
|
||||||
if (zoom.* > 1.0) {
|
|
||||||
zoom.* = 1.0;
|
|
||||||
pan.* = @Vector(2, f32){ 0, 0 };
|
|
||||||
} else {
|
|
||||||
return true; // signal to quit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
|
||||||
// when zoomed in, limit pan so viewport stays in simulation bounds
|
|
||||||
// visible area = screen_size / zoom
|
|
||||||
// max pan = world_size - visible_area
|
|
||||||
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
|
||||||
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
|
||||||
const visible_w = screen_w / zoom;
|
|
||||||
const visible_h = screen_h / zoom;
|
|
||||||
|
|
||||||
const max_pan_x = @max(0, screen_w - visible_w);
|
|
||||||
const max_pan_y = @max(0, screen_h - visible_h);
|
|
||||||
|
|
||||||
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
|
||||||
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,10 @@
|
||||||
layout(location = 0) in vec2 position;
|
layout(location = 0) in vec2 position;
|
||||||
layout(location = 1) in vec2 texCoord;
|
layout(location = 1) in vec2 texCoord;
|
||||||
|
|
||||||
// entity data from SSBO (16 bytes, matches compute shader layout)
|
// entity data from SSBO
|
||||||
struct Entity {
|
struct Entity {
|
||||||
float x;
|
float x;
|
||||||
float y;
|
float y;
|
||||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8), unused in vertex shader
|
|
||||||
uint color;
|
uint color;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -18,8 +17,6 @@ layout(std430, binding = 0) readonly buffer EntityData {
|
||||||
|
|
||||||
// screen size for NDC conversion
|
// screen size for NDC conversion
|
||||||
uniform vec2 screenSize;
|
uniform vec2 screenSize;
|
||||||
uniform float zoom;
|
|
||||||
uniform vec2 pan;
|
|
||||||
|
|
||||||
out vec2 fragTexCoord;
|
out vec2 fragTexCoord;
|
||||||
out vec3 fragColor;
|
out vec3 fragColor;
|
||||||
|
|
@ -28,13 +25,13 @@ void main() {
|
||||||
// get entity data from SSBO
|
// get entity data from SSBO
|
||||||
Entity e = entities[gl_InstanceID];
|
Entity e = entities[gl_InstanceID];
|
||||||
|
|
||||||
// apply pan offset and zoom to convert to NDC
|
// convert entity position to NDC
|
||||||
// pan is in screen pixels, zoom scales the view
|
// entity coords are in screen pixels, convert to [-1, 1]
|
||||||
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
||||||
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
||||||
|
|
||||||
// quad size in NDC (16 pixels, scaled by zoom)
|
// quad size in NDC (16 pixels)
|
||||||
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
float quadSizeNdc = 16.0 / screenSize.x;
|
||||||
|
|
||||||
// offset by quad corner position
|
// offset by quad corner position
|
||||||
gl_Position = vec4(ndcX + position.x * quadSizeNdc,
|
gl_Position = vec4(ndcX + position.x * quadSizeNdc,
|
||||||
|
|
|
||||||
|
|
@ -1,97 +0,0 @@
|
||||||
#version 430
|
|
||||||
|
|
||||||
layout(local_size_x = 256) in;
|
|
||||||
|
|
||||||
struct Entity {
|
|
||||||
float x;
|
|
||||||
float y;
|
|
||||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
|
||||||
uint color;
|
|
||||||
};
|
|
||||||
|
|
||||||
layout(std430, binding = 0) buffer Entities {
|
|
||||||
Entity entities[];
|
|
||||||
};
|
|
||||||
|
|
||||||
uniform uint entityCount;
|
|
||||||
uniform uint frameNumber;
|
|
||||||
uniform vec2 screenSize;
|
|
||||||
uniform vec2 center;
|
|
||||||
uniform float respawnRadius;
|
|
||||||
uniform float entitySpeed;
|
|
||||||
|
|
||||||
// PCG-style GPU RNG - returns value in [0, 1)
|
|
||||||
uint pcg(inout uint state) {
|
|
||||||
state = state * 747796405u + 2891336453u;
|
|
||||||
uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
|
|
||||||
return (word >> 22u) ^ word;
|
|
||||||
}
|
|
||||||
|
|
||||||
float randFloat(inout uint state) {
|
|
||||||
return float(pcg(state)) / 4294967296.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// pack velocity into fixed-point 8.8 format
|
|
||||||
int packVelocity(float vx, float vy) {
|
|
||||||
int vx_fixed = int(clamp(vx * 256.0, -32768.0, 32767.0));
|
|
||||||
int vy_fixed = int(clamp(vy * 256.0, -32768.0, 32767.0));
|
|
||||||
return (vx_fixed << 16) | (vy_fixed & 0xFFFF);
|
|
||||||
}
|
|
||||||
|
|
||||||
void main() {
|
|
||||||
uint id = gl_GlobalInvocationID.x;
|
|
||||||
if (id >= entityCount) return;
|
|
||||||
|
|
||||||
Entity e = entities[id];
|
|
||||||
|
|
||||||
// unpack velocity (fixed-point 8.8)
|
|
||||||
float vx = float(e.packedVel >> 16) / 256.0;
|
|
||||||
float vy = float((e.packedVel << 16) >> 16) / 256.0; // sign-extend low 16 bits
|
|
||||||
|
|
||||||
// update position
|
|
||||||
e.x += vx;
|
|
||||||
e.y += vy;
|
|
||||||
|
|
||||||
// check if reached center - respawn at edge
|
|
||||||
float dx = e.x - center.x;
|
|
||||||
float dy = e.y - center.y;
|
|
||||||
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
|
||||||
// init RNG with entity id and frame number
|
|
||||||
uint rng = id * 1103515245u + frameNumber * 12345u + 1u;
|
|
||||||
|
|
||||||
// pick random edge: 0=top, 1=bottom, 2=left, 3=right
|
|
||||||
uint edge = pcg(rng) & 3u;
|
|
||||||
float t = randFloat(rng);
|
|
||||||
|
|
||||||
// spawn on edge
|
|
||||||
if (edge == 0u) { // top
|
|
||||||
e.x = t * screenSize.x;
|
|
||||||
e.y = 0.0;
|
|
||||||
} else if (edge == 1u) { // bottom
|
|
||||||
e.x = t * screenSize.x;
|
|
||||||
e.y = screenSize.y;
|
|
||||||
} else if (edge == 2u) { // left
|
|
||||||
e.x = 0.0;
|
|
||||||
e.y = t * screenSize.y;
|
|
||||||
} else { // right
|
|
||||||
e.x = screenSize.x;
|
|
||||||
e.y = t * screenSize.y;
|
|
||||||
}
|
|
||||||
|
|
||||||
// velocity toward center
|
|
||||||
dx = center.x - e.x;
|
|
||||||
dy = center.y - e.y;
|
|
||||||
float dist = sqrt(dx*dx + dy*dy);
|
|
||||||
vx = (dx / dist) * entitySpeed;
|
|
||||||
vy = (dy / dist) * entitySpeed;
|
|
||||||
e.packedVel = packVelocity(vx, vy);
|
|
||||||
|
|
||||||
// new random color
|
|
||||||
uint r = pcg(rng) & 0xFFu;
|
|
||||||
uint g = pcg(rng) & 0xFFu;
|
|
||||||
uint b = pcg(rng) & 0xFFu;
|
|
||||||
e.color = (r << 16u) | (g << 8u) | b;
|
|
||||||
}
|
|
||||||
|
|
||||||
entities[id] = e;
|
|
||||||
}
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const rl = @import("raylib");
|
const rl = @import("raylib");
|
||||||
const ztracy = @import("ztracy");
|
|
||||||
const sandbox = @import("sandbox.zig");
|
const sandbox = @import("sandbox.zig");
|
||||||
|
|
||||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||||
|
|
@ -20,11 +19,8 @@ pub const SsboRenderer = struct {
|
||||||
ssbo_id: u32,
|
ssbo_id: u32,
|
||||||
screen_size_loc: i32,
|
screen_size_loc: i32,
|
||||||
circle_texture_loc: i32,
|
circle_texture_loc: i32,
|
||||||
zoom_loc: i32,
|
|
||||||
pan_loc: i32,
|
|
||||||
circle_texture_id: u32,
|
circle_texture_id: u32,
|
||||||
gpu_buffer: []sandbox.GpuEntity,
|
gpu_buffer: []sandbox.GpuEntity,
|
||||||
last_entity_count: usize, // track count to detect when entities are added
|
|
||||||
|
|
||||||
const QUAD_SIZE: f32 = 16.0;
|
const QUAD_SIZE: f32 = 16.0;
|
||||||
|
|
||||||
|
|
@ -57,8 +53,6 @@ pub const SsboRenderer = struct {
|
||||||
// get uniform locations
|
// get uniform locations
|
||||||
const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
|
const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
|
||||||
const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
|
const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
|
||||||
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
|
||||||
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
|
||||||
|
|
||||||
if (screen_size_loc < 0) {
|
if (screen_size_loc < 0) {
|
||||||
std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
|
std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
|
||||||
|
|
@ -100,7 +94,7 @@ pub const SsboRenderer = struct {
|
||||||
rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
|
rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
|
||||||
rl.gl.rlEnableVertexAttribute(1);
|
rl.gl.rlEnableVertexAttribute(1);
|
||||||
|
|
||||||
// create SSBO for entity data (16 bytes per entity, 1M entities = 16MB)
|
// create SSBO for entity data (12 bytes per entity, 1M entities = 12MB)
|
||||||
const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
|
const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
|
||||||
const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
|
const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
|
||||||
if (ssbo_id == 0) {
|
if (ssbo_id == 0) {
|
||||||
|
|
@ -122,11 +116,8 @@ pub const SsboRenderer = struct {
|
||||||
.ssbo_id = ssbo_id,
|
.ssbo_id = ssbo_id,
|
||||||
.screen_size_loc = screen_size_loc,
|
.screen_size_loc = screen_size_loc,
|
||||||
.circle_texture_loc = circle_texture_loc,
|
.circle_texture_loc = circle_texture_loc,
|
||||||
.zoom_loc = zoom_loc,
|
|
||||||
.pan_loc = pan_loc,
|
|
||||||
.circle_texture_id = circle_texture.id,
|
.circle_texture_id = circle_texture.id,
|
||||||
.gpu_buffer = gpu_buffer,
|
.gpu_buffer = gpu_buffer,
|
||||||
.last_entity_count = 0,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -138,80 +129,25 @@ pub const SsboRenderer = struct {
|
||||||
std.heap.page_allocator.free(self.gpu_buffer);
|
std.heap.page_allocator.free(self.gpu_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities) void {
|
||||||
self.renderInternal(entities, zoom, pan, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn renderComputeMode(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
|
||||||
if (entities.count == 0) return;
|
if (entities.count == 0) return;
|
||||||
|
|
||||||
// flush raylib's internal render batch before our custom GL calls
|
// flush raylib's internal render batch before our custom GL calls
|
||||||
rl.gl.rlDrawRenderBatchActive();
|
rl.gl.rlDrawRenderBatchActive();
|
||||||
|
|
||||||
// upload NEW entities when count increases (entities added on CPU)
|
// copy entity data to GPU buffer (position + color only)
|
||||||
if (entities.count > self.last_entity_count) {
|
for (entities.items[0..entities.count], 0..) |entity, i| {
|
||||||
const zone = ztracy.ZoneN(@src(), "ssbo_upload_new");
|
self.gpu_buffer[i] = .{
|
||||||
defer zone.End();
|
.x = entity.x,
|
||||||
|
.y = entity.y,
|
||||||
// copy new entities to GPU buffer
|
.color = entity.color,
|
||||||
for (entities.items[self.last_entity_count..entities.count], self.last_entity_count..) |entity, i| {
|
};
|
||||||
self.gpu_buffer[i] = .{
|
|
||||||
.x = entity.x,
|
|
||||||
.y = entity.y,
|
|
||||||
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
|
||||||
.color = entity.color,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// upload only the new portion to SSBO
|
|
||||||
const offset: u32 = @intCast(self.last_entity_count * @sizeOf(sandbox.GpuEntity));
|
|
||||||
const new_count = entities.count - self.last_entity_count;
|
|
||||||
const data_size: u32 = @intCast(new_count * @sizeOf(sandbox.GpuEntity));
|
|
||||||
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, &self.gpu_buffer[self.last_entity_count], data_size, offset);
|
|
||||||
|
|
||||||
self.last_entity_count = entities.count;
|
|
||||||
} else if (entities.count < self.last_entity_count) {
|
|
||||||
// entities were removed, update count
|
|
||||||
self.last_entity_count = entities.count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.drawInstanced(entities.count, zoom, pan);
|
// upload to SSBO
|
||||||
}
|
const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
|
||||||
|
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
|
||||||
|
|
||||||
fn renderInternal(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32), skip_upload: bool) void {
|
|
||||||
if (entities.count == 0) return;
|
|
||||||
|
|
||||||
// flush raylib's internal render batch before our custom GL calls
|
|
||||||
rl.gl.rlDrawRenderBatchActive();
|
|
||||||
|
|
||||||
if (!skip_upload) {
|
|
||||||
// copy entity data to GPU buffer (position + packed velocity + color)
|
|
||||||
{
|
|
||||||
const zone = ztracy.ZoneN(@src(), "ssbo_copy");
|
|
||||||
defer zone.End();
|
|
||||||
for (entities.items[0..entities.count], 0..) |entity, i| {
|
|
||||||
self.gpu_buffer[i] = .{
|
|
||||||
.x = entity.x,
|
|
||||||
.y = entity.y,
|
|
||||||
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
|
||||||
.color = entity.color,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// upload to SSBO
|
|
||||||
{
|
|
||||||
const zone = ztracy.ZoneN(@src(), "ssbo_upload");
|
|
||||||
defer zone.End();
|
|
||||||
const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
|
|
||||||
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.drawInstanced(entities.count, zoom, pan);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn drawInstanced(self: *SsboRenderer, entity_count: usize, zoom: f32, pan: @Vector(2, f32)) void {
|
|
||||||
// bind shader
|
// bind shader
|
||||||
rl.gl.rlEnableShader(self.shader_id);
|
rl.gl.rlEnableShader(self.shader_id);
|
||||||
|
|
||||||
|
|
@ -219,13 +155,6 @@ pub const SsboRenderer = struct {
|
||||||
const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
|
const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
|
||||||
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||||
|
|
||||||
// set zoom uniform
|
|
||||||
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
|
||||||
|
|
||||||
// set pan uniform
|
|
||||||
const pan_arr = [2]f32{ pan[0], pan[1] };
|
|
||||||
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
|
||||||
|
|
||||||
// bind texture
|
// bind texture
|
||||||
rl.gl.rlActiveTextureSlot(0);
|
rl.gl.rlActiveTextureSlot(0);
|
||||||
rl.gl.rlEnableTexture(self.circle_texture_id);
|
rl.gl.rlEnableTexture(self.circle_texture_id);
|
||||||
|
|
@ -241,13 +170,9 @@ pub const SsboRenderer = struct {
|
||||||
rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));
|
rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));
|
||||||
|
|
||||||
// bind VAO and draw
|
// bind VAO and draw
|
||||||
{
|
_ = rl.gl.rlEnableVertexArray(self.vao_id);
|
||||||
const zone = ztracy.ZoneN(@src(), "ssbo_draw");
|
rl.gl.rlEnableVertexBuffer(self.vbo_id);
|
||||||
defer zone.End();
|
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entities.count));
|
||||||
_ = rl.gl.rlEnableVertexArray(self.vao_id);
|
|
||||||
rl.gl.rlEnableVertexBuffer(self.vbo_id);
|
|
||||||
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entity_count));
|
|
||||||
}
|
|
||||||
|
|
||||||
// cleanup - restore raylib's expected state
|
// cleanup - restore raylib's expected state
|
||||||
rl.gl.rlDisableVertexArray();
|
rl.gl.rlDisableVertexArray();
|
||||||
|
|
|
||||||
70
src/ui.zig
70
src/ui.zig
|
|
@ -19,23 +19,13 @@ pub const box_padding: f32 = 8;
|
||||||
pub const text_color = rl.Color.white;
|
pub const text_color = rl.Color.white;
|
||||||
pub const dim_text_color = rl.Color.gray;
|
pub const dim_text_color = rl.Color.gray;
|
||||||
pub const highlight_color = rl.Color.yellow;
|
pub const highlight_color = rl.Color.yellow;
|
||||||
pub const fps_good_color = rl.Color.green;
|
|
||||||
pub const fps_bad_color = rl.Color.red;
|
|
||||||
pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };
|
pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// state
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
pub var show_ui: bool = true;
|
|
||||||
|
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
// drawing functions
|
// drawing functions
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
||||||
if (!show_ui) return;
|
|
||||||
|
|
||||||
var buf: [256]u8 = undefined;
|
var buf: [256]u8 = undefined;
|
||||||
|
|
||||||
// fps box (above metrics)
|
// fps box (above metrics)
|
||||||
|
|
@ -43,16 +33,13 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
|
rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
|
||||||
const frame_ms = rl.getFrameTime() * 1000.0;
|
const frame_ms = rl.getFrameTime() * 1000.0;
|
||||||
const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
|
const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
|
||||||
rl.drawTextEx(font, "FPS: ", .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
const fps_text = std.fmt.bufPrintZ(&buf, "FPS: {d:.0}", .{fps}) catch "?";
|
||||||
const fps_text = std.fmt.bufPrintZ(&buf, "{d:.0}", .{fps}) catch "?";
|
rl.drawTextEx(font, fps_text, .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
||||||
const fps_color = if (fps >= 60.0) fps_good_color else fps_bad_color;
|
|
||||||
const label_width = rl.measureTextEx(font, "FPS: ", font_size, 0).x;
|
|
||||||
rl.drawTextEx(font, fps_text, .{ .x = padding + label_width, .y = padding }, font_size, 0, fps_color);
|
|
||||||
|
|
||||||
// metrics box (below fps)
|
// metrics box (below fps)
|
||||||
const metrics_y: i32 = 5 + fps_box_height + 5;
|
const metrics_y: i32 = 5 + fps_box_height + 5;
|
||||||
var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
|
var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
|
||||||
const bg_height: i32 = if (paused) 150 else 120;
|
const bg_height: i32 = if (paused) 130 else 100;
|
||||||
rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);
|
rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);
|
||||||
|
|
||||||
// entity count
|
// entity count
|
||||||
|
|
@ -77,11 +64,6 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
|
rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
|
||||||
y += line_height;
|
y += line_height;
|
||||||
|
|
||||||
// zoom level
|
|
||||||
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
|
||||||
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
|
||||||
y += line_height;
|
|
||||||
|
|
||||||
// paused indicator
|
// paused indicator
|
||||||
if (paused) {
|
if (paused) {
|
||||||
y += line_height;
|
y += line_height;
|
||||||
|
|
@ -92,43 +74,8 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
||||||
drawControls(font, metrics_y + bg_height);
|
drawControls(font, metrics_y + bg_height);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn drawMemory(entity_count: usize, font: rl.Font) void {
|
|
||||||
if (!show_ui) return;
|
|
||||||
|
|
||||||
var buf: [256]u8 = undefined;
|
|
||||||
|
|
||||||
const box_width: i32 = 160;
|
|
||||||
const box_height: i32 = @intFromFloat(line_height * 3 + box_padding * 2);
|
|
||||||
const box_x: i32 = @as(i32, @intCast(sandbox.SCREEN_WIDTH)) - box_width - 5;
|
|
||||||
const box_y: i32 = 5;
|
|
||||||
|
|
||||||
rl.drawRectangle(box_x, box_y, box_width, box_height, box_bg);
|
|
||||||
|
|
||||||
var y: f32 = @as(f32, @floatFromInt(box_y)) + box_padding;
|
|
||||||
const x: f32 = @floatFromInt(box_x + @as(i32, @intFromFloat(box_padding)));
|
|
||||||
|
|
||||||
// entity memory (CPU side)
|
|
||||||
const entity_bytes = entity_count * @sizeOf(sandbox.Entity);
|
|
||||||
const entity_mb = @as(f32, @floatFromInt(entity_bytes)) / (1024.0 * 1024.0);
|
|
||||||
const entity_text = std.fmt.bufPrintZ(&buf, "cpu: {d:.1} MB", .{entity_mb}) catch "?";
|
|
||||||
rl.drawTextEx(font, entity_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
|
||||||
y += line_height;
|
|
||||||
|
|
||||||
// GPU buffer memory (SSBO)
|
|
||||||
const gpu_bytes = entity_count * @sizeOf(sandbox.GpuEntity);
|
|
||||||
const gpu_mb = @as(f32, @floatFromInt(gpu_bytes)) / (1024.0 * 1024.0);
|
|
||||||
const gpu_text = std.fmt.bufPrintZ(&buf, "gpu: {d:.1} MB", .{gpu_mb}) catch "?";
|
|
||||||
rl.drawTextEx(font, gpu_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
|
||||||
y += line_height;
|
|
||||||
|
|
||||||
// total
|
|
||||||
const total_mb = entity_mb + gpu_mb;
|
|
||||||
const total_text = std.fmt.bufPrintZ(&buf, "total: {d:.1} MB", .{total_mb}) catch "?";
|
|
||||||
rl.drawTextEx(font, total_text, .{ .x = x, .y = y }, font_size, 0, dim_text_color);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
||||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 4 + box_padding * 2);
|
||||||
const ctrl_box_y: i32 = metrics_bottom + 5;
|
const ctrl_box_y: i32 = metrics_bottom + 5;
|
||||||
rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);
|
rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);
|
||||||
|
|
||||||
|
|
@ -137,11 +84,8 @@ fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
||||||
const controls = [_][]const u8{
|
const controls = [_][]const u8{
|
||||||
"+/-: 10k entities",
|
"+/-: 10k entities",
|
||||||
"shift +/-: 50k",
|
"shift +/-: 50k",
|
||||||
"scroll: zoom",
|
"space: pause",
|
||||||
"drag: pan (zoomed)",
|
"r: reset",
|
||||||
"space: pause, r: reset",
|
|
||||||
"q: zoom out / quit",
|
|
||||||
"tab: toggle ui",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
for (controls) |text| {
|
for (controls) |text| {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue