Compare commits
26 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 02fd358611 | |||
| 5b890b18e4 | |||
| 55b0d7fab7 | |||
| a842800ede | |||
| 0568204cb7 | |||
| 516b4af458 | |||
| 6dcafc8f3c | |||
| 9f3495b882 | |||
| 90bb30b6c6 | |||
| 9e8226de32 | |||
| 62d010bdc0 | |||
| 45c37bfcd2 | |||
| 5fd82000cf | |||
| c30b9c0ed0 | |||
| ebe28e5669 | |||
| 7b43b5726e | |||
| d0dcb701f8 | |||
| e1d5dc136e | |||
| 3e2e39100a | |||
| 26383ed79e | |||
| 1782bc8db7 | |||
| 3f9e33feaf | |||
| d8bc9ac927 | |||
| 9e48f2cc2d | |||
| 123322494e | |||
| 2deaa66a78 |
35 changed files with 2550 additions and 75 deletions
|
|
@ -1,12 +1,14 @@
|
|||
name: release
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: codeberg-small
|
||||
runs-on: ubuntu-latest
|
||||
container: catthehacker/ubuntu:act-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -35,16 +37,32 @@ jobs:
|
|||
|
||||
- name: Upload to release
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
FORGEJO_TOKEN: ${{ secrets.FORGEJO_TOKEN }}
|
||||
run: |
|
||||
RELEASE_ID="${{ github.event.release.id }}"
|
||||
API_URL="${{ github.api_url }}/repos/${{ github.repository }}/releases/${RELEASE_ID}/assets"
|
||||
TAG="${{ github.ref_name }}"
|
||||
API_BASE="${{ github.server_url }}/api/v1"
|
||||
REPO="${{ github.repository }}"
|
||||
|
||||
# check if release exists
|
||||
RELEASE_ID=$(curl -sf \
|
||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||
"${API_BASE}/repos/${REPO}/releases/tags/${TAG}" | jq -r '.id // empty')
|
||||
|
||||
if [ -z "$RELEASE_ID" ]; then
|
||||
echo "Creating release for ${TAG}..."
|
||||
RELEASE_ID=$(curl -sf \
|
||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"tag_name":"'"${TAG}"'","name":"'"${TAG}"'"}' \
|
||||
"${API_BASE}/repos/${REPO}/releases" | jq -r '.id')
|
||||
fi
|
||||
|
||||
echo "Release ID: ${RELEASE_ID}"
|
||||
|
||||
for file in lofivor-linux-x86_64 lofivor-windows-x86_64.exe; do
|
||||
echo "Uploading $file..."
|
||||
curl -X POST \
|
||||
-H "Authorization: token ${GITHUB_TOKEN}" \
|
||||
-H "Content-Type: application/octet-stream" \
|
||||
--data-binary @"$file" \
|
||||
"${API_URL}?name=${file}"
|
||||
curl -sf \
|
||||
-H "Authorization: token ${FORGEJO_TOKEN}" \
|
||||
-F "attachment=@${file}" \
|
||||
"${API_BASE}/repos/${REPO}/releases/${RELEASE_ID}/assets?name=${file}"
|
||||
done
|
||||
|
|
|
|||
27
.github/workflows/release.yml
vendored
27
.github/workflows/release.yml
vendored
|
|
@ -10,9 +10,14 @@ jobs:
|
|||
matrix:
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
artifact: sandbox-linux-x86_64
|
||||
target: native
|
||||
artifact: lofivor-linux-x86_64
|
||||
- os: ubuntu-latest
|
||||
target: x86_64-windows-gnu
|
||||
artifact: lofivor-windows-x86_64.exe
|
||||
- os: macos-latest
|
||||
artifact: sandbox-macos-aarch64
|
||||
target: native
|
||||
artifact: lofivor-macos-aarch64
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
|
|
@ -26,12 +31,24 @@ jobs:
|
|||
version: 0.15.2
|
||||
|
||||
- name: Install X11 dependencies (Linux)
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: matrix.os == 'ubuntu-latest' && matrix.target == 'native'
|
||||
run: sudo apt-get update && sudo apt-get install -y libx11-dev libxcursor-dev libxrandr-dev libxinerama-dev libxi-dev libxext-dev libxfixes-dev libgl1-mesa-dev
|
||||
|
||||
- run: zig build -Doptimize=ReleaseFast
|
||||
- name: Build native
|
||||
if: matrix.target == 'native'
|
||||
run: zig build -Doptimize=ReleaseFast
|
||||
|
||||
- run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
|
||||
- name: Build cross-compile
|
||||
if: matrix.target != 'native'
|
||||
run: zig build -Dtarget=${{ matrix.target }} -Doptimize=ReleaseFast
|
||||
|
||||
- name: Rename artifact (Unix)
|
||||
if: "!contains(matrix.artifact, '.exe')"
|
||||
run: mv zig-out/bin/sandbox ${{ matrix.artifact }}
|
||||
|
||||
- name: Rename artifact (Windows)
|
||||
if: contains(matrix.artifact, '.exe')
|
||||
run: mv zig-out/bin/sandbox.exe ${{ matrix.artifact }}
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
|
|
|||
|
|
@ -82,8 +82,8 @@ these target the rendering bottleneck since update loop is already fast.
|
|||
|
||||
| technique | description | expected gain |
|
||||
| ---------------------- | -------------------------------------------------------------------- | ------------------------------- |
|
||||
| ~~SSBO instance data~~ | ~~pack (x, y, color) = 12 bytes instead of 64-byte matrices~~ | **done** - see optimization 5 |
|
||||
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | significant |
|
||||
| SSBO instance data | pack (x, y, color) = 12 bytes instead of 64-byte matrices | done - see optimization 5 |
|
||||
| compute shader updates | move entity positions to GPU entirely, avoid CPU→GPU sync | done - see optimization 6 |
|
||||
| OpenGL vs Vulkan | test raylib's Vulkan backend | unknown |
|
||||
| discrete GPU testing | test on dedicated GPU where instancing/SSBO shine | significant (different hw) |
|
||||
|
||||
|
|
@ -126,6 +126,33 @@ currently not the bottleneck - update stays <1ms at 100k. these become relevant
|
|||
| entity pools | pre-allocated, reusable entity slots | reduces allocation overhead |
|
||||
| component packing | minimize struct padding | better cache utilization |
|
||||
|
||||
#### estimated gains summary
|
||||
|
||||
| Optimization | Expected Gain | Why |
|
||||
|------------------------|---------------|---------------------------------------------------|
|
||||
| SIMD updates | 0% | Update already on GPU |
|
||||
| Multithreaded update | 0% | Update already on GPU |
|
||||
| Cache-friendly layouts | 0% | CPU doesn't iterate entities |
|
||||
| Fixed-point math | 0% or worse | GPUs are optimized for float |
|
||||
| SoA vs AoS | ~5% | Only helps data upload, not bottleneck |
|
||||
| Frustum culling | 5-15% | Most entities converge to center anyway |
|
||||
| LOD rendering | 20-40% | Real gains - fewer fragments for distant entities |
|
||||
| Temporal techniques | ~50% | But with visual artifacts (flickering) |
|
||||
|
||||
Realistic total if you did everything: ~30-50% improvement
|
||||
|
||||
That'd take you from ~1.4M @ 38fps to maybe ~1.8-2M @ 38fps, or ~1.4M @ 50-55fps.
|
||||
|
||||
What would actually move the needle:
|
||||
- GPU-side frustum culling in compute shader (cull before render, not after)
|
||||
- Point sprites instead of quads for distant entities (4 vertices → 1)
|
||||
- Indirect draw calls (GPU decides what to render, CPU never touches entity data)
|
||||
|
||||
Your real bottleneck is fill rate and vertex throughput on HD 530 integrated
|
||||
graphics. The CPU side is already essentially free.
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
## testing methodology
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ lofivor
|
|||
sandbox stress test for measuring entity rendering performance on weak hardware.
|
||||
written in zig with raylib.
|
||||
|
||||
(lofivor aka lofi-survivor)
|
||||
|
||||
build & run
|
||||
-----------
|
||||
|
||||
|
|
@ -12,8 +14,8 @@ build & run
|
|||
controls
|
||||
--------
|
||||
|
||||
+/- add/remove 1000 entities
|
||||
shift hold for 10x (10000 entities)
|
||||
+/- add/remove 10k entities
|
||||
shift hold for 50k
|
||||
space pause/resume
|
||||
r reset
|
||||
|
||||
|
|
|
|||
24
TODO.md
24
TODO.md
|
|
@ -59,7 +59,7 @@ further options (if needed):
|
|||
- [x] increase raylib batch buffer (currently 8192 vertices = 2048 quads)
|
||||
- [x] GPU instancing (single draw call for all entities)
|
||||
- [x] SSBO instance data (12 bytes vs 64-byte matrices)
|
||||
- [ ] compute shader entity updates (if raylib supports)
|
||||
- [x] compute shader entity updates (raylib supports via rlgl)
|
||||
- [ ] compare OpenGL vs Vulkan backend
|
||||
|
||||
findings (i5-6500T / HD 530):
|
||||
|
|
@ -68,14 +68,18 @@ findings (i5-6500T / HD 530):
|
|||
- instancing doesn't help on integrated graphics (shared RAM, no PCIe savings)
|
||||
- bottleneck is memory bandwidth, not draw call overhead
|
||||
- rlgl batching is already near-optimal for this hardware
|
||||
- compute shaders: update time ~5ms → ~0ms at 150k entities (CPU freed entirely)
|
||||
|
||||
## future optimization concepts
|
||||
## future optimization concepts (GPU-focused)
|
||||
|
||||
- [ ] SIMD entity updates (AVX2/SSE)
|
||||
- [ ] struct-of-arrays vs array-of-structs benchmark
|
||||
- [ ] multithreaded update loop (thread pool)
|
||||
- [ ] cache-friendly memory layouts
|
||||
- [ ] LOD rendering (skip distant entities or reduce detail)
|
||||
- [ ] frustum culling (only render visible)
|
||||
- [ ] temporal techniques (update subset per frame)
|
||||
- [ ] fixed-point vs floating-point math
|
||||
- [ ] GPU-side frustum culling in compute shader
|
||||
- [ ] point sprites for distant/small entities (4 verts → 1)
|
||||
- [ ] indirect draw calls (glDrawArraysIndirect)
|
||||
|
||||
## future optimization concepts (CPU - not currently bottleneck)
|
||||
|
||||
- [ ] SIMD / SoA / multithreading (if game logic makes CPU hot again)
|
||||
|
||||
## other ideas that aren't about optimization
|
||||
|
||||
- [ ] scanline shader
|
||||
|
|
|
|||
13
build.zig
13
build.zig
|
|
@ -4,6 +4,9 @@ pub fn build(b: *std.Build) void {
|
|||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
// tracy profiling (run with -Dtracy=true)
|
||||
const enable_tracy = b.option(bool, "tracy", "Enable Tracy profiler") orelse false;
|
||||
|
||||
const raylib_dep = b.dependency("raylib_zig", .{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
|
|
@ -24,6 +27,16 @@ pub fn build(b: *std.Build) void {
|
|||
sandbox_exe.root_module.addImport("raylib", raylib_dep.module("raylib"));
|
||||
sandbox_exe.linkLibrary(raylib_dep.artifact("raylib"));
|
||||
|
||||
// tracy integration (optional)
|
||||
const ztracy = b.dependency("ztracy", .{
|
||||
.enable_ztracy = enable_tracy,
|
||||
.on_demand = true, // allow connecting after app starts
|
||||
});
|
||||
sandbox_exe.root_module.addImport("ztracy", ztracy.module("root"));
|
||||
if (enable_tracy) {
|
||||
sandbox_exe.linkLibrary(ztracy.artifact("tracy"));
|
||||
}
|
||||
|
||||
b.installArtifact(sandbox_exe);
|
||||
|
||||
const sandbox_run_cmd = b.addRunArtifact(sandbox_exe);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,10 @@
|
|||
.url = "git+https://github.com/raylib-zig/raylib-zig#a4d18b2d1cf8fdddec68b5b084535fca0475f466",
|
||||
.hash = "raylib_zig-5.6.0-dev-KE8REL5MBQAf3p497t52Xw9P7ojndIkVOWPXnLiLLw2P",
|
||||
},
|
||||
.ztracy = .{
|
||||
.url = "git+https://github.com/zig-gamedev/ztracy?ref=main#e7b401dea9ce006f8b236e3a2ca1a9f3d5c3e896",
|
||||
.hash = "ztracy-0.14.0-dev-zHJSq78GGQC904aYvBPn6OOvRVOq_opAwDfeHZdvQyej",
|
||||
},
|
||||
},
|
||||
.paths = .{
|
||||
"build.zig",
|
||||
|
|
|
|||
292
docs/GLOSSARY.txt
Normal file
292
docs/GLOSSARY.txt
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
lofivor glossary
|
||||
================
|
||||
|
||||
terms that come up when optimizing graphics.
|
||||
|
||||
|
||||
clock cycle
|
||||
-----------
|
||||
|
||||
one "tick" of the processor's internal clock.
|
||||
|
||||
a CPU or GPU has a crystal oscillator that vibrates at a fixed rate.
|
||||
each vibration = one cycle. the processor does some work each cycle.
|
||||
|
||||
1 GHz = 1 billion cycles per second
|
||||
1 MHz = 1 million cycles per second
|
||||
|
||||
so a 1 GHz processor has 1 billion opportunities to do work per second.
|
||||
|
||||
"one operation per cycle" is idealized. real work often takes multiple
|
||||
cycles (memory access: 100+ cycles, division: 10-20 cycles, add: 1 cycle).
|
||||
|
||||
your HD 530 runs at ~950 MHz, so roughly 950 million cycles per second.
|
||||
at 60fps, that's about 15.8 million cycles per frame.
|
||||
|
||||
|
||||
fill rate
|
||||
---------
|
||||
|
||||
pixels written per second. measured in megapixels/s or gigapixels/s.
|
||||
|
||||
fill rate = ROPs * clock speed * pixels per clock
|
||||
|
||||
your HD 530: 3 ROPs * 950 MHz * 1 = 2.85 GPixels/s theoretical max.
|
||||
|
||||
|
||||
overdraw
|
||||
--------
|
||||
|
||||
drawing the same pixel multiple times per frame.
|
||||
|
||||
if two entities overlap, the back one gets drawn, then the front one
|
||||
overwrites it. the back one's work was wasted.
|
||||
|
||||
overdraw ratio = total pixels drawn / screen pixels
|
||||
|
||||
1080p = 2.07M pixels. if you draw 20M pixels, overdraw = ~10x.
|
||||
|
||||
|
||||
bandwidth
|
||||
---------
|
||||
|
||||
data transfer rate. measured in bytes/second (GB/s, MB/s).
|
||||
|
||||
memory bandwidth = how fast data moves between processor and RAM.
|
||||
|
||||
your HD 530 shares DDR4 with the CPU: ~30 GB/s total.
|
||||
a discrete GPU has dedicated VRAM: 200-900 GB/s.
|
||||
|
||||
|
||||
latency
|
||||
-------
|
||||
|
||||
time delay. measured in nanoseconds (ns) or cycles.
|
||||
|
||||
memory latency = time to fetch data from RAM.
|
||||
- L1 cache: ~4 cycles
|
||||
- L2 cache: ~12 cycles
|
||||
- L3 cache: ~40 cycles
|
||||
- main RAM: ~200 cycles
|
||||
|
||||
this is why cache matters. a cache miss = 50x slower than a hit.
|
||||
|
||||
|
||||
throughput vs latency
|
||||
---------------------
|
||||
|
||||
latency = how long ONE thing takes.
|
||||
throughput = how many things per second.
|
||||
|
||||
a pipeline can have high latency but high throughput.
|
||||
|
||||
example: a car wash takes 10 minutes (latency).
|
||||
but if cars enter every 1 minute, throughput is 60 cars/hour.
|
||||
|
||||
GPUs hide latency with throughput. one thread waits for memory?
|
||||
switch to another thread. thousands of threads keep the GPU busy.
|
||||
|
||||
|
||||
draw call
|
||||
---------
|
||||
|
||||
one command from CPU to GPU: "draw this batch of geometry."
|
||||
|
||||
each draw call has overhead:
|
||||
- CPU prepares command buffer
|
||||
- driver validates state
|
||||
- GPU switches context
|
||||
|
||||
1 draw call for 1M triangles: fast.
|
||||
1M draw calls for 1M triangles: slow.
|
||||
|
||||
lofivor uses 1 draw call for all entities (instanced rendering).
|
||||
|
||||
|
||||
instancing
|
||||
----------
|
||||
|
||||
drawing many copies of the same geometry in one draw call.
|
||||
|
||||
instead of: draw triangle, draw triangle, draw triangle...
|
||||
you say: draw this triangle 1 million times, here are the positions.
|
||||
|
||||
the GPU handles the replication. massively more efficient.
|
||||
|
||||
|
||||
shader
|
||||
------
|
||||
|
||||
a small program that runs on the GPU.
|
||||
|
||||
the name is historical - early shaders calculated shading/lighting.
|
||||
but today: a shader is just software running on GPU hardware.
|
||||
it doesn't have to do with shading at all.
|
||||
|
||||
more precisely: a shader turns one piece of data into another piece of data.
|
||||
- vertex shader: positions → screen coordinates
|
||||
- fragment shader: fragments → pixel colors
|
||||
- compute shader: data → data (anything)
|
||||
|
||||
GPUs are massively parallel, so shaders run on thousands of inputs at once.
|
||||
CPUs have stagnated; GPUs keep getting faster. modern engines like UE5
|
||||
increasingly use shaders for work that used to be CPU-only.
|
||||
|
||||
|
||||
SSBO (shader storage buffer object)
|
||||
-----------------------------------
|
||||
|
||||
a block of GPU memory that shaders can read/write.
|
||||
|
||||
unlike uniforms (small, read-only), SSBOs can be large and writable.
|
||||
lofivor stores all entity data in an SSBO: positions, velocities, colors.
|
||||
|
||||
|
||||
compute shader
|
||||
--------------
|
||||
|
||||
a shader that does general computation, not rendering.
|
||||
|
||||
runs on GPU cores but doesn't output pixels. just processes data.
|
||||
lofivor uses compute shaders to update entity positions.
|
||||
|
||||
because compute exists, shaders can be anything: physics, AI, sorting,
|
||||
image processing. the GPU is a general-purpose parallel processor.
|
||||
|
||||
|
||||
fragment / pixel shader
|
||||
-----------------------
|
||||
|
||||
program that runs once per pixel (actually per "fragment").
|
||||
|
||||
determines the final color of each pixel. this is where:
|
||||
- texture sampling happens
|
||||
- lighting calculations happen
|
||||
- the expensive math lives
|
||||
|
||||
lofivor's fragment shader: sample texture, multiply by color. trivial.
|
||||
AAA game fragment shader: 500+ instructions. expensive.
|
||||
|
||||
|
||||
vertex shader
|
||||
-------------
|
||||
|
||||
program that runs once per vertex.
|
||||
|
||||
transforms 3D positions to screen positions. lofivor's vertex shader
|
||||
reads from SSBO and positions the quad corners.
|
||||
|
||||
|
||||
ROP (render output unit)
|
||||
------------------------
|
||||
|
||||
final stage of GPU pipeline. writes pixels to framebuffer.
|
||||
|
||||
handles: depth test, stencil test, blending, antialiasing.
|
||||
your bottleneck on HD 530. see docs/rops.txt.
|
||||
|
||||
|
||||
TMU (texture mapping unit)
|
||||
--------------------------
|
||||
|
||||
samples textures. reads pixel colors from texture memory.
|
||||
|
||||
your HD 530 has 24 TMUs. they're fast (22.8 GTexels/s).
|
||||
texture sampling is cheap relative to ROPs on this hardware.
|
||||
|
||||
|
||||
EU (execution unit)
|
||||
-------------------
|
||||
|
||||
intel's term for shader cores.
|
||||
|
||||
your HD 530 has 24 EUs, each with 8 ALUs = 192 ALUs total.
|
||||
these run your vertex, fragment, and compute shaders.
|
||||
|
||||
|
||||
ALU (arithmetic logic unit)
|
||||
---------------------------
|
||||
|
||||
does math. add, multiply, compare, bitwise operations.
|
||||
|
||||
one ALU can do one operation per cycle (simple ops).
|
||||
complex ops (sqrt, sin, cos) take multiple cycles.
|
||||
|
||||
|
||||
framebuffer
|
||||
-----------
|
||||
|
||||
the image being rendered. lives in GPU memory.
|
||||
|
||||
at 1080p with 32-bit color: 1920 * 1080 * 4 = 8.3 MB.
|
||||
double-buffered (front + back): 16.6 MB.
|
||||
|
||||
|
||||
vsync
|
||||
-----
|
||||
|
||||
synchronizing frame presentation with monitor refresh.
|
||||
|
||||
without vsync: tearing (half old frame, half new frame).
|
||||
with vsync: smooth, but if you miss 16.7ms, you wait for next refresh.
|
||||
|
||||
|
||||
frame budget
|
||||
------------
|
||||
|
||||
time available per frame.
|
||||
|
||||
60 fps = 16.67 ms per frame
|
||||
30 fps = 33.33 ms per frame
|
||||
|
||||
everything (CPU + GPU) must complete within budget or frames drop.
|
||||
|
||||
|
||||
pipeline stall
|
||||
--------------
|
||||
|
||||
GPU waiting for something. bad for performance.
|
||||
|
||||
causes:
|
||||
- waiting for memory (cache miss)
|
||||
- waiting for previous stage to finish
|
||||
- synchronization points (barriers)
|
||||
- `discard` in fragment shader (breaks early-z)
|
||||
|
||||
|
||||
early-z
|
||||
-------
|
||||
|
||||
optimization: test depth BEFORE running fragment shader.
|
||||
|
||||
if pixel will be occluded, skip the expensive shader work.
|
||||
`discard` breaks this because GPU can't know depth until shader runs.
|
||||
|
||||
|
||||
LOD (level of detail)
|
||||
---------------------
|
||||
|
||||
using simpler geometry/textures for distant objects.
|
||||
|
||||
far away = fewer pixels = less detail needed.
|
||||
saves vertices, texture bandwidth, and fill rate.
|
||||
|
||||
|
||||
frustum culling
|
||||
---------------
|
||||
|
||||
don't draw what's outside the camera view.
|
||||
|
||||
the "frustum" is the pyramid-shaped visible region.
|
||||
anything outside = wasted work. cull it before sending to GPU.
|
||||
|
||||
|
||||
spatial partitioning
|
||||
--------------------
|
||||
|
||||
organizing entities by position for fast queries.
|
||||
|
||||
types: grid, quadtree, octree, BVH.
|
||||
|
||||
"which entities are near point X?" goes from O(n) to O(log n).
|
||||
essential for collision detection at scale.
|
||||
119
docs/hd530_optimization_guide.md
Normal file
119
docs/hd530_optimization_guide.md
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
# intel hd 530 optimization guide for lofivor
|
||||
|
||||
based on hardware specs and empirical testing.
|
||||
|
||||
## hardware constraints
|
||||
|
||||
from `intel_hd_graphics_530.txt`:
|
||||
|
||||
| resource | value | implication |
|
||||
| ---------- | ------- | ------------- |
|
||||
| ROPs | 3 | fill rate limited - this is our ceiling |
|
||||
| TMUs | 24 | texture sampling is relatively fast |
|
||||
| memory | shared DDR4 ~30GB/s | bandwidth is precious, no VRAM |
|
||||
| pixel rate | 2.85 GPixel/s | max theoretical throughput |
|
||||
| EUs | 24 (192 ALUs) | decent compute, weak vs discrete |
|
||||
| L3 cache | 768 KB | small, cache misses hurt |
|
||||
|
||||
the bottleneck is ROPs (fill rate), not vertices or compute.
|
||||
|
||||
## what works (proven)
|
||||
|
||||
### SSBO instance data
|
||||
- 16 bytes per entity vs 64 bytes (matrices)
|
||||
- minimizes bandwidth on shared memory bus
|
||||
- result: ~5x improvement over instancing
|
||||
|
||||
### compute shader updates
|
||||
- GPU does position/velocity updates
|
||||
- no CPU→GPU sync per frame
|
||||
- result: update time essentially free
|
||||
|
||||
### texture sampling
|
||||
- 22.8 GTexel/s is fast relative to other units
|
||||
- pre-baked circle texture beats procedural math
|
||||
- result: 2x faster than procedural fragment shader
|
||||
|
||||
### instanced triangles/quads
|
||||
- most optimized driver path
|
||||
- intel mesa heavily optimizes this
|
||||
- result: baseline, hard to beat
|
||||
|
||||
## what doesn't work (proven)
|
||||
|
||||
### point sprites
|
||||
- theoretically 6x fewer vertices
|
||||
- reality: 2.4x SLOWER on this hardware
|
||||
- triangle rasterizer is more optimized
|
||||
- see `docs/point_sprites_experiment.md`
|
||||
|
||||
### procedural fragment shaders
|
||||
- `length()`, `smoothstep()`, `discard` are expensive
|
||||
- EUs are weaker than discrete GPUs
|
||||
- `discard` breaks early-z optimization
|
||||
- result: 3.7x slower than texture sampling
|
||||
|
||||
### complex fragment math
|
||||
- only 24 EUs, each running 8 ALUs
|
||||
- transcendentals (sqrt, sin, cos) are 4x slower than FMAD
|
||||
- avoid in hot path
|
||||
|
||||
## what to try next (theoretical)
|
||||
|
||||
### likely to help
|
||||
|
||||
| technique | why it should work | expected gain |
|
||||
| ----------- | ------------------- | --------------- |
|
||||
| frustum culling (GPU) | reduce fill rate, which is bottleneck | 10-30% depending on view |
|
||||
| smaller points when zoomed out (LOD) | fewer pixels per entity = less ROP work | 20-40% |
|
||||
| early-z / depth pre-pass | skip fragment work for occluded pixels | moderate |
|
||||
|
||||
### unlikely to help
|
||||
|
||||
| technique | why it won't help |
|
||||
| ----------- | ------------------ |
|
||||
| more vertex optimization | already fill rate bound, not vertex bound |
|
||||
| SIMD on CPU | updates already on GPU |
|
||||
| multithreading | CPU isn't the bottleneck |
|
||||
| different vertex layouts | negligible vs fill rate |
|
||||
|
||||
### uncertain (need to test)
|
||||
|
||||
| technique | notes |
|
||||
| ----------- | ------- |
|
||||
| vulkan backend | might have less driver overhead, or might not matter |
|
||||
| indirect draw calls | GPU decides what to render, but we're not CPU bound |
|
||||
| fp16 in shaders | HD 530 has 2:1 fp16 ratio, might help fragment shader |
|
||||
|
||||
## key insights
|
||||
|
||||
1. fill rate is king - with only 3 ROPs, everything comes down to how many
|
||||
pixels we're writing. optimizations that don't reduce pixel count won't
|
||||
help.
|
||||
|
||||
2. shared memory hurts - no dedicated VRAM means CPU and GPU compete for
|
||||
bandwidth. keep data transfers minimal.
|
||||
|
||||
3. driver optimization matters - the "common path" (triangles) is more
|
||||
optimized than alternatives (points). don't be clever.
|
||||
|
||||
4. texture sampling is cheap - 22.8 GTexel/s is fast. prefer texture
|
||||
lookups over ALU math in fragment shaders.
|
||||
|
||||
5. avoid discard - breaks early-z, causes pipeline stalls. alpha blending
|
||||
is faster than discard.
|
||||
|
||||
## current ceiling
|
||||
|
||||
~950k entities @ 57fps (SSBO + compute + quads)
|
||||
|
||||
to go higher, we need to reduce fill rate:
|
||||
- cull offscreen entities
|
||||
- reduce entity size when zoomed out
|
||||
- or accept lower fps at higher counts
|
||||
|
||||
## references
|
||||
|
||||
- intel gen9 compute architecture whitepaper
|
||||
- empirical benchmarks in `benchmark_current_i56500t.log`
|
||||
- point sprites experiment in `docs/point_sprites_experiment.md`
|
||||
31
docs/hysteria.md
Normal file
31
docs/hysteria.md
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
# hysteresis in lofivor
|
||||
|
||||
## the problem without it
|
||||
|
||||
say your target is 8.33ms. your frame times naturally jitter: 8.2, 8.4, 8.3, 8.5, 8.2...
|
||||
|
||||
without hysteresis, every time it crosses 8.33ms you'd log "crossed threshold!" - potentially dozens of times per second. the log becomes useless noise.
|
||||
|
||||
## how the code works
|
||||
|
||||
from `sandbox_main.zig` lines 74-89:
|
||||
|
||||
```
|
||||
was_above=false → need frame_ms > 10.33 (target + 2.0 margin) to flip to true
|
||||
was_above=true → need frame_ms < 8.33 (target) to flip back to false
|
||||
```
|
||||
|
||||
this creates a "dead zone" between 8.33 and 10.33ms where no state change happens.
|
||||
|
||||
## the magnet analogy
|
||||
|
||||
the `was_above_target` boolean is like the magnet's current polarity. the frame time "pushing" past thresholds is like the magnetic field. the key insight: **the threshold you need to cross depends on which side you're currently on.**
|
||||
|
||||
if you're in "good" state, you need a significant spike (>10.33ms) before you flip to "bad". if you're in "bad" state, you only need to drop below 8.33ms to recover. this asymmetry is the hysteresis.
|
||||
|
||||
## real-world examples
|
||||
|
||||
- thermostat: heat on at 68°F, off at 72°F (prevents rapid on/off cycling)
|
||||
- schmitt trigger in electronics: same concept, prevents noise from causing oscillation
|
||||
|
||||
the `THRESHOLD_MARGIN` of 2.0ms is the "width" of the hysteresis band - bigger = more stable but less responsive.
|
||||
54
docs/plans/2025-12-16-zoom-pan-design.md
Normal file
54
docs/plans/2025-12-16-zoom-pan-design.md
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# Zoom/Pan Camera Design
|
||||
|
||||
A viewport camera for zooming into and panning around the simulation without affecting entity behavior.
|
||||
|
||||
## Core Behavior
|
||||
|
||||
### Zoom
|
||||
- Scroll wheel zooms toward mouse cursor position
|
||||
- Range: 1x (default floor) to 10x (ceiling)
|
||||
- Instant response, no animation
|
||||
- Esc or Space resets to 1x and clears pan offset
|
||||
|
||||
### Pan
|
||||
- Any mouse button (left/middle/right) + drag pans the viewport
|
||||
- Only available when zoom > 1x
|
||||
- Bounded to simulation area - cannot pan into empty space
|
||||
|
||||
### UI
|
||||
- Display current zoom level in existing panel under render info (e.g., `zoom: 2.3x`)
|
||||
|
||||
## Implementation Approach
|
||||
|
||||
### State
|
||||
New camera state in `sandbox_main.zig`:
|
||||
```zig
|
||||
var zoom: f32 = 1.0;
|
||||
var pan: @Vector(2, f32) = .{ 0, 0 };
|
||||
```
|
||||
|
||||
### Shader Changes
|
||||
Modify `entity.vert` to accept `zoom` and `pan` uniforms:
|
||||
- Apply pan offset before converting to NDC
|
||||
- Scale by zoom factor
|
||||
- Scale quad size by zoom so entities appear larger
|
||||
|
||||
### Input Handling
|
||||
- `getMouseWheelMove()` adjusts zoom (clamped 1.0–10.0)
|
||||
- Zoom-toward-cursor: adjust pan to keep point under cursor stationary
|
||||
- Mouse drag (any button) adjusts pan with bounds checking
|
||||
- Esc/Space resets zoom to 1.0 and pan to (0, 0)
|
||||
|
||||
### Zoom-Toward-Cursor Math
|
||||
When zooming from `oldZoom` to `newZoom` with cursor at `mousePos`:
|
||||
```
|
||||
worldMousePos = (mousePos / oldZoom) + pan
|
||||
newPan = worldMousePos - (mousePos / newZoom)
|
||||
```
|
||||
|
||||
### Pan Bounds
|
||||
Constrain pan so viewport stays within simulation area:
|
||||
```
|
||||
maxPan = simulationSize - (screenSize / zoom)
|
||||
pan = clamp(pan, 0, maxPan)
|
||||
```
|
||||
440
docs/plans/2025-12-16-zoom-pan-plan.md
Normal file
440
docs/plans/2025-12-16-zoom-pan-plan.md
Normal file
|
|
@ -0,0 +1,440 @@
|
|||
# Zoom/Pan Camera Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Add viewport zoom (scroll wheel toward cursor) and pan (any mouse drag when zoomed) to observe the simulation up close.
|
||||
|
||||
**Architecture:** Camera state (zoom, pan) lives in sandbox_main.zig. Passed to shader as uniforms. All rendering paths use the same camera state, but only SSBO path gets shader-based zoom (others would need separate work).
|
||||
|
||||
**Tech Stack:** Zig, raylib, GLSL 430
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Add camera state and shader uniforms
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/sandbox_main.zig:266` (add state after `var paused`)
|
||||
- Modify: `src/ssbo_renderer.zig:20-21` (add uniform locations to struct)
|
||||
- Modify: `src/ssbo_renderer.zig:54-62` (get uniform locations in init)
|
||||
- Modify: `src/ssbo_renderer.zig:154-156` (pass uniforms in render)
|
||||
|
||||
**Step 1: Add camera state to sandbox_main.zig**
|
||||
|
||||
After line 266 (`var paused = false;`), add:
|
||||
|
||||
```zig
|
||||
// camera state for zoom/pan
|
||||
var zoom: f32 = 1.0;
|
||||
var pan = @Vector(2, f32){ 0, 0 };
|
||||
```
|
||||
|
||||
**Step 2: Add uniform locations to SsboRenderer struct**
|
||||
|
||||
In `src/ssbo_renderer.zig`, add to struct fields after line 21 (`circle_texture_loc`):
|
||||
|
||||
```zig
|
||||
zoom_loc: i32,
|
||||
pan_loc: i32,
|
||||
```
|
||||
|
||||
**Step 3: Get uniform locations in init**
|
||||
|
||||
After line 55 (`const circle_texture_loc = ...`), add:
|
||||
|
||||
```zig
|
||||
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
||||
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
||||
```
|
||||
|
||||
**Step 4: Add fields to return struct**
|
||||
|
||||
In the return statement (around line 112), add:
|
||||
|
||||
```zig
|
||||
.zoom_loc = zoom_loc,
|
||||
.pan_loc = pan_loc,
|
||||
```
|
||||
|
||||
**Step 5: Pass uniforms in render method**
|
||||
|
||||
Change render signature to accept zoom/pan:
|
||||
|
||||
```zig
|
||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||
```
|
||||
|
||||
After line 156 (setting screenSize uniform), add:
|
||||
|
||||
```zig
|
||||
// set zoom uniform
|
||||
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||
|
||||
// set pan uniform
|
||||
const pan_arr = [2]f32{ pan[0], pan[1] };
|
||||
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||
```
|
||||
|
||||
**Step 6: Update render call in sandbox_main.zig**
|
||||
|
||||
Change line 336 from:
|
||||
|
||||
```zig
|
||||
ssbo_renderer.?.render(&entities);
|
||||
```
|
||||
|
||||
To:
|
||||
|
||||
```zig
|
||||
ssbo_renderer.?.render(&entities, zoom, pan);
|
||||
```
|
||||
|
||||
**Step 7: Build and verify compiles**
|
||||
|
||||
Run: `zig build`
|
||||
|
||||
Expected: Compiles with no errors (shader won't use uniforms yet, but that's fine)
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Update vertex shader for zoom/pan
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/shaders/entity.vert`
|
||||
|
||||
**Step 1: Add uniforms**
|
||||
|
||||
After line 19 (`uniform vec2 screenSize;`), add:
|
||||
|
||||
```glsl
|
||||
uniform float zoom;
|
||||
uniform vec2 pan;
|
||||
```
|
||||
|
||||
**Step 2: Update NDC calculation**
|
||||
|
||||
Replace lines 29-31:
|
||||
|
||||
```glsl
|
||||
// convert entity position to NDC
|
||||
// entity coords are in screen pixels, convert to [-1, 1]
|
||||
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
||||
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
||||
```
|
||||
|
||||
With:
|
||||
|
||||
```glsl
|
||||
// apply pan offset and zoom to convert to NDC
|
||||
// pan is in screen pixels, zoom scales the view
|
||||
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
||||
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
||||
```
|
||||
|
||||
**Step 3: Scale quad size by zoom**
|
||||
|
||||
Replace line 34:
|
||||
|
||||
```glsl
|
||||
float quadSizeNdc = 16.0 / screenSize.x;
|
||||
```
|
||||
|
||||
With:
|
||||
|
||||
```glsl
|
||||
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
||||
```
|
||||
|
||||
**Step 4: Build and test**
|
||||
|
||||
Run: `zig build && ./zig-out/bin/lofivor`
|
||||
|
||||
Expected: Renders exactly as before (zoom=1.0, pan=0,0 should be identical to old behavior)
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Add zoom input handling
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/sandbox_main.zig` (handleInput function and main loop)
|
||||
|
||||
**Step 1: Add zoom constants**
|
||||
|
||||
After line 32 (BENCH_EXIT_SUSTAIN), add:
|
||||
|
||||
```zig
|
||||
// zoom settings
|
||||
const ZOOM_MIN: f32 = 1.0;
|
||||
const ZOOM_MAX: f32 = 10.0;
|
||||
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
||||
```
|
||||
|
||||
**Step 2: Create handleCamera function**
|
||||
|
||||
After the `handleInput` function (around line 458), add:
|
||||
|
||||
```zig
|
||||
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) void {
|
||||
const wheel = rl.getMouseWheelMove();
|
||||
|
||||
if (wheel != 0) {
|
||||
const mouse_pos = rl.getMousePosition();
|
||||
const old_zoom = zoom.*;
|
||||
|
||||
// calculate new zoom
|
||||
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
||||
var new_zoom = old_zoom * zoom_factor;
|
||||
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
||||
|
||||
if (new_zoom != old_zoom) {
|
||||
// zoom toward mouse cursor:
|
||||
// keep the world point under the cursor stationary
|
||||
// world_pos = (screen_pos / old_zoom) + old_pan
|
||||
// new_pan = world_pos - (screen_pos / new_zoom)
|
||||
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
||||
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
||||
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
||||
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
||||
zoom.* = new_zoom;
|
||||
|
||||
// clamp pan to bounds
|
||||
clampPan(pan, zoom.*);
|
||||
}
|
||||
}
|
||||
|
||||
// reset on Esc or Space (Space also toggles pause in handleInput)
|
||||
if (rl.isKeyPressed(.escape)) {
|
||||
zoom.* = 1.0;
|
||||
pan.* = @Vector(2, f32){ 0, 0 };
|
||||
}
|
||||
}
|
||||
|
||||
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
||||
// when zoomed in, limit pan so viewport stays in simulation bounds
|
||||
// visible area = screen_size / zoom
|
||||
// max pan = world_size - visible_area
|
||||
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
||||
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
||||
const visible_w = screen_w / zoom;
|
||||
const visible_h = screen_h / zoom;
|
||||
|
||||
const max_pan_x = @max(0, screen_w - visible_w);
|
||||
const max_pan_y = @max(0, screen_h - visible_h);
|
||||
|
||||
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
||||
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
||||
}
|
||||
```
|
||||
|
||||
**Step 3: Call handleCamera in main loop**
|
||||
|
||||
In the main loop, after the `handleInput` call (line 318), add:
|
||||
|
||||
```zig
|
||||
handleCamera(&zoom, &pan);
|
||||
```
|
||||
|
||||
**Step 4: Also reset zoom when Space is pressed**
|
||||
|
||||
In `handleInput`, modify the space key handler (around line 450):
|
||||
|
||||
```zig
|
||||
// pause: space (also resets zoom in handleCamera context)
|
||||
if (rl.isKeyPressed(.space)) {
|
||||
paused.* = !paused.*;
|
||||
}
|
||||
```
|
||||
|
||||
Actually, handleInput doesn't have access to zoom/pan. We need to either:
|
||||
- Pass zoom/pan to handleInput
|
||||
- Handle space reset in handleCamera
|
||||
|
||||
Let's handle it in handleCamera. Add after the escape check:
|
||||
|
||||
```zig
|
||||
// Space also resets zoom (pause is handled separately in handleInput)
|
||||
if (rl.isKeyPressed(.space)) {
|
||||
zoom.* = 1.0;
|
||||
pan.* = @Vector(2, f32){ 0, 0 };
|
||||
}
|
||||
```
|
||||
|
||||
**Step 5: Build and test zoom**
|
||||
|
||||
Run: `zig build && ./zig-out/bin/lofivor`
|
||||
|
||||
Test:
|
||||
1. Scroll up - entities should get bigger (zoom in toward cursor)
|
||||
2. Scroll down - entities get smaller (but not below 1x)
|
||||
3. Press Esc or Space - resets to default view
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Add pan input handling
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/sandbox_main.zig` (handleCamera function)
|
||||
|
||||
**Step 1: Add pan logic to handleCamera**
|
||||
|
||||
Add this after the zoom handling, before the reset checks:
|
||||
|
||||
```zig
|
||||
// pan with any mouse button drag (only when zoomed in)
|
||||
if (zoom.* > 1.0) {
|
||||
const any_button = rl.isMouseButtonDown(.left) or
|
||||
rl.isMouseButtonDown(.right) or
|
||||
rl.isMouseButtonDown(.middle);
|
||||
if (any_button) {
|
||||
const delta = rl.getMouseDelta();
|
||||
// pan in opposite direction of drag (drag right = view moves left = pan increases)
|
||||
pan.*[0] -= delta.x / zoom.*;
|
||||
pan.*[1] -= delta.y / zoom.*;
|
||||
clampPan(pan, zoom.*);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Step 2: Build and test pan**
|
||||
|
||||
Run: `zig build && ./zig-out/bin/lofivor`
|
||||
|
||||
Test:
|
||||
1. Scroll to zoom in past 1x
|
||||
2. Click and drag with any mouse button - viewport should pan
|
||||
3. Try to pan past edges - should be bounded
|
||||
4. At 1x zoom, dragging should do nothing
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Add zoom display to UI
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ui.zig:34` (drawMetrics signature)
|
||||
- Modify: `src/ui.zig:71-72` (add zoom line after render)
|
||||
- Modify: `src/sandbox_main.zig:387` (pass zoom to drawMetrics)
|
||||
|
||||
**Step 1: Update drawMetrics signature**
|
||||
|
||||
Change line 34:
|
||||
|
||||
```zig
|
||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
||||
```
|
||||
|
||||
To:
|
||||
|
||||
```zig
|
||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
||||
```
|
||||
|
||||
**Step 2: Increase box height for zoom line**
|
||||
|
||||
Change line 50:
|
||||
|
||||
```zig
|
||||
const bg_height: i32 = if (paused) 130 else 100;
|
||||
```
|
||||
|
||||
To:
|
||||
|
||||
```zig
|
||||
const bg_height: i32 = if (paused) 150 else 120;
|
||||
```
|
||||
|
||||
**Step 3: Add zoom display after render line**
|
||||
|
||||
After line 72 (render_text draw), add:
|
||||
|
||||
```zig
|
||||
y += line_height;
|
||||
|
||||
// zoom level
|
||||
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
||||
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
||||
```
|
||||
|
||||
**Step 4: Update call in sandbox_main.zig**
|
||||
|
||||
Change line 387:
|
||||
|
||||
```zig
|
||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
||||
```
|
||||
|
||||
To:
|
||||
|
||||
```zig
|
||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
||||
```
|
||||
|
||||
**Step 5: Build and test UI**
|
||||
|
||||
Run: `zig build && ./zig-out/bin/lofivor`
|
||||
|
||||
Test:
|
||||
1. UI should show "zoom: 1.0x" in white
|
||||
2. Scroll to zoom - should update and turn yellow when > 1x
|
||||
3. Reset with Esc - back to white 1.0x
|
||||
|
||||
---
|
||||
|
||||
### Task 6: Update controls legend
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ui.zig:120-139` (drawControls function)
|
||||
|
||||
**Step 1: Update controls list and box height**
|
||||
|
||||
Change line 121:
|
||||
|
||||
```zig
|
||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 5 + box_padding * 2);
|
||||
```
|
||||
|
||||
To:
|
||||
|
||||
```zig
|
||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
||||
```
|
||||
|
||||
Change the controls array (lines 127-133):
|
||||
|
||||
```zig
|
||||
const controls = [_][]const u8{
|
||||
"+/-: 10k entities",
|
||||
"shift +/-: 50k",
|
||||
"scroll: zoom",
|
||||
"drag: pan (zoomed)",
|
||||
"space: pause/reset",
|
||||
"esc: reset zoom",
|
||||
"tab: toggle ui",
|
||||
};
|
||||
```
|
||||
|
||||
**Step 2: Build and final test**
|
||||
|
||||
Run: `zig build && ./zig-out/bin/lofivor`
|
||||
|
||||
Full test:
|
||||
1. Scroll wheel zooms toward cursor (1x-10x)
|
||||
2. Any mouse drag pans when zoomed > 1x
|
||||
3. Pan is bounded to simulation area
|
||||
4. Esc resets zoom/pan
|
||||
5. Space toggles pause AND resets zoom/pan
|
||||
6. UI shows zoom level (yellow when zoomed)
|
||||
7. Controls legend shows new controls
|
||||
|
||||
---
|
||||
|
||||
### Task 7: Commit
|
||||
|
||||
```bash
|
||||
git add src/sandbox_main.zig src/ssbo_renderer.zig src/shaders/entity.vert src/ui.zig
|
||||
git commit -m "feat: add zoom/pan camera
|
||||
|
||||
- scroll wheel zooms toward cursor (1x-10x range)
|
||||
- any mouse button drag pans when zoomed
|
||||
- pan bounded to simulation area
|
||||
- esc/space resets to default view
|
||||
- zoom level shown in metrics panel"
|
||||
```
|
||||
170
docs/plans/2025-12-17-compute-shader-updates.md
Normal file
170
docs/plans/2025-12-17-compute-shader-updates.md
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
# compute shader entity updates
|
||||
|
||||
move entity position math to GPU, eliminate CPU→GPU sync per frame.
|
||||
|
||||
## context
|
||||
|
||||
current bottleneck: per-frame `rlUpdateShaderBuffer()` uploads all entity data from CPU to GPU. at 950k entities that's 19MB/frame. targeting 10M entities would be 160MB/frame.
|
||||
|
||||
solution: keep entity data on GPU entirely. compute shader updates positions, vertex shader renders. CPU just dispatches.
|
||||
|
||||
## data structures
|
||||
|
||||
**GpuEntity (16 bytes, std430):**
|
||||
```glsl
|
||||
struct Entity {
|
||||
float x; // world position
|
||||
float y;
|
||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||
uint color; // 0xRRGGBB
|
||||
};
|
||||
```
|
||||
|
||||
**zig side:**
|
||||
```zig
|
||||
const GpuEntity = extern struct {
|
||||
x: f32,
|
||||
y: f32,
|
||||
packed_vel: i32,
|
||||
color: u32,
|
||||
};
|
||||
|
||||
fn packVelocity(vx: f32, vy: f32) i32 {
|
||||
const vx_fixed: i16 = @intFromFloat(vx * 256.0);
|
||||
const vy_fixed: i16 = @intFromFloat(vy * 256.0);
|
||||
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
||||
}
|
||||
```
|
||||
|
||||
## compute shader
|
||||
|
||||
`src/shaders/entity_update.comp`:
|
||||
```glsl
|
||||
#version 430
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(std430, binding = 0) buffer Entities {
|
||||
Entity entities[];
|
||||
};
|
||||
|
||||
uniform uint entityCount;
|
||||
uniform uint frameNumber;
|
||||
uniform vec2 screenSize;
|
||||
uniform vec2 center;
|
||||
uniform float respawnRadius;
|
||||
|
||||
void main() {
|
||||
uint id = gl_GlobalInvocationID.x;
|
||||
if (id >= entityCount) return;
|
||||
|
||||
Entity e = entities[id];
|
||||
|
||||
// unpack velocity
|
||||
float vx = float(e.packedVel >> 16) / 256.0;
|
||||
float vy = float((e.packedVel << 16) >> 16) / 256.0;
|
||||
|
||||
// update position
|
||||
e.x += vx;
|
||||
e.y += vy;
|
||||
|
||||
// respawn check
|
||||
float dx = e.x - center.x;
|
||||
float dy = e.y - center.y;
|
||||
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
||||
// GPU RNG
|
||||
uint seed = id * 1103515245u + frameNumber * 12345u;
|
||||
seed = seed * 747796405u + 2891336453u;
|
||||
|
||||
uint edge = seed & 3u;
|
||||
float t = float((seed >> 2) & 0xFFFFu) / 65535.0;
|
||||
|
||||
// spawn on edge with velocity toward center
|
||||
// (full edge logic in implementation)
|
||||
}
|
||||
|
||||
entities[id] = e;
|
||||
}
|
||||
```
|
||||
|
||||
## integration
|
||||
|
||||
raylib doesn't wrap compute shaders. use raw GL calls via `compute.zig`:
|
||||
|
||||
```zig
|
||||
pub fn dispatch(entity_count: u32, frame: u32) void {
|
||||
gl.glUseProgram(program);
|
||||
gl.glUniform1ui(entity_count_loc, entity_count);
|
||||
gl.glUniform1ui(frame_loc, frame);
|
||||
// ... other uniforms
|
||||
|
||||
const groups = (entity_count + 255) / 256;
|
||||
gl.glDispatchCompute(groups, 1, 1);
|
||||
gl.glMemoryBarrier(gl.GL_SHADER_STORAGE_BARRIER_BIT);
|
||||
}
|
||||
```
|
||||
|
||||
## frame flow
|
||||
|
||||
**before:**
|
||||
```
|
||||
CPU: update positions (5ms at 950k)
|
||||
CPU: copy to gpu_buffer
|
||||
CPU→GPU: rlUpdateShaderBuffer() ← bottleneck
|
||||
GPU: render
|
||||
```
|
||||
|
||||
**after:**
|
||||
```
|
||||
GPU: compute dispatch (~0ms CPU time)
|
||||
GPU: memory barrier
|
||||
GPU: render
|
||||
```
|
||||
|
||||
## implementation steps
|
||||
|
||||
each step is a commit point if desired.
|
||||
|
||||
### step 1: GpuEntity struct expansion
|
||||
- modify `GpuEntity` in sandbox.zig: add `packed_vel` field
|
||||
- add `packVelocity()` helper
|
||||
- update ssbo_renderer to handle 16-byte stride
|
||||
- verify existing rendering still works
|
||||
|
||||
### step 2: compute shader infrastructure
|
||||
- create `src/compute.zig` with GL bindings
|
||||
- create `src/shaders/entity_update.comp` (position update only, no respawn yet)
|
||||
- load and compile compute shader in sandbox_main.zig
|
||||
- dispatch before render, verify positions update
|
||||
|
||||
### step 3: respawn logic
|
||||
- add GPU RNG to compute shader
|
||||
- implement edge spawning + velocity calculation
|
||||
- remove CPU update loop from sandbox.zig
|
||||
|
||||
### step 4: cleanup ✓
|
||||
- `--compute` is now default, `--cpu` flag for fallback/comparison
|
||||
- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
|
||||
- verbose debug output reduced
|
||||
|
||||
## files changed
|
||||
|
||||
**new:**
|
||||
- `src/shaders/entity_update.comp`
|
||||
- `src/compute.zig`
|
||||
|
||||
**modified:**
|
||||
- `src/sandbox.zig` — GpuEntity struct, packVelocity(), remove CPU update
|
||||
- `src/ssbo_renderer.zig` — remove per-frame upload
|
||||
- `src/sandbox_main.zig` — init compute, dispatch in frame loop
|
||||
|
||||
## risks
|
||||
|
||||
1. **driver quirks** — intel HD 530 compute support is fine but older, may hit edge cases
|
||||
2. **debugging** — GPU code harder to debug, start with small counts
|
||||
3. **fallback** — keep `--compute` flag to A/B test against existing SSBO path
|
||||
|
||||
## expected results
|
||||
|
||||
- CPU update time: ~5ms → ~0ms
|
||||
- no per-frame buffer upload
|
||||
- target: 1M+ entities, pushing toward 10M ceiling
|
||||
89
docs/point_sprites_experiment.md
Normal file
89
docs/point_sprites_experiment.md
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
# point sprites experiment
|
||||
|
||||
branch: `point-sprites` (point-sprites work)
|
||||
date: 2024-12
|
||||
hardware: intel hd 530 (skylake gt2, i5-6500T)
|
||||
|
||||
## hypothesis
|
||||
|
||||
point sprites should be faster than quads because:
|
||||
- 1 vertex per entity instead of 6 (quad = 2 triangles)
|
||||
- less vertex throughput
|
||||
- `gl_PointCoord` provides texture coords automatically
|
||||
|
||||
## implementation
|
||||
|
||||
### vertex shader changes
|
||||
- removed quad vertex attributes (position, texcoord)
|
||||
- use `gl_PointSize = 16.0 * zoom` for size control
|
||||
- position calculated from SSBO data only
|
||||
|
||||
### fragment shader changes
|
||||
- use `gl_PointCoord` instead of vertex texcoord
|
||||
- sample circle texture for alpha
|
||||
|
||||
### renderer changes
|
||||
- load `glEnable` and `glDrawArraysInstanced` via `rlGetProcAddress`
|
||||
- enable `GL_PROGRAM_POINT_SIZE`
|
||||
- draw with `glDrawArraysInstanced(GL_POINTS, 0, 1, count)`
|
||||
- removed VBO (no vertex data needed)
|
||||
|
||||
## results
|
||||
|
||||
### attempt 1: procedural circle in fragment shader
|
||||
|
||||
```glsl
|
||||
vec2 coord = gl_PointCoord - vec2(0.5);
|
||||
float dist = length(coord);
|
||||
float alpha = 1.0 - smoothstep(0.4, 0.5, dist);
|
||||
if (alpha < 0.01) discard;
|
||||
```
|
||||
|
||||
**benchmark @ 350k entities:**
|
||||
- point sprites: 23ms render, 43fps
|
||||
- quads (main): 6.2ms render, 151fps
|
||||
- **result: 3.7x SLOWER**
|
||||
|
||||
**why:** `discard` breaks early-z optimization, `length()` and `smoothstep()` are ALU-heavy, intel integrated GPUs are weak at fragment shader math.
|
||||
|
||||
### attempt 2: texture sampling
|
||||
|
||||
```glsl
|
||||
float alpha = texture(circleTexture, gl_PointCoord).r;
|
||||
finalColor = vec4(fragColor, alpha);
|
||||
```
|
||||
|
||||
**benchmark @ 450k entities:**
|
||||
- point sprites: 19.1ms render, 52fps
|
||||
- quads (main): 8.0ms render, 122fps
|
||||
- **result: 2.4x SLOWER**
|
||||
|
||||
better than procedural, but still significantly slower than quads.
|
||||
|
||||
## analysis
|
||||
|
||||
the theoretical advantage (1/6 vertices) doesn't translate to real performance because:
|
||||
|
||||
1. **triangle path is more optimized** - intel's driver heavily optimizes the standard triangle rasterization path. point sprites use a less-traveled code path.
|
||||
|
||||
2. **fill rate is the bottleneck** - HD 530 has only 3 ROPs. we're bound by how fast we can write pixels, not by vertex count. reducing vertices from 6 to 1 doesn't help when fill rate is the constraint.
|
||||
|
||||
3. **point size overhead** - each point requires computing `gl_PointSize` and setting up the point sprite rasterization, which may have per-vertex overhead.
|
||||
|
||||
4. **texture cache behavior** - `gl_PointCoord` may have worse cache locality than explicit vertex texcoords.
|
||||
|
||||
## conclusion
|
||||
|
||||
**point sprites are a regression on intel hd 530.**
|
||||
|
||||
the optimization makes theoretical sense but fails in practice on this hardware. the quad/triangle path is simply more optimized in intel's mesa driver.
|
||||
|
||||
**keep this branch for testing on discrete GPUs** where point sprites might actually help (nvidia/amd have different optimization priorities).
|
||||
|
||||
## lessons learned
|
||||
|
||||
1. always benchmark, don't assume
|
||||
2. "fewer vertices" doesn't always mean faster
|
||||
3. integrated GPU optimization is different from discrete
|
||||
4. the most optimized path is usually the most common path (triangles)
|
||||
5. fill rate matters more than vertex count at high entity counts
|
||||
201
docs/rops.txt
Normal file
201
docs/rops.txt
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
rops: render output units
|
||||
=========================
|
||||
|
||||
what they are, where they came from, and what yours can do.
|
||||
|
||||
|
||||
what is a rop?
|
||||
--------------
|
||||
|
||||
ROP = Render Output Unit (originally "Raster Operations Pipeline")
|
||||
|
||||
it's the final stage of the GPU pipeline. after all the fancy shader
|
||||
math is done, the ROP is the unit that actually writes pixels to memory.
|
||||
|
||||
think of it as the bottleneck between "calculated" and "visible."
|
||||
|
||||
a ROP does:
|
||||
- depth testing (is this pixel in front of what's already there?)
|
||||
- stencil testing (mask operations)
|
||||
- blending (alpha, additive, etc)
|
||||
- anti-aliasing resolve
|
||||
- writing the final color to the framebuffer
|
||||
|
||||
one ROP can write one pixel per clock cycle (roughly).
|
||||
|
||||
|
||||
the first rop
|
||||
-------------
|
||||
|
||||
the term comes from the IBM 8514/A (1987), which had dedicated hardware
|
||||
for "raster operations" - bitwise operations on pixels (AND, OR, XOR).
|
||||
this was revolutionary because before this, the CPU did all pixel math.
|
||||
|
||||
but the modern ROP as we know it emerged with:
|
||||
|
||||
NVIDIA NV1 (1995)
|
||||
one of the first chips with dedicated pixel output hardware
|
||||
could do ~1 million textured pixels/second
|
||||
|
||||
3dfx Voodoo (1996)
|
||||
the card that defined the modern GPU pipeline
|
||||
had 1 TMU + 1 pixel pipeline (essentially 1 ROP)
|
||||
could push 45 million pixels/second
|
||||
that ONE pipeline ran Quake at 640x480
|
||||
|
||||
NVIDIA GeForce 256 (1999)
|
||||
"the first GPU" - named itself with that term
|
||||
4 pixel pipelines = 4 ROPs
|
||||
480 million pixels/second
|
||||
|
||||
so the original consumer 3D cards had... 1 ROP. and they ran Quake.
|
||||
|
||||
|
||||
what one rop can do
|
||||
-------------------
|
||||
|
||||
let's do the math.
|
||||
|
||||
one ROP at 100 MHz (3dfx Voodoo era):
|
||||
100 million cycles/second
|
||||
~1 pixel per cycle
|
||||
= 100 megapixels/second
|
||||
|
||||
at 640x480 @ 60fps:
|
||||
640 * 480 * 60 = 18.4 megapixels/second needed
|
||||
|
||||
so ONE ROP at 100MHz could handle 640x480 with ~5x headroom for overdraw.
|
||||
|
||||
at 1024x768 @ 60fps:
|
||||
1024 * 768 * 60 = 47 megapixels/second
|
||||
|
||||
now you're at 2x overdraw max. still playable, but tight.
|
||||
|
||||
|
||||
one modern rop
|
||||
--------------
|
||||
|
||||
a single modern ROP runs at ~1-2 GHz and can do more per cycle:
|
||||
- multiple color outputs (MRT)
|
||||
- 64-bit or 128-bit color formats
|
||||
- compressed writes
|
||||
|
||||
rough estimate for one ROP at 1.5 GHz:
|
||||
~1.5 billion pixels/second base throughput
|
||||
|
||||
at 1920x1080 @ 60fps:
|
||||
1920 * 1080 * 60 = 124 megapixels/second
|
||||
|
||||
one ROP could handle 1080p with 12x overdraw headroom.
|
||||
|
||||
at 4K @ 60fps:
|
||||
3840 * 2160 * 60 = 497 megapixels/second
|
||||
|
||||
one ROP could handle 4K with 3x overdraw. tight, but possible.
|
||||
|
||||
|
||||
your three rops (intel hd 530)
|
||||
------------------------------
|
||||
|
||||
HD 530 specs:
|
||||
- 3 ROPs
|
||||
- ~950 MHz boost clock
|
||||
- theoretical: 2.85 GPixels/second
|
||||
|
||||
let's break that down:
|
||||
|
||||
at 1080p @ 60fps (124 MP/s needed):
|
||||
2850 / 124 = 23x overdraw budget
|
||||
|
||||
that's actually generous! you could draw each pixel 23 times.
|
||||
|
||||
so why does lofivor struggle at 1M entities?
|
||||
|
||||
because 1M entities at 4x4 pixels = 16M pixels minimum.
|
||||
but with overlap? let's say average 10x overdraw:
|
||||
160M pixels/frame
|
||||
at 60fps = 9.6 billion pixels/second
|
||||
|
||||
your ceiling is 2.85 billion.
|
||||
|
||||
so you're 3.4x over budget. that's why you top out around 300k-400k
|
||||
before frame drops (which matches empirical testing).
|
||||
|
||||
|
||||
the real constraint
|
||||
-------------------
|
||||
|
||||
ROPs don't work in isolation. they're limited by:
|
||||
|
||||
1. MEMORY BANDWIDTH
|
||||
each pixel write = memory access
|
||||
HD 530 shares DDR4 with CPU (~30 GB/s)
|
||||
at 32-bit color: 30GB/s / 4 bytes = 7.5 billion pixels/second max
|
||||
but you're competing with CPU, texture reads, etc.
|
||||
realistic: maybe 2-3 billion pixels for framebuffer writes
|
||||
|
||||
2. TEXTURE SAMPLING
|
||||
if fragment shader samples textures, TMUs must keep up
|
||||
HD 530 has 24 TMUs, so this isn't the bottleneck
|
||||
|
||||
3. SHADER EXECUTION
|
||||
ROPs wait for fragments to be shaded
|
||||
if shaders are slow, ROPs starve
|
||||
lofivor's shaders are trivial, so this isn't the bottleneck
|
||||
|
||||
for lofivor specifically: your 3 ROPs are THE ceiling.
|
||||
|
||||
|
||||
what could you do with more rops?
|
||||
---------------------------------
|
||||
|
||||
comparison:
|
||||
|
||||
Intel HD 530: 3 ROPs, 2.85 GPixels/s
|
||||
GTX 1060: 48 ROPs, 72 GPixels/s
|
||||
RTX 3080: 96 ROPs, 164 GPixels/s
|
||||
RTX 4090: 176 ROPs, 443 GPixels/s
|
||||
|
||||
with a GTX 1060 (25x your fill rate):
|
||||
lofivor could probably hit 5-10 million entities
|
||||
|
||||
with an RTX 4090 (155x your fill rate):
|
||||
tens of millions, limited by other factors
|
||||
|
||||
|
||||
perspective: what 3 rops means historically
|
||||
-------------------------------------------
|
||||
|
||||
your HD 530 has roughly the fill rate of:
|
||||
- GeForce 4 Ti 4600 (2002): 4 ROPs, 1.2 GPixels/s
|
||||
- Radeon 9700 Pro (2002): 8 ROPs, 2.6 GPixels/s
|
||||
|
||||
you're running hardware that, in raw pixel output, matches GPUs from
|
||||
20+ years ago. but with modern features (compute shaders, SSBO, etc).
|
||||
|
||||
this is why lofivor is interesting: you're achieving 700k+ entities
|
||||
on fill-rate-equivalent hardware that originally ran games with
|
||||
maybe 10,000 triangles on screen.
|
||||
|
||||
the difference is technique. those 2002 games did complex per-pixel
|
||||
lighting, shadows, multiple texture passes. lofivor does one texture
|
||||
sample and one blend. same fill rate, 100x the entities.
|
||||
|
||||
|
||||
the lesson
|
||||
----------
|
||||
|
||||
ROPs are simple: they write pixels.
|
||||
|
||||
the number you have determines your pixel budget.
|
||||
everything else (shaders, vertices, CPU logic) only matters if
|
||||
the ROPs aren't your bottleneck.
|
||||
|
||||
with 3 ROPs, you have roughly 2.85 billion pixels/second.
|
||||
spend them wisely:
|
||||
- cull what's offscreen (don't spend pixels on invisible things)
|
||||
- shrink distant objects (LOD saves pixels)
|
||||
- reduce overlap (spatial organization)
|
||||
- keep shaders simple (don't starve the ROPs)
|
||||
|
||||
your 3 ROPs can do remarkable things. Quake ran on 1.
|
||||
316
docs/why-millions-is-hard.txt
Normal file
316
docs/why-millions-is-hard.txt
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
why rendering millions of entities is hard
|
||||
=========================================
|
||||
|
||||
and what "hard" actually means, from first principles.
|
||||
|
||||
|
||||
the simple answer
|
||||
-----------------
|
||||
|
||||
every frame, your computer does work. work takes time. you have 16.7
|
||||
milliseconds to do all the work before the next frame (at 60fps).
|
||||
|
||||
if the work takes longer than 16.7ms, you miss the deadline. frames drop.
|
||||
the game stutters.
|
||||
|
||||
10 million entities means 10 million units of work. whether that fits in
|
||||
16.7ms depends on how much work each unit is.
|
||||
|
||||
|
||||
what is "work" anyway?
|
||||
----------------------
|
||||
|
||||
let's trace what happens when you draw one entity:
|
||||
|
||||
1. CPU: "here's an entity at position (340, 512), color cyan"
|
||||
2. that data travels over a bus to the GPU
|
||||
3. GPU: receives the data, stores it in memory
|
||||
4. GPU: runs a vertex shader (figures out where on screen)
|
||||
5. GPU: runs a fragment shader (figures out what color each pixel is)
|
||||
6. GPU: writes pixels to the framebuffer
|
||||
7. framebuffer gets sent to your monitor
|
||||
|
||||
each step has a speed limit. the slowest step is your bottleneck.
|
||||
|
||||
|
||||
the bottlenecks, explained simply
|
||||
---------------------------------
|
||||
|
||||
MEMORY BANDWIDTH
|
||||
how fast data can move around. measured in GB/s.
|
||||
|
||||
think of it like a highway. you can have a fast car (processor), but
|
||||
if the highway is jammed, you're stuck in traffic.
|
||||
|
||||
an integrated GPU (like Intel HD 530) shares the highway with the CPU.
|
||||
a discrete GPU (like an RTX card) has its own private highway.
|
||||
|
||||
this is why lofivor's SSBO optimization helped so much: shrinking
|
||||
entity data from 64 bytes to 12 bytes means 5x less traffic.
|
||||
|
||||
DRAW CALLS
|
||||
every time you say "GPU, draw this thing", there's overhead.
|
||||
the CPU and GPU have to synchronize, state gets set up, etc.
|
||||
|
||||
1 draw call for 1 million entities: fast
|
||||
1 million draw calls for 1 million entities: slow
|
||||
|
||||
this is why batching matters. not the drawing itself, but the
|
||||
*coordination* of drawing.
|
||||
|
||||
FILL RATE
|
||||
how many pixels the GPU can color per second.
|
||||
|
||||
a 4x4 pixel entity = 16 pixels
|
||||
1 million entities = 16 million pixels minimum
|
||||
|
||||
but your screen is only ~2 million pixels (1920x1080). so entities
|
||||
overlap. "overdraw" means coloring the same pixel multiple times.
|
||||
|
||||
10 million overlapping entities might touch each pixel 50+ times.
|
||||
that's 100 million pixel operations.
|
||||
|
||||
SHADER COMPLEXITY
|
||||
the GPU runs a tiny program for each vertex and each pixel.
|
||||
|
||||
simple: "put it here, color it this" = fast
|
||||
complex: "calculate lighting from 8 sources, sample 4 textures,
|
||||
apply normal mapping, do fresnel..." = slow
|
||||
|
||||
lofivor's shaders are trivial. AAA game shaders are not.
|
||||
|
||||
CPU-GPU SYNCHRONIZATION
|
||||
the CPU and GPU work in parallel, but sometimes they have to wait
|
||||
for each other.
|
||||
|
||||
if the CPU needs to read GPU results, it stalls.
|
||||
if the GPU needs new data and the CPU is busy, it stalls.
|
||||
|
||||
good code keeps them both busy without waiting.
|
||||
|
||||
|
||||
why "real games" hit CPU walls
|
||||
------------------------------
|
||||
|
||||
rendering is just putting colors on pixels. that's the GPU's job.
|
||||
|
||||
but games aren't just rendering. they're also:
|
||||
|
||||
- COLLISION DETECTION
|
||||
does entity A overlap entity B?
|
||||
|
||||
naive approach: check every pair
|
||||
1,000 entities = 500,000 checks (n squared / 2)
|
||||
10,000 entities = 50,000,000 checks
|
||||
1,000,000 entities = 500,000,000,000,000 checks
|
||||
|
||||
that's 500 trillion. per frame. not happening.
|
||||
|
||||
smart approach: spatial partitioning (grids, quadtrees)
|
||||
only check nearby entities. but still, at millions of entities,
|
||||
even "nearby" is a lot.
|
||||
|
||||
- AI / BEHAVIOR
|
||||
each entity decides what to do.
|
||||
|
||||
simple: move toward player. cheap.
|
||||
complex: pathfind around obstacles, consider threats, coordinate
|
||||
with allies, remember state. expensive.
|
||||
|
||||
lofivor entities just drift in a direction. no decisions.
|
||||
a real game enemy makes decisions every frame.
|
||||
|
||||
- PHYSICS
|
||||
entities push each other, bounce, have mass and friction.
|
||||
every interaction is math. lots of entities = lots of math.
|
||||
|
||||
- GAME LOGIC
|
||||
damage calculations, spawning, leveling, cooldowns, buffs...
|
||||
all of this runs on the CPU, every frame.
|
||||
|
||||
so: lofivor can render 700k entities because they don't DO anything.
|
||||
a game with 700k entities that think, collide, and interact would
|
||||
need god-tier optimization or would simply not run.
|
||||
|
||||
|
||||
what makes AAA games slow on old hardware?
|
||||
------------------------------------------
|
||||
|
||||
it's not entity count. most AAA games have maybe hundreds of
|
||||
"entities" on screen. it's everything else:
|
||||
|
||||
TEXTURE RESOLUTION
|
||||
a 4K texture is 67 million pixels of data. per texture.
|
||||
one character might have 10+ textures (diffuse, normal, specular,
|
||||
roughness, ambient occlusion...).
|
||||
|
||||
old hardware: less VRAM, slower texture sampling.
|
||||
|
||||
SHADER COMPLEXITY
|
||||
modern materials simulate light physics. subsurface scattering,
|
||||
global illumination, ray-traced reflections.
|
||||
|
||||
each pixel might do hundreds of math operations.
|
||||
|
||||
POST-PROCESSING
|
||||
bloom, motion blur, depth of field, ambient occlusion, anti-aliasing.
|
||||
full-screen passes that touch every pixel multiple times.
|
||||
|
||||
MESH COMPLEXITY
|
||||
a character might be 100,000 triangles.
|
||||
10 characters = 1 million triangles.
|
||||
each triangle goes through the vertex shader.
|
||||
|
||||
SHADOWS
|
||||
render the scene again from the light's perspective.
|
||||
for each light. every frame.
|
||||
|
||||
AAA games are doing 100x more work per pixel than lofivor.
|
||||
lofivor is doing 100x more pixels than AAA games.
|
||||
|
||||
different problems.
|
||||
|
||||
|
||||
the "abuse" vs "respect" distinction
|
||||
------------------------------------
|
||||
|
||||
abuse: making the hardware do unnecessary work.
|
||||
respect: achieving your goal with minimal waste.
|
||||
|
||||
examples of abuse (that lofivor fixed):
|
||||
|
||||
- sending 64 bytes (a full matrix) when you need 12 bytes (x, y, color)
|
||||
- one draw call per entity when you could batch
|
||||
- calculating transforms on CPU when GPU could do it
|
||||
- clearing the screen twice
|
||||
- uploading the same data every frame
|
||||
|
||||
examples of abuse in the wild:
|
||||
|
||||
- electron apps using a whole browser to show a chat window
|
||||
- games that re-render static UI every frame
|
||||
- loading 4K textures for objects that appear 20 pixels tall
|
||||
- running AI pathfinding for off-screen entities
|
||||
|
||||
the hardware has limits. respecting them means fitting your game
|
||||
within those limits through smart decisions. abusing them means
|
||||
throwing cycles at problems you created yourself.
|
||||
|
||||
|
||||
so can you do 1 million entities with juice on old hardware?
|
||||
------------------------------------------------------------
|
||||
|
||||
yes, with the right decisions.
|
||||
|
||||
what "juice" typically means:
|
||||
- screen shake (free, just offset the camera)
|
||||
- particle effects (separate system, heavily optimized)
|
||||
- flash/hit feedback (change a color value)
|
||||
- sound (different system entirely)
|
||||
|
||||
particles are special: they're designed for millions of tiny things.
|
||||
they don't collide, don't think, often don't even persist (spawn,
|
||||
drift, fade, die). GPU particle systems are essentially what lofivor
|
||||
became: minimal data, instanced rendering.
|
||||
|
||||
what would kill you at 1 million:
|
||||
- per-entity collision
|
||||
- per-entity AI
|
||||
- per-entity sprite variety (texture switches)
|
||||
- per-entity complex shaders
|
||||
|
||||
what you could do:
|
||||
- 1 million particles (visual only, no logic)
|
||||
- 10,000 enemies with collision/AI + 990,000 particles
|
||||
- 100,000 enemies with simple behavior + spatial hash collision
|
||||
|
||||
the secret: most of what looks like "millions of things" in games
|
||||
is actually a small number of meaningful entities + a large number
|
||||
of dumb particles.
|
||||
|
||||
|
||||
the laws of physics (sort of)
|
||||
-----------------------------
|
||||
|
||||
there are hard limits:
|
||||
|
||||
MEMORY BUS BANDWIDTH
|
||||
a DDR4 system might move 25 GB/s.
|
||||
1 million entities at 12 bytes each = 12 MB.
|
||||
at 60fps = 720 MB/s just for entity data.
|
||||
that's only 3% of bandwidth. plenty of room.
|
||||
|
||||
but a naive approach (64 bytes, plus overhead) could be
|
||||
10x worse. suddenly you're at 30%.
|
||||
|
||||
CLOCK CYCLES
|
||||
a 3GHz CPU does 3 billion operations per second.
|
||||
at 60fps, that's 50 million operations per frame.
|
||||
1 million entities = 50 operations each.
|
||||
|
||||
50 operations is: a few multiplies, some loads/stores, a branch.
|
||||
that's barely enough for "move in a direction".
|
||||
pathfinding? AI? collision? not a chance.
|
||||
|
||||
PARALLELISM
|
||||
GPUs have thousands of cores but they're simple.
|
||||
CPUs have few cores but they're smart.
|
||||
|
||||
entity rendering: perfectly parallel (GPU wins)
|
||||
entity decision-making: often sequential (CPU bound)
|
||||
|
||||
so yes, physics constrains us. but "physics" here means:
|
||||
- how fast electrons move through silicon
|
||||
- how much data fits on a wire
|
||||
- how many transistors fit on a chip
|
||||
|
||||
within those limits, there's room. lots of room, if you're clever.
|
||||
lofivor went from 5k to 700k by being clever, not by breaking physics.
|
||||
|
||||
|
||||
the actual lesson
|
||||
-----------------
|
||||
|
||||
the limit isn't really "the hardware can't do it."
|
||||
|
||||
the limit is "the hardware can't do it THE WAY YOU'RE DOING IT."
|
||||
|
||||
every optimization in lofivor was finding a different way:
|
||||
- don't draw circles, blit textures
|
||||
- don't call functions, submit vertices directly
|
||||
- don't send matrices, send packed structs
|
||||
- don't update on CPU, use compute shaders
|
||||
|
||||
the hardware was always capable of 700k. the code wasn't asking right.
|
||||
|
||||
this is true at every level. that old laptop struggling with 10k
|
||||
entities in some game? probably not the laptop's fault. probably
|
||||
the game is doing something wasteful that doesn't need to be.
|
||||
|
||||
"runs poorly on old hardware" often means "we didn't try to make
|
||||
it run on old hardware" not "it's impossible on old hardware."
|
||||
|
||||
|
||||
closing thought
|
||||
---------------
|
||||
|
||||
10 million is a lot. but 1 million? 2 million?
|
||||
|
||||
with discipline: yes.
|
||||
with decisions that respect the hardware: yes.
|
||||
with awareness of what's actually expensive: yes.
|
||||
|
||||
the knowledge of what's expensive is the key.
|
||||
|
||||
most developers don't have it. they use high-level abstractions
|
||||
that hide the cost. they've never seen a frame budget or a
|
||||
bandwidth calculation.
|
||||
|
||||
lofivor is a learning tool. the journey from 5k to 700k teaches
|
||||
where the costs are. once you see them, you can't unsee them.
|
||||
|
||||
you start asking: "what is this actually doing? what does it cost?
|
||||
is there a cheaper way?"
|
||||
|
||||
that's the skill. not the specific techniques—those change with
|
||||
hardware. the skill is asking the questions.
|
||||
35
journal.txt
35
journal.txt
|
|
@ -206,3 +206,38 @@ total improvement from baseline:
|
|||
- SSBO: 60fps @ ~700k entities
|
||||
- ~140x improvement overall!
|
||||
|
||||
---
|
||||
|
||||
optimization 6: compute shader updates
|
||||
--------------------------------------
|
||||
technique: move entity position + respawn logic from CPU to GPU compute shader
|
||||
code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
|
||||
version: 0.7.0
|
||||
|
||||
struct GpuEntity {
|
||||
x: f32, // 4 bytes
|
||||
y: f32, // 4 bytes
|
||||
packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
|
||||
color: u32, // 4 bytes
|
||||
}; // = 16 bytes total (was 12)
|
||||
|
||||
changes:
|
||||
- entity_update.comp: position update, center check, edge respawn, velocity calc
|
||||
- GPU RNG: PCG-style PRNG seeded with entity id + frame number
|
||||
- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
|
||||
- CPU update loop skipped entirely when compute enabled
|
||||
|
||||
benchmark results (i5-6500T / HD 530):
|
||||
- update time: ~5ms → ~0ms at 150k entities
|
||||
- render time unchanged (GPU-bound as before)
|
||||
- total frame time improvement at high entity counts
|
||||
|
||||
analysis: CPU was doing ~150k position updates + distance checks + respawn logic
|
||||
per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
|
||||
new entities when user adds them, not per-frame. memory barrier ensures compute
|
||||
writes visible to vertex shader before draw.
|
||||
|
||||
flags:
|
||||
- --compute: GPU compute updates (now default)
|
||||
- --cpu: fallback to CPU update path for comparison
|
||||
|
||||
|
|
|
|||
16
justfile
16
justfile
|
|
@ -42,11 +42,20 @@ check:
|
|||
test:
|
||||
zig build test
|
||||
|
||||
# auto-benchmark (ramps entities until performance degrades, works on linux/windows)
|
||||
# run sandbox (GPU compute is default)
|
||||
sandbox:
|
||||
zig build -Doptimize=ReleaseFast run
|
||||
|
||||
# auto-benchmark (ramps entities until performance degrades)
|
||||
bench:
|
||||
zig build -Doptimize=ReleaseFast run -- --bench
|
||||
cat benchmark.log
|
||||
|
||||
# benchmark with CPU update path (for comparison)
|
||||
bench-cpu:
|
||||
zig build -Doptimize=ReleaseFast run -- --bench --cpu
|
||||
cat benchmark.log
|
||||
|
||||
# software-rendered benchmark (for CI/headless servers)
|
||||
[linux]
|
||||
bench-sw:
|
||||
|
|
@ -58,3 +67,8 @@ bench-sw:
|
|||
bench-sw:
|
||||
@echo "bench-sw: windows doesn't have xvfb equivalent"
|
||||
@echo "use 'just bench' if you have a GPU, or run in WSL/linux CI"
|
||||
|
||||
[linux]
|
||||
profile port="9876":
|
||||
# start Tracy: tracy-profiler -a 127.0.0.1 -p {{port}}
|
||||
zig build -Dtracy=true -Doptimize=ReleaseFast && TRACY_PORT={{port}} ./zig-out/bin/sandbox
|
||||
|
|
|
|||
8
releases/0.1.0-unoptimized.txt
Normal file
8
releases/0.1.0-unoptimized.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
the baseline: one draw call per entity, pure and simple
|
||||
|
||||
- individual rl.drawCircle() calls in a loop
|
||||
- ~5k entities at 60fps before frame times tank
|
||||
- linear scaling: 10k = ~43ms, 20k = ~77ms
|
||||
- render-bound (update loop stays under 1ms even at 30k)
|
||||
- each circle is its own GPU draw call
|
||||
- the starting point for optimization experiments
|
||||
8
releases/0.2.0-texture_blitting.txt
Normal file
8
releases/0.2.0-texture_blitting.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
pre-render once, blit many: 10x improvement
|
||||
|
||||
- render circle to 16x16 texture at startup
|
||||
- drawTexture() per entity instead of drawCircle()
|
||||
- raylib batches same-texture draws internally
|
||||
- ~50k entities at 60fps
|
||||
- simple change, big win
|
||||
- still one function call per entity, but GPU work is batched
|
||||
9
releases/0.3.0-quad_batching.txt
Normal file
9
releases/0.3.0-quad_batching.txt
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
bypass the wrapper, go straight to rlgl: 2x more
|
||||
|
||||
- skip drawTexture(), submit vertices directly via rl.gl
|
||||
- manually build quads: rlTexCoord2f + rlVertex2f per corner
|
||||
- rlBegin/rlEnd wraps the whole entity loop
|
||||
- ~100k entities at 60fps
|
||||
- eliminates per-call function overhead
|
||||
- vertices go straight to GPU buffer
|
||||
- 20x improvement over baseline
|
||||
11
releases/0.3.1-batch_buffer.txt
Normal file
11
releases/0.3.1-batch_buffer.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
bigger buffer, fewer flushes: squeezing out more headroom
|
||||
|
||||
- increased raylib batch buffer from 8192 to 32768 vertices
|
||||
- ~140k entities at 60fps on i5-6500T
|
||||
- ~40% improvement over default buffer
|
||||
- fewer GPU flushes per frame
|
||||
- also added: release workflows for github and forgejo
|
||||
- added OPTIMIZATIONS.md documenting the journey
|
||||
- added README, UI panel with FPS display
|
||||
- heap allocated entity array to support 1 million entities
|
||||
- per-entity RGB colors
|
||||
13
releases/0.4.0-gpu_instancing.txt
Normal file
13
releases/0.4.0-gpu_instancing.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
gpu instancing: a disappointing discovery
|
||||
|
||||
- drawMeshInstanced() with per-entity transform matrices
|
||||
- ~150k entities at 60fps - barely better than rlgl batching
|
||||
- negligible improvement on integrated graphics
|
||||
- why it didn't help:
|
||||
- integrated GPU shares system RAM (no PCIe transfer savings)
|
||||
- 64-byte matrix per entity vs ~80 bytes for rlgl vertices
|
||||
- bottleneck is memory bandwidth, not draw call overhead
|
||||
- rlgl batching already minimizes draw calls effectively
|
||||
- orthographic camera setup for 2D-like rendering
|
||||
- heap-allocated transforms buffer (64MB too big for stack)
|
||||
- lesson learned: not all "advanced" techniques are wins
|
||||
17
releases/0.5.0-ssbo_instancing.txt
Normal file
17
releases/0.5.0-ssbo_instancing.txt
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
ssbo breakthrough: 5x gain by shrinking the data
|
||||
|
||||
- pack entity data (x, y, color) into 12-byte struct
|
||||
- upload via shader storage buffer object (SSBO)
|
||||
- ~700k entities at 60fps (i5-6500T / HD 530)
|
||||
- ~950k entities at ~57fps
|
||||
- 5x improvement over previous best
|
||||
- 140x total from baseline
|
||||
- why it works:
|
||||
- 12 bytes vs 64 bytes (matrices) = 5.3x less bandwidth
|
||||
- 12 bytes vs 80 bytes (rlgl vertices) = 6.7x less bandwidth
|
||||
- no CPU-side matrix calculations
|
||||
- GPU does NDC conversion and color unpacking
|
||||
- custom vertex/fragment shaders
|
||||
- single rlDrawVertexArrayInstanced() call for all entities
|
||||
- shaders embedded at build time
|
||||
- removed FPS cap, added optional vsync arg
|
||||
5
releases/0.5.1-windows_build.txt
Normal file
5
releases/0.5.1-windows_build.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
cross-platform release: adding windows to the party
|
||||
|
||||
- updated github release workflow
|
||||
- builds for both linux and windows now
|
||||
- no code changes, just CI/CD work
|
||||
10
releases/0.6.0-zoom_zoom.txt
Normal file
10
releases/0.6.0-zoom_zoom.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
zoom and pan: making millions of entities explorable
|
||||
|
||||
- mouse wheel zoom
|
||||
- click and drag panning
|
||||
- orthographic camera transforms
|
||||
- memory panel showing entity buffer sizes
|
||||
- background draws immediately (no flicker)
|
||||
- tab key toggles UI panels
|
||||
- explained "lofivor" name in README (lo-fi survivor)
|
||||
- shader updated for zoom/pan transforms
|
||||
5
releases/0.6.1-q_to_quit.txt
Normal file
5
releases/0.6.1-q_to_quit.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
quick exit: zoom out then quit
|
||||
|
||||
- q key first zooms out, second press quits
|
||||
- nice way to see the full entity field before closing
|
||||
- minor UI text fix
|
||||
11
releases/0.7.0-compute_shader.txt
Normal file
11
releases/0.7.0-compute_shader.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
compute shader: moving physics to the GPU
|
||||
|
||||
- entity position updates now run on GPU via compute shader
|
||||
- GPU-based RNG for entity velocity randomization
|
||||
- full simulation loop stays on GPU, no CPU roundtrip
|
||||
- new compute.zig module for shader management
|
||||
- GpuEntity struct with position, velocity, and color
|
||||
- tracy profiling integration
|
||||
- FPS display turns green (good) or red (bad)
|
||||
- added design docs for zoom/pan and compute shader work
|
||||
- cross-platform alignment fixes for shader data
|
||||
111
src/compute.zig
Normal file
111
src/compute.zig
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
// compute shader module for GPU entity updates
|
||||
// wraps raw GL calls that raylib doesn't expose directly
|
||||
|
||||
const std = @import("std");
|
||||
const rl = @import("raylib");
|
||||
const sandbox = @import("sandbox.zig");
|
||||
|
||||
const comp_source = @embedFile("shaders/entity_update.comp");
|
||||
|
||||
// GL constants not exposed by raylib-zig
|
||||
const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
|
||||
|
||||
// function pointer type for glMemoryBarrier
|
||||
const GlMemoryBarrierFn = *const fn (barriers: u32) callconv(.c) void;
|
||||
|
||||
pub const ComputeShader = struct {
|
||||
program_id: u32,
|
||||
entity_count_loc: i32,
|
||||
frame_number_loc: i32,
|
||||
screen_size_loc: i32,
|
||||
center_loc: i32,
|
||||
respawn_radius_loc: i32,
|
||||
entity_speed_loc: i32,
|
||||
glMemoryBarrier: GlMemoryBarrierFn,
|
||||
|
||||
pub fn init() ?ComputeShader {
|
||||
// load glMemoryBarrier dynamically
|
||||
const barrier_ptr = rl.gl.rlGetProcAddress("glMemoryBarrier");
|
||||
const glMemoryBarrier: GlMemoryBarrierFn = @ptrCast(@alignCast(barrier_ptr));
|
||||
|
||||
// compile compute shader
|
||||
const shader_id = rl.gl.rlCompileShader(comp_source, rl.gl.rl_compute_shader);
|
||||
if (shader_id == 0) {
|
||||
std.debug.print("compute: failed to compile compute shader\n", .{});
|
||||
return null;
|
||||
}
|
||||
|
||||
// link compute program
|
||||
const program_id = rl.gl.rlLoadComputeShaderProgram(shader_id);
|
||||
if (program_id == 0) {
|
||||
std.debug.print("compute: failed to link compute program\n", .{});
|
||||
return null;
|
||||
}
|
||||
|
||||
// get uniform locations
|
||||
const entity_count_loc = rl.gl.rlGetLocationUniform(program_id, "entityCount");
|
||||
const frame_number_loc = rl.gl.rlGetLocationUniform(program_id, "frameNumber");
|
||||
const screen_size_loc = rl.gl.rlGetLocationUniform(program_id, "screenSize");
|
||||
const center_loc = rl.gl.rlGetLocationUniform(program_id, "center");
|
||||
const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
|
||||
const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
|
||||
|
||||
std.debug.print("compute: shader loaded\n", .{});
|
||||
|
||||
return .{
|
||||
.program_id = program_id,
|
||||
.entity_count_loc = entity_count_loc,
|
||||
.frame_number_loc = frame_number_loc,
|
||||
.screen_size_loc = screen_size_loc,
|
||||
.center_loc = center_loc,
|
||||
.respawn_radius_loc = respawn_radius_loc,
|
||||
.entity_speed_loc = entity_speed_loc,
|
||||
.glMemoryBarrier = glMemoryBarrier,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: *ComputeShader) void {
|
||||
rl.gl.rlUnloadShaderProgram(self.program_id);
|
||||
}
|
||||
|
||||
pub fn dispatch(self: *ComputeShader, ssbo_id: u32, entity_count: u32, frame_number: u32) void {
|
||||
if (entity_count == 0) return;
|
||||
|
||||
// constants from sandbox.zig
|
||||
const screen_w: f32 = @floatFromInt(sandbox.SCREEN_WIDTH);
|
||||
const screen_h: f32 = @floatFromInt(sandbox.SCREEN_HEIGHT);
|
||||
const center_x: f32 = screen_w / 2.0;
|
||||
const center_y: f32 = screen_h / 2.0;
|
||||
const respawn_radius: f32 = 10.0; // RESPAWN_THRESHOLD
|
||||
const entity_speed: f32 = 2.0; // ENTITY_SPEED
|
||||
|
||||
// bind compute shader
|
||||
rl.gl.rlEnableShader(self.program_id);
|
||||
|
||||
// set uniforms
|
||||
rl.gl.rlSetUniform(self.entity_count_loc, &entity_count, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
||||
rl.gl.rlSetUniform(self.frame_number_loc, &frame_number, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_uint), 1);
|
||||
|
||||
const screen_size = [2]f32{ screen_w, screen_h };
|
||||
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||
|
||||
const center = [2]f32{ center_x, center_y };
|
||||
rl.gl.rlSetUniform(self.center_loc, ¢er, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||
|
||||
rl.gl.rlSetUniform(self.respawn_radius_loc, &respawn_radius, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||
rl.gl.rlSetUniform(self.entity_speed_loc, &entity_speed, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||
|
||||
// bind SSBO to binding point 0
|
||||
rl.gl.rlBindShaderBuffer(ssbo_id, 0);
|
||||
|
||||
// dispatch compute workgroups: ceil(entity_count / 256)
|
||||
const groups = (entity_count + 255) / 256;
|
||||
rl.gl.rlComputeShaderDispatch(groups, 1, 1);
|
||||
|
||||
// memory barrier - ensure compute writes are visible to vertex shader
|
||||
self.glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
||||
|
||||
// unbind
|
||||
rl.gl.rlBindShaderBuffer(0, 0);
|
||||
}
|
||||
};
|
||||
|
|
@ -18,7 +18,7 @@ pub const Entity = struct {
|
|||
color: u32,
|
||||
};
|
||||
|
||||
pub const MAX_ENTITIES: usize = 1_000_000;
|
||||
pub const MAX_ENTITIES: usize = 10_000_000;
|
||||
|
||||
pub const Entities = struct {
|
||||
items: []Entity,
|
||||
|
|
@ -287,34 +287,69 @@ test "update respawns entity at edge when reaching center" {
|
|||
try std.testing.expect(on_left or on_right or on_top or on_bottom);
|
||||
}
|
||||
|
||||
// GPU entity for SSBO rendering (position + color only, no velocity)
|
||||
// GPU entity for SSBO rendering (16 bytes, matches compute shader layout)
|
||||
pub const GpuEntity = extern struct {
|
||||
x: f32,
|
||||
y: f32,
|
||||
packed_vel: i32, // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||
color: u32,
|
||||
};
|
||||
|
||||
// pack two f32 velocities into a single i32 (fixed-point 8.8 format)
|
||||
pub fn packVelocity(vx: f32, vy: f32) i32 {
|
||||
const vx_fixed: i16 = @intFromFloat(std.math.clamp(vx * 256.0, -32768.0, 32767.0));
|
||||
const vy_fixed: i16 = @intFromFloat(std.math.clamp(vy * 256.0, -32768.0, 32767.0));
|
||||
return (@as(i32, vx_fixed) << 16) | (@as(i32, vy_fixed) & 0xFFFF);
|
||||
}
|
||||
|
||||
test "GpuEntity struct has correct size for SSBO" {
|
||||
// SSBO layout: x(4) + y(4) + color(4) = 12 bytes
|
||||
try std.testing.expectEqual(@as(usize, 12), @sizeOf(GpuEntity));
|
||||
// SSBO layout: x(4) + y(4) + packed_vel(4) + color(4) = 16 bytes
|
||||
try std.testing.expectEqual(@as(usize, 16), @sizeOf(GpuEntity));
|
||||
}
|
||||
|
||||
test "GpuEntity can be created from Entity" {
|
||||
const entity = Entity{
|
||||
.x = 100.0,
|
||||
.y = 200.0,
|
||||
.vx = 1.5, // ignored for GPU
|
||||
.vy = -0.5, // ignored for GPU
|
||||
.vx = 1.5,
|
||||
.vy = -0.5,
|
||||
.color = 0x00FFFF,
|
||||
};
|
||||
|
||||
const gpu_entity = GpuEntity{
|
||||
.x = entity.x,
|
||||
.y = entity.y,
|
||||
.packed_vel = packVelocity(entity.vx, entity.vy),
|
||||
.color = entity.color,
|
||||
};
|
||||
|
||||
try std.testing.expectEqual(@as(f32, 100.0), gpu_entity.x);
|
||||
try std.testing.expectEqual(@as(f32, 200.0), gpu_entity.y);
|
||||
try std.testing.expectEqual(@as(u32, 0x00FFFF), gpu_entity.color);
|
||||
|
||||
// unpack and verify velocity (should round-trip within precision)
|
||||
const vx_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel >> 16)))) / 256.0;
|
||||
const vy_unpacked = @as(f32, @floatFromInt(@as(i16, @truncate(gpu_entity.packed_vel)))) / 256.0;
|
||||
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vx_unpacked, 0.004);
|
||||
try std.testing.expectApproxEqAbs(@as(f32, -0.5), vy_unpacked, 0.004);
|
||||
}
|
||||
|
||||
test "packVelocity round-trips correctly" {
|
||||
// test positive values
|
||||
const packed1 = packVelocity(2.0, 1.5);
|
||||
const vx1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1 >> 16)))) / 256.0;
|
||||
const vy1 = @as(f32, @floatFromInt(@as(i16, @truncate(packed1)))) / 256.0;
|
||||
try std.testing.expectApproxEqAbs(@as(f32, 2.0), vx1, 0.004);
|
||||
try std.testing.expectApproxEqAbs(@as(f32, 1.5), vy1, 0.004);
|
||||
|
||||
// test negative values
|
||||
const packed2 = packVelocity(-1.0, -2.5);
|
||||
const vx2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2 >> 16)))) / 256.0;
|
||||
const vy2 = @as(f32, @floatFromInt(@as(i16, @truncate(packed2)))) / 256.0;
|
||||
try std.testing.expectApproxEqAbs(@as(f32, -1.0), vx2, 0.004);
|
||||
try std.testing.expectApproxEqAbs(@as(f32, -2.5), vy2, 0.004);
|
||||
|
||||
// test zero
|
||||
const packed3 = packVelocity(0.0, 0.0);
|
||||
try std.testing.expectEqual(@as(i32, 0), packed3);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,9 +3,11 @@
|
|||
|
||||
const std = @import("std");
|
||||
const rl = @import("raylib");
|
||||
const ztracy = @import("ztracy");
|
||||
const sandbox = @import("sandbox.zig");
|
||||
const ui = @import("ui.zig");
|
||||
const SsboRenderer = @import("ssbo_renderer.zig").SsboRenderer;
|
||||
const ComputeShader = @import("compute.zig").ComputeShader;
|
||||
|
||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||
const SCREEN_HEIGHT = sandbox.SCREEN_HEIGHT;
|
||||
|
|
@ -20,7 +22,7 @@ const TEXTURE_SIZE: i32 = 16; // must be >= 2 * radius
|
|||
const MESH_SIZE: f32 = @floatFromInt(TEXTURE_SIZE); // match texture size
|
||||
|
||||
// logging thresholds
|
||||
const TARGET_FRAME_MS: f32 = 16.7; // 60fps
|
||||
const TARGET_FRAME_MS: f32 = 8.33; // 120fps
|
||||
const THRESHOLD_MARGIN: f32 = 2.0; // hysteresis margin to avoid bounce
|
||||
const JUMP_THRESHOLD_MS: f32 = 5.0; // log if frame time jumps by this much
|
||||
const HEARTBEAT_INTERVAL: f32 = 10.0; // seconds between periodic logs
|
||||
|
|
@ -31,6 +33,11 @@ const BENCH_RAMP_AMOUNT: usize = 50_000; // entities added per ramp
|
|||
const BENCH_EXIT_THRESHOLD_MS: f32 = 25.0; // exit when frame time exceeds this
|
||||
const BENCH_EXIT_SUSTAIN: f32 = 1.0; // must stay above threshold for this long
|
||||
|
||||
// zoom settings
|
||||
const ZOOM_MIN: f32 = 1.0;
|
||||
const ZOOM_MAX: f32 = 10.0;
|
||||
const ZOOM_SPEED: f32 = 0.1; // multiplier per scroll tick
|
||||
|
||||
const BenchmarkLogger = struct {
|
||||
file: ?std.fs.File,
|
||||
last_logged_frame_ms: f32,
|
||||
|
|
@ -156,6 +163,8 @@ pub fn main() !void {
|
|||
var bench_mode = false;
|
||||
var use_instancing = false;
|
||||
var use_ssbo = true;
|
||||
var use_vsync = false;
|
||||
var use_compute = true; // GPU compute is now default
|
||||
var args = try std.process.argsWithAllocator(std.heap.page_allocator);
|
||||
defer args.deinit();
|
||||
_ = args.skip(); // skip program name
|
||||
|
|
@ -167,12 +176,23 @@ pub fn main() !void {
|
|||
use_ssbo = false; // legacy GPU instancing path
|
||||
} else if (std.mem.eql(u8, arg, "--legacy")) {
|
||||
use_ssbo = false; // legacy rlgl batched path
|
||||
} else if (std.mem.eql(u8, arg, "--vsync")) {
|
||||
use_vsync = true;
|
||||
} else if (std.mem.eql(u8, arg, "--cpu")) {
|
||||
use_compute = false; // fallback to CPU update path
|
||||
}
|
||||
}
|
||||
|
||||
if (use_vsync) {
|
||||
rl.setConfigFlags(.{ .vsync_hint = true });
|
||||
}
|
||||
rl.initWindow(@intCast(SCREEN_WIDTH), @intCast(SCREEN_HEIGHT), "lofivor sandbox");
|
||||
defer rl.closeWindow();
|
||||
rl.setTargetFPS(60);
|
||||
|
||||
// show background immediately (avoid black screen during init)
|
||||
rl.beginDrawing();
|
||||
rl.clearBackground(BG_COLOR);
|
||||
rl.endDrawing();
|
||||
|
||||
// use larger batch buffer: 16384 elements vs default 8192
|
||||
// fewer flushes = less driver overhead per frame
|
||||
|
|
@ -241,6 +261,26 @@ pub fn main() !void {
|
|||
if (ssbo_renderer) |*r| r.deinit();
|
||||
}
|
||||
|
||||
// compute shader setup (only if --compute flag)
|
||||
var compute_shader: ?ComputeShader = null;
|
||||
|
||||
if (use_compute) {
|
||||
if (!use_ssbo) {
|
||||
std.debug.print("--compute requires SSBO mode (default), ignoring\n", .{});
|
||||
} else {
|
||||
compute_shader = ComputeShader.init();
|
||||
if (compute_shader == null) {
|
||||
std.debug.print("failed to initialize compute shader, falling back to CPU\n", .{});
|
||||
} else {
|
||||
std.debug.print("compute shader mode enabled\n", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
defer {
|
||||
if (compute_shader) |*c| c.deinit();
|
||||
}
|
||||
|
||||
// load UI font (embedded)
|
||||
const font_data = @embedFile("verdanab.ttf");
|
||||
const ui_font = rl.loadFontFromMemory(".ttf", font_data, 32, null) catch {
|
||||
|
|
@ -254,6 +294,11 @@ pub fn main() !void {
|
|||
var rng = prng.random();
|
||||
|
||||
var paused = false;
|
||||
|
||||
// camera state for zoom/pan
|
||||
var zoom: f32 = 1.0;
|
||||
var pan = @Vector(2, f32){ 0, 0 };
|
||||
|
||||
var logger = BenchmarkLogger.init();
|
||||
defer logger.deinit();
|
||||
|
||||
|
|
@ -261,6 +306,7 @@ pub fn main() !void {
|
|||
var update_time_us: i64 = 0;
|
||||
var render_time_us: i64 = 0;
|
||||
var elapsed: f32 = 0;
|
||||
var frame_number: u32 = 0;
|
||||
|
||||
// auto-benchmark state
|
||||
var last_ramp_time: f32 = 0;
|
||||
|
|
@ -306,24 +352,47 @@ pub fn main() !void {
|
|||
} else {
|
||||
// manual controls
|
||||
handleInput(&entities, &rng, &paused);
|
||||
if (handleCamera(&zoom, &pan)) break;
|
||||
}
|
||||
|
||||
// update
|
||||
if (!paused) {
|
||||
const tracy_update = ztracy.ZoneN(@src(), "update");
|
||||
defer tracy_update.End();
|
||||
const update_start = std.time.microTimestamp();
|
||||
|
||||
if (compute_shader == null) {
|
||||
// CPU update path (positions + respawn)
|
||||
sandbox.update(&entities, &rng);
|
||||
}
|
||||
// GPU compute path handles update in render section before draw
|
||||
|
||||
update_time_us = std.time.microTimestamp() - update_start;
|
||||
}
|
||||
|
||||
// render
|
||||
const tracy_render = ztracy.ZoneN(@src(), "render");
|
||||
defer tracy_render.End();
|
||||
const render_start = std.time.microTimestamp();
|
||||
|
||||
rl.beginDrawing();
|
||||
rl.clearBackground(BG_COLOR);
|
||||
|
||||
if (use_ssbo) {
|
||||
// SSBO instanced rendering path (12 bytes per entity)
|
||||
ssbo_renderer.?.render(&entities);
|
||||
// dispatch compute shader before render (if enabled)
|
||||
if (compute_shader) |*cs| {
|
||||
if (!paused) {
|
||||
const tracy_compute = ztracy.ZoneN(@src(), "compute_dispatch");
|
||||
defer tracy_compute.End();
|
||||
cs.dispatch(ssbo_renderer.?.ssbo_id, @intCast(entities.count), frame_number);
|
||||
frame_number +%= 1;
|
||||
}
|
||||
// GPU compute mode - only upload new entities, positions updated on GPU
|
||||
ssbo_renderer.?.renderComputeMode(&entities, zoom, pan);
|
||||
} else {
|
||||
// CPU mode - upload entity data to GPU
|
||||
ssbo_renderer.?.render(&entities, zoom, pan);
|
||||
}
|
||||
} else if (use_instancing) {
|
||||
// GPU instancing path (64 bytes per entity)
|
||||
const xforms = transforms.?;
|
||||
|
|
@ -374,7 +443,8 @@ pub fn main() !void {
|
|||
|
||||
// metrics overlay (skip in bench mode for cleaner headless run)
|
||||
if (!bench_mode) {
|
||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, ui_font);
|
||||
ui.drawMetrics(&entities, update_time_us, render_time_us, paused, zoom, ui_font);
|
||||
ui.drawMemory(entities.count, ui_font);
|
||||
}
|
||||
|
||||
rl.endDrawing();
|
||||
|
|
@ -385,6 +455,9 @@ pub fn main() !void {
|
|||
const update_ms = @as(f32, @floatFromInt(update_time_us)) / 1000.0;
|
||||
const render_ms = @as(f32, @floatFromInt(render_time_us)) / 1000.0;
|
||||
logger.log(elapsed, entities.count, frame_ms, update_ms, render_ms);
|
||||
|
||||
// tracy frame mark
|
||||
ztracy.FrameMark();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -397,7 +470,7 @@ var sub_timer: f32 = 0;
|
|||
fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) void {
|
||||
const dt = rl.getFrameTime();
|
||||
const shift = rl.isKeyDown(.left_shift) or rl.isKeyDown(.right_shift);
|
||||
const add_count: usize = if (shift) 10000 else 1000;
|
||||
const add_count: usize = if (shift) 50_000 else 10_000;
|
||||
|
||||
const add_held = rl.isKeyDown(.equal) or rl.isKeyDown(.kp_add);
|
||||
const sub_held = rl.isKeyDown(.minus) or rl.isKeyDown(.kp_subtract);
|
||||
|
|
@ -439,4 +512,86 @@ fn handleInput(entities: *sandbox.Entities, rng: *std.Random, paused: *bool) voi
|
|||
if (rl.isKeyPressed(.space)) {
|
||||
paused.* = !paused.*;
|
||||
}
|
||||
|
||||
// toggle ui: tab
|
||||
if (rl.isKeyPressed(.tab)) {
|
||||
ui.show_ui = !ui.show_ui;
|
||||
}
|
||||
}
|
||||
|
||||
fn handleCamera(zoom: *f32, pan: *@Vector(2, f32)) bool {
|
||||
const wheel = rl.getMouseWheelMove();
|
||||
|
||||
if (wheel != 0) {
|
||||
const mouse_pos = rl.getMousePosition();
|
||||
const old_zoom = zoom.*;
|
||||
|
||||
// calculate new zoom
|
||||
const zoom_factor = if (wheel > 0) (1.0 + ZOOM_SPEED) else (1.0 / (1.0 + ZOOM_SPEED));
|
||||
var new_zoom = old_zoom * zoom_factor;
|
||||
new_zoom = std.math.clamp(new_zoom, ZOOM_MIN, ZOOM_MAX);
|
||||
|
||||
if (new_zoom != old_zoom) {
|
||||
// zoom toward mouse cursor:
|
||||
// keep the world point under the cursor stationary
|
||||
// world_pos = (screen_pos / old_zoom) + old_pan
|
||||
// new_pan = world_pos - (screen_pos / new_zoom)
|
||||
const world_x = (mouse_pos.x / old_zoom) + pan.*[0];
|
||||
const world_y = (mouse_pos.y / old_zoom) + pan.*[1];
|
||||
pan.*[0] = world_x - (mouse_pos.x / new_zoom);
|
||||
pan.*[1] = world_y - (mouse_pos.y / new_zoom);
|
||||
zoom.* = new_zoom;
|
||||
|
||||
// clamp pan to bounds
|
||||
clampPan(pan, zoom.*);
|
||||
}
|
||||
}
|
||||
|
||||
// pan with any mouse button drag (only when zoomed in)
|
||||
if (zoom.* > 1.0) {
|
||||
const any_button = rl.isMouseButtonDown(.left) or
|
||||
rl.isMouseButtonDown(.right) or
|
||||
rl.isMouseButtonDown(.middle);
|
||||
if (any_button) {
|
||||
const delta = rl.getMouseDelta();
|
||||
// drag down = view down, drag right = view right
|
||||
pan.*[0] -= delta.x / zoom.*;
|
||||
pan.*[1] += delta.y / zoom.*;
|
||||
clampPan(pan, zoom.*);
|
||||
}
|
||||
}
|
||||
|
||||
// reset on Return or Enter
|
||||
if (rl.isKeyPressed(.enter) or rl.isKeyPressed(.kp_enter)) {
|
||||
zoom.* = 1.0;
|
||||
pan.* = @Vector(2, f32){ 0, 0 };
|
||||
}
|
||||
|
||||
// q: reset zoom if zoomed in, otherwise quit
|
||||
if (rl.isKeyPressed(.q)) {
|
||||
if (zoom.* > 1.0) {
|
||||
zoom.* = 1.0;
|
||||
pan.* = @Vector(2, f32){ 0, 0 };
|
||||
} else {
|
||||
return true; // signal to quit
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
fn clampPan(pan: *@Vector(2, f32), zoom: f32) void {
|
||||
// when zoomed in, limit pan so viewport stays in simulation bounds
|
||||
// visible area = screen_size / zoom
|
||||
// max pan = world_size - visible_area
|
||||
const screen_w: f32 = @floatFromInt(SCREEN_WIDTH);
|
||||
const screen_h: f32 = @floatFromInt(SCREEN_HEIGHT);
|
||||
const visible_w = screen_w / zoom;
|
||||
const visible_h = screen_h / zoom;
|
||||
|
||||
const max_pan_x = @max(0, screen_w - visible_w);
|
||||
const max_pan_y = @max(0, screen_h - visible_h);
|
||||
|
||||
pan.*[0] = std.math.clamp(pan.*[0], 0, max_pan_x);
|
||||
pan.*[1] = std.math.clamp(pan.*[1], 0, max_pan_y);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,10 +4,11 @@
|
|||
layout(location = 0) in vec2 position;
|
||||
layout(location = 1) in vec2 texCoord;
|
||||
|
||||
// entity data from SSBO
|
||||
// entity data from SSBO (16 bytes, matches compute shader layout)
|
||||
struct Entity {
|
||||
float x;
|
||||
float y;
|
||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8), unused in vertex shader
|
||||
uint color;
|
||||
};
|
||||
|
||||
|
|
@ -17,6 +18,8 @@ layout(std430, binding = 0) readonly buffer EntityData {
|
|||
|
||||
// screen size for NDC conversion
|
||||
uniform vec2 screenSize;
|
||||
uniform float zoom;
|
||||
uniform vec2 pan;
|
||||
|
||||
out vec2 fragTexCoord;
|
||||
out vec3 fragColor;
|
||||
|
|
@ -25,13 +28,13 @@ void main() {
|
|||
// get entity data from SSBO
|
||||
Entity e = entities[gl_InstanceID];
|
||||
|
||||
// convert entity position to NDC
|
||||
// entity coords are in screen pixels, convert to [-1, 1]
|
||||
float ndcX = (e.x / screenSize.x) * 2.0 - 1.0;
|
||||
float ndcY = (e.y / screenSize.y) * 2.0 - 1.0;
|
||||
// apply pan offset and zoom to convert to NDC
|
||||
// pan is in screen pixels, zoom scales the view
|
||||
float ndcX = ((e.x - pan.x) * zoom / screenSize.x) * 2.0 - 1.0;
|
||||
float ndcY = ((e.y - pan.y) * zoom / screenSize.y) * 2.0 - 1.0;
|
||||
|
||||
// quad size in NDC (16 pixels)
|
||||
float quadSizeNdc = 16.0 / screenSize.x;
|
||||
// quad size in NDC (16 pixels, scaled by zoom)
|
||||
float quadSizeNdc = (16.0 * zoom) / screenSize.x;
|
||||
|
||||
// offset by quad corner position
|
||||
gl_Position = vec4(ndcX + position.x * quadSizeNdc,
|
||||
|
|
|
|||
97
src/shaders/entity_update.comp
Normal file
97
src/shaders/entity_update.comp
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
#version 430
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
struct Entity {
|
||||
float x;
|
||||
float y;
|
||||
int packedVel; // vx high 16 bits, vy low 16 bits (fixed-point 8.8)
|
||||
uint color;
|
||||
};
|
||||
|
||||
layout(std430, binding = 0) buffer Entities {
|
||||
Entity entities[];
|
||||
};
|
||||
|
||||
uniform uint entityCount;
|
||||
uniform uint frameNumber;
|
||||
uniform vec2 screenSize;
|
||||
uniform vec2 center;
|
||||
uniform float respawnRadius;
|
||||
uniform float entitySpeed;
|
||||
|
||||
// PCG-style GPU RNG - returns value in [0, 1)
|
||||
uint pcg(inout uint state) {
|
||||
state = state * 747796405u + 2891336453u;
|
||||
uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
|
||||
return (word >> 22u) ^ word;
|
||||
}
|
||||
|
||||
float randFloat(inout uint state) {
|
||||
return float(pcg(state)) / 4294967296.0;
|
||||
}
|
||||
|
||||
// pack velocity into fixed-point 8.8 format
|
||||
int packVelocity(float vx, float vy) {
|
||||
int vx_fixed = int(clamp(vx * 256.0, -32768.0, 32767.0));
|
||||
int vy_fixed = int(clamp(vy * 256.0, -32768.0, 32767.0));
|
||||
return (vx_fixed << 16) | (vy_fixed & 0xFFFF);
|
||||
}
|
||||
|
||||
void main() {
|
||||
uint id = gl_GlobalInvocationID.x;
|
||||
if (id >= entityCount) return;
|
||||
|
||||
Entity e = entities[id];
|
||||
|
||||
// unpack velocity (fixed-point 8.8)
|
||||
float vx = float(e.packedVel >> 16) / 256.0;
|
||||
float vy = float((e.packedVel << 16) >> 16) / 256.0; // sign-extend low 16 bits
|
||||
|
||||
// update position
|
||||
e.x += vx;
|
||||
e.y += vy;
|
||||
|
||||
// check if reached center - respawn at edge
|
||||
float dx = e.x - center.x;
|
||||
float dy = e.y - center.y;
|
||||
if (dx*dx + dy*dy < respawnRadius * respawnRadius) {
|
||||
// init RNG with entity id and frame number
|
||||
uint rng = id * 1103515245u + frameNumber * 12345u + 1u;
|
||||
|
||||
// pick random edge: 0=top, 1=bottom, 2=left, 3=right
|
||||
uint edge = pcg(rng) & 3u;
|
||||
float t = randFloat(rng);
|
||||
|
||||
// spawn on edge
|
||||
if (edge == 0u) { // top
|
||||
e.x = t * screenSize.x;
|
||||
e.y = 0.0;
|
||||
} else if (edge == 1u) { // bottom
|
||||
e.x = t * screenSize.x;
|
||||
e.y = screenSize.y;
|
||||
} else if (edge == 2u) { // left
|
||||
e.x = 0.0;
|
||||
e.y = t * screenSize.y;
|
||||
} else { // right
|
||||
e.x = screenSize.x;
|
||||
e.y = t * screenSize.y;
|
||||
}
|
||||
|
||||
// velocity toward center
|
||||
dx = center.x - e.x;
|
||||
dy = center.y - e.y;
|
||||
float dist = sqrt(dx*dx + dy*dy);
|
||||
vx = (dx / dist) * entitySpeed;
|
||||
vy = (dy / dist) * entitySpeed;
|
||||
e.packedVel = packVelocity(vx, vy);
|
||||
|
||||
// new random color
|
||||
uint r = pcg(rng) & 0xFFu;
|
||||
uint g = pcg(rng) & 0xFFu;
|
||||
uint b = pcg(rng) & 0xFFu;
|
||||
e.color = (r << 16u) | (g << 8u) | b;
|
||||
}
|
||||
|
||||
entities[id] = e;
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
const std = @import("std");
|
||||
const rl = @import("raylib");
|
||||
const ztracy = @import("ztracy");
|
||||
const sandbox = @import("sandbox.zig");
|
||||
|
||||
const SCREEN_WIDTH = sandbox.SCREEN_WIDTH;
|
||||
|
|
@ -19,8 +20,11 @@ pub const SsboRenderer = struct {
|
|||
ssbo_id: u32,
|
||||
screen_size_loc: i32,
|
||||
circle_texture_loc: i32,
|
||||
zoom_loc: i32,
|
||||
pan_loc: i32,
|
||||
circle_texture_id: u32,
|
||||
gpu_buffer: []sandbox.GpuEntity,
|
||||
last_entity_count: usize, // track count to detect when entities are added
|
||||
|
||||
const QUAD_SIZE: f32 = 16.0;
|
||||
|
||||
|
|
@ -53,6 +57,8 @@ pub const SsboRenderer = struct {
|
|||
// get uniform locations
|
||||
const screen_size_loc = rl.gl.rlGetLocationUniform(shader_id, "screenSize");
|
||||
const circle_texture_loc = rl.gl.rlGetLocationUniform(shader_id, "circleTexture");
|
||||
const zoom_loc = rl.gl.rlGetLocationUniform(shader_id, "zoom");
|
||||
const pan_loc = rl.gl.rlGetLocationUniform(shader_id, "pan");
|
||||
|
||||
if (screen_size_loc < 0) {
|
||||
std.debug.print("ssbo: warning - screenSize uniform not found\n", .{});
|
||||
|
|
@ -94,7 +100,7 @@ pub const SsboRenderer = struct {
|
|||
rl.gl.rlSetVertexAttribute(1, 2, rl.gl.rl_float, false, 4 * @sizeOf(f32), 2 * @sizeOf(f32));
|
||||
rl.gl.rlEnableVertexAttribute(1);
|
||||
|
||||
// create SSBO for entity data (12 bytes per entity, 1M entities = 12MB)
|
||||
// create SSBO for entity data (16 bytes per entity, 1M entities = 16MB)
|
||||
const ssbo_size: u32 = @intCast(sandbox.MAX_ENTITIES * @sizeOf(sandbox.GpuEntity));
|
||||
const ssbo_id = rl.gl.rlLoadShaderBuffer(ssbo_size, null, rl.gl.rl_dynamic_draw);
|
||||
if (ssbo_id == 0) {
|
||||
|
|
@ -116,8 +122,11 @@ pub const SsboRenderer = struct {
|
|||
.ssbo_id = ssbo_id,
|
||||
.screen_size_loc = screen_size_loc,
|
||||
.circle_texture_loc = circle_texture_loc,
|
||||
.zoom_loc = zoom_loc,
|
||||
.pan_loc = pan_loc,
|
||||
.circle_texture_id = circle_texture.id,
|
||||
.gpu_buffer = gpu_buffer,
|
||||
.last_entity_count = 0,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -129,25 +138,80 @@ pub const SsboRenderer = struct {
|
|||
std.heap.page_allocator.free(self.gpu_buffer);
|
||||
}
|
||||
|
||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities) void {
|
||||
pub fn render(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||
self.renderInternal(entities, zoom, pan, false);
|
||||
}
|
||||
|
||||
pub fn renderComputeMode(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32)) void {
|
||||
if (entities.count == 0) return;
|
||||
|
||||
// flush raylib's internal render batch before our custom GL calls
|
||||
rl.gl.rlDrawRenderBatchActive();
|
||||
|
||||
// copy entity data to GPU buffer (position + color only)
|
||||
for (entities.items[0..entities.count], 0..) |entity, i| {
|
||||
// upload NEW entities when count increases (entities added on CPU)
|
||||
if (entities.count > self.last_entity_count) {
|
||||
const zone = ztracy.ZoneN(@src(), "ssbo_upload_new");
|
||||
defer zone.End();
|
||||
|
||||
// copy new entities to GPU buffer
|
||||
for (entities.items[self.last_entity_count..entities.count], self.last_entity_count..) |entity, i| {
|
||||
self.gpu_buffer[i] = .{
|
||||
.x = entity.x,
|
||||
.y = entity.y,
|
||||
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
||||
.color = entity.color,
|
||||
};
|
||||
}
|
||||
|
||||
// upload only the new portion to SSBO
|
||||
const offset: u32 = @intCast(self.last_entity_count * @sizeOf(sandbox.GpuEntity));
|
||||
const new_count = entities.count - self.last_entity_count;
|
||||
const data_size: u32 = @intCast(new_count * @sizeOf(sandbox.GpuEntity));
|
||||
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, &self.gpu_buffer[self.last_entity_count], data_size, offset);
|
||||
|
||||
self.last_entity_count = entities.count;
|
||||
} else if (entities.count < self.last_entity_count) {
|
||||
// entities were removed, update count
|
||||
self.last_entity_count = entities.count;
|
||||
}
|
||||
|
||||
self.drawInstanced(entities.count, zoom, pan);
|
||||
}
|
||||
|
||||
fn renderInternal(self: *SsboRenderer, entities: *const sandbox.Entities, zoom: f32, pan: @Vector(2, f32), skip_upload: bool) void {
|
||||
if (entities.count == 0) return;
|
||||
|
||||
// flush raylib's internal render batch before our custom GL calls
|
||||
rl.gl.rlDrawRenderBatchActive();
|
||||
|
||||
if (!skip_upload) {
|
||||
// copy entity data to GPU buffer (position + packed velocity + color)
|
||||
{
|
||||
const zone = ztracy.ZoneN(@src(), "ssbo_copy");
|
||||
defer zone.End();
|
||||
for (entities.items[0..entities.count], 0..) |entity, i| {
|
||||
self.gpu_buffer[i] = .{
|
||||
.x = entity.x,
|
||||
.y = entity.y,
|
||||
.packed_vel = sandbox.packVelocity(entity.vx, entity.vy),
|
||||
.color = entity.color,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// upload to SSBO
|
||||
{
|
||||
const zone = ztracy.ZoneN(@src(), "ssbo_upload");
|
||||
defer zone.End();
|
||||
const data_size: u32 = @intCast(entities.count * @sizeOf(sandbox.GpuEntity));
|
||||
rl.gl.rlUpdateShaderBuffer(self.ssbo_id, self.gpu_buffer.ptr, data_size, 0);
|
||||
}
|
||||
}
|
||||
|
||||
self.drawInstanced(entities.count, zoom, pan);
|
||||
}
|
||||
|
||||
fn drawInstanced(self: *SsboRenderer, entity_count: usize, zoom: f32, pan: @Vector(2, f32)) void {
|
||||
// bind shader
|
||||
rl.gl.rlEnableShader(self.shader_id);
|
||||
|
||||
|
|
@ -155,6 +219,13 @@ pub const SsboRenderer = struct {
|
|||
const screen_size = [2]f32{ @floatFromInt(SCREEN_WIDTH), @floatFromInt(SCREEN_HEIGHT) };
|
||||
rl.gl.rlSetUniform(self.screen_size_loc, &screen_size, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||
|
||||
// set zoom uniform
|
||||
rl.gl.rlSetUniform(self.zoom_loc, &zoom, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_float), 1);
|
||||
|
||||
// set pan uniform
|
||||
const pan_arr = [2]f32{ pan[0], pan[1] };
|
||||
rl.gl.rlSetUniform(self.pan_loc, &pan_arr, @intFromEnum(rl.gl.rlShaderUniformDataType.rl_shader_uniform_vec2), 1);
|
||||
|
||||
// bind texture
|
||||
rl.gl.rlActiveTextureSlot(0);
|
||||
rl.gl.rlEnableTexture(self.circle_texture_id);
|
||||
|
|
@ -170,9 +241,13 @@ pub const SsboRenderer = struct {
|
|||
rl.gl.rlSetBlendMode(@intFromEnum(rl.gl.rlBlendMode.rl_blend_alpha));
|
||||
|
||||
// bind VAO and draw
|
||||
{
|
||||
const zone = ztracy.ZoneN(@src(), "ssbo_draw");
|
||||
defer zone.End();
|
||||
_ = rl.gl.rlEnableVertexArray(self.vao_id);
|
||||
rl.gl.rlEnableVertexBuffer(self.vbo_id);
|
||||
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entities.count));
|
||||
rl.gl.rlDrawVertexArrayInstanced(0, 6, @intCast(entity_count));
|
||||
}
|
||||
|
||||
// cleanup - restore raylib's expected state
|
||||
rl.gl.rlDisableVertexArray();
|
||||
|
|
|
|||
74
src/ui.zig
74
src/ui.zig
|
|
@ -19,13 +19,23 @@ pub const box_padding: f32 = 8;
|
|||
pub const text_color = rl.Color.white;
|
||||
pub const dim_text_color = rl.Color.gray;
|
||||
pub const highlight_color = rl.Color.yellow;
|
||||
pub const fps_good_color = rl.Color.green;
|
||||
pub const fps_bad_color = rl.Color.red;
|
||||
pub const box_bg = rl.Color{ .r = 0, .g = 0, .b = 0, .a = 200 };
|
||||
|
||||
// =============================================================================
|
||||
// state
|
||||
// =============================================================================
|
||||
|
||||
pub var show_ui: bool = true;
|
||||
|
||||
// =============================================================================
|
||||
// drawing functions
|
||||
// =============================================================================
|
||||
|
||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, font: rl.Font) void {
|
||||
pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us: i64, paused: bool, zoom: f32, font: rl.Font) void {
|
||||
if (!show_ui) return;
|
||||
|
||||
var buf: [256]u8 = undefined;
|
||||
|
||||
// fps box (above metrics)
|
||||
|
|
@ -33,13 +43,16 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
|||
rl.drawRectangle(5, 5, 180, fps_box_height, box_bg);
|
||||
const frame_ms = rl.getFrameTime() * 1000.0;
|
||||
const fps = if (frame_ms > 0) 1000.0 / frame_ms else 0;
|
||||
const fps_text = std.fmt.bufPrintZ(&buf, "FPS: {d:.0}", .{fps}) catch "?";
|
||||
rl.drawTextEx(font, fps_text, .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
||||
rl.drawTextEx(font, "FPS: ", .{ .x = padding, .y = padding }, font_size, 0, text_color);
|
||||
const fps_text = std.fmt.bufPrintZ(&buf, "{d:.0}", .{fps}) catch "?";
|
||||
const fps_color = if (fps >= 60.0) fps_good_color else fps_bad_color;
|
||||
const label_width = rl.measureTextEx(font, "FPS: ", font_size, 0).x;
|
||||
rl.drawTextEx(font, fps_text, .{ .x = padding + label_width, .y = padding }, font_size, 0, fps_color);
|
||||
|
||||
// metrics box (below fps)
|
||||
const metrics_y: i32 = 5 + fps_box_height + 5;
|
||||
var y: f32 = @as(f32, @floatFromInt(metrics_y)) + box_padding;
|
||||
const bg_height: i32 = if (paused) 130 else 100;
|
||||
const bg_height: i32 = if (paused) 150 else 120;
|
||||
rl.drawRectangle(5, metrics_y, 180, bg_height, box_bg);
|
||||
|
||||
// entity count
|
||||
|
|
@ -64,6 +77,11 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
|||
rl.drawTextEx(font, render_text, .{ .x = padding, .y = y }, font_size, 0, text_color);
|
||||
y += line_height;
|
||||
|
||||
// zoom level
|
||||
const zoom_text = std.fmt.bufPrintZ(&buf, "zoom: {d:.1}x", .{zoom}) catch "?";
|
||||
rl.drawTextEx(font, zoom_text, .{ .x = padding, .y = y }, font_size, 0, if (zoom > 1.0) highlight_color else text_color);
|
||||
y += line_height;
|
||||
|
||||
// paused indicator
|
||||
if (paused) {
|
||||
y += line_height;
|
||||
|
|
@ -74,18 +92,56 @@ pub fn drawMetrics(entities: *const sandbox.Entities, update_us: i64, render_us:
|
|||
drawControls(font, metrics_y + bg_height);
|
||||
}
|
||||
|
||||
pub fn drawMemory(entity_count: usize, font: rl.Font) void {
|
||||
if (!show_ui) return;
|
||||
|
||||
var buf: [256]u8 = undefined;
|
||||
|
||||
const box_width: i32 = 160;
|
||||
const box_height: i32 = @intFromFloat(line_height * 3 + box_padding * 2);
|
||||
const box_x: i32 = @as(i32, @intCast(sandbox.SCREEN_WIDTH)) - box_width - 5;
|
||||
const box_y: i32 = 5;
|
||||
|
||||
rl.drawRectangle(box_x, box_y, box_width, box_height, box_bg);
|
||||
|
||||
var y: f32 = @as(f32, @floatFromInt(box_y)) + box_padding;
|
||||
const x: f32 = @floatFromInt(box_x + @as(i32, @intFromFloat(box_padding)));
|
||||
|
||||
// entity memory (CPU side)
|
||||
const entity_bytes = entity_count * @sizeOf(sandbox.Entity);
|
||||
const entity_mb = @as(f32, @floatFromInt(entity_bytes)) / (1024.0 * 1024.0);
|
||||
const entity_text = std.fmt.bufPrintZ(&buf, "cpu: {d:.1} MB", .{entity_mb}) catch "?";
|
||||
rl.drawTextEx(font, entity_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
||||
y += line_height;
|
||||
|
||||
// GPU buffer memory (SSBO)
|
||||
const gpu_bytes = entity_count * @sizeOf(sandbox.GpuEntity);
|
||||
const gpu_mb = @as(f32, @floatFromInt(gpu_bytes)) / (1024.0 * 1024.0);
|
||||
const gpu_text = std.fmt.bufPrintZ(&buf, "gpu: {d:.1} MB", .{gpu_mb}) catch "?";
|
||||
rl.drawTextEx(font, gpu_text, .{ .x = x, .y = y }, font_size, 0, text_color);
|
||||
y += line_height;
|
||||
|
||||
// total
|
||||
const total_mb = entity_mb + gpu_mb;
|
||||
const total_text = std.fmt.bufPrintZ(&buf, "total: {d:.1} MB", .{total_mb}) catch "?";
|
||||
rl.drawTextEx(font, total_text, .{ .x = x, .y = y }, font_size, 0, dim_text_color);
|
||||
}
|
||||
|
||||
fn drawControls(font: rl.Font, metrics_bottom: i32) void {
|
||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 4 + box_padding * 2);
|
||||
const ctrl_box_height: i32 = @intFromFloat(small_line_height * 7 + box_padding * 2);
|
||||
const ctrl_box_y: i32 = metrics_bottom + 5;
|
||||
rl.drawRectangle(5, ctrl_box_y, 175, ctrl_box_height, box_bg);
|
||||
|
||||
var y: f32 = @as(f32, @floatFromInt(ctrl_box_y)) + box_padding;
|
||||
|
||||
const controls = [_][]const u8{
|
||||
"+/-: 1000 entities",
|
||||
"shift +/-: 10000",
|
||||
"space: pause",
|
||||
"r: reset",
|
||||
"+/-: 10k entities",
|
||||
"shift +/-: 50k",
|
||||
"scroll: zoom",
|
||||
"drag: pan (zoomed)",
|
||||
"space: pause, r: reset",
|
||||
"q: zoom out / quit",
|
||||
"tab: toggle ui",
|
||||
};
|
||||
|
||||
for (controls) |text| {
|
||||
|
|
|
|||
Loading…
Reference in a new issue