diff --git a/docs/plans/2025-12-17-compute-shader-updates.md b/docs/plans/2025-12-17-compute-shader-updates.md
index d1aa937..ec0adbe 100644
--- a/docs/plans/2025-12-17-compute-shader-updates.md
+++ b/docs/plans/2025-12-17-compute-shader-updates.md
@@ -141,10 +141,10 @@ each step is a commit point if desired.
 - implement edge spawning + velocity calculation
 - remove CPU update loop from sandbox.zig
 
-### step 4: cleanup
-- remove dead code (cpu update, per-frame upload)
-- add `--compute` flag to toggle (keep old path for comparison)
-- benchmark and document results
+### step 4: cleanup ✓
+- `--compute` is now default, `--cpu` flag for fallback/comparison
+- justfile updated: `just bench` (compute), `just bench-cpu` (comparison)
+- verbose debug output reduced
 
 ## files changed
 
diff --git a/journal.txt b/journal.txt
index 34c1234..98a61e3 100644
--- a/journal.txt
+++ b/journal.txt
@@ -206,3 +206,38 @@ total improvement from baseline:
 - SSBO: 60fps @ ~700k entities
 - ~140x improvement overall!
 
+---
+
+optimization 6: compute shader updates
+--------------------------------------
+technique: move entity position + respawn logic from CPU to GPU compute shader
+code: compute.zig, shaders/entity_update.comp, ssbo_renderer.zig
+version: 0.7.0
+
+struct GpuEntity {
+    x: f32,        // 4 bytes
+    y: f32,        // 4 bytes
+    packed_vel: i32, // 4 bytes (vx/vy in fixed-point 8.8)
+    color: u32,    // 4 bytes
+};                 // = 16 bytes total (was 12)
+
+changes:
+- entity_update.comp: position update, center check, edge respawn, velocity calc
+- GPU RNG: PCG-style PRNG seeded with entity id + frame number
+- ssbo_renderer: renderComputeMode() only uploads NEW entities (when count grows)
+- CPU update loop skipped entirely when compute enabled
+
+benchmark results (i5-6500T / HD 530):
+- update time: ~5ms → ~0ms at 150k entities
+- render time unchanged (GPU-bound as before)
+- total frame time improvement at high entity counts
+
+analysis: CPU was doing ~150k position updates + distance checks + respawn logic
+per frame. now GPU does it in parallel via 256-thread workgroups. CPU only uploads
+new entities when user adds them, not per-frame. memory barrier ensures compute
+writes visible to vertex shader before draw.
+
+flags:
+- --compute: GPU compute updates (now default)
+- --cpu: fallback to CPU update path for comparison
+
diff --git a/justfile b/justfile
index bd30a32..a0701f1 100644
--- a/justfile
+++ b/justfile
@@ -42,11 +42,20 @@ check:
 test:
     zig build test
 
+# run sandbox (GPU compute is default)
+sandbox:
+    zig build -Doptimize=ReleaseFast run
+
 # auto-benchmark (ramps entities until performance degrades)
 bench:
     zig build -Doptimize=ReleaseFast run -- --bench
     cat benchmark.log
 
+# benchmark with CPU update path (for comparison)
+bench-cpu:
+    zig build -Doptimize=ReleaseFast run -- --bench --cpu
+    cat benchmark.log
+
 # software-rendered benchmark (for CI/headless servers)
 [linux]
 bench-sw:
diff --git a/src/compute.zig b/src/compute.zig
index 4f28741..5365057 100644
--- a/src/compute.zig
+++ b/src/compute.zig
@@ -50,15 +50,7 @@ pub const ComputeShader = struct {
         const respawn_radius_loc = rl.gl.rlGetLocationUniform(program_id, "respawnRadius");
         const entity_speed_loc = rl.gl.rlGetLocationUniform(program_id, "entitySpeed");
 
-        std.debug.print("compute: shader loaded (program_id={}, uniforms: count={}, frame={}, screen={}, center={}, radius={}, speed={})\n", .{
-            program_id,
-            entity_count_loc,
-            frame_number_loc,
-            screen_size_loc,
-            center_loc,
-            respawn_radius_loc,
-            entity_speed_loc,
-        });
+        std.debug.print("compute: shader loaded\n", .{});
 
         return .{
             .program_id = program_id,
diff --git a/src/sandbox_main.zig b/src/sandbox_main.zig
index cbb8e38..27cf27c 100644
--- a/src/sandbox_main.zig
+++ b/src/sandbox_main.zig
@@ -164,7 +164,7 @@ pub fn main() !void {
     var use_instancing = false;
     var use_ssbo = true;
     var use_vsync = false;
-    var use_compute = false;
+    var use_compute = true; // GPU compute is now default
     var args = try std.process.argsWithAllocator(std.heap.page_allocator);
     defer args.deinit();
     _ = args.skip(); // skip program name
@@ -178,8 +178,8 @@ pub fn main() !void {
             use_ssbo = false; // legacy rlgl batched path
         } else if (std.mem.eql(u8, arg, "--vsync")) {
             use_vsync = true;
-        } else if (std.mem.eql(u8, arg, "--compute")) {
-            use_compute = true;
+        } else if (std.mem.eql(u8, arg, "--cpu")) {
+            use_compute = false; // fallback to CPU update path
         }
     }