v0.4.10: preallocate decoder direct buffer, fix 5fps video

0.4.9 allocated a fresh w*h*4 direct ByteBuffer on every grab() — at 1080p × 24fps that's ~192 MB/s of direct memory churn (page zero-fill + Cleaner enqueue). The decoder thread spent most of its frame budget on memory bookkeeping instead of decoding, fell behind real time, and the single-slot AtomicReference saw bursty refills that the render thread could only sample at ~5fps. Game thread was fine, only the video looked like 5fps. Replace it with one preallocated direct buffer per backend instance, filled under a short-held lock on the decoder side. Swap the pollFrame() ByteBuffer-returning API for consumeFrame(dstAddr, maxBytes) so the render thread memcpys straight from staging buffer → GPU texture pointer under the same lock — no allocation, no race window between "got buffer" and "decoder overwrote it". Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 22:55:04 +09:00
parent 3d4843dd0d
commit cee01bd448
6 changed files with 96 additions and 49 deletions
--- a/src/main/java/com/ejclaw/videoplayer/client/playback/JavaCvBackend.java
+++ b/src/main/java/com/ejclaw/videoplayer/client/playback/JavaCvBackend.java
@@ -4,6 +4,8 @@ import com.ejclaw.videoplayer.VideoPlayerMod;
 import net.fabricmc.api.EnvType;
 import net.fabricmc.api.Environment;

+import org.lwjgl.system.MemoryUtil;
+
 import javax.sound.sampled.AudioFormat;
 import javax.sound.sampled.AudioSystem;
 import javax.sound.sampled.SourceDataLine;
@@ -13,7 +15,6 @@ import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.ShortBuffer;
 import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicReference;

 /**
 * SPEC §5.3 — fallback mp4/http(s) backend driven by JavaCV's FFmpegFrameGrabber.
@@ -38,7 +39,17 @@ public class JavaCvBackend implements VideoBackend {
    private Thread worker;
    private final AtomicBoolean running = new AtomicBoolean(false);
    private final AtomicBoolean paused  = new AtomicBoolean(false);
-    private final AtomicReference<ByteBuffer> latest = new AtomicReference<>();
+    /**
+     * Single preallocated RGBA staging buffer. Decoder thread writes into it under
+     * {@link #frameLock}; render thread reads via {@link #consumeFrame(long, long)} under the
+     * same lock. One allocation for the lifetime of the backend instead of one per frame —
+     * see 0.4.10 changelog for the regression that motivated this. The lock is short-held
+     * (one 8MB memcpy ≈ 1ms at 1080p) so contention is negligible.
+     */
+    private final Object frameLock = new Object();
+    private ByteBuffer frameBuf;
+    private int frameBufBytes = 0;
+    private boolean frameDirty = false;
    private volatile int width = 0;
    private volatile int height = 0;
    private volatile float gain = 1.0F;
@@ -88,14 +99,32 @@ public class JavaCvBackend implements VideoBackend {
    public int videoHeight() { return height; }

    @Override
-    public ByteBuffer pollFrame() {
-        return latest.getAndSet(null);
+    public boolean consumeFrame(long dstAddr, long maxBytes) {
+        synchronized (frameLock) {
+            if (!frameDirty || frameBuf == null || frameBufBytes <= 0) return false;
+            if (frameBufBytes > maxBytes) {
+                // Texture not yet resized for this frame's dimensions — drop and wait for the
+                // caller to ensure capacity next tick. ensureTexture() runs in Entry.upload
+                // before consumeFrame, so this is only hit on the exact tick of a resolution
+                // change.
+                frameDirty = false;
+                return false;
+            }
+            MemoryUtil.memCopy(MemoryUtil.memAddress(frameBuf), dstAddr, frameBufBytes);
+            frameDirty = false;
+            return true;
+        }
    }

    @Override
    public void close() {
        closed = true;
        stopWorker();
+        synchronized (frameLock) {
+            frameBuf = null;
+            frameBufBytes = 0;
+            frameDirty = false;
+        }
    }

    private void stopWorker() {
@@ -214,17 +243,34 @@ public class JavaCvBackend implements VideoBackend {
                Object[] images = (Object[]) imageField.get(frame);
                if (images != null && images.length > 0 && images[0] instanceof ByteBuffer src) {
                    // frame.image[0] is the swscale-converted RGBA plane, reused by the grabber
-                    // across grab() calls. Copy into a fresh direct buffer because the render
-                    // thread reads `latest` asynchronously and would otherwise see a buffer
-                    // already being overwritten by the next grab().
+                    // across grab() calls. Copy into our preallocated staging buffer under
+                    // frameLock so the render thread's consumeFrame() sees a coherent image.
+                    //
+                    // 0.4.9 used `ByteBuffer.allocateDirect(w*h*4)` on every grab — at 1080p ×
+                    // 24fps that's ~192 MB/s of direct memory churn (each allocation zero-fills
+                    // the page, plus the Cleaner enqueues the old buffer for finalization).
+                    // The decoder thread spent so much time on memory bookkeeping that grab()
+                    // fell behind real time, the single-slot `latest` AtomicReference was
+                    // refilled in bursts, and the user saw ~5fps playback even though the
+                    // game/render thread was fine.
+                    //
+                    // Preallocating once eliminates both the zero-fill cost and the Cleaner
+                    // pressure. The decoder thread now spends its budget on the actual decode +
+                    // swscale + a single 8MB memcpy — well within 42ms at 1080p × 24fps.
                    int need = src.remaining();
                    if (need > 0) {
-                        ByteBuffer copy = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
-                        int srcPos = src.position();
-                        copy.put(src);
-                        src.position(srcPos); // restore so JavaCV's own bookkeeping isn't disturbed
-                        copy.flip();
-                        latest.set(copy);
+                        synchronized (frameLock) {
+                            if (frameBuf == null || frameBuf.capacity() < need) {
+                                frameBuf = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
+                            }
+                            int srcPos = src.position();
+                            long dstAddr = MemoryUtil.memAddress(frameBuf);
+                            long srcAddr = MemoryUtil.memAddress(src) + srcPos;
+                            MemoryUtil.memCopy(srcAddr, dstAddr, need);
+                            src.position(srcPos); // unchanged, but explicit — JavaCV reads it too
+                            frameBufBytes = need;
+                            frameDirty = true;
+                        }
                    }
                }

--- a/src/main/java/com/ejclaw/videoplayer/client/playback/VideoBackend.java
+++ b/src/main/java/com/ejclaw/videoplayer/client/playback/VideoBackend.java
@@ -3,8 +3,6 @@ package com.ejclaw.videoplayer.client.playback;
 import net.fabricmc.api.EnvType;
 import net.fabricmc.api.Environment;

-import java.nio.ByteBuffer;
-
 /**
 * SPEC §5.3 — minimal playback backend abstraction. Implementations: WatermediaBackend (preferred,
 * when v2 supports the target MC version) and JavaCvBackend (fallback).
@@ -21,10 +19,19 @@ public interface VideoBackend {
    int videoHeight();

    /**
-     * Poll a new decoded RGBA frame if one is ready.
-     * @return the frame buffer (capacity = w*h*4) or {@code null} if no new frame is ready.
+     * If a new RGBA frame is ready, memcpy it directly into the GPU texture buffer at
+     * {@code dstAddr} (must have room for at least {@code w*h*4} bytes) and clear the dirty
+     * flag. Returns {@code true} when a frame was written.
+     *
+     * <p>Replaces the prior {@code pollFrame()} which returned a {@link java.nio.ByteBuffer}.
+     * The old contract forced the decoder to either allocate a fresh direct buffer per frame
+     * (huge memory churn at 1080p — see 0.4.10 changelog) or expose a reused buffer whose
+     * memory the decoder could clobber while the renderer was still reading. Pushing the copy
+     * inside the backend lets the decoder hold a single preallocated buffer under its own
+     * lock and copy out to the GPU pointer in one synchronized block — zero allocation, no
+     * race window.
     */
-    ByteBuffer pollFrame();
+    boolean consumeFrame(long dstAddr, long maxBytes);

    void close();
 }
--- a/src/main/java/com/ejclaw/videoplayer/client/playback/VideoPlayback.java
+++ b/src/main/java/com/ejclaw/videoplayer/client/playback/VideoPlayback.java
@@ -9,9 +9,7 @@ import net.minecraft.client.Minecraft;
 import net.minecraft.client.renderer.texture.DynamicTexture;
 import net.minecraft.core.BlockPos;
 import net.minecraft.resources.Identifier;
-import org.lwjgl.system.MemoryUtil;

-import java.nio.ByteBuffer;
 import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -113,10 +111,8 @@ public final class VideoPlayback {
                continue;
            }
            if (!e.backend.isReady()) continue;
-            ByteBuffer buf = e.backend.pollFrame();
-            if (buf == null) continue;
            try {
-                e.upload(buf);
+                e.tryUpload();
            } catch (Throwable t) {
                VideoPlayerMod.LOG.warn("[{}] texture upload failed: {}", VideoPlayerMod.MOD_ID, t.toString());
                e.close();
@@ -188,23 +184,23 @@ public final class VideoPlayback {
            }
        }

-        /** Copy an incoming RGBA byte buffer into the texture, resizing if dimensions changed. */
-        void upload(ByteBuffer rgba) {
+        /**
+         * If the backend has a new RGBA frame, copy it straight into the texture's native
+         * pixel buffer and re-upload to GPU. The backend does the memcpy under its own lock
+         * so we never read a half-written frame. RGBA bytes already match NativeImage's
+         * ABGR-int layout in little-endian byte order (byte 0 = R = low byte of the int).
+         */
+        void tryUpload() {
            int w = backend.videoWidth();
            int h = backend.videoHeight();
            if (w <= 0 || h <= 0) return;
            ensureTexture(w, h, false);
            NativeImage img = texture.getPixels();
            if (img == null) return;
-
-            // RGBA bytes from the backend already match NativeImage's ABGR-int layout when
-            // viewed as little-endian bytes: byte 0 = R (low byte of ABGR int), byte 1 = G,
-            // byte 2 = B, byte 3 = A. So a flat memcpy works — no per-pixel swap needed.
-            // This replaces a 2M-iteration Java loop with one native memcpy for 1080p frames,
-            // cutting upload time from ~20ms to <1ms and removing the main stutter source.
-            long bytes = (long) w * h * 4L;
-            MemoryUtil.memCopy(MemoryUtil.memAddress(rgba), img.getPointer(), bytes);
-            texture.upload();
+            long maxBytes = (long) w * h * 4L;
+            if (backend.consumeFrame(img.getPointer(), maxBytes)) {
+                texture.upload();
+            }
        }

        void close() {
--- a/src/main/java/com/ejclaw/videoplayer/client/playback/WatermediaBackend.java
+++ b/src/main/java/com/ejclaw/videoplayer/client/playback/WatermediaBackend.java
@@ -4,8 +4,6 @@ import com.ejclaw.videoplayer.VideoPlayerMod;
 import net.fabricmc.api.EnvType;
 import net.fabricmc.api.Environment;

-import java.nio.ByteBuffer;
-
 /**
 * SPEC §5.3 / §5.4 — WaterMedia v2 backend. Reflection-only so the mod jar stays clean of
 * compile-time WaterMedia dependencies. Until a v2 build supports 1.21.6+ this returns
@@ -38,8 +36,8 @@ public class WatermediaBackend implements VideoBackend {
    @Override public int videoHeight() { return height; }

    @Override
-    public ByteBuffer pollFrame() {
-        return null; // no frames until v2 is wired up
+    public boolean consumeFrame(long dstAddr, long maxBytes) {
+        return false; // no frames until v2 is wired up
    }

    @Override