v0.4.10: preallocate decoder direct buffer, fix 5fps video
Some checks failed
build / build (push) Has been cancelled
Some checks failed
build / build (push) Has been cancelled
0.4.9 allocated a fresh w*h*4 direct ByteBuffer on every grab() — at 1080p × 24fps that's ~192 MB/s of direct memory churn (page zero-fill + Cleaner enqueue). The decoder thread spent most of its frame budget on memory bookkeeping instead of decoding, fell behind real time, and the single-slot AtomicReference saw bursty refills that the render thread could only sample at ~5fps. Game thread was fine, only the video looked like 5fps. Replace it with one preallocated direct buffer per backend instance, filled under a short-held lock on the decoder side. Swap the pollFrame() ByteBuffer-returning API for consumeFrame(dstAddr, maxBytes) so the render thread memcpys straight from staging buffer → GPU texture pointer under the same lock — no allocation, no race window between "got buffer" and "decoder overwrote it". Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,8 @@ import com.ejclaw.videoplayer.VideoPlayerMod;
|
||||
import net.fabricmc.api.EnvType;
|
||||
import net.fabricmc.api.Environment;
|
||||
|
||||
import org.lwjgl.system.MemoryUtil;
|
||||
|
||||
import javax.sound.sampled.AudioFormat;
|
||||
import javax.sound.sampled.AudioSystem;
|
||||
import javax.sound.sampled.SourceDataLine;
|
||||
@@ -13,7 +15,6 @@ import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.ShortBuffer;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
/**
|
||||
* SPEC §5.3 — fallback mp4/http(s) backend driven by JavaCV's FFmpegFrameGrabber.
|
||||
@@ -38,7 +39,17 @@ public class JavaCvBackend implements VideoBackend {
|
||||
private Thread worker;
|
||||
private final AtomicBoolean running = new AtomicBoolean(false);
|
||||
private final AtomicBoolean paused = new AtomicBoolean(false);
|
||||
private final AtomicReference<ByteBuffer> latest = new AtomicReference<>();
|
||||
/**
|
||||
* Single preallocated RGBA staging buffer. Decoder thread writes into it under
|
||||
* {@link #frameLock}; render thread reads via {@link #consumeFrame(long, long)} under the
|
||||
* same lock. One allocation for the lifetime of the backend instead of one per frame —
|
||||
* see 0.4.10 changelog for the regression that motivated this. The lock is short-held
|
||||
* (one 8MB memcpy ≈ 1ms at 1080p) so contention is negligible.
|
||||
*/
|
||||
private final Object frameLock = new Object();
|
||||
private ByteBuffer frameBuf;
|
||||
private int frameBufBytes = 0;
|
||||
private boolean frameDirty = false;
|
||||
private volatile int width = 0;
|
||||
private volatile int height = 0;
|
||||
private volatile float gain = 1.0F;
|
||||
@@ -88,14 +99,32 @@ public class JavaCvBackend implements VideoBackend {
|
||||
public int videoHeight() { return height; }
|
||||
|
||||
@Override
|
||||
public ByteBuffer pollFrame() {
|
||||
return latest.getAndSet(null);
|
||||
public boolean consumeFrame(long dstAddr, long maxBytes) {
|
||||
synchronized (frameLock) {
|
||||
if (!frameDirty || frameBuf == null || frameBufBytes <= 0) return false;
|
||||
if (frameBufBytes > maxBytes) {
|
||||
// Texture not yet resized for this frame's dimensions — drop and wait for the
|
||||
// caller to ensure capacity next tick. ensureTexture() runs in Entry.upload
|
||||
// before consumeFrame, so this is only hit on the exact tick of a resolution
|
||||
// change.
|
||||
frameDirty = false;
|
||||
return false;
|
||||
}
|
||||
MemoryUtil.memCopy(MemoryUtil.memAddress(frameBuf), dstAddr, frameBufBytes);
|
||||
frameDirty = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
closed = true;
|
||||
stopWorker();
|
||||
synchronized (frameLock) {
|
||||
frameBuf = null;
|
||||
frameBufBytes = 0;
|
||||
frameDirty = false;
|
||||
}
|
||||
}
|
||||
|
||||
private void stopWorker() {
|
||||
@@ -214,17 +243,34 @@ public class JavaCvBackend implements VideoBackend {
|
||||
Object[] images = (Object[]) imageField.get(frame);
|
||||
if (images != null && images.length > 0 && images[0] instanceof ByteBuffer src) {
|
||||
// frame.image[0] is the swscale-converted RGBA plane, reused by the grabber
|
||||
// across grab() calls. Copy into a fresh direct buffer because the render
|
||||
// thread reads `latest` asynchronously and would otherwise see a buffer
|
||||
// already being overwritten by the next grab().
|
||||
// across grab() calls. Copy into our preallocated staging buffer under
|
||||
// frameLock so the render thread's consumeFrame() sees a coherent image.
|
||||
//
|
||||
// 0.4.9 used `ByteBuffer.allocateDirect(w*h*4)` on every grab — at 1080p ×
|
||||
// 24fps that's ~192 MB/s of direct memory churn (each allocation zero-fills
|
||||
// the page, plus the Cleaner enqueues the old buffer for finalization).
|
||||
// The decoder thread spent so much time on memory bookkeeping that grab()
|
||||
// fell behind real time, the single-slot `latest` AtomicReference was
|
||||
// refilled in bursts, and the user saw ~5fps playback even though the
|
||||
// game/render thread was fine.
|
||||
//
|
||||
// Preallocating once eliminates both the zero-fill cost and the Cleaner
|
||||
// pressure. The decoder thread now spends its budget on the actual decode +
|
||||
// swscale + a single 8MB memcpy — well within 42ms at 1080p × 24fps.
|
||||
int need = src.remaining();
|
||||
if (need > 0) {
|
||||
ByteBuffer copy = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
|
||||
int srcPos = src.position();
|
||||
copy.put(src);
|
||||
src.position(srcPos); // restore so JavaCV's own bookkeeping isn't disturbed
|
||||
copy.flip();
|
||||
latest.set(copy);
|
||||
synchronized (frameLock) {
|
||||
if (frameBuf == null || frameBuf.capacity() < need) {
|
||||
frameBuf = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
|
||||
}
|
||||
int srcPos = src.position();
|
||||
long dstAddr = MemoryUtil.memAddress(frameBuf);
|
||||
long srcAddr = MemoryUtil.memAddress(src) + srcPos;
|
||||
MemoryUtil.memCopy(srcAddr, dstAddr, need);
|
||||
src.position(srcPos); // unchanged, but explicit — JavaCV reads it too
|
||||
frameBufBytes = need;
|
||||
frameDirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@ package com.ejclaw.videoplayer.client.playback;
|
||||
import net.fabricmc.api.EnvType;
|
||||
import net.fabricmc.api.Environment;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* SPEC §5.3 — minimal playback backend abstraction. Implementations: WatermediaBackend (preferred,
|
||||
* when v2 supports the target MC version) and JavaCvBackend (fallback).
|
||||
@@ -21,10 +19,19 @@ public interface VideoBackend {
|
||||
int videoHeight();
|
||||
|
||||
/**
|
||||
* Poll a new decoded RGBA frame if one is ready.
|
||||
* @return the frame buffer (capacity = w*h*4) or {@code null} if no new frame is ready.
|
||||
* If a new RGBA frame is ready, memcpy it directly into the GPU texture buffer at
|
||||
* {@code dstAddr} (must have room for at least {@code w*h*4} bytes) and clear the dirty
|
||||
* flag. Returns {@code true} when a frame was written.
|
||||
*
|
||||
* <p>Replaces the prior {@code pollFrame()} which returned a {@link java.nio.ByteBuffer}.
|
||||
* The old contract forced the decoder to either allocate a fresh direct buffer per frame
|
||||
* (huge memory churn at 1080p — see 0.4.10 changelog) or expose a reused buffer whose
|
||||
* memory the decoder could clobber while the renderer was still reading. Pushing the copy
|
||||
* inside the backend lets the decoder hold a single preallocated buffer under its own
|
||||
* lock and copy out to the GPU pointer in one synchronized block — zero allocation, no
|
||||
* race window.
|
||||
*/
|
||||
ByteBuffer pollFrame();
|
||||
boolean consumeFrame(long dstAddr, long maxBytes);
|
||||
|
||||
void close();
|
||||
}
|
||||
|
||||
@@ -9,9 +9,7 @@ import net.minecraft.client.Minecraft;
|
||||
import net.minecraft.client.renderer.texture.DynamicTexture;
|
||||
import net.minecraft.core.BlockPos;
|
||||
import net.minecraft.resources.Identifier;
|
||||
import org.lwjgl.system.MemoryUtil;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@@ -113,10 +111,8 @@ public final class VideoPlayback {
|
||||
continue;
|
||||
}
|
||||
if (!e.backend.isReady()) continue;
|
||||
ByteBuffer buf = e.backend.pollFrame();
|
||||
if (buf == null) continue;
|
||||
try {
|
||||
e.upload(buf);
|
||||
e.tryUpload();
|
||||
} catch (Throwable t) {
|
||||
VideoPlayerMod.LOG.warn("[{}] texture upload failed: {}", VideoPlayerMod.MOD_ID, t.toString());
|
||||
e.close();
|
||||
@@ -188,23 +184,23 @@ public final class VideoPlayback {
|
||||
}
|
||||
}
|
||||
|
||||
/** Copy an incoming RGBA byte buffer into the texture, resizing if dimensions changed. */
|
||||
void upload(ByteBuffer rgba) {
|
||||
/**
|
||||
* If the backend has a new RGBA frame, copy it straight into the texture's native
|
||||
* pixel buffer and re-upload to GPU. The backend does the memcpy under its own lock
|
||||
* so we never read a half-written frame. RGBA bytes already match NativeImage's
|
||||
* ABGR-int layout in little-endian byte order (byte 0 = R = low byte of the int).
|
||||
*/
|
||||
void tryUpload() {
|
||||
int w = backend.videoWidth();
|
||||
int h = backend.videoHeight();
|
||||
if (w <= 0 || h <= 0) return;
|
||||
ensureTexture(w, h, false);
|
||||
NativeImage img = texture.getPixels();
|
||||
if (img == null) return;
|
||||
|
||||
// RGBA bytes from the backend already match NativeImage's ABGR-int layout when
|
||||
// viewed as little-endian bytes: byte 0 = R (low byte of ABGR int), byte 1 = G,
|
||||
// byte 2 = B, byte 3 = A. So a flat memcpy works — no per-pixel swap needed.
|
||||
// This replaces a 2M-iteration Java loop with one native memcpy for 1080p frames,
|
||||
// cutting upload time from ~20ms to <1ms and removing the main stutter source.
|
||||
long bytes = (long) w * h * 4L;
|
||||
MemoryUtil.memCopy(MemoryUtil.memAddress(rgba), img.getPointer(), bytes);
|
||||
texture.upload();
|
||||
long maxBytes = (long) w * h * 4L;
|
||||
if (backend.consumeFrame(img.getPointer(), maxBytes)) {
|
||||
texture.upload();
|
||||
}
|
||||
}
|
||||
|
||||
void close() {
|
||||
|
||||
@@ -4,8 +4,6 @@ import com.ejclaw.videoplayer.VideoPlayerMod;
|
||||
import net.fabricmc.api.EnvType;
|
||||
import net.fabricmc.api.Environment;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* SPEC §5.3 / §5.4 — WaterMedia v2 backend. Reflection-only so the mod jar stays clean of
|
||||
* compile-time WaterMedia dependencies. Until a v2 build supports 1.21.6+ this returns
|
||||
@@ -38,8 +36,8 @@ public class WatermediaBackend implements VideoBackend {
|
||||
@Override public int videoHeight() { return height; }
|
||||
|
||||
@Override
|
||||
public ByteBuffer pollFrame() {
|
||||
return null; // no frames until v2 is wired up
|
||||
public boolean consumeFrame(long dstAddr, long maxBytes) {
|
||||
return false; // no frames until v2 is wired up
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
Reference in New Issue
Block a user