v0.4.11: video frame ring buffer + decoder stats + 0.1s audio buffer
Some checks failed
build / build (push) Has been cancelled

0.4.10 still played at ~2-5 fps even though the decoder buffer was
preallocated. Root cause: the single-slot staging buffer was paced by
SourceDataLine backpressure at the audio buffer's granularity (~0.5 s),
so the decoder burst-produced ~12 video frames into the slot while audio
drained, the consumer saw only the last frame of each burst, then the
decoder stalled until audio drained again. Net visible rate ~ source_fps
/ frames_per_burst.

Fix:
- Replace single staging slot with a 4-slot ring (preallocated, FIFO).
  Decoder writes to ringTail; if full, overwrites oldest and bumps
  droppedFrames so we can see overflow in the log. Render thread drains
  oldest under the same lock — no allocation, no race.
- Shrink audio driver buffer 0.5 s → 0.1 s so the decoder is paced more
  tightly. Burst size collapses from ~12 frames to 2-3, which fits
  inside the ring.
- Log decoder spec on start (WxH @ fps, audio Hz x ch, ring depth) and
  produced/consumed/dropped counters every ~10 s. Lets the user log
  confirm whether the decoder is keeping real-time pace and whether the
  ring is overflowing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
tkrmagid
2026-05-16 02:10:47 +09:00
parent cee01bd448
commit 9b99283b70
3 changed files with 135 additions and 55 deletions

View File

@@ -15,6 +15,7 @@ import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.ShortBuffer;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
/**
* SPEC §5.3 — fallback mp4/http(s) backend driven by JavaCV's FFmpegFrameGrabber.
@@ -40,16 +41,35 @@ public class JavaCvBackend implements VideoBackend {
private final AtomicBoolean running = new AtomicBoolean(false);
private final AtomicBoolean paused = new AtomicBoolean(false);
/**
* Single preallocated RGBA staging buffer. Decoder thread writes into it under
* {@link #frameLock}; render thread reads via {@link #consumeFrame(long, long)} under the
* same lock. One allocation for the lifetime of the backend instead of one per frame —
* see 0.4.10 changelog for the regression that motivated this. The lock is short-held
* (one 8MB memcpy ≈ 1ms at 1080p) so contention is negligible.
* Ring buffer of preallocated RGBA staging slots. Decoder thread writes to {@code ringTail}
* under {@link #frameLock}; render thread drains the oldest slot via
* {@link #consumeFrame(long, long)} under the same lock.
*
* <p>0.4.10 used a single staging slot and relied on {@link SourceDataLine#write}
* backpressure to pace the decoder. That paced only at audio-buffer granularity (~0.5 s):
* the decoder burst-produced ~12 video frames into the slot while the audio line drained,
* the consumer (60+ Hz polling) saw only the last frame of each burst, then the decoder
* stalled until audio drained again — net effect ~2 fps of visible video despite the
* decoder producing at the source's 24 fps. The ring absorbs the burst; combined with the
* smaller audio buffer (~0.1 s) below the burst collapses to 23 frames which fits in
* {@link #FRAME_RING_SLOTS}.
*
* <p>If the ring still fills, the decoder overwrites the oldest slot and increments
* {@link #droppedFrames}. Memory cost: {@code 4 × w × h × 4} bytes (32 MB at 1080p,
* ~130 MB at 4K).
*/
private static final int FRAME_RING_SLOTS = 4;
private final Object frameLock = new Object();
private ByteBuffer frameBuf;
private int frameBufBytes = 0;
private boolean frameDirty = false;
private final ByteBuffer[] ringBufs = new ByteBuffer[FRAME_RING_SLOTS];
private final int[] ringBytes = new int[FRAME_RING_SLOTS];
private int ringHead = 0; // next slot to consume
private int ringTail = 0; // next slot to produce into
private int ringCount = 0;
/** Decoder telemetry (cumulative). Logged ~every 10 s from the decode thread. */
private final AtomicLong producedFrames = new AtomicLong();
private final AtomicLong consumedFrames = new AtomicLong();
private final AtomicLong droppedFrames = new AtomicLong();
private volatile int width = 0;
private volatile int height = 0;
private volatile float gain = 1.0F;
@@ -101,17 +121,22 @@ public class JavaCvBackend implements VideoBackend {
@Override
public boolean consumeFrame(long dstAddr, long maxBytes) {
synchronized (frameLock) {
if (!frameDirty || frameBuf == null || frameBufBytes <= 0) return false;
if (frameBufBytes > maxBytes) {
// Texture not yet resized for this frame's dimensions — drop and wait for the
// caller to ensure capacity next tick. ensureTexture() runs in Entry.upload
// before consumeFrame, so this is only hit on the exact tick of a resolution
// change.
frameDirty = false;
if (ringCount <= 0) return false;
int idx = ringHead;
int n = ringBytes[idx];
ByteBuffer buf = ringBufs[idx];
// Always advance head regardless of memcpy outcome — otherwise a single oversize
// frame (e.g. mid-resize) would jam the ring forever.
ringHead = (idx + 1) % FRAME_RING_SLOTS;
ringCount--;
if (buf == null || n <= 0 || n > maxBytes) {
// Texture not yet sized for this frame, or empty slot — skip. ensureTexture()
// runs in Entry.tryUpload() before consumeFrame, so n > maxBytes only happens
// on the exact tick of a resolution change.
return false;
}
MemoryUtil.memCopy(MemoryUtil.memAddress(frameBuf), dstAddr, frameBufBytes);
frameDirty = false;
MemoryUtil.memCopy(MemoryUtil.memAddress(buf), dstAddr, n);
consumedFrames.incrementAndGet();
return true;
}
}
@@ -121,9 +146,11 @@ public class JavaCvBackend implements VideoBackend {
closed = true;
stopWorker();
synchronized (frameLock) {
frameBuf = null;
frameBufBytes = 0;
frameDirty = false;
for (int i = 0; i < FRAME_RING_SLOTS; i++) {
ringBufs[i] = null;
ringBytes[i] = 0;
}
ringHead = ringTail = ringCount = 0;
}
}
@@ -209,6 +236,18 @@ public class JavaCvBackend implements VideoBackend {
localAudioLine = openLine(sampleRate, audioChannels);
this.audioLine = localAudioLine;
// Decoder spec — printed once per playback so the user log shows what the decoder
// actually sees (resolution / frame rate / sample rate). Used to verify our pacing
// assumptions (e.g. ring depth, audio buffer length) match the source.
double srcFrameRate = 0;
try { srcFrameRate = ((Number) grabberCls.getMethod("getFrameRate").invoke(grabber)).doubleValue(); }
catch (Throwable ignored) {}
VideoPlayerMod.LOG.info(
"[{}] decoder started: {}x{} @ {} fps, audio {} Hz x{}, ring={} slots",
VideoPlayerMod.MOD_ID, width, height,
String.format("%.2f", srcFrameRate),
sampleRate, audioChannels, FRAME_RING_SLOTS);
Class<?> frameCls = Class.forName(FRAME_CLASS);
Field imageField = frameCls.getField("image");
Field samplesField = frameCls.getField("samples");
@@ -216,6 +255,14 @@ public class JavaCvBackend implements VideoBackend {
// but we still resolve its class so a future code path could fall back to it if a
// grabber refuses setPixelFormat. Keep the lookup defensive.
// Stats sampling: every 10 s of wall-clock we log produced/consumed/dropped deltas
// and the implied fps. Lets us tell from the log whether the decoder is keeping
// real-time pace (produced≈source fps) and whether the ring is overflowing
// (dropped>0). All counters are cumulative; we keep the previous sample to compute
// deltas.
long statsLastNs = System.nanoTime();
long lastProd = 0, lastCons = 0, lastDrop = 0;
while (running.get() && !closed) {
if (paused.get()) { Thread.sleep(20); continue; }
Object frame;
@@ -243,37 +290,63 @@ public class JavaCvBackend implements VideoBackend {
Object[] images = (Object[]) imageField.get(frame);
if (images != null && images.length > 0 && images[0] instanceof ByteBuffer src) {
// frame.image[0] is the swscale-converted RGBA plane, reused by the grabber
// across grab() calls. Copy into our preallocated staging buffer under
// frameLock so the render thread's consumeFrame() sees a coherent image.
// across grab() calls. Copy into the next ring slot under frameLock so the
// render thread's consumeFrame() sees coherent frames in FIFO order.
//
// 0.4.9 used `ByteBuffer.allocateDirect(w*h*4)` on every grab — at 1080p ×
// 24fps that's ~192 MB/s of direct memory churn (each allocation zero-fills
// the page, plus the Cleaner enqueues the old buffer for finalization).
// The decoder thread spent so much time on memory bookkeeping that grab()
// fell behind real time, the single-slot `latest` AtomicReference was
// refilled in bursts, and the user saw ~5fps playback even though the
// game/render thread was fine.
//
// Preallocating once eliminates both the zero-fill cost and the Cleaner
// pressure. The decoder thread now spends its budget on the actual decode +
// swscale + a single 8MB memcpy — well within 42ms at 1080p × 24fps.
// Allocation is one-time per slot, lazily on first use (or on a resolution
// upgrade) — never per frame. 0.4.9's per-frame allocateDirect was the
// primary memory-churn problem; 0.4.10 fixed that; 0.4.11 adds the ring on
// top to absorb the burst-then-stall caused by SourceDataLine backpressure
// pacing only at audio-buffer granularity.
int need = src.remaining();
if (need > 0) {
int srcPos = src.position();
long srcAddr = MemoryUtil.memAddress(src) + srcPos;
synchronized (frameLock) {
if (frameBuf == null || frameBuf.capacity() < need) {
frameBuf = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
int idx = ringTail;
if (ringBufs[idx] == null || ringBufs[idx].capacity() < need) {
ringBufs[idx] = ByteBuffer.allocateDirect(need).order(ByteOrder.nativeOrder());
}
int srcPos = src.position();
long dstAddr = MemoryUtil.memAddress(frameBuf);
long srcAddr = MemoryUtil.memAddress(src) + srcPos;
long dstAddr = MemoryUtil.memAddress(ringBufs[idx]);
MemoryUtil.memCopy(srcAddr, dstAddr, need);
src.position(srcPos); // unchanged, but explicit — JavaCV reads it too
frameBufBytes = need;
frameDirty = true;
ringBytes[idx] = need;
ringTail = (idx + 1) % FRAME_RING_SLOTS;
if (ringCount < FRAME_RING_SLOTS) {
ringCount++;
} else {
// Ring was full — we overwrote the oldest frame. Advance head
// to point at the next-oldest so consume order stays FIFO.
ringHead = (ringHead + 1) % FRAME_RING_SLOTS;
droppedFrames.incrementAndGet();
}
producedFrames.incrementAndGet();
}
src.position(srcPos); // restore — JavaCV reads it on subsequent grabs
}
}
// Periodic stats — once per ~10 s of wall-clock. Includes ring depth so we can
// see whether the consumer is keeping up.
long now = System.nanoTime();
if (now - statsLastNs > 10_000_000_000L) {
long prod = producedFrames.get();
long cons = consumedFrames.get();
long drop = droppedFrames.get();
double elapsedS = (now - statsLastNs) / 1e9;
int depth;
synchronized (frameLock) { depth = ringCount; }
VideoPlayerMod.LOG.info(
"[{}] decoder stats: produced={} ({} fps), consumed={} ({} fps), dropped={} (+{}) over {}s, ring={}/{}",
VideoPlayerMod.MOD_ID,
prod, String.format("%.1f", (prod - lastProd) / elapsedS),
cons, String.format("%.1f", (cons - lastCons) / elapsedS),
drop, (drop - lastDrop),
String.format("%.1f", elapsedS),
depth, FRAME_RING_SLOTS);
statsLastNs = now;
lastProd = prod; lastCons = cons; lastDrop = drop;
}
// If we have an open audio line, SourceDataLine.write() blocks for backpressure
// and provides natural A/V pacing; otherwise tick ~60fps so we don't busy-loop.
if (localAudioLine == null) Thread.sleep(15);
@@ -308,10 +381,17 @@ public class JavaCvBackend implements VideoBackend {
try {
AudioFormat fmt = new AudioFormat(sampleRate, 16, channels, true, false); // signed 16-bit LE
SourceDataLine line = AudioSystem.getSourceDataLine(fmt);
// ~0.5 s of audio buffered in the driver. Smooths over upstream hiccups without
// delaying close() — stopWorker() calls line.stop() / line.flush() to dump it.
// ~0.1 s of audio buffered in the driver. 0.4.10 used 0.5 s, which let the decoder
// burst ~12 video frames between backpressure stalls — way past the video ring's
// capacity and the visible cause of the "2-5 fps" stutter the user saw. With 0.1 s
// the audio line refills more often, so the decoder is paced more tightly and
// bursts collapse to 2-3 frames (well inside FRAME_RING_SLOTS).
//
// Floor at frameSizeBytes * 256 keeps the buffer above the typical OS / driver
// minimum so we don't get UnsupportedOperationException at line.open() on
// exotic sample rates.
int frameSizeBytes = 2 * channels;
int bufferBytes = Math.max(sampleRate * frameSizeBytes / 2, frameSizeBytes * 1024);
int bufferBytes = Math.max(sampleRate * frameSizeBytes / 10, frameSizeBytes * 256);
line.open(fmt, bufferBytes);
line.start();
return line;