Add realtime loopback STT prototype

This commit is contained in:
2026-05-02 20:20:54 +09:00
parent 10e0dd75db
commit 5775c4809a
17 changed files with 1034 additions and 0 deletions

View File

@@ -0,0 +1,112 @@
interface RealtimeSegmenterOptions {
onSegment: (pcm16: Buffer) => void;
}
export class RealtimeSegmenter {
private readonly pendingSamples: number[] = [];
private readonly preRoll: number[] = [];
private readonly speech: number[] = [];
private readonly frameSamples = 320;
private readonly preRollSamples = 3200;
private readonly speechStartThreshold = 900;
private readonly speechContinueThreshold = 450;
private readonly speechStartFrames = 2;
private readonly speechEndFrames = 18;
private readonly minSpeechSamples = 6400;
private speechActive = false;
private speechCandidateFrames = 0;
private silenceFrames = 0;
constructor(private readonly options: RealtimeSegmenterOptions) {}
pushChunk(chunk: Buffer): void {
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
this.pendingSamples.push(chunk.readInt16LE(offset));
}
while (true) {
const frame = takeFrame(this.pendingSamples, this.frameSamples);
if (!frame) {
return;
}
this.processFrame(frame);
}
}
private processFrame(frame: Int16Array): void {
let peak = 0;
for (const sample of frame) {
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
}
if (!this.speechActive) {
appendWithCap(this.preRoll, frame, this.preRollSamples);
if (peak >= this.speechStartThreshold) {
this.speechCandidateFrames += 1;
} else {
this.speechCandidateFrames = 0;
}
if (this.speechCandidateFrames < this.speechStartFrames) {
return;
}
this.speechActive = true;
this.silenceFrames = 0;
this.speech.splice(0, this.speech.length, ...this.preRoll);
this.preRoll.splice(0, this.preRoll.length);
}
this.speech.push(...frame);
if (peak >= this.speechContinueThreshold) {
this.silenceFrames = 0;
} else {
this.silenceFrames += 1;
}
if (this.silenceFrames < this.speechEndFrames) {
return;
}
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
this.speechActive = false;
this.speech.splice(0, this.speech.length);
this.silenceFrames = 0;
this.speechCandidateFrames = 0;
if (speechPcm.length < this.minSpeechSamples * 2) {
return;
}
this.options.onSegment(speechPcm);
}
}
function takeFrame(source: number[], size: number): Int16Array | null {
if (source.length < size) {
return null;
}
const samples = source.splice(0, size);
return Int16Array.from(samples);
}
function appendWithCap(target: number[], samples: Int16Array, cap: number): void {
target.push(...samples);
if (target.length > cap) {
target.splice(0, target.length - cap);
}
}
function int16ArrayToBuffer(input: Int16Array): Buffer {
const output = Buffer.allocUnsafe(input.length * 2);
for (let index = 0; index < input.length; index += 1) {
output.writeInt16LE(input[index]!, index * 2);
}
return output;
}