Files
realtime_voice_bot/src/audio/realtime-segmenter.ts
2026-05-03 01:56:09 +09:00

149 lines
4.3 KiB
TypeScript

interface RealtimeSegmenterOptions {
onSegment: (pcm16: Buffer) => void;
onLevel?: (peak: number) => void;
onSpeechStart?: (peak: number) => void;
onSpeechDiscarded?: (samples: number) => void;
onSpeechReady?: (samples: number) => void;
preRollSamples?: number;
speechStartThreshold?: number;
speechContinueThreshold?: number;
speechStartFrames?: number;
speechEndFrames?: number;
minSpeechSamples?: number;
maxSpeechSamples?: number;
}
export class RealtimeSegmenter {
private readonly pendingSamples: number[] = [];
private readonly preRoll: number[] = [];
private readonly speech: number[] = [];
private readonly frameSamples = 320;
private readonly preRollSamples: number;
private readonly speechStartThreshold: number;
private readonly speechContinueThreshold: number;
private readonly speechStartFrames: number;
private readonly speechEndFrames: number;
private readonly minSpeechSamples: number;
private readonly maxSpeechSamples: number;
private speechActive = false;
private speechCandidateFrames = 0;
private silenceFrames = 0;
constructor(private readonly options: RealtimeSegmenterOptions) {
this.preRollSamples = options.preRollSamples ?? 3200;
this.speechStartThreshold = options.speechStartThreshold ?? 900;
this.speechContinueThreshold = options.speechContinueThreshold ?? 450;
this.speechStartFrames = options.speechStartFrames ?? 2;
this.speechEndFrames = options.speechEndFrames ?? 24;
this.minSpeechSamples = options.minSpeechSamples ?? 7200;
this.maxSpeechSamples = options.maxSpeechSamples ?? 160000;
}
pushChunk(chunk: Buffer): void {
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
this.pendingSamples.push(chunk.readInt16LE(offset));
}
while (true) {
const frame = takeFrame(this.pendingSamples, this.frameSamples);
if (!frame) {
return;
}
this.processFrame(frame);
}
}
reset(): void {
this.pendingSamples.splice(0, this.pendingSamples.length);
this.preRoll.splice(0, this.preRoll.length);
this.speech.splice(0, this.speech.length);
this.speechActive = false;
this.speechCandidateFrames = 0;
this.silenceFrames = 0;
}
private processFrame(frame: Int16Array): void {
let peak = 0;
for (const sample of frame) {
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
}
this.options.onLevel?.(peak);
if (!this.speechActive) {
appendWithCap(this.preRoll, frame, this.preRollSamples);
if (peak >= this.speechStartThreshold) {
this.speechCandidateFrames += 1;
} else {
this.speechCandidateFrames = 0;
}
if (this.speechCandidateFrames < this.speechStartFrames) {
return;
}
this.speechActive = true;
this.silenceFrames = 0;
this.speech.splice(0, this.speech.length, ...this.preRoll);
this.preRoll.splice(0, this.preRoll.length);
this.options.onSpeechStart?.(peak);
}
this.speech.push(...frame);
if (peak >= this.speechContinueThreshold) {
this.silenceFrames = 0;
} else {
this.silenceFrames += 1;
}
if (this.silenceFrames < this.speechEndFrames) {
if (this.speech.length < this.maxSpeechSamples) {
return;
}
}
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
this.speechActive = false;
this.speech.splice(0, this.speech.length);
this.silenceFrames = 0;
this.speechCandidateFrames = 0;
if (speechPcm.length < this.minSpeechSamples * 2) {
this.options.onSpeechDiscarded?.(speechPcm.length / 2);
return;
}
this.options.onSpeechReady?.(speechPcm.length / 2);
this.options.onSegment(speechPcm);
}
}
function takeFrame(source: number[], size: number): Int16Array | null {
if (source.length < size) {
return null;
}
const samples = source.splice(0, size);
return Int16Array.from(samples);
}
function appendWithCap(target: number[], samples: Int16Array, cap: number): void {
target.push(...samples);
if (target.length > cap) {
target.splice(0, target.length - cap);
}
}
function int16ArrayToBuffer(input: Int16Array): Buffer {
const output = Buffer.allocUnsafe(input.length * 2);
for (let index = 0; index < input.length; index += 1) {
output.writeInt16LE(input[index]!, index * 2);
}
return output;
}