interface RealtimeSegmenterOptions { onSegment: (pcm16: Buffer) => void; onLevel?: (peak: number) => void; onSpeechStart?: (peak: number) => void; onSpeechDiscarded?: (samples: number) => void; onSpeechReady?: (samples: number) => void; preRollSamples?: number; speechStartThreshold?: number; speechContinueThreshold?: number; speechStartFrames?: number; speechEndFrames?: number; minSpeechSamples?: number; maxSpeechSamples?: number; } export class RealtimeSegmenter { private readonly pendingSamples: number[] = []; private readonly preRoll: number[] = []; private readonly speech: number[] = []; private readonly frameSamples = 320; private readonly preRollSamples: number; private readonly speechStartThreshold: number; private readonly speechContinueThreshold: number; private readonly speechStartFrames: number; private readonly speechEndFrames: number; private readonly minSpeechSamples: number; private readonly maxSpeechSamples: number; private speechActive = false; private speechCandidateFrames = 0; private silenceFrames = 0; constructor(private readonly options: RealtimeSegmenterOptions) { this.preRollSamples = options.preRollSamples ?? 3200; this.speechStartThreshold = options.speechStartThreshold ?? 900; this.speechContinueThreshold = options.speechContinueThreshold ?? 450; this.speechStartFrames = options.speechStartFrames ?? 2; this.speechEndFrames = options.speechEndFrames ?? 24; this.minSpeechSamples = options.minSpeechSamples ?? 7200; this.maxSpeechSamples = options.maxSpeechSamples ?? 160000; } pushChunk(chunk: Buffer): void { for (let offset = 0; offset + 1 < chunk.length; offset += 2) { this.pendingSamples.push(chunk.readInt16LE(offset)); } while (true) { const frame = takeFrame(this.pendingSamples, this.frameSamples); if (!frame) { return; } this.processFrame(frame); } } reset(): void { this.pendingSamples.splice(0, this.pendingSamples.length); this.preRoll.splice(0, this.preRoll.length); this.speech.splice(0, this.speech.length); this.speechActive = false; this.speechCandidateFrames = 0; this.silenceFrames = 0; } private processFrame(frame: Int16Array): void { let peak = 0; for (const sample of frame) { const abs = Math.abs(sample); if (abs > peak) { peak = abs; } } this.options.onLevel?.(peak); if (!this.speechActive) { appendWithCap(this.preRoll, frame, this.preRollSamples); if (peak >= this.speechStartThreshold) { this.speechCandidateFrames += 1; } else { this.speechCandidateFrames = 0; } if (this.speechCandidateFrames < this.speechStartFrames) { return; } this.speechActive = true; this.silenceFrames = 0; this.speech.splice(0, this.speech.length, ...this.preRoll); this.preRoll.splice(0, this.preRoll.length); this.options.onSpeechStart?.(peak); } this.speech.push(...frame); if (peak >= this.speechContinueThreshold) { this.silenceFrames = 0; } else { this.silenceFrames += 1; } if (this.silenceFrames < this.speechEndFrames) { if (this.speech.length < this.maxSpeechSamples) { return; } } const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech)); this.speechActive = false; this.speech.splice(0, this.speech.length); this.silenceFrames = 0; this.speechCandidateFrames = 0; if (speechPcm.length < this.minSpeechSamples * 2) { this.options.onSpeechDiscarded?.(speechPcm.length / 2); return; } this.options.onSpeechReady?.(speechPcm.length / 2); this.options.onSegment(speechPcm); } } function takeFrame(source: number[], size: number): Int16Array | null { if (source.length < size) { return null; } const samples = source.splice(0, size); return Int16Array.from(samples); } function appendWithCap(target: number[], samples: Int16Array, cap: number): void { target.push(...samples); if (target.length > cap) { target.splice(0, target.length - cap); } } function int16ArrayToBuffer(input: Int16Array): Buffer { const output = Buffer.allocUnsafe(input.length * 2); for (let index = 0; index < input.length; index += 1) { output.writeInt16LE(input[index]!, index * 2); } return output; }