149 lines
4.3 KiB
TypeScript
149 lines
4.3 KiB
TypeScript
interface RealtimeSegmenterOptions {
|
|
onSegment: (pcm16: Buffer) => void;
|
|
onLevel?: (peak: number) => void;
|
|
onSpeechStart?: (peak: number) => void;
|
|
onSpeechDiscarded?: (samples: number) => void;
|
|
onSpeechReady?: (samples: number) => void;
|
|
preRollSamples?: number;
|
|
speechStartThreshold?: number;
|
|
speechContinueThreshold?: number;
|
|
speechStartFrames?: number;
|
|
speechEndFrames?: number;
|
|
minSpeechSamples?: number;
|
|
maxSpeechSamples?: number;
|
|
}
|
|
|
|
export class RealtimeSegmenter {
|
|
private readonly pendingSamples: number[] = [];
|
|
private readonly preRoll: number[] = [];
|
|
private readonly speech: number[] = [];
|
|
|
|
private readonly frameSamples = 320;
|
|
private readonly preRollSamples: number;
|
|
private readonly speechStartThreshold: number;
|
|
private readonly speechContinueThreshold: number;
|
|
private readonly speechStartFrames: number;
|
|
private readonly speechEndFrames: number;
|
|
private readonly minSpeechSamples: number;
|
|
private readonly maxSpeechSamples: number;
|
|
|
|
private speechActive = false;
|
|
private speechCandidateFrames = 0;
|
|
private silenceFrames = 0;
|
|
|
|
constructor(private readonly options: RealtimeSegmenterOptions) {
|
|
this.preRollSamples = options.preRollSamples ?? 3200;
|
|
this.speechStartThreshold = options.speechStartThreshold ?? 900;
|
|
this.speechContinueThreshold = options.speechContinueThreshold ?? 450;
|
|
this.speechStartFrames = options.speechStartFrames ?? 2;
|
|
this.speechEndFrames = options.speechEndFrames ?? 24;
|
|
this.minSpeechSamples = options.minSpeechSamples ?? 7200;
|
|
this.maxSpeechSamples = options.maxSpeechSamples ?? 160000;
|
|
}
|
|
|
|
pushChunk(chunk: Buffer): void {
|
|
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
|
this.pendingSamples.push(chunk.readInt16LE(offset));
|
|
}
|
|
|
|
while (true) {
|
|
const frame = takeFrame(this.pendingSamples, this.frameSamples);
|
|
if (!frame) {
|
|
return;
|
|
}
|
|
this.processFrame(frame);
|
|
}
|
|
}
|
|
|
|
reset(): void {
|
|
this.pendingSamples.splice(0, this.pendingSamples.length);
|
|
this.preRoll.splice(0, this.preRoll.length);
|
|
this.speech.splice(0, this.speech.length);
|
|
this.speechActive = false;
|
|
this.speechCandidateFrames = 0;
|
|
this.silenceFrames = 0;
|
|
}
|
|
|
|
private processFrame(frame: Int16Array): void {
|
|
let peak = 0;
|
|
for (const sample of frame) {
|
|
const abs = Math.abs(sample);
|
|
if (abs > peak) {
|
|
peak = abs;
|
|
}
|
|
}
|
|
|
|
this.options.onLevel?.(peak);
|
|
|
|
if (!this.speechActive) {
|
|
appendWithCap(this.preRoll, frame, this.preRollSamples);
|
|
if (peak >= this.speechStartThreshold) {
|
|
this.speechCandidateFrames += 1;
|
|
} else {
|
|
this.speechCandidateFrames = 0;
|
|
}
|
|
|
|
if (this.speechCandidateFrames < this.speechStartFrames) {
|
|
return;
|
|
}
|
|
|
|
this.speechActive = true;
|
|
this.silenceFrames = 0;
|
|
this.speech.splice(0, this.speech.length, ...this.preRoll);
|
|
this.preRoll.splice(0, this.preRoll.length);
|
|
this.options.onSpeechStart?.(peak);
|
|
}
|
|
|
|
this.speech.push(...frame);
|
|
|
|
if (peak >= this.speechContinueThreshold) {
|
|
this.silenceFrames = 0;
|
|
} else {
|
|
this.silenceFrames += 1;
|
|
}
|
|
|
|
if (this.silenceFrames < this.speechEndFrames) {
|
|
if (this.speech.length < this.maxSpeechSamples) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
|
|
this.speechActive = false;
|
|
this.speech.splice(0, this.speech.length);
|
|
this.silenceFrames = 0;
|
|
this.speechCandidateFrames = 0;
|
|
|
|
if (speechPcm.length < this.minSpeechSamples * 2) {
|
|
this.options.onSpeechDiscarded?.(speechPcm.length / 2);
|
|
return;
|
|
}
|
|
|
|
this.options.onSpeechReady?.(speechPcm.length / 2);
|
|
this.options.onSegment(speechPcm);
|
|
}
|
|
}
|
|
|
|
function takeFrame(source: number[], size: number): Int16Array | null {
|
|
if (source.length < size) {
|
|
return null;
|
|
}
|
|
const samples = source.splice(0, size);
|
|
return Int16Array.from(samples);
|
|
}
|
|
|
|
function appendWithCap(target: number[], samples: Int16Array, cap: number): void {
|
|
target.push(...samples);
|
|
if (target.length > cap) {
|
|
target.splice(0, target.length - cap);
|
|
}
|
|
}
|
|
|
|
function int16ArrayToBuffer(input: Int16Array): Buffer {
|
|
const output = Buffer.allocUnsafe(input.length * 2);
|
|
for (let index = 0; index < input.length; index += 1) {
|
|
output.writeInt16LE(input[index]!, index * 2);
|
|
}
|
|
return output;
|
|
}
|