Add realtime loopback STT prototype
This commit is contained in:
112
src/audio/realtime-segmenter.ts
Normal file
112
src/audio/realtime-segmenter.ts
Normal file
@@ -0,0 +1,112 @@
|
||||
interface RealtimeSegmenterOptions {
|
||||
onSegment: (pcm16: Buffer) => void;
|
||||
}
|
||||
|
||||
export class RealtimeSegmenter {
|
||||
private readonly pendingSamples: number[] = [];
|
||||
private readonly preRoll: number[] = [];
|
||||
private readonly speech: number[] = [];
|
||||
|
||||
private readonly frameSamples = 320;
|
||||
private readonly preRollSamples = 3200;
|
||||
private readonly speechStartThreshold = 900;
|
||||
private readonly speechContinueThreshold = 450;
|
||||
private readonly speechStartFrames = 2;
|
||||
private readonly speechEndFrames = 18;
|
||||
private readonly minSpeechSamples = 6400;
|
||||
|
||||
private speechActive = false;
|
||||
private speechCandidateFrames = 0;
|
||||
private silenceFrames = 0;
|
||||
|
||||
constructor(private readonly options: RealtimeSegmenterOptions) {}
|
||||
|
||||
pushChunk(chunk: Buffer): void {
|
||||
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
||||
this.pendingSamples.push(chunk.readInt16LE(offset));
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, this.frameSamples);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
this.processFrame(frame);
|
||||
}
|
||||
}
|
||||
|
||||
private processFrame(frame: Int16Array): void {
|
||||
let peak = 0;
|
||||
for (const sample of frame) {
|
||||
const abs = Math.abs(sample);
|
||||
if (abs > peak) {
|
||||
peak = abs;
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.speechActive) {
|
||||
appendWithCap(this.preRoll, frame, this.preRollSamples);
|
||||
if (peak >= this.speechStartThreshold) {
|
||||
this.speechCandidateFrames += 1;
|
||||
} else {
|
||||
this.speechCandidateFrames = 0;
|
||||
}
|
||||
|
||||
if (this.speechCandidateFrames < this.speechStartFrames) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.speechActive = true;
|
||||
this.silenceFrames = 0;
|
||||
this.speech.splice(0, this.speech.length, ...this.preRoll);
|
||||
this.preRoll.splice(0, this.preRoll.length);
|
||||
}
|
||||
|
||||
this.speech.push(...frame);
|
||||
|
||||
if (peak >= this.speechContinueThreshold) {
|
||||
this.silenceFrames = 0;
|
||||
} else {
|
||||
this.silenceFrames += 1;
|
||||
}
|
||||
|
||||
if (this.silenceFrames < this.speechEndFrames) {
|
||||
return;
|
||||
}
|
||||
|
||||
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
|
||||
this.speechActive = false;
|
||||
this.speech.splice(0, this.speech.length);
|
||||
this.silenceFrames = 0;
|
||||
this.speechCandidateFrames = 0;
|
||||
|
||||
if (speechPcm.length < this.minSpeechSamples * 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.options.onSegment(speechPcm);
|
||||
}
|
||||
}
|
||||
|
||||
function takeFrame(source: number[], size: number): Int16Array | null {
|
||||
if (source.length < size) {
|
||||
return null;
|
||||
}
|
||||
const samples = source.splice(0, size);
|
||||
return Int16Array.from(samples);
|
||||
}
|
||||
|
||||
function appendWithCap(target: number[], samples: Int16Array, cap: number): void {
|
||||
target.push(...samples);
|
||||
if (target.length > cap) {
|
||||
target.splice(0, target.length - cap);
|
||||
}
|
||||
}
|
||||
|
||||
function int16ArrayToBuffer(input: Int16Array): Buffer {
|
||||
const output = Buffer.allocUnsafe(input.length * 2);
|
||||
for (let index = 0; index < input.length; index += 1) {
|
||||
output.writeInt16LE(input[index]!, index * 2);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
Reference in New Issue
Block a user