Use simpler speech detection on Windows local mode
This commit is contained in:
@@ -34,6 +34,13 @@ export class LocalVoiceSession {
|
||||
private readonly queue: SpeechJob[] = [];
|
||||
private readonly pendingSamples: number[] = [];
|
||||
private readonly silenceThreshold = 900;
|
||||
private readonly windowsFrameSamples = 320;
|
||||
private readonly windowsPreRollSamples = 3_200;
|
||||
private readonly windowsSpeechStartThreshold = 520;
|
||||
private readonly windowsSpeechContinueThreshold = 260;
|
||||
private readonly windowsSpeechStartFrames = 2;
|
||||
private readonly windowsSpeechEndFrames = 18;
|
||||
private readonly windowsMinSpeechSamples = 7_200;
|
||||
|
||||
private vad: RealTimeVAD | null = null;
|
||||
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
|
||||
@@ -49,12 +56,18 @@ export class LocalVoiceSession {
|
||||
private lastNonSilentAudioAt = 0;
|
||||
private warnedNoPcm = false;
|
||||
private warnedSilent = false;
|
||||
private windowsSpeechBuffer: number[] = [];
|
||||
private windowsPreRollBuffer: number[] = [];
|
||||
private windowsSpeechActive = false;
|
||||
private windowsSpeechCandidateFrames = 0;
|
||||
private windowsSilenceFrames = 0;
|
||||
|
||||
constructor(private readonly options: LocalVoiceSessionOptions) {
|
||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
if (process.platform !== "win32") {
|
||||
this.vad = await RealTimeVAD.new({
|
||||
model: "v5",
|
||||
sampleRate: 16000,
|
||||
@@ -74,6 +87,9 @@ export class LocalVoiceSession {
|
||||
void this.handleSpeechEnd(audio);
|
||||
},
|
||||
});
|
||||
} else {
|
||||
this.options.logger.info("Windows local mode uses amplitude-based speech detection");
|
||||
}
|
||||
|
||||
this.recorder = this.spawnRecorder();
|
||||
this.recorderStartedAt = Date.now();
|
||||
@@ -212,7 +228,7 @@ export class LocalVoiceSession {
|
||||
}
|
||||
|
||||
private pushPcm16Chunk(chunk: Buffer): void {
|
||||
if (this.destroyed || !this.vad) {
|
||||
if (this.destroyed) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -232,6 +248,15 @@ export class LocalVoiceSession {
|
||||
this.lastNonSilentAudioAt = Date.now();
|
||||
}
|
||||
|
||||
if (process.platform === "win32") {
|
||||
this.processWindowsSpeechFrames();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.vad) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, 1536);
|
||||
if (!frame) {
|
||||
@@ -247,6 +272,70 @@ export class LocalVoiceSession {
|
||||
}
|
||||
}
|
||||
|
||||
private processWindowsSpeechFrames(): void {
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, this.windowsFrameSamples);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
let peak = 0;
|
||||
for (const sample of frame) {
|
||||
const abs = Math.abs(sample);
|
||||
if (abs > peak) {
|
||||
peak = abs;
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.windowsSpeechActive) {
|
||||
this.appendWithCap(this.windowsPreRollBuffer, frame, this.windowsPreRollSamples);
|
||||
|
||||
if (peak >= this.windowsSpeechStartThreshold) {
|
||||
this.windowsSpeechCandidateFrames += 1;
|
||||
} else {
|
||||
this.windowsSpeechCandidateFrames = 0;
|
||||
}
|
||||
|
||||
if (this.windowsSpeechCandidateFrames >= this.windowsSpeechStartFrames) {
|
||||
this.windowsSpeechActive = true;
|
||||
this.windowsSilenceFrames = 0;
|
||||
this.windowsSpeechBuffer = [...this.windowsPreRollBuffer];
|
||||
this.windowsPreRollBuffer = [];
|
||||
this.interruptPlayback("local-barge-in");
|
||||
this.options.logger.debug("Windows speech start detected", { peak });
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.windowsSpeechBuffer.push(...frame);
|
||||
|
||||
if (peak >= this.windowsSpeechContinueThreshold) {
|
||||
this.windowsSilenceFrames = 0;
|
||||
} else {
|
||||
this.windowsSilenceFrames += 1;
|
||||
}
|
||||
|
||||
if (this.windowsSilenceFrames < this.windowsSpeechEndFrames) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const speech = Int16Array.from(this.windowsSpeechBuffer);
|
||||
this.windowsSpeechActive = false;
|
||||
this.windowsSpeechBuffer = [];
|
||||
this.windowsSilenceFrames = 0;
|
||||
this.windowsSpeechCandidateFrames = 0;
|
||||
|
||||
if (speech.length < this.windowsMinSpeechSamples) {
|
||||
this.options.logger.debug("Ignored short Windows speech segment", { samples: speech.length });
|
||||
continue;
|
||||
}
|
||||
|
||||
this.options.logger.debug("Windows speech end detected", { samples: speech.length });
|
||||
void this.handleSpeechEnd(int16ArrayToFloat32(speech));
|
||||
}
|
||||
}
|
||||
|
||||
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
|
||||
if (audio.length < 16000 * 0.25) {
|
||||
this.options.logger.debug("Ignored short local speech segment", { samples: audio.length });
|
||||
@@ -537,6 +626,13 @@ export class LocalVoiceSession {
|
||||
}
|
||||
return this.options.config.LOCAL_AUDIO_SINK ?? "default";
|
||||
}
|
||||
|
||||
private appendWithCap(target: number[], samples: Int16Array, cap: number): void {
|
||||
target.push(...samples);
|
||||
if (target.length > cap) {
|
||||
target.splice(0, target.length - cap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function createWaveFileBuffer(
|
||||
|
||||
Reference in New Issue
Block a user