diff --git a/src/audio/local-voice-session.ts b/src/audio/local-voice-session.ts index c81965d..b010f85 100644 --- a/src/audio/local-voice-session.ts +++ b/src/audio/local-voice-session.ts @@ -34,6 +34,13 @@ export class LocalVoiceSession { private readonly queue: SpeechJob[] = []; private readonly pendingSamples: number[] = []; private readonly silenceThreshold = 900; + private readonly windowsFrameSamples = 320; + private readonly windowsPreRollSamples = 3_200; + private readonly windowsSpeechStartThreshold = 520; + private readonly windowsSpeechContinueThreshold = 260; + private readonly windowsSpeechStartFrames = 2; + private readonly windowsSpeechEndFrames = 18; + private readonly windowsMinSpeechSamples = 7_200; private vad: RealTimeVAD | null = null; private recorder: ChildProcessByStdio | null = null; @@ -49,31 +56,40 @@ export class LocalVoiceSession { private lastNonSilentAudioAt = 0; private warnedNoPcm = false; private warnedSilent = false; + private windowsSpeechBuffer: number[] = []; + private windowsPreRollBuffer: number[] = []; + private windowsSpeechActive = false; + private windowsSpeechCandidateFrames = 0; + private windowsSilenceFrames = 0; constructor(private readonly options: LocalVoiceSessionOptions) { this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS); } async start(): Promise { - this.vad = await RealTimeVAD.new({ - model: "v5", - sampleRate: 16000, - frameSamples: 1536, - positiveSpeechThreshold: 0.55, - negativeSpeechThreshold: 0.35, - redemptionFrames: 8, - preSpeechPadFrames: 2, - minSpeechFrames: 3, - onFrameProcessed: () => undefined, - onVADMisfire: () => undefined, - onSpeechStart: () => { - this.interruptPlayback("local-barge-in"); - }, - onSpeechRealStart: () => undefined, - onSpeechEnd: (audio: Float32Array) => { - void this.handleSpeechEnd(audio); - }, - }); + if (process.platform !== "win32") { + this.vad = await RealTimeVAD.new({ + model: "v5", + sampleRate: 16000, + frameSamples: 1536, + positiveSpeechThreshold: 0.55, + negativeSpeechThreshold: 0.35, + redemptionFrames: 8, + preSpeechPadFrames: 2, + minSpeechFrames: 3, + onFrameProcessed: () => undefined, + onVADMisfire: () => undefined, + onSpeechStart: () => { + this.interruptPlayback("local-barge-in"); + }, + onSpeechRealStart: () => undefined, + onSpeechEnd: (audio: Float32Array) => { + void this.handleSpeechEnd(audio); + }, + }); + } else { + this.options.logger.info("Windows local mode uses amplitude-based speech detection"); + } this.recorder = this.spawnRecorder(); this.recorderStartedAt = Date.now(); @@ -212,7 +228,7 @@ export class LocalVoiceSession { } private pushPcm16Chunk(chunk: Buffer): void { - if (this.destroyed || !this.vad) { + if (this.destroyed) { return; } @@ -232,6 +248,15 @@ export class LocalVoiceSession { this.lastNonSilentAudioAt = Date.now(); } + if (process.platform === "win32") { + this.processWindowsSpeechFrames(); + return; + } + + if (!this.vad) { + return; + } + while (true) { const frame = takeFrame(this.pendingSamples, 1536); if (!frame) { @@ -247,6 +272,70 @@ export class LocalVoiceSession { } } + private processWindowsSpeechFrames(): void { + while (true) { + const frame = takeFrame(this.pendingSamples, this.windowsFrameSamples); + if (!frame) { + return; + } + + let peak = 0; + for (const sample of frame) { + const abs = Math.abs(sample); + if (abs > peak) { + peak = abs; + } + } + + if (!this.windowsSpeechActive) { + this.appendWithCap(this.windowsPreRollBuffer, frame, this.windowsPreRollSamples); + + if (peak >= this.windowsSpeechStartThreshold) { + this.windowsSpeechCandidateFrames += 1; + } else { + this.windowsSpeechCandidateFrames = 0; + } + + if (this.windowsSpeechCandidateFrames >= this.windowsSpeechStartFrames) { + this.windowsSpeechActive = true; + this.windowsSilenceFrames = 0; + this.windowsSpeechBuffer = [...this.windowsPreRollBuffer]; + this.windowsPreRollBuffer = []; + this.interruptPlayback("local-barge-in"); + this.options.logger.debug("Windows speech start detected", { peak }); + } else { + continue; + } + } + + this.windowsSpeechBuffer.push(...frame); + + if (peak >= this.windowsSpeechContinueThreshold) { + this.windowsSilenceFrames = 0; + } else { + this.windowsSilenceFrames += 1; + } + + if (this.windowsSilenceFrames < this.windowsSpeechEndFrames) { + continue; + } + + const speech = Int16Array.from(this.windowsSpeechBuffer); + this.windowsSpeechActive = false; + this.windowsSpeechBuffer = []; + this.windowsSilenceFrames = 0; + this.windowsSpeechCandidateFrames = 0; + + if (speech.length < this.windowsMinSpeechSamples) { + this.options.logger.debug("Ignored short Windows speech segment", { samples: speech.length }); + continue; + } + + this.options.logger.debug("Windows speech end detected", { samples: speech.length }); + void this.handleSpeechEnd(int16ArrayToFloat32(speech)); + } + } + private async handleSpeechEnd(audio: Float32Array): Promise { if (audio.length < 16000 * 0.25) { this.options.logger.debug("Ignored short local speech segment", { samples: audio.length }); @@ -537,6 +626,13 @@ export class LocalVoiceSession { } return this.options.config.LOCAL_AUDIO_SINK ?? "default"; } + + private appendWithCap(target: number[], samples: Int16Array, cap: number): void { + target.push(...samples); + if (target.length > cap) { + target.splice(0, target.length - cap); + } + } } function createWaveFileBuffer(