Use simpler speech detection on Windows local mode

This commit is contained in:
2026-04-30 17:50:03 +09:00
parent e74f71e45b
commit ab4e0b38b0

View File

@@ -34,6 +34,13 @@ export class LocalVoiceSession {
private readonly queue: SpeechJob[] = [];
private readonly pendingSamples: number[] = [];
private readonly silenceThreshold = 900;
private readonly windowsFrameSamples = 320;
private readonly windowsPreRollSamples = 3_200;
private readonly windowsSpeechStartThreshold = 520;
private readonly windowsSpeechContinueThreshold = 260;
private readonly windowsSpeechStartFrames = 2;
private readonly windowsSpeechEndFrames = 18;
private readonly windowsMinSpeechSamples = 7_200;
private vad: RealTimeVAD | null = null;
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
@@ -49,12 +56,18 @@ export class LocalVoiceSession {
private lastNonSilentAudioAt = 0;
private warnedNoPcm = false;
private warnedSilent = false;
private windowsSpeechBuffer: number[] = [];
private windowsPreRollBuffer: number[] = [];
private windowsSpeechActive = false;
private windowsSpeechCandidateFrames = 0;
private windowsSilenceFrames = 0;
constructor(private readonly options: LocalVoiceSessionOptions) {
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
}
async start(): Promise<void> {
if (process.platform !== "win32") {
this.vad = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
@@ -74,6 +87,9 @@ export class LocalVoiceSession {
void this.handleSpeechEnd(audio);
},
});
} else {
this.options.logger.info("Windows local mode uses amplitude-based speech detection");
}
this.recorder = this.spawnRecorder();
this.recorderStartedAt = Date.now();
@@ -212,7 +228,7 @@ export class LocalVoiceSession {
}
private pushPcm16Chunk(chunk: Buffer): void {
if (this.destroyed || !this.vad) {
if (this.destroyed) {
return;
}
@@ -232,6 +248,15 @@ export class LocalVoiceSession {
this.lastNonSilentAudioAt = Date.now();
}
if (process.platform === "win32") {
this.processWindowsSpeechFrames();
return;
}
if (!this.vad) {
return;
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
@@ -247,6 +272,70 @@ export class LocalVoiceSession {
}
}
private processWindowsSpeechFrames(): void {
while (true) {
const frame = takeFrame(this.pendingSamples, this.windowsFrameSamples);
if (!frame) {
return;
}
let peak = 0;
for (const sample of frame) {
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
}
if (!this.windowsSpeechActive) {
this.appendWithCap(this.windowsPreRollBuffer, frame, this.windowsPreRollSamples);
if (peak >= this.windowsSpeechStartThreshold) {
this.windowsSpeechCandidateFrames += 1;
} else {
this.windowsSpeechCandidateFrames = 0;
}
if (this.windowsSpeechCandidateFrames >= this.windowsSpeechStartFrames) {
this.windowsSpeechActive = true;
this.windowsSilenceFrames = 0;
this.windowsSpeechBuffer = [...this.windowsPreRollBuffer];
this.windowsPreRollBuffer = [];
this.interruptPlayback("local-barge-in");
this.options.logger.debug("Windows speech start detected", { peak });
} else {
continue;
}
}
this.windowsSpeechBuffer.push(...frame);
if (peak >= this.windowsSpeechContinueThreshold) {
this.windowsSilenceFrames = 0;
} else {
this.windowsSilenceFrames += 1;
}
if (this.windowsSilenceFrames < this.windowsSpeechEndFrames) {
continue;
}
const speech = Int16Array.from(this.windowsSpeechBuffer);
this.windowsSpeechActive = false;
this.windowsSpeechBuffer = [];
this.windowsSilenceFrames = 0;
this.windowsSpeechCandidateFrames = 0;
if (speech.length < this.windowsMinSpeechSamples) {
this.options.logger.debug("Ignored short Windows speech segment", { samples: speech.length });
continue;
}
this.options.logger.debug("Windows speech end detected", { samples: speech.length });
void this.handleSpeechEnd(int16ArrayToFloat32(speech));
}
}
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
this.options.logger.debug("Ignored short local speech segment", { samples: audio.length });
@@ -537,6 +626,13 @@ export class LocalVoiceSession {
}
return this.options.config.LOCAL_AUDIO_SINK ?? "default";
}
private appendWithCap(target: number[], samples: Int16Array, cap: number): void {
target.push(...samples);
if (target.length > cap) {
target.splice(0, target.length - cap);
}
}
}
function createWaveFileBuffer(