Use simpler speech detection on Windows local mode
This commit is contained in:
@@ -34,6 +34,13 @@ export class LocalVoiceSession {
|
|||||||
private readonly queue: SpeechJob[] = [];
|
private readonly queue: SpeechJob[] = [];
|
||||||
private readonly pendingSamples: number[] = [];
|
private readonly pendingSamples: number[] = [];
|
||||||
private readonly silenceThreshold = 900;
|
private readonly silenceThreshold = 900;
|
||||||
|
private readonly windowsFrameSamples = 320;
|
||||||
|
private readonly windowsPreRollSamples = 3_200;
|
||||||
|
private readonly windowsSpeechStartThreshold = 520;
|
||||||
|
private readonly windowsSpeechContinueThreshold = 260;
|
||||||
|
private readonly windowsSpeechStartFrames = 2;
|
||||||
|
private readonly windowsSpeechEndFrames = 18;
|
||||||
|
private readonly windowsMinSpeechSamples = 7_200;
|
||||||
|
|
||||||
private vad: RealTimeVAD | null = null;
|
private vad: RealTimeVAD | null = null;
|
||||||
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
|
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
|
||||||
@@ -49,31 +56,40 @@ export class LocalVoiceSession {
|
|||||||
private lastNonSilentAudioAt = 0;
|
private lastNonSilentAudioAt = 0;
|
||||||
private warnedNoPcm = false;
|
private warnedNoPcm = false;
|
||||||
private warnedSilent = false;
|
private warnedSilent = false;
|
||||||
|
private windowsSpeechBuffer: number[] = [];
|
||||||
|
private windowsPreRollBuffer: number[] = [];
|
||||||
|
private windowsSpeechActive = false;
|
||||||
|
private windowsSpeechCandidateFrames = 0;
|
||||||
|
private windowsSilenceFrames = 0;
|
||||||
|
|
||||||
constructor(private readonly options: LocalVoiceSessionOptions) {
|
constructor(private readonly options: LocalVoiceSessionOptions) {
|
||||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||||
}
|
}
|
||||||
|
|
||||||
async start(): Promise<void> {
|
async start(): Promise<void> {
|
||||||
this.vad = await RealTimeVAD.new({
|
if (process.platform !== "win32") {
|
||||||
model: "v5",
|
this.vad = await RealTimeVAD.new({
|
||||||
sampleRate: 16000,
|
model: "v5",
|
||||||
frameSamples: 1536,
|
sampleRate: 16000,
|
||||||
positiveSpeechThreshold: 0.55,
|
frameSamples: 1536,
|
||||||
negativeSpeechThreshold: 0.35,
|
positiveSpeechThreshold: 0.55,
|
||||||
redemptionFrames: 8,
|
negativeSpeechThreshold: 0.35,
|
||||||
preSpeechPadFrames: 2,
|
redemptionFrames: 8,
|
||||||
minSpeechFrames: 3,
|
preSpeechPadFrames: 2,
|
||||||
onFrameProcessed: () => undefined,
|
minSpeechFrames: 3,
|
||||||
onVADMisfire: () => undefined,
|
onFrameProcessed: () => undefined,
|
||||||
onSpeechStart: () => {
|
onVADMisfire: () => undefined,
|
||||||
this.interruptPlayback("local-barge-in");
|
onSpeechStart: () => {
|
||||||
},
|
this.interruptPlayback("local-barge-in");
|
||||||
onSpeechRealStart: () => undefined,
|
},
|
||||||
onSpeechEnd: (audio: Float32Array) => {
|
onSpeechRealStart: () => undefined,
|
||||||
void this.handleSpeechEnd(audio);
|
onSpeechEnd: (audio: Float32Array) => {
|
||||||
},
|
void this.handleSpeechEnd(audio);
|
||||||
});
|
},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
this.options.logger.info("Windows local mode uses amplitude-based speech detection");
|
||||||
|
}
|
||||||
|
|
||||||
this.recorder = this.spawnRecorder();
|
this.recorder = this.spawnRecorder();
|
||||||
this.recorderStartedAt = Date.now();
|
this.recorderStartedAt = Date.now();
|
||||||
@@ -212,7 +228,7 @@ export class LocalVoiceSession {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private pushPcm16Chunk(chunk: Buffer): void {
|
private pushPcm16Chunk(chunk: Buffer): void {
|
||||||
if (this.destroyed || !this.vad) {
|
if (this.destroyed) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,6 +248,15 @@ export class LocalVoiceSession {
|
|||||||
this.lastNonSilentAudioAt = Date.now();
|
this.lastNonSilentAudioAt = Date.now();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (process.platform === "win32") {
|
||||||
|
this.processWindowsSpeechFrames();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.vad) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const frame = takeFrame(this.pendingSamples, 1536);
|
const frame = takeFrame(this.pendingSamples, 1536);
|
||||||
if (!frame) {
|
if (!frame) {
|
||||||
@@ -247,6 +272,70 @@ export class LocalVoiceSession {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private processWindowsSpeechFrames(): void {
|
||||||
|
while (true) {
|
||||||
|
const frame = takeFrame(this.pendingSamples, this.windowsFrameSamples);
|
||||||
|
if (!frame) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let peak = 0;
|
||||||
|
for (const sample of frame) {
|
||||||
|
const abs = Math.abs(sample);
|
||||||
|
if (abs > peak) {
|
||||||
|
peak = abs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.windowsSpeechActive) {
|
||||||
|
this.appendWithCap(this.windowsPreRollBuffer, frame, this.windowsPreRollSamples);
|
||||||
|
|
||||||
|
if (peak >= this.windowsSpeechStartThreshold) {
|
||||||
|
this.windowsSpeechCandidateFrames += 1;
|
||||||
|
} else {
|
||||||
|
this.windowsSpeechCandidateFrames = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.windowsSpeechCandidateFrames >= this.windowsSpeechStartFrames) {
|
||||||
|
this.windowsSpeechActive = true;
|
||||||
|
this.windowsSilenceFrames = 0;
|
||||||
|
this.windowsSpeechBuffer = [...this.windowsPreRollBuffer];
|
||||||
|
this.windowsPreRollBuffer = [];
|
||||||
|
this.interruptPlayback("local-barge-in");
|
||||||
|
this.options.logger.debug("Windows speech start detected", { peak });
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.windowsSpeechBuffer.push(...frame);
|
||||||
|
|
||||||
|
if (peak >= this.windowsSpeechContinueThreshold) {
|
||||||
|
this.windowsSilenceFrames = 0;
|
||||||
|
} else {
|
||||||
|
this.windowsSilenceFrames += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.windowsSilenceFrames < this.windowsSpeechEndFrames) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const speech = Int16Array.from(this.windowsSpeechBuffer);
|
||||||
|
this.windowsSpeechActive = false;
|
||||||
|
this.windowsSpeechBuffer = [];
|
||||||
|
this.windowsSilenceFrames = 0;
|
||||||
|
this.windowsSpeechCandidateFrames = 0;
|
||||||
|
|
||||||
|
if (speech.length < this.windowsMinSpeechSamples) {
|
||||||
|
this.options.logger.debug("Ignored short Windows speech segment", { samples: speech.length });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.options.logger.debug("Windows speech end detected", { samples: speech.length });
|
||||||
|
void this.handleSpeechEnd(int16ArrayToFloat32(speech));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
|
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
|
||||||
if (audio.length < 16000 * 0.25) {
|
if (audio.length < 16000 * 0.25) {
|
||||||
this.options.logger.debug("Ignored short local speech segment", { samples: audio.length });
|
this.options.logger.debug("Ignored short local speech segment", { samples: audio.length });
|
||||||
@@ -537,6 +626,13 @@ export class LocalVoiceSession {
|
|||||||
}
|
}
|
||||||
return this.options.config.LOCAL_AUDIO_SINK ?? "default";
|
return this.options.config.LOCAL_AUDIO_SINK ?? "default";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private appendWithCap(target: number[], samples: Int16Array, cap: number): void {
|
||||||
|
target.push(...samples);
|
||||||
|
if (target.length > cap) {
|
||||||
|
target.splice(0, target.length - cap);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function createWaveFileBuffer(
|
function createWaveFileBuffer(
|
||||||
|
|||||||
Reference in New Issue
Block a user