feat: add local audio test mode
This commit is contained in:
339
src/audio/local-voice-session.ts
Normal file
339
src/audio/local-voice-session.ts
Normal file
@@ -0,0 +1,339 @@
|
||||
import { spawn, type ChildProcessByStdio } from "node:child_process";
|
||||
import { once } from "node:events";
|
||||
import type { Readable, Writable } from "node:stream";
|
||||
|
||||
import { RealTimeVAD } from "avr-vad";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import { Logger } from "../logger.js";
|
||||
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "../services/openai-llm.js";
|
||||
|
||||
interface LocalVoiceSessionOptions {
|
||||
config: AssistantRuntimeConfig;
|
||||
logger: Logger;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
llm: OpenAiLlmService;
|
||||
}
|
||||
|
||||
interface SpeechJob {
|
||||
text: string;
|
||||
source: "assistant" | "manual";
|
||||
}
|
||||
|
||||
export class LocalVoiceSession {
|
||||
private readonly memory: ConversationMemory;
|
||||
private readonly queue: SpeechJob[] = [];
|
||||
private readonly pendingSamples: number[] = [];
|
||||
|
||||
private vad: RealTimeVAD | null = null;
|
||||
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
|
||||
private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
|
||||
private currentAbortController: AbortController | null = null;
|
||||
private currentPlayback: PreparedSpeechAudio | null = null;
|
||||
private processing = Promise.resolve();
|
||||
private draining = false;
|
||||
private destroyed = false;
|
||||
|
||||
constructor(private readonly options: LocalVoiceSessionOptions) {
|
||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
this.vad = await RealTimeVAD.new({
|
||||
model: "v5",
|
||||
sampleRate: 16000,
|
||||
frameSamples: 1536,
|
||||
positiveSpeechThreshold: 0.55,
|
||||
negativeSpeechThreshold: 0.35,
|
||||
redemptionFrames: 8,
|
||||
preSpeechPadFrames: 2,
|
||||
minSpeechFrames: 3,
|
||||
onFrameProcessed: () => undefined,
|
||||
onVADMisfire: () => undefined,
|
||||
onSpeechStart: () => {
|
||||
this.interruptPlayback("local-barge-in");
|
||||
},
|
||||
onSpeechRealStart: () => undefined,
|
||||
onSpeechEnd: (audio: Float32Array) => {
|
||||
void this.handleSpeechEnd(audio);
|
||||
},
|
||||
});
|
||||
|
||||
this.recorder = this.spawnRecorder();
|
||||
this.recorder.stdout.on("data", (chunk: Buffer) => {
|
||||
this.pushPcm16Chunk(chunk);
|
||||
});
|
||||
this.recorder.stderr.on("data", (chunk: Buffer) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text.length > 0) {
|
||||
this.options.logger.debug("[pw-record]", text);
|
||||
}
|
||||
});
|
||||
this.recorder.on("exit", (code, signal) => {
|
||||
if (!this.destroyed) {
|
||||
this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
this.destroyed = true;
|
||||
this.interruptPlayback("local-shutdown");
|
||||
|
||||
if (this.recorder && !this.recorder.killed) {
|
||||
this.recorder.kill("SIGTERM");
|
||||
await once(this.recorder, "exit").catch(() => null);
|
||||
}
|
||||
|
||||
if (this.vad) {
|
||||
await this.vad.destroy().catch((error) => {
|
||||
this.options.logger.warn("Local VAD destroy failed", error);
|
||||
});
|
||||
this.vad = null;
|
||||
}
|
||||
}
|
||||
|
||||
clearConversation(): void {
|
||||
this.memory.clear();
|
||||
this.interruptPlayback("local-reset");
|
||||
}
|
||||
|
||||
async speakText(text: string): Promise<void> {
|
||||
this.queue.push({
|
||||
text,
|
||||
source: "manual",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
statusSummary(): string {
|
||||
return [
|
||||
"모드: local",
|
||||
`입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
|
||||
`출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
|
||||
`대기열: ${this.queue.length}`,
|
||||
`최근 대화 턴: ${this.memory.recentTurns().length}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
|
||||
const args = [
|
||||
"--rate",
|
||||
"16000",
|
||||
"--channels",
|
||||
"1",
|
||||
"--format",
|
||||
"s16",
|
||||
"--raw",
|
||||
];
|
||||
|
||||
if (this.options.config.LOCAL_AUDIO_SOURCE) {
|
||||
args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
|
||||
}
|
||||
|
||||
args.push("-");
|
||||
|
||||
this.options.logger.info("Starting local recorder", {
|
||||
source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
|
||||
});
|
||||
|
||||
return spawn("pw-record", args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
private pushPcm16Chunk(chunk: Buffer): void {
|
||||
if (this.destroyed || !this.vad) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
||||
this.pendingSamples.push(chunk.readInt16LE(offset));
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, 1536);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
const floatFrame = int16ArrayToFloat32(frame);
|
||||
this.processing = this.processing
|
||||
.then(() => this.vad?.processAudio(floatFrame))
|
||||
.catch((error) => {
|
||||
this.options.logger.warn("Local VAD processing failed", error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
|
||||
if (audio.length < 16000 * 0.25) {
|
||||
return;
|
||||
}
|
||||
|
||||
const utterance: UserUtterance = {
|
||||
speakerId: "local-user",
|
||||
speakerName: this.options.config.LOCAL_SPEAKER_NAME,
|
||||
text: "",
|
||||
};
|
||||
|
||||
let transcript: string | null = null;
|
||||
try {
|
||||
transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
|
||||
} catch (error) {
|
||||
this.options.logger.warn("Local STT failed", error);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!transcript || transcript.trim().length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
utterance.text = transcript.trim();
|
||||
this.memory.addUserTurn(utterance);
|
||||
this.options.logger.info("Local transcript", utterance.text);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
console.log(`\n[you] ${utterance.text}`);
|
||||
}
|
||||
|
||||
let reply: string;
|
||||
try {
|
||||
reply = await this.options.llm.generateReply(this.memory, utterance);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("Local LLM failed", error);
|
||||
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
this.memory.addAssistantTurn(reply);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
console.log(`[bot] ${reply}\n`);
|
||||
}
|
||||
|
||||
this.queue.push({
|
||||
text: reply,
|
||||
source: "assistant",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
private interruptPlayback(reason: string): void {
|
||||
if (this.queue.length > 0 || this.currentPlayer) {
|
||||
this.options.logger.info("Interrupting local playback", reason);
|
||||
}
|
||||
|
||||
this.queue.splice(0, this.queue.length);
|
||||
this.currentAbortController?.abort();
|
||||
this.currentAbortController = null;
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
|
||||
if (this.currentPlayer && !this.currentPlayer.killed) {
|
||||
this.currentPlayer.kill("SIGKILL");
|
||||
}
|
||||
this.currentPlayer = null;
|
||||
}
|
||||
|
||||
private async drainQueue(): Promise<void> {
|
||||
if (this.draining || this.destroyed) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.draining = true;
|
||||
|
||||
try {
|
||||
while (this.queue.length > 0 && !this.destroyed) {
|
||||
const job = this.queue.shift();
|
||||
if (!job) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
this.currentAbortController = abortController;
|
||||
|
||||
try {
|
||||
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Local TTS synthesis failed", error);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.playToSink(this.currentPlayback, abortController.signal);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Local playback failed", error);
|
||||
}
|
||||
} finally {
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
if (this.currentAbortController === abortController) {
|
||||
this.currentAbortController = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.draining = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
|
||||
const args = [
|
||||
"--rate",
|
||||
"48000",
|
||||
"--channels",
|
||||
"2",
|
||||
"--format",
|
||||
"s16",
|
||||
"--raw",
|
||||
];
|
||||
|
||||
if (this.options.config.LOCAL_AUDIO_SINK) {
|
||||
args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
|
||||
}
|
||||
|
||||
args.push("-");
|
||||
|
||||
const player = spawn("pw-play", args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
this.currentPlayer = player;
|
||||
|
||||
player.stderr.on("data", (chunk: Buffer) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text.length > 0) {
|
||||
this.options.logger.debug("[pw-play]", text);
|
||||
}
|
||||
});
|
||||
|
||||
signal.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
playback.stream.destroy();
|
||||
if (!player.killed) {
|
||||
player.kill("SIGKILL");
|
||||
}
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
|
||||
playback.stream.pipe(player.stdin);
|
||||
|
||||
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
|
||||
this.currentPlayer = null;
|
||||
|
||||
if (signal.aborted) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (code !== 0) {
|
||||
throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user