diff --git a/.env.example b/.env.example index 2fed58f..13b45c6 100644 --- a/.env.example +++ b/.env.example @@ -14,6 +14,7 @@ TTS_LANGUAGE=KR TTS_SPEAKER=KR TTS_DEVICE=cpu TTS_SPEED=1.18 +TTS_PLAYBACK_RATE=3 TTS_SDP_RATIO=0.22 TTS_NOISE_SCALE=0.55 TTS_NOISE_SCALE_W=0.75 diff --git a/README.md b/README.md index 5c94fd3..668e81b 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,9 @@ bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다." - Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다 - `TTS_SPEED` - 기본값 `1.18` +- `TTS_PLAYBACK_RATE` + - 기본값 `3` + - 생성된 WAV를 `ffmpeg`로 더 빠르게 재생합니다 - `TTS_SDP_RATIO` - 기본값 `0.22` - `TTS_NOISE_SCALE` @@ -187,6 +190,7 @@ TTS_LANGUAGE=KR TTS_SPEAKER=KR TTS_DEVICE=cpu TTS_SPEED=1.18 +TTS_PLAYBACK_RATE=3 TTS_SDP_RATIO=0.22 TTS_NOISE_SCALE=0.55 TTS_NOISE_SCALE_W=0.75 diff --git a/src/config.ts b/src/config.ts index a8fe226..fefa28c 100644 --- a/src/config.ts +++ b/src/config.ts @@ -25,6 +25,7 @@ const envSchema = z.object({ TTS_SPEAKER: z.string().min(1).default("KR"), TTS_DEVICE: z.string().min(1).default("cpu"), TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1.18), + TTS_PLAYBACK_RATE: z.coerce.number().min(0.5).max(4).default(3), TTS_SDP_RATIO: z.coerce.number().min(0).max(1).default(0.22), TTS_NOISE_SCALE: z.coerce.number().min(0).max(2).default(0.55), TTS_NOISE_SCALE_W: z.coerce.number().min(0).max(2).default(0.75), diff --git a/src/services/audio-playback.ts b/src/services/audio-playback.ts index 180f150..03ffab2 100644 --- a/src/services/audio-playback.ts +++ b/src/services/audio-playback.ts @@ -1,5 +1,9 @@ import { spawn } from "node:child_process"; +import { rm } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; import process from "node:process"; +import { randomUUID } from "node:crypto"; async function run(command: string, args: string[], env?: NodeJS.ProcessEnv): Promise { await new Promise((resolve, reject) => { @@ -20,7 +24,48 @@ async function run(command: string, args: string[], env?: NodeJS.ProcessEnv): Pr }); } -export async function playWavFile(filePath: string): Promise { +function buildAtempoFilter(rate: number): string { + const filters: string[] = []; + let remaining = rate; + + while (remaining > 2) { + filters.push("atempo=2.0"); + remaining /= 2; + } + + while (remaining < 0.5) { + filters.push("atempo=0.5"); + remaining /= 0.5; + } + + filters.push(`atempo=${remaining.toFixed(3)}`); + return filters.join(","); +} + +async function applyPlaybackRate(filePath: string, playbackRate: number): Promise { + if (Math.abs(playbackRate - 1) < 0.01) { + return filePath; + } + + const targetPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${randomUUID()}.wav`); + const filter = buildAtempoFilter(playbackRate); + + await run("ffmpeg", [ + "-y", + "-hide_banner", + "-loglevel", + "error", + "-i", + filePath, + "-filter:a", + filter, + targetPath, + ]); + + return targetPath; +} + +async function playNativeWavFile(filePath: string): Promise { if (process.platform === "win32") { const env = { ...process.env, @@ -45,3 +90,15 @@ export async function playWavFile(filePath: string): Promise { throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`); } + +export async function playWavFile(filePath: string, playbackRate = 1): Promise { + const playablePath = await applyPlaybackRate(filePath, playbackRate); + + try { + await playNativeWavFile(playablePath); + } finally { + if (playablePath !== filePath) { + await rm(playablePath, { force: true }).catch(() => undefined); + } + } +} diff --git a/src/services/melo-tts.ts b/src/services/melo-tts.ts index 7c2808a..a01a70a 100644 --- a/src/services/melo-tts.ts +++ b/src/services/melo-tts.ts @@ -67,7 +67,7 @@ export class MeloTtsService { try { await this.synthesizeToFile(trimmed, targetPath); - await playWavFile(targetPath); + await playWavFile(targetPath, this.config.TTS_PLAYBACK_RATE); } finally { await rm(targetPath, { force: true }).catch(() => undefined); } @@ -128,6 +128,7 @@ export class MeloTtsService { language: this.config.TTS_LANGUAGE, speaker: this.config.TTS_SPEAKER, speed: this.config.TTS_SPEED, + playback_rate: this.config.TTS_PLAYBACK_RATE, sdp_ratio: this.config.TTS_SDP_RATIO, noise_scale: this.config.TTS_NOISE_SCALE, noise_scale_w: this.config.TTS_NOISE_SCALE_W,