Speed up TTS playback by default
This commit is contained in:
@@ -14,6 +14,7 @@ TTS_LANGUAGE=KR
|
|||||||
TTS_SPEAKER=KR
|
TTS_SPEAKER=KR
|
||||||
TTS_DEVICE=cpu
|
TTS_DEVICE=cpu
|
||||||
TTS_SPEED=1.18
|
TTS_SPEED=1.18
|
||||||
|
TTS_PLAYBACK_RATE=3
|
||||||
TTS_SDP_RATIO=0.22
|
TTS_SDP_RATIO=0.22
|
||||||
TTS_NOISE_SCALE=0.55
|
TTS_NOISE_SCALE=0.55
|
||||||
TTS_NOISE_SCALE_W=0.75
|
TTS_NOISE_SCALE_W=0.75
|
||||||
|
|||||||
@@ -85,6 +85,9 @@ bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다."
|
|||||||
- Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다
|
- Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다
|
||||||
- `TTS_SPEED`
|
- `TTS_SPEED`
|
||||||
- 기본값 `1.18`
|
- 기본값 `1.18`
|
||||||
|
- `TTS_PLAYBACK_RATE`
|
||||||
|
- 기본값 `3`
|
||||||
|
- 생성된 WAV를 `ffmpeg`로 더 빠르게 재생합니다
|
||||||
- `TTS_SDP_RATIO`
|
- `TTS_SDP_RATIO`
|
||||||
- 기본값 `0.22`
|
- 기본값 `0.22`
|
||||||
- `TTS_NOISE_SCALE`
|
- `TTS_NOISE_SCALE`
|
||||||
@@ -187,6 +190,7 @@ TTS_LANGUAGE=KR
|
|||||||
TTS_SPEAKER=KR
|
TTS_SPEAKER=KR
|
||||||
TTS_DEVICE=cpu
|
TTS_DEVICE=cpu
|
||||||
TTS_SPEED=1.18
|
TTS_SPEED=1.18
|
||||||
|
TTS_PLAYBACK_RATE=3
|
||||||
TTS_SDP_RATIO=0.22
|
TTS_SDP_RATIO=0.22
|
||||||
TTS_NOISE_SCALE=0.55
|
TTS_NOISE_SCALE=0.55
|
||||||
TTS_NOISE_SCALE_W=0.75
|
TTS_NOISE_SCALE_W=0.75
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ const envSchema = z.object({
|
|||||||
TTS_SPEAKER: z.string().min(1).default("KR"),
|
TTS_SPEAKER: z.string().min(1).default("KR"),
|
||||||
TTS_DEVICE: z.string().min(1).default("cpu"),
|
TTS_DEVICE: z.string().min(1).default("cpu"),
|
||||||
TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1.18),
|
TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1.18),
|
||||||
|
TTS_PLAYBACK_RATE: z.coerce.number().min(0.5).max(4).default(3),
|
||||||
TTS_SDP_RATIO: z.coerce.number().min(0).max(1).default(0.22),
|
TTS_SDP_RATIO: z.coerce.number().min(0).max(1).default(0.22),
|
||||||
TTS_NOISE_SCALE: z.coerce.number().min(0).max(2).default(0.55),
|
TTS_NOISE_SCALE: z.coerce.number().min(0).max(2).default(0.55),
|
||||||
TTS_NOISE_SCALE_W: z.coerce.number().min(0).max(2).default(0.75),
|
TTS_NOISE_SCALE_W: z.coerce.number().min(0).max(2).default(0.75),
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
import { spawn } from "node:child_process";
|
import { spawn } from "node:child_process";
|
||||||
|
import { rm } from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
import process from "node:process";
|
import process from "node:process";
|
||||||
|
import { randomUUID } from "node:crypto";
|
||||||
|
|
||||||
async function run(command: string, args: string[], env?: NodeJS.ProcessEnv): Promise<void> {
|
async function run(command: string, args: string[], env?: NodeJS.ProcessEnv): Promise<void> {
|
||||||
await new Promise<void>((resolve, reject) => {
|
await new Promise<void>((resolve, reject) => {
|
||||||
@@ -20,7 +24,48 @@ async function run(command: string, args: string[], env?: NodeJS.ProcessEnv): Pr
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function playWavFile(filePath: string): Promise<void> {
|
function buildAtempoFilter(rate: number): string {
|
||||||
|
const filters: string[] = [];
|
||||||
|
let remaining = rate;
|
||||||
|
|
||||||
|
while (remaining > 2) {
|
||||||
|
filters.push("atempo=2.0");
|
||||||
|
remaining /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (remaining < 0.5) {
|
||||||
|
filters.push("atempo=0.5");
|
||||||
|
remaining /= 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
filters.push(`atempo=${remaining.toFixed(3)}`);
|
||||||
|
return filters.join(",");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function applyPlaybackRate(filePath: string, playbackRate: number): Promise<string> {
|
||||||
|
if (Math.abs(playbackRate - 1) < 0.01) {
|
||||||
|
return filePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
const targetPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${randomUUID()}.wav`);
|
||||||
|
const filter = buildAtempoFilter(playbackRate);
|
||||||
|
|
||||||
|
await run("ffmpeg", [
|
||||||
|
"-y",
|
||||||
|
"-hide_banner",
|
||||||
|
"-loglevel",
|
||||||
|
"error",
|
||||||
|
"-i",
|
||||||
|
filePath,
|
||||||
|
"-filter:a",
|
||||||
|
filter,
|
||||||
|
targetPath,
|
||||||
|
]);
|
||||||
|
|
||||||
|
return targetPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function playNativeWavFile(filePath: string): Promise<void> {
|
||||||
if (process.platform === "win32") {
|
if (process.platform === "win32") {
|
||||||
const env = {
|
const env = {
|
||||||
...process.env,
|
...process.env,
|
||||||
@@ -45,3 +90,15 @@ export async function playWavFile(filePath: string): Promise<void> {
|
|||||||
|
|
||||||
throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
|
throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function playWavFile(filePath: string, playbackRate = 1): Promise<void> {
|
||||||
|
const playablePath = await applyPlaybackRate(filePath, playbackRate);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await playNativeWavFile(playablePath);
|
||||||
|
} finally {
|
||||||
|
if (playablePath !== filePath) {
|
||||||
|
await rm(playablePath, { force: true }).catch(() => undefined);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ export class MeloTtsService {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
await this.synthesizeToFile(trimmed, targetPath);
|
await this.synthesizeToFile(trimmed, targetPath);
|
||||||
await playWavFile(targetPath);
|
await playWavFile(targetPath, this.config.TTS_PLAYBACK_RATE);
|
||||||
} finally {
|
} finally {
|
||||||
await rm(targetPath, { force: true }).catch(() => undefined);
|
await rm(targetPath, { force: true }).catch(() => undefined);
|
||||||
}
|
}
|
||||||
@@ -128,6 +128,7 @@ export class MeloTtsService {
|
|||||||
language: this.config.TTS_LANGUAGE,
|
language: this.config.TTS_LANGUAGE,
|
||||||
speaker: this.config.TTS_SPEAKER,
|
speaker: this.config.TTS_SPEAKER,
|
||||||
speed: this.config.TTS_SPEED,
|
speed: this.config.TTS_SPEED,
|
||||||
|
playback_rate: this.config.TTS_PLAYBACK_RATE,
|
||||||
sdp_ratio: this.config.TTS_SDP_RATIO,
|
sdp_ratio: this.config.TTS_SDP_RATIO,
|
||||||
noise_scale: this.config.TTS_NOISE_SCALE,
|
noise_scale: this.config.TTS_NOISE_SCALE,
|
||||||
noise_scale_w: this.config.TTS_NOISE_SCALE_W,
|
noise_scale_w: this.config.TTS_NOISE_SCALE_W,
|
||||||
|
|||||||
Reference in New Issue
Block a user