Files
realtime_voice_bot/python/local_tts_worker.py

126 lines
3.6 KiB
Python

import base64
import json
import os
import sys
import tempfile
import traceback
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
def log(message: str) -> None:
print(message, file=sys.stderr, flush=True)
def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None:
payload = {
"id": request_id,
"ok": ok,
}
if ok:
payload["result"] = result
else:
payload["error"] = error or "unknown error"
sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n")
sys.stdout.flush()
class TtsWorker:
def __init__(self) -> None:
from melo.api import TTS
self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR"
self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR"
self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto"
self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12"))
self.model = TTS(language=self.language, device=self.device)
speaker_ids = self.model.hps.data.spk2id
self.speaker_id = speaker_ids.get(self.speaker_key)
if self.speaker_id is None:
normalized = self.speaker_key.upper()
self.speaker_id = speaker_ids.get(normalized)
if self.speaker_id is None:
self.speaker_id = next(iter(speaker_ids.values()))
log(
f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}"
)
def synthesize(self, text: str) -> bytes:
temp_path = ""
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle:
temp_path = handle.name
self.model.tts_to_file(
text,
self.speaker_id,
temp_path,
speed=self.speed,
quiet=True,
)
with open(temp_path, "rb") as handle:
return handle.read()
finally:
if temp_path:
try:
os.unlink(temp_path)
except OSError:
pass
def main() -> int:
try:
worker = TtsWorker()
except Exception as exc:
log("failed to initialize local TTS worker")
log("run `bun run setup:local-ai` first if dependencies are missing")
log("".join(traceback.format_exception(exc)))
return 1
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
request = json.loads(line)
request_id = int(request["id"])
method = request["method"]
params = request.get("params", {})
if method == "ping":
write_response(request_id, True, {"ready": True})
continue
if method != "synthesize":
raise ValueError(f"unsupported method: {method}")
text = str(params.get("text", "")).strip()
if not text:
raise ValueError("text is empty")
audio = worker.synthesize(text)
write_response(
request_id,
True,
{
"wav_base64": base64.b64encode(audio).decode("ascii"),
},
)
except Exception as exc:
error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip()
write_response(request_id, False, error=error_text)
return 0
if __name__ == "__main__":
raise SystemExit(main())