import base64 import json import os import sys import tempfile import traceback os.environ.setdefault("PYTHONIOENCODING", "utf-8") def log(message: str) -> None: print(message, file=sys.stderr, flush=True) def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None: payload = { "id": request_id, "ok": ok, } if ok: payload["result"] = result else: payload["error"] = error or "unknown error" sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n") sys.stdout.flush() class TtsWorker: def __init__(self) -> None: from melo.api import TTS self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR" self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR" self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto" self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12")) self.model = TTS(language=self.language, device=self.device) speaker_ids = self.model.hps.data.spk2id self.speaker_id = speaker_ids.get(self.speaker_key) if self.speaker_id is None: normalized = self.speaker_key.upper() self.speaker_id = speaker_ids.get(normalized) if self.speaker_id is None: self.speaker_id = next(iter(speaker_ids.values())) log( f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}" ) def synthesize(self, text: str) -> bytes: temp_path = "" try: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle: temp_path = handle.name self.model.tts_to_file( text, self.speaker_id, temp_path, speed=self.speed, quiet=True, ) with open(temp_path, "rb") as handle: return handle.read() finally: if temp_path: try: os.unlink(temp_path) except OSError: pass def main() -> int: try: worker = TtsWorker() except Exception as exc: log("failed to initialize local TTS worker") log("run `bun run setup:local-ai` first if dependencies are missing") log("".join(traceback.format_exception(exc))) return 1 for line in sys.stdin: line = line.strip() if not line: continue try: request = json.loads(line) request_id = int(request["id"]) method = request["method"] params = request.get("params", {}) if method == "ping": write_response(request_id, True, {"ready": True}) continue if method != "synthesize": raise ValueError(f"unsupported method: {method}") text = str(params.get("text", "")).strip() if not text: raise ValueError("text is empty") audio = worker.synthesize(text) write_response( request_id, True, { "wav_base64": base64.b64encode(audio).decode("ascii"), }, ) except Exception as exc: error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip() write_response(request_id, False, error=error_text) return 0 if __name__ == "__main__": raise SystemExit(main())