import base64 import io import json import os import sys import traceback import wave import numpy as np os.environ.setdefault("PYTHONIOENCODING", "utf-8") def log(message: str) -> None: print(message, file=sys.stderr, flush=True) def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None: payload = { "id": request_id, "ok": ok, } if ok: payload["result"] = result else: payload["error"] = error or "unknown error" sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n") sys.stdout.flush() def normalize_lang(raw: str) -> str: lowered = raw.strip().lower() if lowered in {"kr", "ko-kr"}: return "ko" return lowered or "ko" def normalize_voice(raw: str) -> str: value = raw.strip() if value.upper() in {"KR", "KO"} or not value: return "af_heart" return value class TtsWorker: def __init__(self) -> None: from kokoro_onnx import Kokoro from misaki import ko self.model_path = os.environ["LOCAL_TTS_MODEL_PATH"] self.voices_path = os.environ["LOCAL_TTS_VOICES_PATH"] self.language = normalize_lang(os.environ.get("LOCAL_TTS_LANGUAGE", "ko")) self.voice = normalize_voice(os.environ.get("LOCAL_TTS_SPEAKER", "af_heart")) self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12")) self.g2p = ko.KOG2P() self.model = Kokoro(self.model_path, self.voices_path) log( f"local-tts ready model={os.path.basename(self.model_path)} voice={self.voice} language={self.language} speed={self.speed}" ) def synthesize(self, text: str) -> bytes: phonemes, _tokens = self.g2p(text) samples, sample_rate = self.model.create( phonemes, voice=self.voice, speed=self.speed, lang="en-us", is_phonemes=True, ) return build_wav_bytes(samples, sample_rate) def build_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes: clipped = np.clip(samples, -1.0, 1.0) pcm = (clipped * 32767.0).astype(np.int16) buffer = io.BytesIO() with wave.open(buffer, "wb") as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(sample_rate) wav_file.writeframes(pcm.tobytes()) return buffer.getvalue() def main() -> int: try: worker = TtsWorker() except Exception as exc: log("failed to initialize local TTS worker") log("run `bun run setup:local-ai` first if dependencies are missing") log("".join(traceback.format_exception(exc))) return 1 for line in sys.stdin: line = line.strip() if not line: continue try: request = json.loads(line) request_id = int(request["id"]) method = request["method"] params = request.get("params", {}) if method == "ping": write_response(request_id, True, {"ready": True}) continue if method != "synthesize": raise ValueError(f"unsupported method: {method}") text = str(params.get("text", "")).strip() if not text: raise ValueError("text is empty") audio = worker.synthesize(text) write_response( request_id, True, { "wav_base64": base64.b64encode(audio).decode("ascii"), }, ) except Exception as exc: error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip() write_response(request_id, False, error=error_text) return 0 if __name__ == "__main__": raise SystemExit(main())