137 lines
3.8 KiB
Python
137 lines
3.8 KiB
Python
import base64
|
|
import io
|
|
import json
|
|
import os
|
|
import sys
|
|
import traceback
|
|
import wave
|
|
|
|
import numpy as np
|
|
|
|
|
|
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
|
|
|
|
|
|
def log(message: str) -> None:
|
|
print(message, file=sys.stderr, flush=True)
|
|
|
|
|
|
def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None:
|
|
payload = {
|
|
"id": request_id,
|
|
"ok": ok,
|
|
}
|
|
if ok:
|
|
payload["result"] = result
|
|
else:
|
|
payload["error"] = error or "unknown error"
|
|
|
|
sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
|
sys.stdout.flush()
|
|
|
|
|
|
def normalize_lang(raw: str) -> str:
|
|
lowered = raw.strip().lower()
|
|
if lowered in {"kr", "ko-kr"}:
|
|
return "ko"
|
|
return lowered or "ko"
|
|
|
|
|
|
def normalize_voice(raw: str) -> str:
|
|
value = raw.strip()
|
|
if value.upper() in {"KR", "KO"} or not value:
|
|
return "af_heart"
|
|
return value
|
|
|
|
|
|
class TtsWorker:
|
|
def __init__(self) -> None:
|
|
from kokoro_onnx import Kokoro
|
|
from misaki import ko
|
|
|
|
self.model_path = os.environ["LOCAL_TTS_MODEL_PATH"]
|
|
self.voices_path = os.environ["LOCAL_TTS_VOICES_PATH"]
|
|
self.language = normalize_lang(os.environ.get("LOCAL_TTS_LANGUAGE", "ko"))
|
|
self.voice = normalize_voice(os.environ.get("LOCAL_TTS_SPEAKER", "af_heart"))
|
|
self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12"))
|
|
self.g2p = ko.KOG2P()
|
|
self.model = Kokoro(self.model_path, self.voices_path)
|
|
|
|
log(
|
|
f"local-tts ready model={os.path.basename(self.model_path)} voice={self.voice} language={self.language} speed={self.speed}"
|
|
)
|
|
|
|
def synthesize(self, text: str) -> bytes:
|
|
phonemes, _tokens = self.g2p(text)
|
|
samples, sample_rate = self.model.create(
|
|
phonemes,
|
|
voice=self.voice,
|
|
speed=self.speed,
|
|
lang="en-us",
|
|
is_phonemes=True,
|
|
)
|
|
return build_wav_bytes(samples, sample_rate)
|
|
|
|
|
|
def build_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes:
|
|
clipped = np.clip(samples, -1.0, 1.0)
|
|
pcm = (clipped * 32767.0).astype(np.int16)
|
|
buffer = io.BytesIO()
|
|
|
|
with wave.open(buffer, "wb") as wav_file:
|
|
wav_file.setnchannels(1)
|
|
wav_file.setsampwidth(2)
|
|
wav_file.setframerate(sample_rate)
|
|
wav_file.writeframes(pcm.tobytes())
|
|
|
|
return buffer.getvalue()
|
|
|
|
|
|
def main() -> int:
|
|
try:
|
|
worker = TtsWorker()
|
|
except Exception as exc:
|
|
log("failed to initialize local TTS worker")
|
|
log("run `bun run setup:local-ai` first if dependencies are missing")
|
|
log("".join(traceback.format_exception(exc)))
|
|
return 1
|
|
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
request = json.loads(line)
|
|
request_id = int(request["id"])
|
|
method = request["method"]
|
|
params = request.get("params", {})
|
|
|
|
if method == "ping":
|
|
write_response(request_id, True, {"ready": True})
|
|
continue
|
|
if method != "synthesize":
|
|
raise ValueError(f"unsupported method: {method}")
|
|
|
|
text = str(params.get("text", "")).strip()
|
|
if not text:
|
|
raise ValueError("text is empty")
|
|
|
|
audio = worker.synthesize(text)
|
|
write_response(
|
|
request_id,
|
|
True,
|
|
{
|
|
"wav_base64": base64.b64encode(audio).decode("ascii"),
|
|
},
|
|
)
|
|
except Exception as exc:
|
|
error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip()
|
|
write_response(request_id, False, error=error_text)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|