realtime_voice_bot/python/loopback_stt_worker.py

import base64
import json
import os
import sys
import site
import traceback
from typing import Any
from pathlib import Path

import numpy as np
from faster_whisper import WhisperModel


def configure_windows_dll_search_paths() -> list[str]:
    if sys.platform != "win32":
        return []

    candidates: list[Path] = []
    executable_dir = Path(sys.executable).resolve().parent
    venv_root = executable_dir.parent
    candidates.extend(
        [
            venv_root / "Lib" / "site-packages" / "nvidia" / "cublas" / "bin",
            venv_root / "Lib" / "site-packages" / "nvidia" / "cudnn" / "bin",
        ]
    )

    for package_path in site.getsitepackages():
        base = Path(package_path)
        candidates.extend(
            [
                base / "nvidia" / "cublas" / "bin",
                base / "nvidia" / "cudnn" / "bin",
            ]
        )

    added: list[str] = []
    seen: set[str] = set()
    for candidate in candidates:
        normalized = str(candidate.resolve())
        if normalized in seen:
            continue
        seen.add(normalized)
        if not candidate.exists():
            continue

        os.add_dll_directory(normalized)
        if normalized not in os.environ.get("PATH", ""):
            os.environ["PATH"] = normalized + os.pathsep + os.environ.get("PATH", "")
        added.append(normalized)

    return added


os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
CONFIGURED_DLL_PATHS = configure_windows_dll_search_paths()
if CONFIGURED_DLL_PATHS:
    print(
        f"configured CUDA DLL search paths: {', '.join(CONFIGURED_DLL_PATHS)}",
        file=sys.stderr,
        flush=True,
    )


def resolve_model() -> WhisperModel:
    model_name = os.environ.get("WHISPER_MODEL", "large-v3-turbo")
    requested_device = os.environ.get("WHISPER_DEVICE", "auto")
    requested_compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto")

    attempts: list[tuple[str, str]] = []
    if requested_device == "auto":
        if requested_compute == "auto":
            attempts.extend(
                [
                    ("cuda", "float16"),
                    ("cuda", "int8_float16"),
                    ("cpu", "int8"),
                    ("cpu", "float32"),
                ]
            )
        else:
            attempts.extend(
                [
                    ("cuda", requested_compute),
                    ("cpu", requested_compute),
                ]
            )
    else:
        if requested_compute == "auto":
            compute = "float16" if requested_device == "cuda" else "int8"
        else:
            compute = requested_compute
        attempts.append((requested_device, compute))

    last_error: Exception | None = None
    for device, compute_type in attempts:
        try:
            model = WhisperModel(model_name, device=device, compute_type=compute_type)
            setattr(model, "_resolved_device", device)
            setattr(model, "_resolved_compute_type", compute_type)
            return model
        except Exception as error:  # noqa: BLE001
            last_error = error

    assert last_error is not None
    raise last_error


MODEL = resolve_model()
LANGUAGE = os.environ.get("WHISPER_LANGUAGE", "ko")
BEAM_SIZE = int(os.environ.get("WHISPER_BEAM_SIZE", "1"))


def write(payload: dict[str, Any]) -> None:
    sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n")
    sys.stdout.flush()


def transcribe_pcm16_base64(pcm16_base64: str) -> str:
    audio_bytes = base64.b64decode(pcm16_base64)
    audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0

    segments, _info = MODEL.transcribe(
        audio,
        language=LANGUAGE,
        task="transcribe",
        beam_size=BEAM_SIZE,
        condition_on_previous_text=False,
        vad_filter=False,
        without_timestamps=True,
        word_timestamps=False,
        temperature=0.0,
    )

    text_parts: list[str] = []
    for segment in segments:
        if segment.text:
            text_parts.append(segment.text.strip())
    return " ".join(part for part in text_parts if part).strip()


for raw_line in sys.stdin:
    line = raw_line.strip()
    if not line:
        continue

    request = json.loads(line)
    request_id = request["id"]
    method = request["method"]
    params = request.get("params", {})

    try:
        if method == "ping":
            write(
                {
                    "id": request_id,
                    "result": {
                        "model": os.environ.get("WHISPER_MODEL", "large-v3-turbo"),
                        "device": getattr(MODEL, "_resolved_device", "unknown"),
                        "compute_type": getattr(MODEL, "_resolved_compute_type", "unknown"),
                    },
                }
            )
            continue

        if method == "transcribe":
            text = transcribe_pcm16_base64(params["pcm16_base64"])
            write(
                {
                    "id": request_id,
                    "result": {
                        "text": text,
                    },
                }
            )
            continue

        raise RuntimeError(f"unknown method: {method}")
    except Exception as error:  # noqa: BLE001
        traceback.print_exc(file=sys.stderr)
        write(
            {
                "id": request_id,
                "error": f"{type(error).__name__}: {error}",
            }
        )