Tune realtime STT defaults
This commit is contained in:
10
.env.example
10
.env.example
@@ -10,7 +10,15 @@ WHISPER_MODEL=large-v3-turbo
|
|||||||
WHISPER_LANGUAGE=ko
|
WHISPER_LANGUAGE=ko
|
||||||
WHISPER_DEVICE=auto
|
WHISPER_DEVICE=auto
|
||||||
WHISPER_COMPUTE_TYPE=auto
|
WHISPER_COMPUTE_TYPE=auto
|
||||||
WHISPER_BEAM_SIZE=1
|
WHISPER_BEAM_SIZE=2
|
||||||
|
|
||||||
|
SEGMENT_START_THRESHOLD=900
|
||||||
|
SEGMENT_CONTINUE_THRESHOLD=450
|
||||||
|
SEGMENT_START_FRAMES=2
|
||||||
|
SEGMENT_END_FRAMES=24
|
||||||
|
SEGMENT_PREROLL_SAMPLES=3200
|
||||||
|
SEGMENT_MIN_SPEECH_SAMPLES=7200
|
||||||
|
SEGMENT_MAX_SPEECH_SAMPLES=160000
|
||||||
|
|
||||||
DEBUG_TRANSCRIPTS=true
|
DEBUG_TRANSCRIPTS=true
|
||||||
LOG_LEVEL=info
|
LOG_LEVEL=info
|
||||||
|
|||||||
12
README.md
12
README.md
@@ -46,7 +46,13 @@ bun run start:loopback
|
|||||||
- `WHISPER_COMPUTE_TYPE`
|
- `WHISPER_COMPUTE_TYPE`
|
||||||
- `auto`, `float16`, `int8_float16`, `int8`, `float32`
|
- `auto`, `float16`, `int8_float16`, `int8`, `float32`
|
||||||
- `WHISPER_BEAM_SIZE`
|
- `WHISPER_BEAM_SIZE`
|
||||||
- 기본값 `1`
|
- 기본값 `2`
|
||||||
|
- `SEGMENT_END_FRAMES`
|
||||||
|
- 기본값 `24`
|
||||||
|
- 끝을 조금 더 늦게 잘라서 문장이 잘게 끊기는 현상을 줄입니다
|
||||||
|
- `SEGMENT_MAX_SPEECH_SAMPLES`
|
||||||
|
- 기본값 `160000`
|
||||||
|
- 너무 긴 구간은 강제로 끊어서 지연이 과하게 커지는 걸 막습니다
|
||||||
|
|
||||||
## 메모
|
## 메모
|
||||||
|
|
||||||
@@ -77,7 +83,9 @@ WHISPER_MODEL=large-v3-turbo
|
|||||||
WHISPER_LANGUAGE=ko
|
WHISPER_LANGUAGE=ko
|
||||||
WHISPER_DEVICE=auto
|
WHISPER_DEVICE=auto
|
||||||
WHISPER_COMPUTE_TYPE=auto
|
WHISPER_COMPUTE_TYPE=auto
|
||||||
WHISPER_BEAM_SIZE=1
|
WHISPER_BEAM_SIZE=2
|
||||||
|
SEGMENT_END_FRAMES=24
|
||||||
|
SEGMENT_MAX_SPEECH_SAMPLES=160000
|
||||||
DEBUG_TRANSCRIPTS=true
|
DEBUG_TRANSCRIPTS=true
|
||||||
LOG_LEVEL=info
|
LOG_LEVEL=info
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -4,6 +4,13 @@ interface RealtimeSegmenterOptions {
|
|||||||
onSpeechStart?: (peak: number) => void;
|
onSpeechStart?: (peak: number) => void;
|
||||||
onSpeechDiscarded?: (samples: number) => void;
|
onSpeechDiscarded?: (samples: number) => void;
|
||||||
onSpeechReady?: (samples: number) => void;
|
onSpeechReady?: (samples: number) => void;
|
||||||
|
preRollSamples?: number;
|
||||||
|
speechStartThreshold?: number;
|
||||||
|
speechContinueThreshold?: number;
|
||||||
|
speechStartFrames?: number;
|
||||||
|
speechEndFrames?: number;
|
||||||
|
minSpeechSamples?: number;
|
||||||
|
maxSpeechSamples?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class RealtimeSegmenter {
|
export class RealtimeSegmenter {
|
||||||
@@ -12,18 +19,27 @@ export class RealtimeSegmenter {
|
|||||||
private readonly speech: number[] = [];
|
private readonly speech: number[] = [];
|
||||||
|
|
||||||
private readonly frameSamples = 320;
|
private readonly frameSamples = 320;
|
||||||
private readonly preRollSamples = 3200;
|
private readonly preRollSamples: number;
|
||||||
private readonly speechStartThreshold = 900;
|
private readonly speechStartThreshold: number;
|
||||||
private readonly speechContinueThreshold = 450;
|
private readonly speechContinueThreshold: number;
|
||||||
private readonly speechStartFrames = 2;
|
private readonly speechStartFrames: number;
|
||||||
private readonly speechEndFrames = 18;
|
private readonly speechEndFrames: number;
|
||||||
private readonly minSpeechSamples = 6400;
|
private readonly minSpeechSamples: number;
|
||||||
|
private readonly maxSpeechSamples: number;
|
||||||
|
|
||||||
private speechActive = false;
|
private speechActive = false;
|
||||||
private speechCandidateFrames = 0;
|
private speechCandidateFrames = 0;
|
||||||
private silenceFrames = 0;
|
private silenceFrames = 0;
|
||||||
|
|
||||||
constructor(private readonly options: RealtimeSegmenterOptions) {}
|
constructor(private readonly options: RealtimeSegmenterOptions) {
|
||||||
|
this.preRollSamples = options.preRollSamples ?? 3200;
|
||||||
|
this.speechStartThreshold = options.speechStartThreshold ?? 900;
|
||||||
|
this.speechContinueThreshold = options.speechContinueThreshold ?? 450;
|
||||||
|
this.speechStartFrames = options.speechStartFrames ?? 2;
|
||||||
|
this.speechEndFrames = options.speechEndFrames ?? 24;
|
||||||
|
this.minSpeechSamples = options.minSpeechSamples ?? 7200;
|
||||||
|
this.maxSpeechSamples = options.maxSpeechSamples ?? 160000;
|
||||||
|
}
|
||||||
|
|
||||||
pushChunk(chunk: Buffer): void {
|
pushChunk(chunk: Buffer): void {
|
||||||
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
||||||
@@ -78,8 +94,10 @@ export class RealtimeSegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.silenceFrames < this.speechEndFrames) {
|
if (this.silenceFrames < this.speechEndFrames) {
|
||||||
|
if (this.speech.length < this.maxSpeechSamples) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
|
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
|
||||||
this.speechActive = false;
|
this.speechActive = false;
|
||||||
|
|||||||
@@ -19,7 +19,14 @@ const envSchema = z.object({
|
|||||||
WHISPER_LANGUAGE: z.string().min(1).default("ko"),
|
WHISPER_LANGUAGE: z.string().min(1).default("ko"),
|
||||||
WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"),
|
WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"),
|
||||||
WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"),
|
WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"),
|
||||||
WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
|
WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(2),
|
||||||
|
SEGMENT_START_THRESHOLD: z.coerce.number().int().min(100).max(10000).default(900),
|
||||||
|
SEGMENT_CONTINUE_THRESHOLD: z.coerce.number().int().min(50).max(10000).default(450),
|
||||||
|
SEGMENT_START_FRAMES: z.coerce.number().int().min(1).max(10).default(2),
|
||||||
|
SEGMENT_END_FRAMES: z.coerce.number().int().min(4).max(60).default(24),
|
||||||
|
SEGMENT_PREROLL_SAMPLES: z.coerce.number().int().min(320).max(16000).default(3200),
|
||||||
|
SEGMENT_MIN_SPEECH_SAMPLES: z.coerce.number().int().min(1600).max(64000).default(7200),
|
||||||
|
SEGMENT_MAX_SPEECH_SAMPLES: z.coerce.number().int().min(16000).max(320000).default(160000),
|
||||||
DEBUG_TRANSCRIPTS: z
|
DEBUG_TRANSCRIPTS: z
|
||||||
.string()
|
.string()
|
||||||
.optional()
|
.optional()
|
||||||
|
|||||||
31
src/index.ts
31
src/index.ts
@@ -69,8 +69,9 @@ async function runLoopback(): Promise<void> {
|
|||||||
await stt.warmup();
|
await stt.warmup();
|
||||||
logger.info("STT warmup finished");
|
logger.info("STT warmup finished");
|
||||||
|
|
||||||
const transcriptionQueue: Buffer[] = [];
|
const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
|
||||||
let transcribing = false;
|
let transcribing = false;
|
||||||
|
let nextSegmentIndex = 1;
|
||||||
|
|
||||||
const runNext = async (): Promise<void> => {
|
const runNext = async (): Promise<void> => {
|
||||||
if (transcribing) {
|
if (transcribing) {
|
||||||
@@ -83,11 +84,17 @@ async function runLoopback(): Promise<void> {
|
|||||||
|
|
||||||
transcribing = true;
|
transcribing = true;
|
||||||
try {
|
try {
|
||||||
const text = await stt.transcribePcm16(next);
|
const startedAt = Date.now();
|
||||||
|
const text = await stt.transcribePcm16(next.pcm16);
|
||||||
|
logger.info("STT latency", {
|
||||||
|
index: next.index,
|
||||||
|
wait_ms: startedAt - next.queuedAt,
|
||||||
|
transcribe_ms: Date.now() - startedAt,
|
||||||
|
});
|
||||||
if (!text) {
|
if (!text) {
|
||||||
logger.info("빈 전사 결과");
|
logger.info("빈 전사 결과");
|
||||||
} else {
|
} else {
|
||||||
logger.info("Transcript", text);
|
logger.info("Transcript", { index: next.index, text });
|
||||||
if (config.DEBUG_TRANSCRIPTS) {
|
if (config.DEBUG_TRANSCRIPTS) {
|
||||||
console.log(`\n[text] ${text}\n`);
|
console.log(`\n[text] ${text}\n`);
|
||||||
}
|
}
|
||||||
@@ -101,6 +108,13 @@ async function runLoopback(): Promise<void> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const segmenter = new RealtimeSegmenter({
|
const segmenter = new RealtimeSegmenter({
|
||||||
|
preRollSamples: config.SEGMENT_PREROLL_SAMPLES,
|
||||||
|
speechStartThreshold: config.SEGMENT_START_THRESHOLD,
|
||||||
|
speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD,
|
||||||
|
speechStartFrames: config.SEGMENT_START_FRAMES,
|
||||||
|
speechEndFrames: config.SEGMENT_END_FRAMES,
|
||||||
|
minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES,
|
||||||
|
maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES,
|
||||||
onLevel: (peak) => {
|
onLevel: (peak) => {
|
||||||
if (peak > maxPeak) {
|
if (peak > maxPeak) {
|
||||||
maxPeak = peak;
|
maxPeak = peak;
|
||||||
@@ -128,11 +142,17 @@ async function runLoopback(): Promise<void> {
|
|||||||
},
|
},
|
||||||
onSpeechReady: (samples) => {
|
onSpeechReady: (samples) => {
|
||||||
emittedSegmentCount += 1;
|
emittedSegmentCount += 1;
|
||||||
logger.info("Speech segment ready", { index: emittedSegmentCount, samples });
|
logger.info("Speech segment ready", { index: emittedSegmentCount, samples, ms: Math.round((samples / 16000) * 1000) });
|
||||||
},
|
},
|
||||||
onSegment: (pcm16) => {
|
onSegment: (pcm16) => {
|
||||||
transcriptionQueue.push(pcm16);
|
const index = nextSegmentIndex++;
|
||||||
|
transcriptionQueue.push({
|
||||||
|
pcm16,
|
||||||
|
queuedAt: Date.now(),
|
||||||
|
index,
|
||||||
|
});
|
||||||
logger.info("Queued segment for STT", {
|
logger.info("Queued segment for STT", {
|
||||||
|
index,
|
||||||
queue: transcriptionQueue.length,
|
queue: transcriptionQueue.length,
|
||||||
bytes: pcm16.length,
|
bytes: pcm16.length,
|
||||||
});
|
});
|
||||||
@@ -167,6 +187,7 @@ async function runLoopback(): Promise<void> {
|
|||||||
console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
|
console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
|
||||||
console.log(`model: ${config.WHISPER_MODEL}`);
|
console.log(`model: ${config.WHISPER_MODEL}`);
|
||||||
console.log(`language: ${config.WHISPER_LANGUAGE}`);
|
console.log(`language: ${config.WHISPER_LANGUAGE}`);
|
||||||
|
console.log(`beam: ${config.WHISPER_BEAM_SIZE}`);
|
||||||
|
|
||||||
setInterval(() => {
|
setInterval(() => {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
|
|||||||
Reference in New Issue
Block a user