Figured out how to reduce hallucinations a bunch.

This commit is contained in:
Kiri 2025-08-17 13:52:41 -07:00
parent 9a16c371f8
commit 4dc7385239

View File

@ -6,6 +6,8 @@ recent_phrase_count = 8
# Seconds of silence before we start a new phrase. # Seconds of silence before we start a new phrase.
phrase_timeout = 3 phrase_timeout = 3
# Higher is more restrictive on what it lets pass through.
no_speech_prob_threshold = 0.2
import numpy as np import numpy as np
import speech_recognition import speech_recognition
@ -33,6 +35,7 @@ class Transcriber:
self.phrases = [""] self.phrases = [""]
# Time since the last data came in for the current phrase.
self._phrase_time = datetime.utcnow() self._phrase_time = datetime.utcnow()
def set_source(self, source): def set_source(self, source):
@ -40,7 +43,7 @@ class Transcriber:
def phrase_probably_silent(self): def phrase_probably_silent(self):
"""Whisper hallucinates a LOT on silence, so let's just ignore stuff """Whisper hallucinates a LOT on silence, so let's just ignore stuff
that's mostly silence.""" that's mostly silence. First line of defense here."""
threshold = 100 threshold = 100
threshold_pass = 0 threshold_pass = 0
@ -55,8 +58,6 @@ class Transcriber:
avg = avg / len(self._current_data) avg = avg / len(self._current_data)
threshold_pct = threshold_pass / len(self._current_data) threshold_pct = threshold_pass / len(self._current_data)
print("threshold_pct: ", threshold_pct)
print("avg: ", avg)
if threshold_pct < 0.1: if threshold_pct < 0.1:
return True return True
@ -77,10 +78,12 @@ class Transcriber:
# last phrase complete and start a new one. Clear the current # last phrase complete and start a new one. Clear the current
# working audio buffer to start over with the new data. # working audio buffer to start over with the new data.
if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout): if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
# TODO: Append stats to the end for debugging so we can keep
# tracking down the hallucinations. # Only add a new phrase if we actually have data in the last
# one.
if self.phrases[-1] != "": if self.phrases[-1] != "":
self.phrases.append("") self.phrases.append("")
self._current_data = b'' self._current_data = b''
self._phrase_time = now self._phrase_time = now
@ -114,7 +117,13 @@ class Transcriber:
result = _audio_model.transcribe( result = _audio_model.transcribe(
audio_np, fp16=torch.cuda.is_available()) audio_np, fp16=torch.cuda.is_available())
text = result['text'].strip() # Filter out text segments with a high no_speech_prob.
combined_text = ""
for seg in result["segments"]:
if seg["no_speech_prob"] <= no_speech_prob_threshold:
combined_text += seg["text"]
text = combined_text.strip()
self.phrases[-1] = text self.phrases[-1] = text