Figured out how to reduce hallucinations a bunch.

2025-08-17 13:52:41 -07:00 · 2025-08-17 13:52:41 -07:00 · 4dc7385239
commit 4dc7385239
parent 9a16c371f8
1 changed files with 15 additions and 6 deletions
--- a/transcriber.py
+++ b/transcriber.py
@ -6,6 +6,8 @@ recent_phrase_count = 8
 # Seconds of silence before we start a new phrase.
 phrase_timeout = 3
 # Higher is more restrictive on what it lets pass through.
 no_speech_prob_threshold = 0.2
 import numpy as np
 import speech_recognition
@ -33,6 +35,7 @@ class Transcriber:
        self.phrases = [""]
        # Time since the last data came in for the current phrase.
        self._phrase_time = datetime.utcnow()
    def set_source(self, source):
@ -40,7 +43,7 @@ class Transcriber:
    def phrase_probably_silent(self):
        """Whisper hallucinates a LOT on silence, so let's just ignore stuff
-        that's mostly silence."""
+        that's mostly silence. First line of defense here."""
        threshold = 100
        threshold_pass = 0
@ -55,8 +58,6 @@ class Transcriber:
        avg = avg / len(self._current_data)
        threshold_pct = threshold_pass / len(self._current_data)
        print("threshold_pct: ", threshold_pct)
        print("avg: ", avg)
        if threshold_pct < 0.1:
            return True
@ -77,10 +78,12 @@ class Transcriber:
                # last phrase complete and start a new one. Clear the current
                # working audio buffer to start over with the new data.
                if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
-                    # TODO: Append stats to the end for debugging so we can keep
+
-                    # tracking down the hallucinations.
+                    # Only add a new phrase if we actually have data in the last
                    # one.
                    if self.phrases[-1] != "":
                        self.phrases.append("")
                    self._current_data = b''
                self._phrase_time = now
@ -114,7 +117,13 @@ class Transcriber:
                    result = _audio_model.transcribe(
                        audio_np, fp16=torch.cuda.is_available())
-                    text = result['text'].strip()
+                    # Filter out text segments with a high no_speech_prob.
                    combined_text = ""
                    for seg in result["segments"]:
                        if seg["no_speech_prob"] <= no_speech_prob_threshold:
                            combined_text += seg["text"]
                    text = combined_text.strip()
                    self.phrases[-1] = text