Figured out how to reduce hallucinations a bunch.

2025-08-17 13:52:41 -07:00 · 2025-08-17 13:52:41 -07:00 · 4dc7385239
commit 4dc7385239
parent 9a16c371f8
1 changed files with 15 additions and 6 deletions
--- a/transcriber.py
+++ b/transcriber.py
@ -6,6 +6,8 @@ recent_phrase_count = 8
 # Seconds of silence before we start a new phrase.
 phrase_timeout = 3

+# Higher is more restrictive on what it lets pass through.
+no_speech_prob_threshold = 0.2

 import numpy as np
 import speech_recognition
@ -33,6 +35,7 @@ class Transcriber:

        self.phrases = [""]

+        # Time since the last data came in for the current phrase.
        self._phrase_time = datetime.utcnow()

    def set_source(self, source):
@ -40,7 +43,7 @@ class Transcriber:

    def phrase_probably_silent(self):
        """Whisper hallucinates a LOT on silence, so let's just ignore stuff
-        that's mostly silence."""
+        that's mostly silence. First line of defense here."""

        threshold = 100
        threshold_pass = 0
@ -55,8 +58,6 @@ class Transcriber:
        
        avg = avg / len(self._current_data)
        threshold_pct = threshold_pass / len(self._current_data)
-        print("threshold_pct: ", threshold_pct)
-        print("avg: ", avg)

        if threshold_pct < 0.1:
            return True
@ -77,10 +78,12 @@ class Transcriber:
                # last phrase complete and start a new one. Clear the current
                # working audio buffer to start over with the new data.
                if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
-                    # TODO: Append stats to the end for debugging so we can keep
-                    # tracking down the hallucinations.
+
+                    # Only add a new phrase if we actually have data in the last
+                    # one.
                    if self.phrases[-1] != "":
                        self.phrases.append("")
+
                    self._current_data = b''

                self._phrase_time = now
@ -114,7 +117,13 @@ class Transcriber:
                    result = _audio_model.transcribe(
                        audio_np, fp16=torch.cuda.is_available())

-                    text = result['text'].strip()
+                    # Filter out text segments with a high no_speech_prob.
+                    combined_text = ""
+                    for seg in result["segments"]:
+                        if seg["no_speech_prob"] <= no_speech_prob_threshold:
+                            combined_text += seg["text"]
+
+                    text = combined_text.strip()

                    self.phrases[-1] = text