From 4dc738523911e9104a6c64433ff8c0402df36b0d Mon Sep 17 00:00:00 2001
From: Kiri <expiredpopsicle@gmail.com>
Date: Sun, 17 Aug 2025 13:52:41 -0700
Subject: [PATCH] Figured out how to reduce hallucinations a bunch.

---
 transcriber.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/transcriber.py b/transcriber.py
index 65fb553..63fa42c 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -6,6 +6,8 @@ recent_phrase_count = 8
 # Seconds of silence before we start a new phrase.
 phrase_timeout = 3
 
+# Higher is more restrictive on what it lets pass through.
+no_speech_prob_threshold = 0.2
 
 import numpy as np
 import speech_recognition
@@ -33,6 +35,7 @@ class Transcriber:
 
         self.phrases = [""]
 
+        # Time since the last data came in for the current phrase.
         self._phrase_time = datetime.utcnow()
 
     def set_source(self, source):
@@ -40,7 +43,7 @@ class Transcriber:
 
     def phrase_probably_silent(self):
         """Whisper hallucinates a LOT on silence, so let's just ignore stuff
-        that's mostly silence."""
+        that's mostly silence. First line of defense here."""
 
         threshold = 100
         threshold_pass = 0
@@ -55,8 +58,6 @@ class Transcriber:
         
         avg = avg / len(self._current_data)
         threshold_pct = threshold_pass / len(self._current_data)
-        print("threshold_pct: ", threshold_pct)
-        print("avg: ", avg)
 
         if threshold_pct < 0.1:
             return True
@@ -77,10 +78,12 @@ class Transcriber:
                 # last phrase complete and start a new one. Clear the current
                 # working audio buffer to start over with the new data.
                 if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
-                    # TODO: Append stats to the end for debugging so we can keep
-                    # tracking down the hallucinations.
+
+                    # Only add a new phrase if we actually have data in the last
+                    # one.
                     if self.phrases[-1] != "":
                         self.phrases.append("")
+
                     self._current_data = b''
 
                 self._phrase_time = now
@@ -114,7 +117,13 @@ class Transcriber:
                     result = _audio_model.transcribe(
                         audio_np, fp16=torch.cuda.is_available())
 
-                    text = result['text'].strip()
+                    # Filter out text segments with a high no_speech_prob.
+                    combined_text = ""
+                    for seg in result["segments"]:
+                        if seg["no_speech_prob"] <= no_speech_prob_threshold:
+                            combined_text += seg["text"]
+
+                    text = combined_text.strip()
 
                     self.phrases[-1] = text