From 4dc738523911e9104a6c64433ff8c0402df36b0d Mon Sep 17 00:00:00 2001 From: Kiri Date: Sun, 17 Aug 2025 13:52:41 -0700 Subject: [PATCH] Figured out how to reduce hallucinations a bunch. --- transcriber.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/transcriber.py b/transcriber.py index 65fb553..63fa42c 100644 --- a/transcriber.py +++ b/transcriber.py @@ -6,6 +6,8 @@ recent_phrase_count = 8 # Seconds of silence before we start a new phrase. phrase_timeout = 3 +# Higher is more restrictive on what it lets pass through. +no_speech_prob_threshold = 0.2 import numpy as np import speech_recognition @@ -33,6 +35,7 @@ class Transcriber: self.phrases = [""] + # Time since the last data came in for the current phrase. self._phrase_time = datetime.utcnow() def set_source(self, source): @@ -40,7 +43,7 @@ class Transcriber: def phrase_probably_silent(self): """Whisper hallucinates a LOT on silence, so let's just ignore stuff - that's mostly silence.""" + that's mostly silence. First line of defense here.""" threshold = 100 threshold_pass = 0 @@ -55,8 +58,6 @@ class Transcriber: avg = avg / len(self._current_data) threshold_pct = threshold_pass / len(self._current_data) - print("threshold_pct: ", threshold_pct) - print("avg: ", avg) if threshold_pct < 0.1: return True @@ -77,10 +78,12 @@ class Transcriber: # last phrase complete and start a new one. Clear the current # working audio buffer to start over with the new data. if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout): - # TODO: Append stats to the end for debugging so we can keep - # tracking down the hallucinations. + + # Only add a new phrase if we actually have data in the last + # one. if self.phrases[-1] != "": self.phrases.append("") + self._current_data = b'' self._phrase_time = now @@ -114,7 +117,13 @@ class Transcriber: result = _audio_model.transcribe( audio_np, fp16=torch.cuda.is_available()) - text = result['text'].strip() + # Filter out text segments with a high no_speech_prob. + combined_text = "" + for seg in result["segments"]: + if seg["no_speech_prob"] <= no_speech_prob_threshold: + combined_text += seg["text"] + + text = combined_text.strip() self.phrases[-1] = text