From 457732e2ff7889a7c7b14a248948895d2375dac2 Mon Sep 17 00:00:00 2001
From: Kiri <expiredpopsicle@gmail.com>
Date: Fri, 22 Aug 2025 08:06:45 -0700
Subject: [PATCH] Tweaked the speech recognition threshold thingy and other
 tweaks.

---
 audiosource.py | 26 ++++++++++++++++++++++---
 transcriber.py | 53 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/audiosource.py b/audiosource.py
index 73e6855..848b487 100644
--- a/audiosource.py
+++ b/audiosource.py
@@ -8,9 +8,15 @@ import json
 import threading
 import speech_recognition
 import wave
+from datetime import datetime, timedelta
 
 from pyogg.opus_decoder import OpusDecoder
 
+#wave_out = wave.open("tmp/mic.wav", "wb")
+#wave_out.setnchannels(1)
+#wave_out.setframerate(16000)
+#wave_out.setsampwidth(2)
+
 
 class AudioSource:
     def __init__(self):
@@ -18,6 +24,17 @@ class AudioSource:
         # callback.
         self.data_queue = Queue()
 
+        self.time_of_last_input = datetime.utcnow()
+
+        self._data_mutex = threading.Lock()
+
+    def add_data(self, data):
+        with self._data_mutex:
+            self.time_of_last_input = datetime.utcnow()
+            self.data_queue.put(bytearray(data))
+            #wave_out.writeframes(data)
+
+
     def is_done(self):
         return True
 
@@ -34,7 +51,7 @@ class MicrophoneAudioSource(AudioSource):
         super().__init__()
         
         self._recorder = speech_recognition.Recognizer()
-        self._recorder.energy_threshold = 1200
+        self._recorder.energy_threshold = 50
 
         # Definitely do this, dynamic energy compensation lowers the energy
         # threshold dramatically to a point where the SpeechRecognizer
@@ -55,7 +72,9 @@ class MicrophoneAudioSource(AudioSource):
 
             # Grab the raw bytes and push it into the thread safe queue.
             data = audio.get_raw_data()
-            self.data_queue.put(bytearray(data))
+            self.time_of_last_input = datetime.utcnow()
+            #self.data_queue.put(bytearray(data))
+            self.add_data(data)
 
         # Create a background thread that will pass us raw audio bytes.
         # We could do this manually but SpeechRecognizer provides a nice helper.
@@ -151,7 +170,8 @@ class OpusStreamAudioSource(AudioSource):
                     # We need to copy decoded_data here or we end up with
                     # recycled buffers in our queue, which leads to broken
                     # audio.
-                    self.data_queue.put(bytearray(decoded_data))
+                    #self.data_queue.put(bytearray(decoded_data))
+                    self.add_data(decoded_data)
 
                 else:
                     break
diff --git a/transcriber.py b/transcriber.py
index 116924f..92cf61a 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -4,10 +4,10 @@
 recent_phrase_count = 8
 
 # Seconds of silence before we start a new phrase.
-phrase_timeout = 3
+phrase_timeout = 1.0
 
 # Higher is less restrictive on what it lets pass through.
-no_speech_prob_threshold = 0.25 # 0.15
+no_speech_prob_threshold = 0.15 # 0.15
 
 # Minimum number of seconds before we fire off the model again.
 min_time_between_updates = 2
@@ -109,24 +109,12 @@ class Transcriber:
 
             # We got some new data. Let's process it!
 
-            # If enough time has passed between recordings, consider the
-            # last phrase complete and start a new one. Clear the current
-            # working audio buffer to start over with the new data.
-            if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
-
-                # Only add a new phrase if we actually have data in the last
-                # one.
-                with self._phrases_list_mutex:
-                    if self.phrases[-1] != "":
-                        self.phrases.append("")
-
-                self._current_data = b''
-
-            # Get all the new data since last tick,
+            # Get all the new data since last tick.
             new_data = []
-            while not self._audio_source.data_queue.empty():
-                new_packet = self._audio_source.data_queue.get()
-                new_data.append(new_packet)
+            with self._audio_source._data_mutex:
+                while not self._audio_source.data_queue.empty():
+                    new_packet = self._audio_source.data_queue.get()
+                    new_data.append(new_packet)
             new_data_joined = b''.join(new_data)
             
             # For debugging...
@@ -154,10 +142,20 @@ class Transcriber:
             with _audio_model_mutex:
                 #print("Transcribe start ", len(self._current_data))
                 result = _audio_model.transcribe(
-                    audio_np, fp16=torch.cuda.is_available())
+                    audio_np, fp16=torch.cuda.is_available(),
+                    word_timestamps=True,
+                    hallucination_silence_threshold=2)
                 #print("Transcribe end")
                 self._last_model_time = now
 
+            with self._phrases_list_mutex:
+                wave_out = wave.open("tmp/wave%0.4d.wav" % len(self.phrases), "wb")
+                wave_out.setnchannels(1)
+                wave_out.setframerate(16000)
+                wave_out.setsampwidth(2)
+                wave_out.writeframes(self._current_data)
+
+
             # Filter out text segments with a high no_speech_prob.
             combined_text = ""
             for seg in result["segments"]:
@@ -166,7 +164,7 @@ class Transcriber:
 
             text = combined_text.strip()
 
-            # FIXME:
+            # # FIXME:
             text = result["text"]
 
             with self._phrases_list_mutex:
@@ -177,6 +175,19 @@ class Transcriber:
             # cause us to split phrases.
             self._phrase_time = now
 
+        # If enough time has passed between recordings, consider the
+        # last phrase complete and start a new one. Clear the current
+        # working audio buffer to start over with the new data.
+        if now - self._audio_source.time_of_last_input > timedelta(seconds=phrase_timeout):
+
+            # Only add a new phrase if we actually have data in the last
+            # one.
+            with self._phrases_list_mutex:
+                if self.phrases[-1] != "":
+                    self.phrases.append("")
+
+            self._current_data = b''
+
         # Automatically drop audio sources when we're finished with them.
         if self._audio_source.is_done():
             self._audio_source = None