Tweaked the speech recognition threshold thingy and other tweaks.

2025-08-22 08:06:45 -07:00 · 2025-08-22 08:06:45 -07:00 · 457732e2ff
commit 457732e2ff
parent 2fcbd16bd9
2 changed files with 55 additions and 24 deletions
--- a/audiosource.py
+++ b/audiosource.py
@ -8,9 +8,15 @@ import json
 import threading
 import speech_recognition
 import wave
+from datetime import datetime, timedelta

 from pyogg.opus_decoder import OpusDecoder

+#wave_out = wave.open("tmp/mic.wav", "wb")
+#wave_out.setnchannels(1)
+#wave_out.setframerate(16000)
+#wave_out.setsampwidth(2)
+

 class AudioSource:
    def __init__(self):
@ -18,6 +24,17 @@ class AudioSource:
        # callback.
        self.data_queue = Queue()

+        self.time_of_last_input = datetime.utcnow()
+
+        self._data_mutex = threading.Lock()
+
+    def add_data(self, data):
+        with self._data_mutex:
+            self.time_of_last_input = datetime.utcnow()
+            self.data_queue.put(bytearray(data))
+            #wave_out.writeframes(data)
+
+
    def is_done(self):
        return True

@ -34,7 +51,7 @@ class MicrophoneAudioSource(AudioSource):
        super().__init__()
        
        self._recorder = speech_recognition.Recognizer()
-        self._recorder.energy_threshold = 1200
+        self._recorder.energy_threshold = 50

        # Definitely do this, dynamic energy compensation lowers the energy
        # threshold dramatically to a point where the SpeechRecognizer
@ -55,7 +72,9 @@ class MicrophoneAudioSource(AudioSource):

            # Grab the raw bytes and push it into the thread safe queue.
            data = audio.get_raw_data()
-            self.data_queue.put(bytearray(data))
+            self.time_of_last_input = datetime.utcnow()
+            #self.data_queue.put(bytearray(data))
+            self.add_data(data)

        # Create a background thread that will pass us raw audio bytes.
        # We could do this manually but SpeechRecognizer provides a nice helper.
@ -151,7 +170,8 @@ class OpusStreamAudioSource(AudioSource):
                    # We need to copy decoded_data here or we end up with
                    # recycled buffers in our queue, which leads to broken
                    # audio.
-                    self.data_queue.put(bytearray(decoded_data))
+                    #self.data_queue.put(bytearray(decoded_data))
+                    self.add_data(decoded_data)

                else:
                    break
--- a/transcriber.py
+++ b/transcriber.py
@ -4,10 +4,10 @@
 recent_phrase_count = 8

 # Seconds of silence before we start a new phrase.
-phrase_timeout = 3
+phrase_timeout = 1.0

 # Higher is less restrictive on what it lets pass through.
-no_speech_prob_threshold = 0.25 # 0.15
+no_speech_prob_threshold = 0.15 # 0.15

 # Minimum number of seconds before we fire off the model again.
 min_time_between_updates = 2
@ -109,21 +109,9 @@ class Transcriber:

            # We got some new data. Let's process it!

-            # If enough time has passed between recordings, consider the
-            # last phrase complete and start a new one. Clear the current
-            # working audio buffer to start over with the new data.
-            if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
-
-                # Only add a new phrase if we actually have data in the last
-                # one.
-                with self._phrases_list_mutex:
-                    if self.phrases[-1] != "":
-                        self.phrases.append("")
-
-                self._current_data = b''
-
-            # Get all the new data since last tick,
+            # Get all the new data since last tick.
            new_data = []
+            with self._audio_source._data_mutex:
                while not self._audio_source.data_queue.empty():
                    new_packet = self._audio_source.data_queue.get()
                    new_data.append(new_packet)
@ -154,10 +142,20 @@ class Transcriber:
            with _audio_model_mutex:
                #print("Transcribe start ", len(self._current_data))
                result = _audio_model.transcribe(
-                    audio_np, fp16=torch.cuda.is_available())
+                    audio_np, fp16=torch.cuda.is_available(),
+                    word_timestamps=True,
+                    hallucination_silence_threshold=2)
                #print("Transcribe end")
                self._last_model_time = now

+            with self._phrases_list_mutex:
+                wave_out = wave.open("tmp/wave%0.4d.wav" % len(self.phrases), "wb")
+                wave_out.setnchannels(1)
+                wave_out.setframerate(16000)
+                wave_out.setsampwidth(2)
+                wave_out.writeframes(self._current_data)
+
+
            # Filter out text segments with a high no_speech_prob.
            combined_text = ""
            for seg in result["segments"]:
@ -166,7 +164,7 @@ class Transcriber:

            text = combined_text.strip()

-            # FIXME:
+            # # FIXME:
            text = result["text"]

            with self._phrases_list_mutex:
@ -177,6 +175,19 @@ class Transcriber:
            # cause us to split phrases.
            self._phrase_time = now

+        # If enough time has passed between recordings, consider the
+        # last phrase complete and start a new one. Clear the current
+        # working audio buffer to start over with the new data.
+        if now - self._audio_source.time_of_last_input > timedelta(seconds=phrase_timeout):
+
+            # Only add a new phrase if we actually have data in the last
+            # one.
+            with self._phrases_list_mutex:
+                if self.phrases[-1] != "":
+                    self.phrases.append("")
+
+            self._current_data = b''
+
        # Automatically drop audio sources when we're finished with them.
        if self._audio_source.is_done():
            self._audio_source = None