Tweaked the speech recognition threshold thingy and other tweaks.

2025-08-22 08:06:45 -07:00 · 2025-08-22 08:06:45 -07:00 · 457732e2ff
commit 457732e2ff
parent 2fcbd16bd9
2 changed files with 55 additions and 24 deletions
--- a/audiosource.py
+++ b/audiosource.py
@ -8,9 +8,15 @@ import json
 import threading
 import speech_recognition
 import wave
 from datetime import datetime, timedelta
 from pyogg.opus_decoder import OpusDecoder
 #wave_out = wave.open("tmp/mic.wav", "wb")
 #wave_out.setnchannels(1)
 #wave_out.setframerate(16000)
 #wave_out.setsampwidth(2)
 class AudioSource:
    def __init__(self):
@ -18,6 +24,17 @@ class AudioSource:
        # callback.
        self.data_queue = Queue()
        self.time_of_last_input = datetime.utcnow()
        self._data_mutex = threading.Lock()
    def add_data(self, data):
        with self._data_mutex:
            self.time_of_last_input = datetime.utcnow()
            self.data_queue.put(bytearray(data))
            #wave_out.writeframes(data)
    def is_done(self):
        return True
@ -34,7 +51,7 @@ class MicrophoneAudioSource(AudioSource):
        super().__init__()
        self._recorder = speech_recognition.Recognizer()
-        self._recorder.energy_threshold = 1200
+        self._recorder.energy_threshold = 50
        # Definitely do this, dynamic energy compensation lowers the energy
        # threshold dramatically to a point where the SpeechRecognizer
@ -55,7 +72,9 @@ class MicrophoneAudioSource(AudioSource):
            # Grab the raw bytes and push it into the thread safe queue.
            data = audio.get_raw_data()
-            self.data_queue.put(bytearray(data))
+            self.time_of_last_input = datetime.utcnow()
            #self.data_queue.put(bytearray(data))
            self.add_data(data)
        # Create a background thread that will pass us raw audio bytes.
        # We could do this manually but SpeechRecognizer provides a nice helper.
@ -151,7 +170,8 @@ class OpusStreamAudioSource(AudioSource):
                    # We need to copy decoded_data here or we end up with
                    # recycled buffers in our queue, which leads to broken
                    # audio.
-                    self.data_queue.put(bytearray(decoded_data))
+                    #self.data_queue.put(bytearray(decoded_data))
                    self.add_data(decoded_data)
                else:
                    break
--- a/transcriber.py
+++ b/transcriber.py
@ -4,10 +4,10 @@
 recent_phrase_count = 8
 # Seconds of silence before we start a new phrase.
-phrase_timeout = 3
+phrase_timeout = 1.0
 # Higher is less restrictive on what it lets pass through.
-no_speech_prob_threshold = 0.25 # 0.15
+no_speech_prob_threshold = 0.15 # 0.15
 # Minimum number of seconds before we fire off the model again.
 min_time_between_updates = 2
@ -109,21 +109,9 @@ class Transcriber:
            # We got some new data. Let's process it!
-            # If enough time has passed between recordings, consider the
+            # Get all the new data since last tick.
            # last phrase complete and start a new one. Clear the current
            # working audio buffer to start over with the new data.
            if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
                # Only add a new phrase if we actually have data in the last
                # one.
                with self._phrases_list_mutex:
                    if self.phrases[-1] != "":
                        self.phrases.append("")
                self._current_data = b''
            # Get all the new data since last tick,
            new_data = []
            with self._audio_source._data_mutex:
                while not self._audio_source.data_queue.empty():
                    new_packet = self._audio_source.data_queue.get()
                    new_data.append(new_packet)
@ -154,10 +142,20 @@ class Transcriber:
            with _audio_model_mutex:
                #print("Transcribe start ", len(self._current_data))
                result = _audio_model.transcribe(
-                    audio_np, fp16=torch.cuda.is_available())
+                    audio_np, fp16=torch.cuda.is_available(),
                    word_timestamps=True,
                    hallucination_silence_threshold=2)
                #print("Transcribe end")
                self._last_model_time = now
            with self._phrases_list_mutex:
                wave_out = wave.open("tmp/wave%0.4d.wav" % len(self.phrases), "wb")
                wave_out.setnchannels(1)
                wave_out.setframerate(16000)
                wave_out.setsampwidth(2)
                wave_out.writeframes(self._current_data)
            # Filter out text segments with a high no_speech_prob.
            combined_text = ""
            for seg in result["segments"]:
@ -166,7 +164,7 @@ class Transcriber:
            text = combined_text.strip()
-            # FIXME:
+            # # FIXME:
            text = result["text"]
            with self._phrases_list_mutex:
@ -177,6 +175,19 @@ class Transcriber:
            # cause us to split phrases.
            self._phrase_time = now
        # If enough time has passed between recordings, consider the
        # last phrase complete and start a new one. Clear the current
        # working audio buffer to start over with the new data.
        if now - self._audio_source.time_of_last_input > timedelta(seconds=phrase_timeout):
            # Only add a new phrase if we actually have data in the last
            # one.
            with self._phrases_list_mutex:
                if self.phrases[-1] != "":
                    self.phrases.append("")
            self._current_data = b''
        # Automatically drop audio sources when we're finished with them.
        if self._audio_source.is_done():
            self._audio_source = None