From 457732e2ff7889a7c7b14a248948895d2375dac2 Mon Sep 17 00:00:00 2001 From: Kiri Date: Fri, 22 Aug 2025 08:06:45 -0700 Subject: [PATCH] Tweaked the speech recognition threshold thingy and other tweaks. --- audiosource.py | 26 ++++++++++++++++++++++--- transcriber.py | 53 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/audiosource.py b/audiosource.py index 73e6855..848b487 100644 --- a/audiosource.py +++ b/audiosource.py @@ -8,9 +8,15 @@ import json import threading import speech_recognition import wave +from datetime import datetime, timedelta from pyogg.opus_decoder import OpusDecoder +#wave_out = wave.open("tmp/mic.wav", "wb") +#wave_out.setnchannels(1) +#wave_out.setframerate(16000) +#wave_out.setsampwidth(2) + class AudioSource: def __init__(self): @@ -18,6 +24,17 @@ class AudioSource: # callback. self.data_queue = Queue() + self.time_of_last_input = datetime.utcnow() + + self._data_mutex = threading.Lock() + + def add_data(self, data): + with self._data_mutex: + self.time_of_last_input = datetime.utcnow() + self.data_queue.put(bytearray(data)) + #wave_out.writeframes(data) + + def is_done(self): return True @@ -34,7 +51,7 @@ class MicrophoneAudioSource(AudioSource): super().__init__() self._recorder = speech_recognition.Recognizer() - self._recorder.energy_threshold = 1200 + self._recorder.energy_threshold = 50 # Definitely do this, dynamic energy compensation lowers the energy # threshold dramatically to a point where the SpeechRecognizer @@ -55,7 +72,9 @@ class MicrophoneAudioSource(AudioSource): # Grab the raw bytes and push it into the thread safe queue. data = audio.get_raw_data() - self.data_queue.put(bytearray(data)) + self.time_of_last_input = datetime.utcnow() + #self.data_queue.put(bytearray(data)) + self.add_data(data) # Create a background thread that will pass us raw audio bytes. # We could do this manually but SpeechRecognizer provides a nice helper. @@ -151,7 +170,8 @@ class OpusStreamAudioSource(AudioSource): # We need to copy decoded_data here or we end up with # recycled buffers in our queue, which leads to broken # audio. - self.data_queue.put(bytearray(decoded_data)) + #self.data_queue.put(bytearray(decoded_data)) + self.add_data(decoded_data) else: break diff --git a/transcriber.py b/transcriber.py index 116924f..92cf61a 100644 --- a/transcriber.py +++ b/transcriber.py @@ -4,10 +4,10 @@ recent_phrase_count = 8 # Seconds of silence before we start a new phrase. -phrase_timeout = 3 +phrase_timeout = 1.0 # Higher is less restrictive on what it lets pass through. -no_speech_prob_threshold = 0.25 # 0.15 +no_speech_prob_threshold = 0.15 # 0.15 # Minimum number of seconds before we fire off the model again. min_time_between_updates = 2 @@ -109,24 +109,12 @@ class Transcriber: # We got some new data. Let's process it! - # If enough time has passed between recordings, consider the - # last phrase complete and start a new one. Clear the current - # working audio buffer to start over with the new data. - if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout): - - # Only add a new phrase if we actually have data in the last - # one. - with self._phrases_list_mutex: - if self.phrases[-1] != "": - self.phrases.append("") - - self._current_data = b'' - - # Get all the new data since last tick, + # Get all the new data since last tick. new_data = [] - while not self._audio_source.data_queue.empty(): - new_packet = self._audio_source.data_queue.get() - new_data.append(new_packet) + with self._audio_source._data_mutex: + while not self._audio_source.data_queue.empty(): + new_packet = self._audio_source.data_queue.get() + new_data.append(new_packet) new_data_joined = b''.join(new_data) # For debugging... @@ -154,10 +142,20 @@ class Transcriber: with _audio_model_mutex: #print("Transcribe start ", len(self._current_data)) result = _audio_model.transcribe( - audio_np, fp16=torch.cuda.is_available()) + audio_np, fp16=torch.cuda.is_available(), + word_timestamps=True, + hallucination_silence_threshold=2) #print("Transcribe end") self._last_model_time = now + with self._phrases_list_mutex: + wave_out = wave.open("tmp/wave%0.4d.wav" % len(self.phrases), "wb") + wave_out.setnchannels(1) + wave_out.setframerate(16000) + wave_out.setsampwidth(2) + wave_out.writeframes(self._current_data) + + # Filter out text segments with a high no_speech_prob. combined_text = "" for seg in result["segments"]: @@ -166,7 +164,7 @@ class Transcriber: text = combined_text.strip() - # FIXME: + # # FIXME: text = result["text"] with self._phrases_list_mutex: @@ -177,6 +175,19 @@ class Transcriber: # cause us to split phrases. self._phrase_time = now + # If enough time has passed between recordings, consider the + # last phrase complete and start a new one. Clear the current + # working audio buffer to start over with the new data. + if now - self._audio_source.time_of_last_input > timedelta(seconds=phrase_timeout): + + # Only add a new phrase if we actually have data in the last + # one. + with self._phrases_list_mutex: + if self.phrases[-1] != "": + self.phrases.append("") + + self._current_data = b'' + # Automatically drop audio sources when we're finished with them. if self._audio_source.is_done(): self._audio_source = None