Tweaked the speech recognition threshold thingy and other tweaks.

This commit is contained in:
Kiri 2025-08-22 08:06:45 -07:00
parent 2fcbd16bd9
commit 457732e2ff
2 changed files with 55 additions and 24 deletions

View File

@ -8,9 +8,15 @@ import json
import threading import threading
import speech_recognition import speech_recognition
import wave import wave
from datetime import datetime, timedelta
from pyogg.opus_decoder import OpusDecoder from pyogg.opus_decoder import OpusDecoder
#wave_out = wave.open("tmp/mic.wav", "wb")
#wave_out.setnchannels(1)
#wave_out.setframerate(16000)
#wave_out.setsampwidth(2)
class AudioSource: class AudioSource:
def __init__(self): def __init__(self):
@ -18,6 +24,17 @@ class AudioSource:
# callback. # callback.
self.data_queue = Queue() self.data_queue = Queue()
self.time_of_last_input = datetime.utcnow()
self._data_mutex = threading.Lock()
def add_data(self, data):
with self._data_mutex:
self.time_of_last_input = datetime.utcnow()
self.data_queue.put(bytearray(data))
#wave_out.writeframes(data)
def is_done(self): def is_done(self):
return True return True
@ -34,7 +51,7 @@ class MicrophoneAudioSource(AudioSource):
super().__init__() super().__init__()
self._recorder = speech_recognition.Recognizer() self._recorder = speech_recognition.Recognizer()
self._recorder.energy_threshold = 1200 self._recorder.energy_threshold = 50
# Definitely do this, dynamic energy compensation lowers the energy # Definitely do this, dynamic energy compensation lowers the energy
# threshold dramatically to a point where the SpeechRecognizer # threshold dramatically to a point where the SpeechRecognizer
@ -55,7 +72,9 @@ class MicrophoneAudioSource(AudioSource):
# Grab the raw bytes and push it into the thread safe queue. # Grab the raw bytes and push it into the thread safe queue.
data = audio.get_raw_data() data = audio.get_raw_data()
self.data_queue.put(bytearray(data)) self.time_of_last_input = datetime.utcnow()
#self.data_queue.put(bytearray(data))
self.add_data(data)
# Create a background thread that will pass us raw audio bytes. # Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper. # We could do this manually but SpeechRecognizer provides a nice helper.
@ -151,7 +170,8 @@ class OpusStreamAudioSource(AudioSource):
# We need to copy decoded_data here or we end up with # We need to copy decoded_data here or we end up with
# recycled buffers in our queue, which leads to broken # recycled buffers in our queue, which leads to broken
# audio. # audio.
self.data_queue.put(bytearray(decoded_data)) #self.data_queue.put(bytearray(decoded_data))
self.add_data(decoded_data)
else: else:
break break

View File

@ -4,10 +4,10 @@
recent_phrase_count = 8 recent_phrase_count = 8
# Seconds of silence before we start a new phrase. # Seconds of silence before we start a new phrase.
phrase_timeout = 3 phrase_timeout = 1.0
# Higher is less restrictive on what it lets pass through. # Higher is less restrictive on what it lets pass through.
no_speech_prob_threshold = 0.25 # 0.15 no_speech_prob_threshold = 0.15 # 0.15
# Minimum number of seconds before we fire off the model again. # Minimum number of seconds before we fire off the model again.
min_time_between_updates = 2 min_time_between_updates = 2
@ -109,21 +109,9 @@ class Transcriber:
# We got some new data. Let's process it! # We got some new data. Let's process it!
# If enough time has passed between recordings, consider the # Get all the new data since last tick.
# last phrase complete and start a new one. Clear the current
# working audio buffer to start over with the new data.
if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
# Only add a new phrase if we actually have data in the last
# one.
with self._phrases_list_mutex:
if self.phrases[-1] != "":
self.phrases.append("")
self._current_data = b''
# Get all the new data since last tick,
new_data = [] new_data = []
with self._audio_source._data_mutex:
while not self._audio_source.data_queue.empty(): while not self._audio_source.data_queue.empty():
new_packet = self._audio_source.data_queue.get() new_packet = self._audio_source.data_queue.get()
new_data.append(new_packet) new_data.append(new_packet)
@ -154,10 +142,20 @@ class Transcriber:
with _audio_model_mutex: with _audio_model_mutex:
#print("Transcribe start ", len(self._current_data)) #print("Transcribe start ", len(self._current_data))
result = _audio_model.transcribe( result = _audio_model.transcribe(
audio_np, fp16=torch.cuda.is_available()) audio_np, fp16=torch.cuda.is_available(),
word_timestamps=True,
hallucination_silence_threshold=2)
#print("Transcribe end") #print("Transcribe end")
self._last_model_time = now self._last_model_time = now
with self._phrases_list_mutex:
wave_out = wave.open("tmp/wave%0.4d.wav" % len(self.phrases), "wb")
wave_out.setnchannels(1)
wave_out.setframerate(16000)
wave_out.setsampwidth(2)
wave_out.writeframes(self._current_data)
# Filter out text segments with a high no_speech_prob. # Filter out text segments with a high no_speech_prob.
combined_text = "" combined_text = ""
for seg in result["segments"]: for seg in result["segments"]:
@ -166,7 +164,7 @@ class Transcriber:
text = combined_text.strip() text = combined_text.strip()
# FIXME: # # FIXME:
text = result["text"] text = result["text"]
with self._phrases_list_mutex: with self._phrases_list_mutex:
@ -177,6 +175,19 @@ class Transcriber:
# cause us to split phrases. # cause us to split phrases.
self._phrase_time = now self._phrase_time = now
# If enough time has passed between recordings, consider the
# last phrase complete and start a new one. Clear the current
# working audio buffer to start over with the new data.
if now - self._audio_source.time_of_last_input > timedelta(seconds=phrase_timeout):
# Only add a new phrase if we actually have data in the last
# one.
with self._phrases_list_mutex:
if self.phrases[-1] != "":
self.phrases.append("")
self._current_data = b''
# Automatically drop audio sources when we're finished with them. # Automatically drop audio sources when we're finished with them.
if self._audio_source.is_done(): if self._audio_source.is_done():
self._audio_source = None self._audio_source = None