More refactoring. Opus now talking to Whisper.

This commit is contained in:
Kiri 2025-08-17 10:28:42 -07:00
parent 25bc5d6663
commit aed7f70815
5 changed files with 257 additions and 153 deletions

176
audiosource.py Normal file
View File

@ -0,0 +1,176 @@
#!/usr/bin/python3
import socket
import select
import time
from queue import Queue
import json
import threading
import speech_recognition
import wave
from pyogg.opus_decoder import OpusDecoder
class AudioSource:
def __init__(self):
# Thread safe Queue for passing data from the threaded recording
# callback.
self.data_queue = Queue()
def is_done(self):
return True
# -----------------------------------------------------------------------------
# Microphone
# How real time the recording is in seconds.
record_timeout = 2
class MicrophoneAudioSource(AudioSource):
def __init__(self):
super().__init__()
self._recorder = speech_recognition.Recognizer()
self._recorder.energy_threshold = 1200
# Definitely do this, dynamic energy compensation lowers the energy
# threshold dramatically to a point where the SpeechRecognizer
# never stops recording.
self._recorder.dynamic_energy_threshold = False
self._source = speech_recognition.Microphone(sample_rate=16000)
with self._source:
self._recorder.adjust_for_ambient_noise(self._source)
def record_callback(_, audio:speech_recognition.AudioData) -> None:
"""
Threaded callback function to receive audio data when recordings finish.
audio: An AudioData containing the recorded bytes.
"""
# Grab the raw bytes and push it into the thread safe queue.
data = audio.get_raw_data()
self.data_queue.put(bytearray(data))
# Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper.
self._stopper = self._recorder.listen_in_background(
self._source, record_callback,
phrase_time_limit=record_timeout)
def stop(self):
assert(self._stopper)
self._stopper()
self._recorder = None
self._stopper = None
self._source = None
def is_done(self):
return self._recorder == None
# -----------------------------------------------------------------------------
# Opus stream
# For debugging
# wave_out = wave.open("wave2.wav", "wb")
# wave_out.setnchannels(1)
# wave_out.setframerate(16000)
# wave_out.setsampwidth(2)
class OpusStreamAudioSource(AudioSource):
def __init__(self, sock):
super().__init__()
self._socket = sock
self._opus_decoder = OpusDecoder()
self._opus_decoder.set_channels(1)
self._opus_decoder.set_sampling_frequency(16000)
# Fetch user info.
user_info_tmp = self._read_packet(self._socket)
self._user_info = json.loads(user_info_tmp.decode("utf-8"))
print("User connection...")
print(json.dumps(self._user_info, indent=4))
self._is_done = False
# Start input thread.
self._input_thread = threading.Thread(
target=self._input_thread_func, daemon=True)
self._input_thread.start()
def _read_packet(self, sock):
try:
input_buffer = b''
#print("Reading packet size...")
while len(input_buffer) < 4:
input_buffer = input_buffer + sock.recv(1)
if not input_buffer:
raise Exception("Failed to read size of packet.")
packet_size = int.from_bytes(input_buffer, "little")
#print("Packet size: ", packet_size)
input_buffer = b''
while len(input_buffer) < packet_size:
input_buffer = input_buffer + sock.recv(1)
if not input_buffer:
raise Exception("Failed to read packet.")
return input_buffer
except Exception as e: # FIXME: Use socket-specific exception type.
return None
def _input_thread_func(self):
print("input thread start")
try:
while not self._is_done:
next_packet = self._read_packet(self._socket)
if next_packet:
# If we don't use bytearray here to copy, we run into a weird
# exception about the memory not being writeable.
decoded_data = self._opus_decoder.decode(bytearray(next_packet))
# For debugging.
#wave_out.writeframes(decoded_data)
# We need to copy decoded_data here or we end up with
# recycled buffers in our queue, which leads to broken
# audio.
self.data_queue.put(bytearray(decoded_data))
else:
break
except Exception as e:
# Probably disconnected. We don't care. Just clean up.
# FIXME: Limit exception to socket errors.
pass
print("input thread done")
self._is_done = True
def stop(self):
self._is_done = True
# We won't join() the input thread because we don't want to sit around
# and wait for a packet. It'll die on its own, so whatever.
def is_done(self):
return self._is_done

0
diffstuff.py Normal file
View File

View File

@ -1,9 +1,8 @@
setuptools setuptools
pyaudio pyaudio
SpeechRecognition SpeechRecognition
--extra-index-url https://download.pytorch.org/whl/cu116 --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
torch torch
numpy numpy
git+https://github.com/openai/whisper.git git+https://github.com/openai/whisper.git
git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc
pygame

View File

@ -26,12 +26,17 @@ import pygame
import wave import wave
#from pyogg.opus import OpusEncoder #from pyogg.opus import OpusEncoder
from pyogg.opus_decoder import OpusDecoder
import socket import socket
import select import select
import time import time
import json import json
import threading
import diffstuff
import audiosource
from transcriber import Transcriber
pygame_font_height = 16 pygame_font_height = 16
pygame.init() pygame.init()
@ -42,86 +47,6 @@ pygame_font = pygame.font.Font("/home/kiri/.fonts/Sigmar-Regular.ttf", pygame_fo
class AudioSource:
def __init__(self):
# Thread safe Queue for passing data from the threaded recording callback.
self.data_queue = Queue()
def is_done(self):
return True
class MicrophoneAudioSource(AudioSource):
def __init__(self):
super().__init__()
self._recorder = None
self._stopper = None
def start(self):
assert(self._recorder == None)
self._recorder = speech_recognition.Recognizer()
self._recorder.energy_threshold = 1200
# Definitely do this, dynamic energy compensation lowers the energy
# threshold dramatically to a point where the SpeechRecognizer
# never stops recording.
self._recorder.dynamic_energy_threshold = False
self._source = speech_recognition.Microphone(sample_rate=16000)
with self._source:
self._recorder.adjust_for_ambient_noise(self._source)
def record_callback(_, audio:speech_recognition.AudioData) -> None:
"""
Threaded callback function to receive audio data when recordings finish.
audio: An AudioData containing the recorded bytes.
"""
# Grab the raw bytes and push it into the thread safe queue.
data = audio.get_raw_data()
self.data_queue.put(data)
# Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper.
self._stopper = self._recorder.listen_in_background(
self._source, record_callback,
phrase_time_limit=record_timeout)
def stop(self):
assert(self._stopper)
self._stopper()
self._recorder = None
self._stopper = None
self._source = None
def is_done(self):
return False
class OpusStreamAudioSource(AudioSource):
def __init__(self):
super().__init__()
def start(self):
self.opus_decoder = OpusDecoder()
self.opus_decoder.set_channels(1)
self.opus_decoder.set_sampling_frequency(16000)
def stop(self):
pass
def is_done(self):
return False
opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print("binding") print("binding")
opus_server_socket.bind(("127.0.0.1", 9967)) opus_server_socket.bind(("127.0.0.1", 9967))
@ -131,35 +56,14 @@ print("listening")
opus_server_socket.listen() opus_server_socket.listen()
def read_packet(sock):
try:
input_buffer = b''
print("Reading packet size...")
while len(input_buffer) < 4:
input_buffer = input_buffer + sock.recv(1)
if not input_buffer:
raise Exception("Failed to read size of packet.")
packet_size = int.from_bytes(input_buffer, "little")
print("Packet size: ", packet_size)
input_buffer = b''
while len(input_buffer) < packet_size:
input_buffer = input_buffer + sock.recv(1)
if not input_buffer:
raise Exception("Failed to read packet.")
return input_buffer
except Exception as e: # FIXME: Use socket-specific exception type.
return None
wave_out = wave.open("wave.wav", "wb") wave_out = wave.open("wave.wav", "wb")
wave_out.setnchannels(1) wave_out.setnchannels(1)
wave_out.setframerate(16000) wave_out.setframerate(16000)
wave_out.setsampwidth(2) wave_out.setsampwidth(2)
transcriber = Transcriber()
mic_source = audiosource.MicrophoneAudioSource()
while True: while True:
print("looping") print("looping")
@ -172,58 +76,18 @@ while True:
print(accepted_socket) print(accepted_socket)
break break
# Fetch user info. opusSource = audiosource.OpusStreamAudioSource(accepted_socket)
user_info = read_packet(accepted_socket)
user_info = json.loads(user_info.decode("utf-8"))
print("User connection...")
print(json.dumps(user_info, indent=4))
# Fire up decoder. transcriber.set_source(opusSource)
opus_decoder = OpusDecoder() while not opusSource.is_done():
print(dir(opus_decoder)) time.sleep(0.1)
opus_decoder.set_channels(1) transcriber.update()
opus_decoder.set_sampling_frequency(16000)
# Receive data.
while True:
next_packet = read_packet(accepted_socket)
#print("got packet: ", next_packet)
if next_packet:
# If we don't use bytearray here to copy, we run into a weird
# exception about the memory not being writeable.
decoded_data = opus_decoder.decode(bytearray(next_packet))
print(decoded_data)
wave_out.writeframes(decoded_data)
else:
print("DONE I THINK")
if not next_packet:
break
#b = accepted_socket.recv(32)
#print(b)
#if not b:
# break
exit(0) exit(0)
# while True:
# pygame_text_surface = pygame_font.render("Test test test", (0, 0, 0), (255, 255, 255))
# pygame_text_rect = pygame_text_surface.get_rect()
# pygame_text_rect.center = (640, 32)
# pygame_display_surface.fill((0, 0, 0))
# pygame_display_surface.blit(pygame_text_surface, pygame_text_rect)
# for event in pygame.event.get():
# if event.type == pygame.QUIT:
# pygame.quit()
# pygame.display.update()
# exit(0)
def onestepchange(start, dest): def onestepchange(start, dest):

65
transcriber.py Normal file
View File

@ -0,0 +1,65 @@
#!/usr/bin/python3
import numpy as np
import speech_recognition
import whisper
import torch
import wave
_audio_model = whisper.load_model("medium.en") # "large"
# For debugging...
# wave_out = wave.open("wave.wav", "wb")
# wave_out.setnchannels(1)
# wave_out.setframerate(16000)
# wave_out.setsampwidth(2)
class Transcriber:
def __init__(self):
self._audio_source = None
# Audio data for the current phrase.
self._current_data = b''
def set_source(self, source):
self._audio_source = source
def update(self):
if self._audio_source:
if not self._audio_source.data_queue.empty():
# We got some new data. Let's process it!
new_data = []
while not self._audio_source.data_queue.empty():
new_packet = self._audio_source.data_queue.get()
new_data.append(new_packet)
new_data_joined = b''.join(new_data)
# For debugging...
#wave_out.writeframes(new_data_joined)
self._current_data = self._current_data + new_data_joined
# Convert in-ram buffer to something the model can use
# directly without needing a temp file. Convert data from 16
# bit wide integers to floating point with a width of 32
# bits. Clamp the audio stream frequency to a PCM wavelength
# compatible default of 32768hz max.
audio_np = np.frombuffer(
self._current_data, dtype=np.int16).astype(np.float32) / 32768.0
# Run the transcription model, and extract the text.
result = _audio_model.transcribe(
audio_np, fp16=torch.cuda.is_available())
text = result['text'].strip()
print("text now: ", text)
# Automatically drop audio sources when we're finished with them.
if self._audio_source.is_done():
self._audio_source = None