More refactoring. Opus now talking to Whisper.

2025-08-17 10:28:42 -07:00 · 2025-08-17 10:28:42 -07:00 · aed7f70815
commit aed7f70815
parent 25bc5d6663
5 changed files with 257 additions and 153 deletions
--- a/audiosource.py
+++ b/audiosource.py
@ -0,0 +1,176 @@
+#!/usr/bin/python3
+
+import socket
+import select
+import time
+from queue import Queue
+import json
+import threading
+import speech_recognition
+import wave
+
+from pyogg.opus_decoder import OpusDecoder
+
+
+class AudioSource:
+    def __init__(self):
+        # Thread safe Queue for passing data from the threaded recording
+        # callback.
+        self.data_queue = Queue()
+
+    def is_done(self):
+        return True
+
+
+# -----------------------------------------------------------------------------
+# Microphone
+
+# How real time the recording is in seconds.
+record_timeout = 2
+
+class MicrophoneAudioSource(AudioSource):
+
+    def __init__(self):
+        super().__init__()
+        
+        self._recorder = speech_recognition.Recognizer()
+        self._recorder.energy_threshold = 1200
+
+        # Definitely do this, dynamic energy compensation lowers the energy
+        # threshold dramatically to a point where the SpeechRecognizer
+        # never stops recording.
+        self._recorder.dynamic_energy_threshold = False
+   
+        self._source = speech_recognition.Microphone(sample_rate=16000)
+
+
+        with self._source:
+            self._recorder.adjust_for_ambient_noise(self._source)
+
+        def record_callback(_, audio:speech_recognition.AudioData) -> None:
+            """
+            Threaded callback function to receive audio data when recordings finish.
+            audio: An AudioData containing the recorded bytes.
+            """
+
+            # Grab the raw bytes and push it into the thread safe queue.
+            data = audio.get_raw_data()
+            self.data_queue.put(bytearray(data))
+
+        # Create a background thread that will pass us raw audio bytes.
+        # We could do this manually but SpeechRecognizer provides a nice helper.
+        self._stopper = self._recorder.listen_in_background(
+            self._source, record_callback,
+            phrase_time_limit=record_timeout)
+
+    def stop(self):
+        assert(self._stopper)
+        self._stopper()
+        
+        self._recorder = None
+        self._stopper = None
+        self._source = None
+
+    def is_done(self):
+        return self._recorder == None
+
+# -----------------------------------------------------------------------------
+# Opus stream
+
+# For debugging
+# wave_out = wave.open("wave2.wav", "wb")
+# wave_out.setnchannels(1)
+# wave_out.setframerate(16000)
+# wave_out.setsampwidth(2)
+
+class OpusStreamAudioSource(AudioSource):
+
+    def __init__(self, sock):
+        super().__init__()
+
+        self._socket = sock
+
+        self._opus_decoder = OpusDecoder()
+        self._opus_decoder.set_channels(1)
+        self._opus_decoder.set_sampling_frequency(16000)
+
+        # Fetch user info.
+        user_info_tmp = self._read_packet(self._socket)
+        self._user_info = json.loads(user_info_tmp.decode("utf-8"))
+        print("User connection...")
+        print(json.dumps(self._user_info, indent=4))
+
+        self._is_done = False
+
+        # Start input thread.
+        self._input_thread = threading.Thread(
+            target=self._input_thread_func, daemon=True)
+        self._input_thread.start()
+
+    def _read_packet(self, sock):
+        try:
+            input_buffer = b''
+            #print("Reading packet size...")
+            while len(input_buffer) < 4:
+                input_buffer = input_buffer + sock.recv(1)
+                if not input_buffer:
+                    raise Exception("Failed to read size of packet.")
+
+            packet_size = int.from_bytes(input_buffer, "little")
+            #print("Packet size: ", packet_size)
+
+            input_buffer = b''
+            while len(input_buffer) < packet_size:
+                input_buffer = input_buffer + sock.recv(1)
+                if not input_buffer:
+                    raise Exception("Failed to read packet.")
+
+            return input_buffer
+        
+        except Exception as e: # FIXME: Use socket-specific exception type.
+            return None
+
+    def _input_thread_func(self):
+
+        print("input thread start")
+        try:
+
+            while not self._is_done:
+
+                next_packet = self._read_packet(self._socket)
+
+                if next_packet:
+
+                    # If we don't use bytearray here to copy, we run into a weird
+                    # exception about the memory not being writeable.
+                    decoded_data = self._opus_decoder.decode(bytearray(next_packet))
+
+                    # For debugging.
+                    #wave_out.writeframes(decoded_data)
+                    
+                    # We need to copy decoded_data here or we end up with
+                    # recycled buffers in our queue, which leads to broken
+                    # audio.
+                    self.data_queue.put(bytearray(decoded_data))
+
+                else:
+                    break
+            
+        except Exception as e:
+           # Probably disconnected. We don't care. Just clean up.
+           # FIXME: Limit exception to socket errors.
+           pass
+
+        print("input thread done")
+
+        self._is_done = True
+
+    def stop(self):
+        self._is_done = True
+        
+        # We won't join() the input thread because we don't want to sit around
+        # and wait for a packet. It'll die on its own, so whatever.
+    
+    def is_done(self):
+        return self._is_done
+
--- a/diffstuff.py
+++ b/diffstuff.py
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,8 @@
 setuptools
 pyaudio
 SpeechRecognition
--extra-index-url https://download.pytorch.org/whl/cu116
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
 torch
 numpy
 git+https://github.com/openai/whisper.git
-
-pygame
+git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc
--- a/transcribe_demo.py
+++ b/transcribe_demo.py
@ -26,12 +26,17 @@ import pygame

 import wave
 #from pyogg.opus import OpusEncoder
-from pyogg.opus_decoder import OpusDecoder

 import socket
 import select
 import time
 import json
+import threading
+
+
+import diffstuff
+import audiosource
+from transcriber import Transcriber

 pygame_font_height = 16
 pygame.init()
@ -42,86 +47,6 @@ pygame_font = pygame.font.Font("/home/kiri/.fonts/Sigmar-Regular.ttf", pygame_fo



-class AudioSource:
-    def __init__(self):
-        # Thread safe Queue for passing data from the threaded recording callback.
-        self.data_queue = Queue()
-
-    def is_done(self):
-        return True
-
-class MicrophoneAudioSource(AudioSource):
-
-    def __init__(self):
-        super().__init__()
-        
-        self._recorder = None
-        self._stopper = None
-
-    def start(self):
-        assert(self._recorder == None)
-
-        self._recorder = speech_recognition.Recognizer()
-        self._recorder.energy_threshold = 1200
-
-        # Definitely do this, dynamic energy compensation lowers the energy
-        # threshold dramatically to a point where the SpeechRecognizer
-        # never stops recording.
-        self._recorder.dynamic_energy_threshold = False
-   
-        self._source = speech_recognition.Microphone(sample_rate=16000)
-
-
-        with self._source:
-            self._recorder.adjust_for_ambient_noise(self._source)
-
-        def record_callback(_, audio:speech_recognition.AudioData) -> None:
-            """
-            Threaded callback function to receive audio data when recordings finish.
-            audio: An AudioData containing the recorded bytes.
-            """
-
-            # Grab the raw bytes and push it into the thread safe queue.
-            data = audio.get_raw_data()
-            self.data_queue.put(data)
-
-        # Create a background thread that will pass us raw audio bytes.
-        # We could do this manually but SpeechRecognizer provides a nice helper.
-        self._stopper = self._recorder.listen_in_background(
-            self._source, record_callback,
-            phrase_time_limit=record_timeout)
-
-    def stop(self):
-        assert(self._stopper)
-        self._stopper()
-        
-        self._recorder = None
-        self._stopper = None
-        self._source = None
-
-    def is_done(self):
-        return False
-
-class OpusStreamAudioSource(AudioSource):
-
-    def __init__(self):
-        super().__init__()
-
-    def start(self):
-        self.opus_decoder = OpusDecoder()
-        self.opus_decoder.set_channels(1)
-        self.opus_decoder.set_sampling_frequency(16000)
-
-
-
-    def stop(self):
-        pass
-    
-    
-    def is_done(self):
-        return False
-
-
 opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 print("binding")
 opus_server_socket.bind(("127.0.0.1", 9967))
@ -131,35 +56,14 @@ print("listening")
 opus_server_socket.listen()


-def read_packet(sock):
-    try:
-        input_buffer = b''
-        print("Reading packet size...")
-        while len(input_buffer) < 4:
-            input_buffer = input_buffer + sock.recv(1)
-            if not input_buffer:
-                raise Exception("Failed to read size of packet.")
-
-        packet_size = int.from_bytes(input_buffer, "little")
-        print("Packet size: ", packet_size)
-
-        input_buffer = b''
-        while len(input_buffer) < packet_size:
-            input_buffer = input_buffer + sock.recv(1)
-            if not input_buffer:
-                raise Exception("Failed to read packet.")
-
-        return input_buffer
-    
-    except Exception as e: # FIXME: Use socket-specific exception type.
-        return None
-
-
 wave_out = wave.open("wave.wav", "wb")
 wave_out.setnchannels(1)
 wave_out.setframerate(16000)
 wave_out.setsampwidth(2)

+transcriber = Transcriber()
+mic_source = audiosource.MicrophoneAudioSource()
+
 while True:
    print("looping")

@ -172,58 +76,18 @@ while True:
            print(accepted_socket)
            break

-    # Fetch user info.
-    user_info = read_packet(accepted_socket)
-    user_info = json.loads(user_info.decode("utf-8"))
-    print("User connection...")
-    print(json.dumps(user_info, indent=4))
+    opusSource = audiosource.OpusStreamAudioSource(accepted_socket)

-    # Fire up decoder.
-    opus_decoder = OpusDecoder()
-    print(dir(opus_decoder))
-    opus_decoder.set_channels(1)
-    opus_decoder.set_sampling_frequency(16000)
-
-    # Receive data.
-    while True:
-        next_packet = read_packet(accepted_socket)
-        #print("got packet: ", next_packet)
-
-        if next_packet:
-            # If we don't use bytearray here to copy, we run into a weird
-            # exception about the memory not being writeable.
-            decoded_data = opus_decoder.decode(bytearray(next_packet))
-            print(decoded_data)
-            wave_out.writeframes(decoded_data)
-        else:
-            print("DONE I THINK")
-
-        if not next_packet:
-            break
-        #b = accepted_socket.recv(32)
-        #print(b)
-        #if not b:
-        #    break
+    transcriber.set_source(opusSource)
+    while not opusSource.is_done():
+        time.sleep(0.1)
+        transcriber.update()


 exit(0)



-# while True:
-#     pygame_text_surface = pygame_font.render("Test test test", (0, 0, 0), (255, 255, 255))
-#     pygame_text_rect = pygame_text_surface.get_rect()
-#     pygame_text_rect.center = (640, 32)
-#     pygame_display_surface.fill((0, 0, 0))
-#     pygame_display_surface.blit(pygame_text_surface, pygame_text_rect)
-
-#     for event in pygame.event.get():
-#         if event.type == pygame.QUIT:
-#             pygame.quit()
-
-#     pygame.display.update()
-
-# exit(0)

 def onestepchange(start, dest):

--- a/transcriber.py
+++ b/transcriber.py
@ -0,0 +1,65 @@
+#!/usr/bin/python3
+
+import numpy as np
+import speech_recognition
+import whisper
+import torch
+import wave
+
+_audio_model = whisper.load_model("medium.en") # "large"
+
+# For debugging...
+# wave_out = wave.open("wave.wav", "wb")
+# wave_out.setnchannels(1)
+# wave_out.setframerate(16000)
+# wave_out.setsampwidth(2)
+
+class Transcriber:
+
+    def __init__(self):
+        self._audio_source = None
+
+        # Audio data for the current phrase.
+        self._current_data = b''
+
+    def set_source(self, source):
+        self._audio_source = source
+
+    def update(self):
+        
+        if self._audio_source:
+
+            if not self._audio_source.data_queue.empty():
+
+                # We got some new data. Let's process it!
+                new_data = []
+                while not self._audio_source.data_queue.empty():
+                    new_packet = self._audio_source.data_queue.get()
+                    new_data.append(new_packet)
+
+                new_data_joined = b''.join(new_data)
+                
+                # For debugging...
+                #wave_out.writeframes(new_data_joined)
+                
+                self._current_data = self._current_data + new_data_joined
+
+                # Convert in-ram buffer to something the model can use
+                # directly without needing a temp file. Convert data from 16
+                # bit wide integers to floating point with a width of 32
+                # bits. Clamp the audio stream frequency to a PCM wavelength
+                # compatible default of 32768hz max.
+                audio_np = np.frombuffer(
+                    self._current_data, dtype=np.int16).astype(np.float32) / 32768.0
+
+                # Run the transcription model, and extract the text.
+                result = _audio_model.transcribe(
+                    audio_np, fp16=torch.cuda.is_available())
+
+                text = result['text'].strip()
+
+                print("text now: ", text)
+
+            # Automatically drop audio sources when we're finished with them.
+            if self._audio_source.is_done():
+                self._audio_source = None