From 25bc5d66633bf560cdb51ccf7c924bd83d6be5f7 Mon Sep 17 00:00:00 2001 From: Kiri Date: Sat, 16 Aug 2025 22:51:13 -0700 Subject: [PATCH] More multi-user work. --- requirements2.txt | 1 + transcribe_demo.py | 158 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 146 insertions(+), 13 deletions(-) diff --git a/requirements2.txt b/requirements2.txt index bf909eb..87797c1 100644 --- a/requirements2.txt +++ b/requirements2.txt @@ -5,3 +5,4 @@ SpeechRecognition torch numpy git+https://github.com/openai/whisper.git +git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc \ No newline at end of file diff --git a/transcribe_demo.py b/transcribe_demo.py index edc824b..546dfef 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -24,6 +24,14 @@ import difflib import pygame +import wave +#from pyogg.opus import OpusEncoder +from pyogg.opus_decoder import OpusDecoder + +import socket +import select +import time +import json pygame_font_height = 16 pygame.init() @@ -39,43 +47,166 @@ class AudioSource: # Thread safe Queue for passing data from the threaded recording callback. self.data_queue = Queue() + def is_done(self): + return True + class MicrophoneAudioSource(AudioSource): + def __init__(self): super().__init__() + + self._recorder = None + self._stopper = None - self.recorder = speech_recognition.Recognizer() - self.recorder.energy_threshold = 1000 + def start(self): + assert(self._recorder == None) + + self._recorder = speech_recognition.Recognizer() + self._recorder.energy_threshold = 1200 # Definitely do this, dynamic energy compensation lowers the energy # threshold dramatically to a point where the SpeechRecognizer # never stops recording. - self.recorder.dynamic_energy_threshold = False + self._recorder.dynamic_energy_threshold = False - self.source = speech_recognition.Microphone(sample_rate=16000) + self._source = speech_recognition.Microphone(sample_rate=16000) - with self.source: - self.recorder.adjust_for_ambient_noise(self.source) + with self._source: + self._recorder.adjust_for_ambient_noise(self._source) def record_callback(_, audio:speech_recognition.AudioData) -> None: """ Threaded callback function to receive audio data when recordings finish. audio: An AudioData containing the recorded bytes. """ + # Grab the raw bytes and push it into the thread safe queue. - - print("GOT SOME DATA!!!") - data = audio.get_raw_data() self.data_queue.put(data) # Create a background thread that will pass us raw audio bytes. # We could do this manually but SpeechRecognizer provides a nice helper. - self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=record_timeout) + self._stopper = self._recorder.listen_in_background( + self._source, record_callback, + phrase_time_limit=record_timeout) - print("--------------------------------------------------------------") - print("Done setting up mic!") - print("--------------------------------------------------------------") + def stop(self): + assert(self._stopper) + self._stopper() + + self._recorder = None + self._stopper = None + self._source = None + + def is_done(self): + return False + +class OpusStreamAudioSource(AudioSource): + + def __init__(self): + super().__init__() + + def start(self): + self.opus_decoder = OpusDecoder() + self.opus_decoder.set_channels(1) + self.opus_decoder.set_sampling_frequency(16000) + + + + def stop(self): + pass + + + def is_done(self): + return False + + +opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +print("binding") +opus_server_socket.bind(("127.0.0.1", 9967)) +print("set non blocking") +#opus_server_socket.setblocking(False) +print("listening") +opus_server_socket.listen() + + +def read_packet(sock): + try: + input_buffer = b'' + print("Reading packet size...") + while len(input_buffer) < 4: + input_buffer = input_buffer + sock.recv(1) + if not input_buffer: + raise Exception("Failed to read size of packet.") + + packet_size = int.from_bytes(input_buffer, "little") + print("Packet size: ", packet_size) + + input_buffer = b'' + while len(input_buffer) < packet_size: + input_buffer = input_buffer + sock.recv(1) + if not input_buffer: + raise Exception("Failed to read packet.") + + return input_buffer + + except Exception as e: # FIXME: Use socket-specific exception type. + return None + + +wave_out = wave.open("wave.wav", "wb") +wave_out.setnchannels(1) +wave_out.setframerate(16000) +wave_out.setsampwidth(2) + +while True: + print("looping") + + # Wait for a connection. + while True: + s = select.select([opus_server_socket], [], [], 0) + time.sleep(0.01) + if len(s[0]): + accepted_socket, addr = opus_server_socket.accept() + print(accepted_socket) + break + + # Fetch user info. + user_info = read_packet(accepted_socket) + user_info = json.loads(user_info.decode("utf-8")) + print("User connection...") + print(json.dumps(user_info, indent=4)) + + # Fire up decoder. + opus_decoder = OpusDecoder() + print(dir(opus_decoder)) + opus_decoder.set_channels(1) + opus_decoder.set_sampling_frequency(16000) + + # Receive data. + while True: + next_packet = read_packet(accepted_socket) + #print("got packet: ", next_packet) + + if next_packet: + # If we don't use bytearray here to copy, we run into a weird + # exception about the memory not being writeable. + decoded_data = opus_decoder.decode(bytearray(next_packet)) + print(decoded_data) + wave_out.writeframes(decoded_data) + else: + print("DONE I THINK") + + if not next_packet: + break + #b = accepted_socket.recv(32) + #print(b) + #if not b: + # break + + +exit(0) @@ -169,6 +300,7 @@ def main(): output_text = [""] mic_audio_source = MicrophoneAudioSource() + mic_audio_source.start() data_queue = mic_audio_source.data_queue # Rolling audio input buffer.