diff --git a/audiosource.py b/audiosource.py new file mode 100644 index 0000000..05fe474 --- /dev/null +++ b/audiosource.py @@ -0,0 +1,176 @@ +#!/usr/bin/python3 + +import socket +import select +import time +from queue import Queue +import json +import threading +import speech_recognition +import wave + +from pyogg.opus_decoder import OpusDecoder + + +class AudioSource: + def __init__(self): + # Thread safe Queue for passing data from the threaded recording + # callback. + self.data_queue = Queue() + + def is_done(self): + return True + + +# ----------------------------------------------------------------------------- +# Microphone + +# How real time the recording is in seconds. +record_timeout = 2 + +class MicrophoneAudioSource(AudioSource): + + def __init__(self): + super().__init__() + + self._recorder = speech_recognition.Recognizer() + self._recorder.energy_threshold = 1200 + + # Definitely do this, dynamic energy compensation lowers the energy + # threshold dramatically to a point where the SpeechRecognizer + # never stops recording. + self._recorder.dynamic_energy_threshold = False + + self._source = speech_recognition.Microphone(sample_rate=16000) + + + with self._source: + self._recorder.adjust_for_ambient_noise(self._source) + + def record_callback(_, audio:speech_recognition.AudioData) -> None: + """ + Threaded callback function to receive audio data when recordings finish. + audio: An AudioData containing the recorded bytes. + """ + + # Grab the raw bytes and push it into the thread safe queue. + data = audio.get_raw_data() + self.data_queue.put(bytearray(data)) + + # Create a background thread that will pass us raw audio bytes. + # We could do this manually but SpeechRecognizer provides a nice helper. + self._stopper = self._recorder.listen_in_background( + self._source, record_callback, + phrase_time_limit=record_timeout) + + def stop(self): + assert(self._stopper) + self._stopper() + + self._recorder = None + self._stopper = None + self._source = None + + def is_done(self): + return self._recorder == None + +# ----------------------------------------------------------------------------- +# Opus stream + +# For debugging +# wave_out = wave.open("wave2.wav", "wb") +# wave_out.setnchannels(1) +# wave_out.setframerate(16000) +# wave_out.setsampwidth(2) + +class OpusStreamAudioSource(AudioSource): + + def __init__(self, sock): + super().__init__() + + self._socket = sock + + self._opus_decoder = OpusDecoder() + self._opus_decoder.set_channels(1) + self._opus_decoder.set_sampling_frequency(16000) + + # Fetch user info. + user_info_tmp = self._read_packet(self._socket) + self._user_info = json.loads(user_info_tmp.decode("utf-8")) + print("User connection...") + print(json.dumps(self._user_info, indent=4)) + + self._is_done = False + + # Start input thread. + self._input_thread = threading.Thread( + target=self._input_thread_func, daemon=True) + self._input_thread.start() + + def _read_packet(self, sock): + try: + input_buffer = b'' + #print("Reading packet size...") + while len(input_buffer) < 4: + input_buffer = input_buffer + sock.recv(1) + if not input_buffer: + raise Exception("Failed to read size of packet.") + + packet_size = int.from_bytes(input_buffer, "little") + #print("Packet size: ", packet_size) + + input_buffer = b'' + while len(input_buffer) < packet_size: + input_buffer = input_buffer + sock.recv(1) + if not input_buffer: + raise Exception("Failed to read packet.") + + return input_buffer + + except Exception as e: # FIXME: Use socket-specific exception type. + return None + + def _input_thread_func(self): + + print("input thread start") + try: + + while not self._is_done: + + next_packet = self._read_packet(self._socket) + + if next_packet: + + # If we don't use bytearray here to copy, we run into a weird + # exception about the memory not being writeable. + decoded_data = self._opus_decoder.decode(bytearray(next_packet)) + + # For debugging. + #wave_out.writeframes(decoded_data) + + # We need to copy decoded_data here or we end up with + # recycled buffers in our queue, which leads to broken + # audio. + self.data_queue.put(bytearray(decoded_data)) + + else: + break + + except Exception as e: + # Probably disconnected. We don't care. Just clean up. + # FIXME: Limit exception to socket errors. + pass + + print("input thread done") + + self._is_done = True + + def stop(self): + self._is_done = True + + # We won't join() the input thread because we don't want to sit around + # and wait for a packet. It'll die on its own, so whatever. + + def is_done(self): + return self._is_done + diff --git a/diffstuff.py b/diffstuff.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 18c62c5..87797c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ setuptools pyaudio SpeechRecognition ---extra-index-url https://download.pytorch.org/whl/cu116 +--extra-index-url https://download.pytorch.org/whl/rocm6.2.4 torch numpy git+https://github.com/openai/whisper.git - -pygame +git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc \ No newline at end of file diff --git a/transcribe_demo.py b/transcribe_demo.py index 546dfef..24490a8 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -26,12 +26,17 @@ import pygame import wave #from pyogg.opus import OpusEncoder -from pyogg.opus_decoder import OpusDecoder import socket import select import time import json +import threading + + +import diffstuff +import audiosource +from transcriber import Transcriber pygame_font_height = 16 pygame.init() @@ -42,86 +47,6 @@ pygame_font = pygame.font.Font("/home/kiri/.fonts/Sigmar-Regular.ttf", pygame_fo -class AudioSource: - def __init__(self): - # Thread safe Queue for passing data from the threaded recording callback. - self.data_queue = Queue() - - def is_done(self): - return True - -class MicrophoneAudioSource(AudioSource): - - def __init__(self): - super().__init__() - - self._recorder = None - self._stopper = None - - def start(self): - assert(self._recorder == None) - - self._recorder = speech_recognition.Recognizer() - self._recorder.energy_threshold = 1200 - - # Definitely do this, dynamic energy compensation lowers the energy - # threshold dramatically to a point where the SpeechRecognizer - # never stops recording. - self._recorder.dynamic_energy_threshold = False - - self._source = speech_recognition.Microphone(sample_rate=16000) - - - with self._source: - self._recorder.adjust_for_ambient_noise(self._source) - - def record_callback(_, audio:speech_recognition.AudioData) -> None: - """ - Threaded callback function to receive audio data when recordings finish. - audio: An AudioData containing the recorded bytes. - """ - - # Grab the raw bytes and push it into the thread safe queue. - data = audio.get_raw_data() - self.data_queue.put(data) - - # Create a background thread that will pass us raw audio bytes. - # We could do this manually but SpeechRecognizer provides a nice helper. - self._stopper = self._recorder.listen_in_background( - self._source, record_callback, - phrase_time_limit=record_timeout) - - def stop(self): - assert(self._stopper) - self._stopper() - - self._recorder = None - self._stopper = None - self._source = None - - def is_done(self): - return False - -class OpusStreamAudioSource(AudioSource): - - def __init__(self): - super().__init__() - - def start(self): - self.opus_decoder = OpusDecoder() - self.opus_decoder.set_channels(1) - self.opus_decoder.set_sampling_frequency(16000) - - - - def stop(self): - pass - - - def is_done(self): - return False - - opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) print("binding") opus_server_socket.bind(("127.0.0.1", 9967)) @@ -131,35 +56,14 @@ print("listening") opus_server_socket.listen() -def read_packet(sock): - try: - input_buffer = b'' - print("Reading packet size...") - while len(input_buffer) < 4: - input_buffer = input_buffer + sock.recv(1) - if not input_buffer: - raise Exception("Failed to read size of packet.") - - packet_size = int.from_bytes(input_buffer, "little") - print("Packet size: ", packet_size) - - input_buffer = b'' - while len(input_buffer) < packet_size: - input_buffer = input_buffer + sock.recv(1) - if not input_buffer: - raise Exception("Failed to read packet.") - - return input_buffer - - except Exception as e: # FIXME: Use socket-specific exception type. - return None - - wave_out = wave.open("wave.wav", "wb") wave_out.setnchannels(1) wave_out.setframerate(16000) wave_out.setsampwidth(2) +transcriber = Transcriber() +mic_source = audiosource.MicrophoneAudioSource() + while True: print("looping") @@ -172,58 +76,18 @@ while True: print(accepted_socket) break - # Fetch user info. - user_info = read_packet(accepted_socket) - user_info = json.loads(user_info.decode("utf-8")) - print("User connection...") - print(json.dumps(user_info, indent=4)) + opusSource = audiosource.OpusStreamAudioSource(accepted_socket) - # Fire up decoder. - opus_decoder = OpusDecoder() - print(dir(opus_decoder)) - opus_decoder.set_channels(1) - opus_decoder.set_sampling_frequency(16000) - - # Receive data. - while True: - next_packet = read_packet(accepted_socket) - #print("got packet: ", next_packet) - - if next_packet: - # If we don't use bytearray here to copy, we run into a weird - # exception about the memory not being writeable. - decoded_data = opus_decoder.decode(bytearray(next_packet)) - print(decoded_data) - wave_out.writeframes(decoded_data) - else: - print("DONE I THINK") - - if not next_packet: - break - #b = accepted_socket.recv(32) - #print(b) - #if not b: - # break + transcriber.set_source(opusSource) + while not opusSource.is_done(): + time.sleep(0.1) + transcriber.update() exit(0) -# while True: -# pygame_text_surface = pygame_font.render("Test test test", (0, 0, 0), (255, 255, 255)) -# pygame_text_rect = pygame_text_surface.get_rect() -# pygame_text_rect.center = (640, 32) -# pygame_display_surface.fill((0, 0, 0)) -# pygame_display_surface.blit(pygame_text_surface, pygame_text_rect) - -# for event in pygame.event.get(): -# if event.type == pygame.QUIT: -# pygame.quit() - -# pygame.display.update() - -# exit(0) def onestepchange(start, dest): diff --git a/transcriber.py b/transcriber.py new file mode 100644 index 0000000..9120c8e --- /dev/null +++ b/transcriber.py @@ -0,0 +1,65 @@ +#!/usr/bin/python3 + +import numpy as np +import speech_recognition +import whisper +import torch +import wave + +_audio_model = whisper.load_model("medium.en") # "large" + +# For debugging... +# wave_out = wave.open("wave.wav", "wb") +# wave_out.setnchannels(1) +# wave_out.setframerate(16000) +# wave_out.setsampwidth(2) + +class Transcriber: + + def __init__(self): + self._audio_source = None + + # Audio data for the current phrase. + self._current_data = b'' + + def set_source(self, source): + self._audio_source = source + + def update(self): + + if self._audio_source: + + if not self._audio_source.data_queue.empty(): + + # We got some new data. Let's process it! + new_data = [] + while not self._audio_source.data_queue.empty(): + new_packet = self._audio_source.data_queue.get() + new_data.append(new_packet) + + new_data_joined = b''.join(new_data) + + # For debugging... + #wave_out.writeframes(new_data_joined) + + self._current_data = self._current_data + new_data_joined + + # Convert in-ram buffer to something the model can use + # directly without needing a temp file. Convert data from 16 + # bit wide integers to floating point with a width of 32 + # bits. Clamp the audio stream frequency to a PCM wavelength + # compatible default of 32768hz max. + audio_np = np.frombuffer( + self._current_data, dtype=np.int16).astype(np.float32) / 32768.0 + + # Run the transcription model, and extract the text. + result = _audio_model.transcribe( + audio_np, fp16=torch.cuda.is_available()) + + text = result['text'].strip() + + print("text now: ", text) + + # Automatically drop audio sources when we're finished with them. + if self._audio_source.is_done(): + self._audio_source = None \ No newline at end of file