More refactoring. Opus now talking to Whisper.
This commit is contained in:
parent
25bc5d6663
commit
aed7f70815
176
audiosource.py
Normal file
176
audiosource.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import socket
|
||||||
|
import select
|
||||||
|
import time
|
||||||
|
from queue import Queue
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import speech_recognition
|
||||||
|
import wave
|
||||||
|
|
||||||
|
from pyogg.opus_decoder import OpusDecoder
|
||||||
|
|
||||||
|
|
||||||
|
class AudioSource:
|
||||||
|
def __init__(self):
|
||||||
|
# Thread safe Queue for passing data from the threaded recording
|
||||||
|
# callback.
|
||||||
|
self.data_queue = Queue()
|
||||||
|
|
||||||
|
def is_done(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Microphone
|
||||||
|
|
||||||
|
# How real time the recording is in seconds.
|
||||||
|
record_timeout = 2
|
||||||
|
|
||||||
|
class MicrophoneAudioSource(AudioSource):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._recorder = speech_recognition.Recognizer()
|
||||||
|
self._recorder.energy_threshold = 1200
|
||||||
|
|
||||||
|
# Definitely do this, dynamic energy compensation lowers the energy
|
||||||
|
# threshold dramatically to a point where the SpeechRecognizer
|
||||||
|
# never stops recording.
|
||||||
|
self._recorder.dynamic_energy_threshold = False
|
||||||
|
|
||||||
|
self._source = speech_recognition.Microphone(sample_rate=16000)
|
||||||
|
|
||||||
|
|
||||||
|
with self._source:
|
||||||
|
self._recorder.adjust_for_ambient_noise(self._source)
|
||||||
|
|
||||||
|
def record_callback(_, audio:speech_recognition.AudioData) -> None:
|
||||||
|
"""
|
||||||
|
Threaded callback function to receive audio data when recordings finish.
|
||||||
|
audio: An AudioData containing the recorded bytes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Grab the raw bytes and push it into the thread safe queue.
|
||||||
|
data = audio.get_raw_data()
|
||||||
|
self.data_queue.put(bytearray(data))
|
||||||
|
|
||||||
|
# Create a background thread that will pass us raw audio bytes.
|
||||||
|
# We could do this manually but SpeechRecognizer provides a nice helper.
|
||||||
|
self._stopper = self._recorder.listen_in_background(
|
||||||
|
self._source, record_callback,
|
||||||
|
phrase_time_limit=record_timeout)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
assert(self._stopper)
|
||||||
|
self._stopper()
|
||||||
|
|
||||||
|
self._recorder = None
|
||||||
|
self._stopper = None
|
||||||
|
self._source = None
|
||||||
|
|
||||||
|
def is_done(self):
|
||||||
|
return self._recorder == None
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Opus stream
|
||||||
|
|
||||||
|
# For debugging
|
||||||
|
# wave_out = wave.open("wave2.wav", "wb")
|
||||||
|
# wave_out.setnchannels(1)
|
||||||
|
# wave_out.setframerate(16000)
|
||||||
|
# wave_out.setsampwidth(2)
|
||||||
|
|
||||||
|
class OpusStreamAudioSource(AudioSource):
|
||||||
|
|
||||||
|
def __init__(self, sock):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._socket = sock
|
||||||
|
|
||||||
|
self._opus_decoder = OpusDecoder()
|
||||||
|
self._opus_decoder.set_channels(1)
|
||||||
|
self._opus_decoder.set_sampling_frequency(16000)
|
||||||
|
|
||||||
|
# Fetch user info.
|
||||||
|
user_info_tmp = self._read_packet(self._socket)
|
||||||
|
self._user_info = json.loads(user_info_tmp.decode("utf-8"))
|
||||||
|
print("User connection...")
|
||||||
|
print(json.dumps(self._user_info, indent=4))
|
||||||
|
|
||||||
|
self._is_done = False
|
||||||
|
|
||||||
|
# Start input thread.
|
||||||
|
self._input_thread = threading.Thread(
|
||||||
|
target=self._input_thread_func, daemon=True)
|
||||||
|
self._input_thread.start()
|
||||||
|
|
||||||
|
def _read_packet(self, sock):
|
||||||
|
try:
|
||||||
|
input_buffer = b''
|
||||||
|
#print("Reading packet size...")
|
||||||
|
while len(input_buffer) < 4:
|
||||||
|
input_buffer = input_buffer + sock.recv(1)
|
||||||
|
if not input_buffer:
|
||||||
|
raise Exception("Failed to read size of packet.")
|
||||||
|
|
||||||
|
packet_size = int.from_bytes(input_buffer, "little")
|
||||||
|
#print("Packet size: ", packet_size)
|
||||||
|
|
||||||
|
input_buffer = b''
|
||||||
|
while len(input_buffer) < packet_size:
|
||||||
|
input_buffer = input_buffer + sock.recv(1)
|
||||||
|
if not input_buffer:
|
||||||
|
raise Exception("Failed to read packet.")
|
||||||
|
|
||||||
|
return input_buffer
|
||||||
|
|
||||||
|
except Exception as e: # FIXME: Use socket-specific exception type.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _input_thread_func(self):
|
||||||
|
|
||||||
|
print("input thread start")
|
||||||
|
try:
|
||||||
|
|
||||||
|
while not self._is_done:
|
||||||
|
|
||||||
|
next_packet = self._read_packet(self._socket)
|
||||||
|
|
||||||
|
if next_packet:
|
||||||
|
|
||||||
|
# If we don't use bytearray here to copy, we run into a weird
|
||||||
|
# exception about the memory not being writeable.
|
||||||
|
decoded_data = self._opus_decoder.decode(bytearray(next_packet))
|
||||||
|
|
||||||
|
# For debugging.
|
||||||
|
#wave_out.writeframes(decoded_data)
|
||||||
|
|
||||||
|
# We need to copy decoded_data here or we end up with
|
||||||
|
# recycled buffers in our queue, which leads to broken
|
||||||
|
# audio.
|
||||||
|
self.data_queue.put(bytearray(decoded_data))
|
||||||
|
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Probably disconnected. We don't care. Just clean up.
|
||||||
|
# FIXME: Limit exception to socket errors.
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("input thread done")
|
||||||
|
|
||||||
|
self._is_done = True
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._is_done = True
|
||||||
|
|
||||||
|
# We won't join() the input thread because we don't want to sit around
|
||||||
|
# and wait for a packet. It'll die on its own, so whatever.
|
||||||
|
|
||||||
|
def is_done(self):
|
||||||
|
return self._is_done
|
||||||
|
|
0
diffstuff.py
Normal file
0
diffstuff.py
Normal file
@ -1,9 +1,8 @@
|
|||||||
setuptools
|
setuptools
|
||||||
pyaudio
|
pyaudio
|
||||||
SpeechRecognition
|
SpeechRecognition
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu116
|
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
||||||
torch
|
torch
|
||||||
numpy
|
numpy
|
||||||
git+https://github.com/openai/whisper.git
|
git+https://github.com/openai/whisper.git
|
||||||
|
git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc
|
||||||
pygame
|
|
@ -26,12 +26,17 @@ import pygame
|
|||||||
|
|
||||||
import wave
|
import wave
|
||||||
#from pyogg.opus import OpusEncoder
|
#from pyogg.opus import OpusEncoder
|
||||||
from pyogg.opus_decoder import OpusDecoder
|
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
import select
|
import select
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
import threading
|
||||||
|
|
||||||
|
|
||||||
|
import diffstuff
|
||||||
|
import audiosource
|
||||||
|
from transcriber import Transcriber
|
||||||
|
|
||||||
pygame_font_height = 16
|
pygame_font_height = 16
|
||||||
pygame.init()
|
pygame.init()
|
||||||
@ -42,86 +47,6 @@ pygame_font = pygame.font.Font("/home/kiri/.fonts/Sigmar-Regular.ttf", pygame_fo
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AudioSource:
|
|
||||||
def __init__(self):
|
|
||||||
# Thread safe Queue for passing data from the threaded recording callback.
|
|
||||||
self.data_queue = Queue()
|
|
||||||
|
|
||||||
def is_done(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
class MicrophoneAudioSource(AudioSource):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self._recorder = None
|
|
||||||
self._stopper = None
|
|
||||||
|
|
||||||
def start(self):
|
|
||||||
assert(self._recorder == None)
|
|
||||||
|
|
||||||
self._recorder = speech_recognition.Recognizer()
|
|
||||||
self._recorder.energy_threshold = 1200
|
|
||||||
|
|
||||||
# Definitely do this, dynamic energy compensation lowers the energy
|
|
||||||
# threshold dramatically to a point where the SpeechRecognizer
|
|
||||||
# never stops recording.
|
|
||||||
self._recorder.dynamic_energy_threshold = False
|
|
||||||
|
|
||||||
self._source = speech_recognition.Microphone(sample_rate=16000)
|
|
||||||
|
|
||||||
|
|
||||||
with self._source:
|
|
||||||
self._recorder.adjust_for_ambient_noise(self._source)
|
|
||||||
|
|
||||||
def record_callback(_, audio:speech_recognition.AudioData) -> None:
|
|
||||||
"""
|
|
||||||
Threaded callback function to receive audio data when recordings finish.
|
|
||||||
audio: An AudioData containing the recorded bytes.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Grab the raw bytes and push it into the thread safe queue.
|
|
||||||
data = audio.get_raw_data()
|
|
||||||
self.data_queue.put(data)
|
|
||||||
|
|
||||||
# Create a background thread that will pass us raw audio bytes.
|
|
||||||
# We could do this manually but SpeechRecognizer provides a nice helper.
|
|
||||||
self._stopper = self._recorder.listen_in_background(
|
|
||||||
self._source, record_callback,
|
|
||||||
phrase_time_limit=record_timeout)
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
assert(self._stopper)
|
|
||||||
self._stopper()
|
|
||||||
|
|
||||||
self._recorder = None
|
|
||||||
self._stopper = None
|
|
||||||
self._source = None
|
|
||||||
|
|
||||||
def is_done(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
class OpusStreamAudioSource(AudioSource):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def start(self):
|
|
||||||
self.opus_decoder = OpusDecoder()
|
|
||||||
self.opus_decoder.set_channels(1)
|
|
||||||
self.opus_decoder.set_sampling_frequency(16000)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def is_done(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
print("binding")
|
print("binding")
|
||||||
opus_server_socket.bind(("127.0.0.1", 9967))
|
opus_server_socket.bind(("127.0.0.1", 9967))
|
||||||
@ -131,35 +56,14 @@ print("listening")
|
|||||||
opus_server_socket.listen()
|
opus_server_socket.listen()
|
||||||
|
|
||||||
|
|
||||||
def read_packet(sock):
|
|
||||||
try:
|
|
||||||
input_buffer = b''
|
|
||||||
print("Reading packet size...")
|
|
||||||
while len(input_buffer) < 4:
|
|
||||||
input_buffer = input_buffer + sock.recv(1)
|
|
||||||
if not input_buffer:
|
|
||||||
raise Exception("Failed to read size of packet.")
|
|
||||||
|
|
||||||
packet_size = int.from_bytes(input_buffer, "little")
|
|
||||||
print("Packet size: ", packet_size)
|
|
||||||
|
|
||||||
input_buffer = b''
|
|
||||||
while len(input_buffer) < packet_size:
|
|
||||||
input_buffer = input_buffer + sock.recv(1)
|
|
||||||
if not input_buffer:
|
|
||||||
raise Exception("Failed to read packet.")
|
|
||||||
|
|
||||||
return input_buffer
|
|
||||||
|
|
||||||
except Exception as e: # FIXME: Use socket-specific exception type.
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
wave_out = wave.open("wave.wav", "wb")
|
wave_out = wave.open("wave.wav", "wb")
|
||||||
wave_out.setnchannels(1)
|
wave_out.setnchannels(1)
|
||||||
wave_out.setframerate(16000)
|
wave_out.setframerate(16000)
|
||||||
wave_out.setsampwidth(2)
|
wave_out.setsampwidth(2)
|
||||||
|
|
||||||
|
transcriber = Transcriber()
|
||||||
|
mic_source = audiosource.MicrophoneAudioSource()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print("looping")
|
print("looping")
|
||||||
|
|
||||||
@ -172,58 +76,18 @@ while True:
|
|||||||
print(accepted_socket)
|
print(accepted_socket)
|
||||||
break
|
break
|
||||||
|
|
||||||
# Fetch user info.
|
opusSource = audiosource.OpusStreamAudioSource(accepted_socket)
|
||||||
user_info = read_packet(accepted_socket)
|
|
||||||
user_info = json.loads(user_info.decode("utf-8"))
|
|
||||||
print("User connection...")
|
|
||||||
print(json.dumps(user_info, indent=4))
|
|
||||||
|
|
||||||
# Fire up decoder.
|
transcriber.set_source(opusSource)
|
||||||
opus_decoder = OpusDecoder()
|
while not opusSource.is_done():
|
||||||
print(dir(opus_decoder))
|
time.sleep(0.1)
|
||||||
opus_decoder.set_channels(1)
|
transcriber.update()
|
||||||
opus_decoder.set_sampling_frequency(16000)
|
|
||||||
|
|
||||||
# Receive data.
|
|
||||||
while True:
|
|
||||||
next_packet = read_packet(accepted_socket)
|
|
||||||
#print("got packet: ", next_packet)
|
|
||||||
|
|
||||||
if next_packet:
|
|
||||||
# If we don't use bytearray here to copy, we run into a weird
|
|
||||||
# exception about the memory not being writeable.
|
|
||||||
decoded_data = opus_decoder.decode(bytearray(next_packet))
|
|
||||||
print(decoded_data)
|
|
||||||
wave_out.writeframes(decoded_data)
|
|
||||||
else:
|
|
||||||
print("DONE I THINK")
|
|
||||||
|
|
||||||
if not next_packet:
|
|
||||||
break
|
|
||||||
#b = accepted_socket.recv(32)
|
|
||||||
#print(b)
|
|
||||||
#if not b:
|
|
||||||
# break
|
|
||||||
|
|
||||||
|
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# while True:
|
|
||||||
# pygame_text_surface = pygame_font.render("Test test test", (0, 0, 0), (255, 255, 255))
|
|
||||||
# pygame_text_rect = pygame_text_surface.get_rect()
|
|
||||||
# pygame_text_rect.center = (640, 32)
|
|
||||||
# pygame_display_surface.fill((0, 0, 0))
|
|
||||||
# pygame_display_surface.blit(pygame_text_surface, pygame_text_rect)
|
|
||||||
|
|
||||||
# for event in pygame.event.get():
|
|
||||||
# if event.type == pygame.QUIT:
|
|
||||||
# pygame.quit()
|
|
||||||
|
|
||||||
# pygame.display.update()
|
|
||||||
|
|
||||||
# exit(0)
|
|
||||||
|
|
||||||
def onestepchange(start, dest):
|
def onestepchange(start, dest):
|
||||||
|
|
||||||
|
65
transcriber.py
Normal file
65
transcriber.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import speech_recognition
|
||||||
|
import whisper
|
||||||
|
import torch
|
||||||
|
import wave
|
||||||
|
|
||||||
|
_audio_model = whisper.load_model("medium.en") # "large"
|
||||||
|
|
||||||
|
# For debugging...
|
||||||
|
# wave_out = wave.open("wave.wav", "wb")
|
||||||
|
# wave_out.setnchannels(1)
|
||||||
|
# wave_out.setframerate(16000)
|
||||||
|
# wave_out.setsampwidth(2)
|
||||||
|
|
||||||
|
class Transcriber:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._audio_source = None
|
||||||
|
|
||||||
|
# Audio data for the current phrase.
|
||||||
|
self._current_data = b''
|
||||||
|
|
||||||
|
def set_source(self, source):
|
||||||
|
self._audio_source = source
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
|
||||||
|
if self._audio_source:
|
||||||
|
|
||||||
|
if not self._audio_source.data_queue.empty():
|
||||||
|
|
||||||
|
# We got some new data. Let's process it!
|
||||||
|
new_data = []
|
||||||
|
while not self._audio_source.data_queue.empty():
|
||||||
|
new_packet = self._audio_source.data_queue.get()
|
||||||
|
new_data.append(new_packet)
|
||||||
|
|
||||||
|
new_data_joined = b''.join(new_data)
|
||||||
|
|
||||||
|
# For debugging...
|
||||||
|
#wave_out.writeframes(new_data_joined)
|
||||||
|
|
||||||
|
self._current_data = self._current_data + new_data_joined
|
||||||
|
|
||||||
|
# Convert in-ram buffer to something the model can use
|
||||||
|
# directly without needing a temp file. Convert data from 16
|
||||||
|
# bit wide integers to floating point with a width of 32
|
||||||
|
# bits. Clamp the audio stream frequency to a PCM wavelength
|
||||||
|
# compatible default of 32768hz max.
|
||||||
|
audio_np = np.frombuffer(
|
||||||
|
self._current_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Run the transcription model, and extract the text.
|
||||||
|
result = _audio_model.transcribe(
|
||||||
|
audio_np, fp16=torch.cuda.is_available())
|
||||||
|
|
||||||
|
text = result['text'].strip()
|
||||||
|
|
||||||
|
print("text now: ", text)
|
||||||
|
|
||||||
|
# Automatically drop audio sources when we're finished with them.
|
||||||
|
if self._audio_source.is_done():
|
||||||
|
self._audio_source = None
|
Loading…
Reference in New Issue
Block a user