#!/usr/bin/python3 import numpy as np import speech_recognition import whisper import torch import wave _audio_model = whisper.load_model("medium.en") # "large" # For debugging... # wave_out = wave.open("wave.wav", "wb") # wave_out.setnchannels(1) # wave_out.setframerate(16000) # wave_out.setsampwidth(2) class Transcriber: def __init__(self): self._audio_source = None # Audio data for the current phrase. self._current_data = b'' def set_source(self, source): self._audio_source = source def update(self): if self._audio_source: if not self._audio_source.data_queue.empty(): # We got some new data. Let's process it! new_data = [] while not self._audio_source.data_queue.empty(): new_packet = self._audio_source.data_queue.get() new_data.append(new_packet) new_data_joined = b''.join(new_data) # For debugging... #wave_out.writeframes(new_data_joined) self._current_data = self._current_data + new_data_joined # Convert in-ram buffer to something the model can use # directly without needing a temp file. Convert data from 16 # bit wide integers to floating point with a width of 32 # bits. Clamp the audio stream frequency to a PCM wavelength # compatible default of 32768hz max. audio_np = np.frombuffer( self._current_data, dtype=np.int16).astype(np.float32) / 32768.0 # Run the transcription model, and extract the text. result = _audio_model.transcribe( audio_np, fp16=torch.cuda.is_available()) text = result['text'].strip() print("text now: ", text) # Automatically drop audio sources when we're finished with them. if self._audio_source.is_done(): self._audio_source = None