Really make it real time with no disk IO needed.

This commit is contained in:
Duane F. King 2023-11-21 14:10:31 -08:00
parent f213b229b8
commit 973880412a
2 changed files with 10 additions and 14 deletions

View File

@ -2,4 +2,5 @@ pyaudio
SpeechRecognition SpeechRecognition
--extra-index-url https://download.pytorch.org/whl/cu116 --extra-index-url https://download.pytorch.org/whl/cu116
torch torch
numpy
git+https://github.com/openai/whisper.git git+https://github.com/openai/whisper.git

View File

@ -3,6 +3,7 @@
import argparse import argparse
import io import io
import os import os
import numpy as np
import speech_recognition as sr import speech_recognition as sr
import whisper import whisper
import torch import torch
@ -106,22 +107,16 @@ def main():
phrase_complete = True phrase_complete = True
# This is the last time we received new audio data from the queue. # This is the last time we received new audio data from the queue.
phrase_time = now phrase_time = now
# Concatenate our current audio data with the latest audio data. # Combine audio data from queue
while not data_queue.empty(): audio_data = b''.join(data_queue.queue)
data = data_queue.get() data_queue.queue.clear()
last_sample += data
# Convert in-ram buffer to something the model can use directly without needing a temp file.
# Use AudioData to convert the raw data to wav data. audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
wav_data = io.BytesIO(audio_data.get_wav_data())
# Write wav data to the temporary file as bytes.
with open(temp_file, 'w+b') as f:
f.write(wav_data.read())
# Read the transcription. # Read the transcription.
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
text = result['text'].strip() text = result['text'].strip()
# If we detected a pause between recordings, add a new item to our transcription. # If we detected a pause between recordings, add a new item to our transcription.