Really make it real time with no disk IO needed.
This commit is contained in:
parent
f213b229b8
commit
973880412a
@ -2,4 +2,5 @@ pyaudio
|
|||||||
SpeechRecognition
|
SpeechRecognition
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu116
|
--extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
torch
|
torch
|
||||||
|
numpy
|
||||||
git+https://github.com/openai/whisper.git
|
git+https://github.com/openai/whisper.git
|
@ -3,6 +3,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import numpy as np
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
import whisper
|
import whisper
|
||||||
import torch
|
import torch
|
||||||
@ -106,22 +107,16 @@ def main():
|
|||||||
phrase_complete = True
|
phrase_complete = True
|
||||||
# This is the last time we received new audio data from the queue.
|
# This is the last time we received new audio data from the queue.
|
||||||
phrase_time = now
|
phrase_time = now
|
||||||
|
|
||||||
# Concatenate our current audio data with the latest audio data.
|
# Combine audio data from queue
|
||||||
while not data_queue.empty():
|
audio_data = b''.join(data_queue.queue)
|
||||||
data = data_queue.get()
|
data_queue.queue.clear()
|
||||||
last_sample += data
|
|
||||||
|
# Convert in-ram buffer to something the model can use directly without needing a temp file.
|
||||||
# Use AudioData to convert the raw data to wav data.
|
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
|
||||||
wav_data = io.BytesIO(audio_data.get_wav_data())
|
|
||||||
|
|
||||||
# Write wav data to the temporary file as bytes.
|
|
||||||
with open(temp_file, 'w+b') as f:
|
|
||||||
f.write(wav_data.read())
|
|
||||||
|
|
||||||
# Read the transcription.
|
# Read the transcription.
|
||||||
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
|
result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
|
||||||
text = result['text'].strip()
|
text = result['text'].strip()
|
||||||
|
|
||||||
# If we detected a pause between recordings, add a new item to our transcription.
|
# If we detected a pause between recordings, add a new item to our transcription.
|
||||||
|
Loading…
Reference in New Issue
Block a user