Merge pull request #43 from duaneking/master
Really make it real time with no disk IO needed.
This commit is contained in:
commit
828dd92044
@ -2,4 +2,5 @@ pyaudio
|
||||
SpeechRecognition
|
||||
--extra-index-url https://download.pytorch.org/whl/cu116
|
||||
torch
|
||||
numpy
|
||||
git+https://github.com/openai/whisper.git
|
@ -1,15 +1,14 @@
|
||||
#! python3.7
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import numpy as np
|
||||
import speech_recognition as sr
|
||||
import whisper
|
||||
import torch
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from queue import Queue
|
||||
from tempfile import NamedTemporaryFile
|
||||
from time import sleep
|
||||
from sys import platform
|
||||
|
||||
@ -35,8 +34,6 @@ def main():
|
||||
|
||||
# The last time a recording was retrieved from the queue.
|
||||
phrase_time = None
|
||||
# Current raw audio bytes.
|
||||
last_sample = bytes()
|
||||
# Thread safe Queue for passing data from the threaded recording callback.
|
||||
data_queue = Queue()
|
||||
# We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
|
||||
@ -71,7 +68,6 @@ def main():
|
||||
record_timeout = args.record_timeout
|
||||
phrase_timeout = args.phrase_timeout
|
||||
|
||||
temp_file = NamedTemporaryFile().name
|
||||
transcription = ['']
|
||||
|
||||
with source:
|
||||
@ -102,26 +98,21 @@ def main():
|
||||
# If enough time has passed between recordings, consider the phrase complete.
|
||||
# Clear the current working audio buffer to start over with the new data.
|
||||
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
|
||||
last_sample = bytes()
|
||||
phrase_complete = True
|
||||
# This is the last time we received new audio data from the queue.
|
||||
phrase_time = now
|
||||
|
||||
# Concatenate our current audio data with the latest audio data.
|
||||
while not data_queue.empty():
|
||||
data = data_queue.get()
|
||||
last_sample += data
|
||||
# Combine audio data from queue
|
||||
audio_data = b''.join(data_queue.queue)
|
||||
data_queue.queue.clear()
|
||||
|
||||
# Use AudioData to convert the raw data to wav data.
|
||||
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
||||
wav_data = io.BytesIO(audio_data.get_wav_data())
|
||||
|
||||
# Write wav data to the temporary file as bytes.
|
||||
with open(temp_file, 'w+b') as f:
|
||||
f.write(wav_data.read())
|
||||
# Convert in-ram buffer to something the model can use directly without needing a temp file.
|
||||
# Convert data from 16 bit wide integers to floating point with a width of 32 bits.
|
||||
# Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
|
||||
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Read the transcription.
|
||||
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
|
||||
result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
|
||||
text = result['text'].strip()
|
||||
|
||||
# If we detected a pause between recordings, add a new item to our transcription.
|
||||
|
Loading…
Reference in New Issue
Block a user