Merge pull request #43 from duaneking/master

Really make it real time with no disk IO needed.
This commit is contained in:
davabase 2023-11-26 11:51:31 -08:00 committed by GitHub
commit 828dd92044
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 20 deletions

View File

@ -2,4 +2,5 @@ pyaudio
SpeechRecognition SpeechRecognition
--extra-index-url https://download.pytorch.org/whl/cu116 --extra-index-url https://download.pytorch.org/whl/cu116
torch torch
numpy
git+https://github.com/openai/whisper.git git+https://github.com/openai/whisper.git

View File

@ -1,15 +1,14 @@
#! python3.7 #! python3.7
import argparse import argparse
import io
import os import os
import numpy as np
import speech_recognition as sr import speech_recognition as sr
import whisper import whisper
import torch import torch
from datetime import datetime, timedelta from datetime import datetime, timedelta
from queue import Queue from queue import Queue
from tempfile import NamedTemporaryFile
from time import sleep from time import sleep
from sys import platform from sys import platform
@ -35,8 +34,6 @@ def main():
# The last time a recording was retrieved from the queue. # The last time a recording was retrieved from the queue.
phrase_time = None phrase_time = None
# Current raw audio bytes.
last_sample = bytes()
# Thread safe Queue for passing data from the threaded recording callback. # Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue() data_queue = Queue()
# We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends. # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
@ -71,7 +68,6 @@ def main():
record_timeout = args.record_timeout record_timeout = args.record_timeout
phrase_timeout = args.phrase_timeout phrase_timeout = args.phrase_timeout
temp_file = NamedTemporaryFile().name
transcription = [''] transcription = ['']
with source: with source:
@ -102,26 +98,21 @@ def main():
# If enough time has passed between recordings, consider the phrase complete. # If enough time has passed between recordings, consider the phrase complete.
# Clear the current working audio buffer to start over with the new data. # Clear the current working audio buffer to start over with the new data.
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
last_sample = bytes()
phrase_complete = True phrase_complete = True
# This is the last time we received new audio data from the queue. # This is the last time we received new audio data from the queue.
phrase_time = now phrase_time = now
# Concatenate our current audio data with the latest audio data. # Combine audio data from queue
while not data_queue.empty(): audio_data = b''.join(data_queue.queue)
data = data_queue.get() data_queue.queue.clear()
last_sample += data
# Use AudioData to convert the raw data to wav data. # Convert in-ram buffer to something the model can use directly without needing a temp file.
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
wav_data = io.BytesIO(audio_data.get_wav_data()) # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
# Write wav data to the temporary file as bytes.
with open(temp_file, 'w+b') as f:
f.write(wav_data.read())
# Read the transcription. # Read the transcription.
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
text = result['text'].strip() text = result['text'].strip()
# If we detected a pause between recordings, add a new item to our transcription. # If we detected a pause between recordings, add a new item to our transcription.