From 973880412aa4ac398543b9e34de116be715886d9 Mon Sep 17 00:00:00 2001 From: "Duane F. King" Date: Tue, 21 Nov 2023 14:10:31 -0800 Subject: [PATCH 1/3] Really make it real time with no disk IO needed. --- requirements.txt | 1 + transcribe_demo.py | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index ae172ba..dd251a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ pyaudio SpeechRecognition --extra-index-url https://download.pytorch.org/whl/cu116 torch +numpy git+https://github.com/openai/whisper.git \ No newline at end of file diff --git a/transcribe_demo.py b/transcribe_demo.py index 14ec910..3e8b650 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -3,6 +3,7 @@ import argparse import io import os +import numpy as np import speech_recognition as sr import whisper import torch @@ -106,22 +107,16 @@ def main(): phrase_complete = True # This is the last time we received new audio data from the queue. phrase_time = now - - # Concatenate our current audio data with the latest audio data. - while not data_queue.empty(): - data = data_queue.get() - last_sample += data - - # Use AudioData to convert the raw data to wav data. - audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) - wav_data = io.BytesIO(audio_data.get_wav_data()) - - # Write wav data to the temporary file as bytes. - with open(temp_file, 'w+b') as f: - f.write(wav_data.read()) + + # Combine audio data from queue + audio_data = b''.join(data_queue.queue) + data_queue.queue.clear() + + # Convert in-ram buffer to something the model can use directly without needing a temp file. + audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 # Read the transcription. - result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) + result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available()) text = result['text'].strip() # If we detected a pause between recordings, add a new item to our transcription. From 9ecc380a339b3b84d911fb35ef4aed654bd8c185 Mon Sep 17 00:00:00 2001 From: Duane King Date: Fri, 24 Nov 2023 14:51:44 -0800 Subject: [PATCH 2/3] remove last_sample as its now dead code --- transcribe_demo.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/transcribe_demo.py b/transcribe_demo.py index 3e8b650..ee9e3b2 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -36,8 +36,6 @@ def main(): # The last time a recording was retrieved from the queue. phrase_time = None - # Current raw audio bytes. - last_sample = bytes() # Thread safe Queue for passing data from the threaded recording callback. data_queue = Queue() # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends. @@ -103,7 +101,6 @@ def main(): # If enough time has passed between recordings, consider the phrase complete. # Clear the current working audio buffer to start over with the new data. if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): - last_sample = bytes() phrase_complete = True # This is the last time we received new audio data from the queue. phrase_time = now From 571061b114639434a4dcfb8d867ad42516eaf354 Mon Sep 17 00:00:00 2001 From: Duane King Date: Fri, 24 Nov 2023 14:58:27 -0800 Subject: [PATCH 3/3] Cleanup and formatting Done on my phone. --- transcribe_demo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/transcribe_demo.py b/transcribe_demo.py index ee9e3b2..9d370d2 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -1,7 +1,6 @@ #! python3.7 import argparse -import io import os import numpy as np import speech_recognition as sr @@ -10,7 +9,6 @@ import torch from datetime import datetime, timedelta from queue import Queue -from tempfile import NamedTemporaryFile from time import sleep from sys import platform @@ -70,7 +68,6 @@ def main(): record_timeout = args.record_timeout phrase_timeout = args.phrase_timeout - temp_file = NamedTemporaryFile().name transcription = [''] with source: @@ -110,6 +107,8 @@ def main(): data_queue.queue.clear() # Convert in-ram buffer to something the model can use directly without needing a temp file. + # Convert data from 16 bit wide integers to floating point with a width of 32 bits. + # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max. audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 # Read the transcription.