From 973880412aa4ac398543b9e34de116be715886d9 Mon Sep 17 00:00:00 2001
From: "Duane F. King" <duaneking@users.noreply.github.com>
Date: Tue, 21 Nov 2023 14:10:31 -0800
Subject: [PATCH 1/3] Really make it real time with no disk IO needed.

---
 requirements.txt   |  1 +
 transcribe_demo.py | 23 +++++++++--------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ae172ba..dd251a9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ pyaudio
 SpeechRecognition
 --extra-index-url https://download.pytorch.org/whl/cu116
 torch
+numpy
 git+https://github.com/openai/whisper.git
\ No newline at end of file
diff --git a/transcribe_demo.py b/transcribe_demo.py
index 14ec910..3e8b650 100644
--- a/transcribe_demo.py
+++ b/transcribe_demo.py
@@ -3,6 +3,7 @@
 import argparse
 import io
 import os
+import numpy as np
 import speech_recognition as sr
 import whisper
 import torch
@@ -106,22 +107,16 @@ def main():
                     phrase_complete = True
                 # This is the last time we received new audio data from the queue.
                 phrase_time = now
-
-                # Concatenate our current audio data with the latest audio data.
-                while not data_queue.empty():
-                    data = data_queue.get()
-                    last_sample += data
-
-                # Use AudioData to convert the raw data to wav data.
-                audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
-                wav_data = io.BytesIO(audio_data.get_wav_data())
-
-                # Write wav data to the temporary file as bytes.
-                with open(temp_file, 'w+b') as f:
-                    f.write(wav_data.read())
+                
+                # Combine audio data from queue
+                audio_data = b''.join(data_queue.queue)
+                data_queue.queue.clear()
+                
+                # Convert in-ram buffer to something the model can use directly without needing a temp file.
+                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
 
                 # Read the transcription.
-                result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
+                result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
                 text = result['text'].strip()
 
                 # If we detected a pause between recordings, add a new item to our transcription.

From 9ecc380a339b3b84d911fb35ef4aed654bd8c185 Mon Sep 17 00:00:00 2001
From: Duane King <duaneking@users.noreply.github.com>
Date: Fri, 24 Nov 2023 14:51:44 -0800
Subject: [PATCH 2/3] remove last_sample as its now dead code

---
 transcribe_demo.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/transcribe_demo.py b/transcribe_demo.py
index 3e8b650..ee9e3b2 100644
--- a/transcribe_demo.py
+++ b/transcribe_demo.py
@@ -36,8 +36,6 @@ def main():
 
     # The last time a recording was retrieved from the queue.
     phrase_time = None
-    # Current raw audio bytes.
-    last_sample = bytes()
     # Thread safe Queue for passing data from the threaded recording callback.
     data_queue = Queue()
     # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
@@ -103,7 +101,6 @@ def main():
                 # If enough time has passed between recordings, consider the phrase complete.
                 # Clear the current working audio buffer to start over with the new data.
                 if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
-                    last_sample = bytes()
                     phrase_complete = True
                 # This is the last time we received new audio data from the queue.
                 phrase_time = now

From 571061b114639434a4dcfb8d867ad42516eaf354 Mon Sep 17 00:00:00 2001
From: Duane King <duaneking@users.noreply.github.com>
Date: Fri, 24 Nov 2023 14:58:27 -0800
Subject: [PATCH 3/3] Cleanup and formatting

Done on my phone.
---
 transcribe_demo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/transcribe_demo.py b/transcribe_demo.py
index ee9e3b2..9d370d2 100644
--- a/transcribe_demo.py
+++ b/transcribe_demo.py
@@ -1,7 +1,6 @@
 #! python3.7
 
 import argparse
-import io
 import os
 import numpy as np
 import speech_recognition as sr
@@ -10,7 +9,6 @@ import torch
 
 from datetime import datetime, timedelta
 from queue import Queue
-from tempfile import NamedTemporaryFile
 from time import sleep
 from sys import platform
 
@@ -70,7 +68,6 @@ def main():
     record_timeout = args.record_timeout
     phrase_timeout = args.phrase_timeout
 
-    temp_file = NamedTemporaryFile().name
     transcription = ['']
 
     with source:
@@ -110,6 +107,8 @@ def main():
                 data_queue.queue.clear()
                 
                 # Convert in-ram buffer to something the model can use directly without needing a temp file.
+                # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
+                # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
                 audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
 
                 # Read the transcription.