2022-11-28 22:27:56 -08:00
#! python3.7
import argparse
import os
2023-11-21 14:10:31 -08:00
import numpy as np
2022-11-28 22:27:56 -08:00
import speech_recognition as sr
import whisper
2023-01-14 20:13:48 -08:00
import torch
2022-11-28 22:27:56 -08:00
from datetime import datetime , timedelta
from queue import Queue
from time import sleep
2023-01-14 20:31:04 -08:00
from sys import platform
2022-11-28 22:27:56 -08:00
2025-03-06 20:27:44 -08:00
import textwrap
2022-11-28 22:27:56 -08:00
def main ( ) :
parser = argparse . ArgumentParser ( )
2023-01-14 20:31:22 -08:00
parser . add_argument ( " --model " , default = " medium " , help = " Model to use " ,
2022-11-28 22:27:56 -08:00
choices = [ " tiny " , " base " , " small " , " medium " , " large " ] )
parser . add_argument ( " --non_english " , action = ' store_true ' ,
help = " Don ' t use the english model. " )
parser . add_argument ( " --energy_threshold " , default = 1000 ,
help = " Energy level for mic to detect. " , type = int )
parser . add_argument ( " --record_timeout " , default = 2 ,
help = " How real time the recording is in seconds. " , type = float )
parser . add_argument ( " --phrase_timeout " , default = 3 ,
help = " How much empty space between recordings before we "
2023-10-03 11:21:17 -07:00
" consider it a new line in the transcription. " , type = float )
2023-01-14 20:31:04 -08:00
if ' linux ' in platform :
parser . add_argument ( " --default_microphone " , default = ' pulse ' ,
help = " Default microphone name for SpeechRecognition. "
" Run this with ' list ' to view available Microphones. " , type = str )
2022-11-28 22:27:56 -08:00
args = parser . parse_args ( )
2023-10-03 11:21:17 -07:00
# The last time a recording was retrieved from the queue.
2022-11-28 22:27:56 -08:00
phrase_time = None
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue ( )
2023-10-03 11:21:17 -07:00
# We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
2022-11-28 22:27:56 -08:00
recorder = sr . Recognizer ( )
recorder . energy_threshold = args . energy_threshold
2023-10-03 11:21:17 -07:00
# Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
2022-11-28 22:27:56 -08:00
recorder . dynamic_energy_threshold = False
2023-10-03 11:21:17 -07:00
# Important for linux users.
2023-01-14 20:31:04 -08:00
# Prevents permanent application hang and crash by using the wrong Microphone
if ' linux ' in platform :
mic_name = args . default_microphone
if not mic_name or mic_name == ' list ' :
print ( " Available microphone devices are: " )
for index , name in enumerate ( sr . Microphone . list_microphone_names ( ) ) :
2023-10-03 11:21:17 -07:00
print ( f " Microphone with name \" { name } \" found " )
2023-01-14 20:31:04 -08:00
return
else :
for index , name in enumerate ( sr . Microphone . list_microphone_names ( ) ) :
if mic_name in name :
source = sr . Microphone ( sample_rate = 16000 , device_index = index )
break
else :
source = sr . Microphone ( sample_rate = 16000 )
2023-10-03 11:21:17 -07:00
2023-01-14 20:31:04 -08:00
# Load / Download model
model = args . model
if args . model != " large " and not args . non_english :
model = model + " .en "
audio_model = whisper . load_model ( model )
2022-11-28 22:27:56 -08:00
2023-01-14 20:31:04 -08:00
record_timeout = args . record_timeout
phrase_timeout = args . phrase_timeout
transcription = [ ' ' ]
2023-10-03 11:21:17 -07:00
2022-11-28 22:27:56 -08:00
with source :
recorder . adjust_for_ambient_noise ( source )
def record_callback ( _ , audio : sr . AudioData ) - > None :
"""
2023-10-03 11:21:17 -07:00
Threaded callback function to receive audio data when recordings finish .
2022-11-28 22:27:56 -08:00
audio : An AudioData containing the recorded bytes .
"""
# Grab the raw bytes and push it into the thread safe queue.
data = audio . get_raw_data ( )
data_queue . put ( data )
# Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper.
recorder . listen_in_background ( source , record_callback , phrase_time_limit = record_timeout )
# Cue the user that we're ready to go.
print ( " Model loaded. \n " )
2025-03-06 20:27:44 -08:00
audio_data = b ' '
2022-11-28 22:27:56 -08:00
while True :
try :
now = datetime . utcnow ( )
# Pull raw recorded audio from the queue.
if not data_queue . empty ( ) :
phrase_complete = False
# If enough time has passed between recordings, consider the phrase complete.
# Clear the current working audio buffer to start over with the new data.
if phrase_time and now - phrase_time > timedelta ( seconds = phrase_timeout ) :
phrase_complete = True
# This is the last time we received new audio data from the queue.
phrase_time = now
2025-03-06 20:27:44 -08:00
# for d in data_queue:
# if d > 0.5:
# print("Got something: ", d)
2023-11-21 14:10:31 -08:00
# Combine audio data from queue
2025-03-06 20:27:44 -08:00
audio_data + = b ' ' . join ( data_queue . queue )
2023-11-21 14:10:31 -08:00
data_queue . queue . clear ( )
2025-03-06 20:27:44 -08:00
2023-11-21 14:10:31 -08:00
# Convert in-ram buffer to something the model can use directly without needing a temp file.
2023-11-24 14:58:27 -08:00
# Convert data from 16 bit wide integers to floating point with a width of 32 bits.
# Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
2023-11-21 14:10:31 -08:00
audio_np = np . frombuffer ( audio_data , dtype = np . int16 ) . astype ( np . float32 ) / 32768.0
2022-11-28 22:27:56 -08:00
# Read the transcription.
2023-11-21 14:10:31 -08:00
result = audio_model . transcribe ( audio_np , fp16 = torch . cuda . is_available ( ) )
2022-11-28 22:27:56 -08:00
text = result [ ' text ' ] . strip ( )
2025-03-06 20:27:44 -08:00
# # If we detected a pause between recordings, add a new item to our transcription.
# # Otherwise edit the existing one.
# if phrase_complete:
# transcription.append(text)
# else:
# transcription[-1] += text
print ( text )
# Update rolling transcription file.
f = open ( " transcription.txt " , " w+ " )
output_text = transcription [ - 4 : ]
output_text . append ( text )
f . write ( " " . join ( output_text ) )
f . close ( )
2022-11-28 22:27:56 -08:00
if phrase_complete :
2025-03-06 20:27:44 -08:00
# Append to full transcription.
2022-11-28 22:27:56 -08:00
transcription . append ( text )
2025-03-06 20:27:44 -08:00
# text += "\n"
# f = open("transcription.txt", "w+")
# f.write("\n".join(textwrap.wrap(text)))
# f.close()
print ( " * Phrase complete. " )
audio_data = b ' '
2022-11-28 22:27:56 -08:00
# Clear the console to reprint the updated transcription.
2025-03-06 20:27:44 -08:00
# os.system('cls' if os.name=='nt' else 'clear')
2022-11-28 22:27:56 -08:00
for line in transcription :
print ( line )
# Flush stdout.
print ( ' ' , end = ' ' , flush = True )
2024-03-14 10:49:16 -07:00
else :
2022-11-28 22:27:56 -08:00
# Infinite loops are bad for processors, must sleep.
2025-03-06 20:27:44 -08:00
sleep ( 0.01 )
2022-11-28 22:27:56 -08:00
except KeyboardInterrupt :
break
print ( " \n \n Transcription: " )
for line in transcription :
print ( line )
if __name__ == " __main__ " :
2023-10-03 11:21:17 -07:00
main ( )