2022-11-28 22:27:56 -08:00
#! python3.7
import argparse
import io
import os
import speech_recognition as sr
import whisper
2023-01-14 20:13:48 -08:00
import torch
2022-11-28 22:27:56 -08:00
from datetime import datetime , timedelta
from queue import Queue
from tempfile import NamedTemporaryFile
from time import sleep
2023-01-14 20:31:04 -08:00
from sys import platform
2022-11-28 22:27:56 -08:00
def main ( ) :
parser = argparse . ArgumentParser ( )
2023-01-14 20:31:22 -08:00
parser . add_argument ( " --model " , default = " medium " , help = " Model to use " ,
2022-11-28 22:27:56 -08:00
choices = [ " tiny " , " base " , " small " , " medium " , " large " ] )
parser . add_argument ( " --non_english " , action = ' store_true ' ,
help = " Don ' t use the english model. " )
parser . add_argument ( " --energy_threshold " , default = 1000 ,
help = " Energy level for mic to detect. " , type = int )
parser . add_argument ( " --record_timeout " , default = 2 ,
help = " How real time the recording is in seconds. " , type = float )
parser . add_argument ( " --phrase_timeout " , default = 3 ,
help = " How much empty space between recordings before we "
2023-10-03 11:21:17 -07:00
" consider it a new line in the transcription. " , type = float )
2023-01-14 20:31:04 -08:00
if ' linux ' in platform :
parser . add_argument ( " --default_microphone " , default = ' pulse ' ,
help = " Default microphone name for SpeechRecognition. "
" Run this with ' list ' to view available Microphones. " , type = str )
2022-11-28 22:27:56 -08:00
args = parser . parse_args ( )
2023-10-03 11:21:17 -07:00
# The last time a recording was retrieved from the queue.
2022-11-28 22:27:56 -08:00
phrase_time = None
# Current raw audio bytes.
last_sample = bytes ( )
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue ( )
2023-10-03 11:21:17 -07:00
# We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
2022-11-28 22:27:56 -08:00
recorder = sr . Recognizer ( )
recorder . energy_threshold = args . energy_threshold
2023-10-03 11:21:17 -07:00
# Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
2022-11-28 22:27:56 -08:00
recorder . dynamic_energy_threshold = False
2023-10-03 11:21:17 -07:00
# Important for linux users.
2023-01-14 20:31:04 -08:00
# Prevents permanent application hang and crash by using the wrong Microphone
if ' linux ' in platform :
mic_name = args . default_microphone
if not mic_name or mic_name == ' list ' :
print ( " Available microphone devices are: " )
for index , name in enumerate ( sr . Microphone . list_microphone_names ( ) ) :
2023-10-03 11:21:17 -07:00
print ( f " Microphone with name \" { name } \" found " )
2023-01-14 20:31:04 -08:00
return
else :
for index , name in enumerate ( sr . Microphone . list_microphone_names ( ) ) :
if mic_name in name :
source = sr . Microphone ( sample_rate = 16000 , device_index = index )
break
else :
source = sr . Microphone ( sample_rate = 16000 )
2023-10-03 11:21:17 -07:00
2023-01-14 20:31:04 -08:00
# Load / Download model
model = args . model
if args . model != " large " and not args . non_english :
model = model + " .en "
audio_model = whisper . load_model ( model )
2022-11-28 22:27:56 -08:00
2023-01-14 20:31:04 -08:00
record_timeout = args . record_timeout
phrase_timeout = args . phrase_timeout
temp_file = NamedTemporaryFile ( ) . name
transcription = [ ' ' ]
2023-10-03 11:21:17 -07:00
2022-11-28 22:27:56 -08:00
with source :
recorder . adjust_for_ambient_noise ( source )
def record_callback ( _ , audio : sr . AudioData ) - > None :
"""
2023-10-03 11:21:17 -07:00
Threaded callback function to receive audio data when recordings finish .
2022-11-28 22:27:56 -08:00
audio : An AudioData containing the recorded bytes .
"""
# Grab the raw bytes and push it into the thread safe queue.
data = audio . get_raw_data ( )
data_queue . put ( data )
# Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper.
recorder . listen_in_background ( source , record_callback , phrase_time_limit = record_timeout )
# Cue the user that we're ready to go.
print ( " Model loaded. \n " )
while True :
try :
now = datetime . utcnow ( )
# Pull raw recorded audio from the queue.
if not data_queue . empty ( ) :
phrase_complete = False
# If enough time has passed between recordings, consider the phrase complete.
# Clear the current working audio buffer to start over with the new data.
if phrase_time and now - phrase_time > timedelta ( seconds = phrase_timeout ) :
last_sample = bytes ( )
phrase_complete = True
# This is the last time we received new audio data from the queue.
phrase_time = now
# Concatenate our current audio data with the latest audio data.
while not data_queue . empty ( ) :
data = data_queue . get ( )
last_sample + = data
# Use AudioData to convert the raw data to wav data.
audio_data = sr . AudioData ( last_sample , source . SAMPLE_RATE , source . SAMPLE_WIDTH )
wav_data = io . BytesIO ( audio_data . get_wav_data ( ) )
# Write wav data to the temporary file as bytes.
with open ( temp_file , ' w+b ' ) as f :
f . write ( wav_data . read ( ) )
# Read the transcription.
2023-01-14 20:13:48 -08:00
result = audio_model . transcribe ( temp_file , fp16 = torch . cuda . is_available ( ) )
2022-11-28 22:27:56 -08:00
text = result [ ' text ' ] . strip ( )
2023-10-03 11:21:17 -07:00
# If we detected a pause between recordings, add a new item to our transcription.
2022-11-28 22:27:56 -08:00
# Otherwise edit the existing one.
if phrase_complete :
transcription . append ( text )
else :
transcription [ - 1 ] = text
# Clear the console to reprint the updated transcription.
os . system ( ' cls ' if os . name == ' nt ' else ' clear ' )
for line in transcription :
print ( line )
# Flush stdout.
print ( ' ' , end = ' ' , flush = True )
# Infinite loops are bad for processors, must sleep.
sleep ( 0.25 )
except KeyboardInterrupt :
break
print ( " \n \n Transcription: " )
for line in transcription :
print ( line )
if __name__ == " __main__ " :
2023-10-03 11:21:17 -07:00
main ( )