diff --git a/transcribe_demo.py b/transcribe_demo.py index a8f6721..6dd8972 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -5,11 +5,13 @@ import io import os import speech_recognition as sr import whisper +import torch from datetime import datetime, timedelta from queue import Queue from tempfile import NamedTemporaryFile from time import sleep +from sys import platform def main(): @@ -24,9 +26,43 @@ def main(): help="How real time the recording is in seconds.", type=float) parser.add_argument("--phrase_timeout", default=3, help="How much empty space between recordings before we " - "consider it a new line in the transcription.", type=float) + "consider it a new line in the transcription.", type=float) + if 'linux' in platform: + parser.add_argument("--default_microphone", default='pulse', + help="Default microphone name for SpeechRecognition. " + "Run this with 'list' to view available Microphones.", type=str) args = parser.parse_args() - + + # The last time a recording was retreived from the queue. + phrase_time = None + # Current raw audio bytes. + last_sample = bytes() + # Thread safe Queue for passing data from the threaded recording callback. + data_queue = Queue() + # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. + recorder = sr.Recognizer() + recorder.energy_threshold = args.energy_threshold + # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. + recorder.dynamic_energy_threshold = False + + # Important for linux users. + # Prevents permanent application hang and crash by using the wrong Microphone + if 'linux' in platform: + mic_name = args.default_microphone + if not mic_name or mic_name == 'list': + print("Available microphone devices are: ") + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f"Microphone with name \"{name}\" found") + return + else: + for index, name in enumerate(sr.Microphone.list_microphone_names()): + if mic_name in name: + source = sr.Microphone(sample_rate=16000, device_index=index) + break + else: + source = sr.Microphone(sample_rate=16000) + + # Load / Download model model = args.model if args.model != "large" and not args.non_english: model = model + ".en" @@ -37,21 +73,7 @@ def main(): temp_file = NamedTemporaryFile().name transcription = [''] - - # The last time a recording was retreived from the queue. - phrase_time = None - # Current raw audio bytes. - last_sample = bytes() - # Thread safe Queue for passing data from the threaded recording callback. - data_queue = Queue() - - # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. - recorder = sr.Recognizer() - recorder.energy_threshold = args.energy_threshold - # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. - recorder.dynamic_energy_threshold = False - - source = sr.Microphone(sample_rate=16000) + with source: recorder.adjust_for_ambient_noise(source) @@ -99,7 +121,7 @@ def main(): f.write(wav_data.read()) # Read the transcription. - result = audio_model.transcribe(temp_file) + result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) text = result['text'].strip() # If we detected a pause between recordings, add a new item to our transcripion.