Merge pull request #1 from JohnCiubuc/master

Better Linux support and fixed FP16 Warning
This commit is contained in:
davabase 2023-01-20 12:54:00 -08:00 committed by GitHub
commit a0b5bdbb0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -5,11 +5,13 @@ import io
import os import os
import speech_recognition as sr import speech_recognition as sr
import whisper import whisper
import torch
from datetime import datetime, timedelta from datetime import datetime, timedelta
from queue import Queue from queue import Queue
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from time import sleep from time import sleep
from sys import platform
def main(): def main():
@ -24,9 +26,43 @@ def main():
help="How real time the recording is in seconds.", type=float) help="How real time the recording is in seconds.", type=float)
parser.add_argument("--phrase_timeout", default=3, parser.add_argument("--phrase_timeout", default=3,
help="How much empty space between recordings before we " help="How much empty space between recordings before we "
"consider it a new line in the transcription.", type=float) "consider it a new line in the transcription.", type=float)
if 'linux' in platform:
parser.add_argument("--default_microphone", default='pulse',
help="Default microphone name for SpeechRecognition. "
"Run this with 'list' to view available Microphones.", type=str)
args = parser.parse_args() args = parser.parse_args()
# The last time a recording was retreived from the queue.
phrase_time = None
# Current raw audio bytes.
last_sample = bytes()
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue()
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
recorder = sr.Recognizer()
recorder.energy_threshold = args.energy_threshold
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
recorder.dynamic_energy_threshold = False
# Important for linux users.
# Prevents permanent application hang and crash by using the wrong Microphone
if 'linux' in platform:
mic_name = args.default_microphone
if not mic_name or mic_name == 'list':
print("Available microphone devices are: ")
for index, name in enumerate(sr.Microphone.list_microphone_names()):
print(f"Microphone with name \"{name}\" found")
return
else:
for index, name in enumerate(sr.Microphone.list_microphone_names()):
if mic_name in name:
source = sr.Microphone(sample_rate=16000, device_index=index)
break
else:
source = sr.Microphone(sample_rate=16000)
# Load / Download model
model = args.model model = args.model
if args.model != "large" and not args.non_english: if args.model != "large" and not args.non_english:
model = model + ".en" model = model + ".en"
@ -37,21 +73,7 @@ def main():
temp_file = NamedTemporaryFile().name temp_file = NamedTemporaryFile().name
transcription = [''] transcription = ['']
# The last time a recording was retreived from the queue.
phrase_time = None
# Current raw audio bytes.
last_sample = bytes()
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue()
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
recorder = sr.Recognizer()
recorder.energy_threshold = args.energy_threshold
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
recorder.dynamic_energy_threshold = False
source = sr.Microphone(sample_rate=16000)
with source: with source:
recorder.adjust_for_ambient_noise(source) recorder.adjust_for_ambient_noise(source)
@ -99,7 +121,7 @@ def main():
f.write(wav_data.read()) f.write(wav_data.read())
# Read the transcription. # Read the transcription.
result = audio_model.transcribe(temp_file) result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
text = result['text'].strip() text = result['text'].strip()
# If we detected a pause between recordings, add a new item to our transcripion. # If we detected a pause between recordings, add a new item to our transcripion.