#! python3.7 # Recent phrases to include in the text buffer before the current transcription. recent_phrase_count = 8 # How real time the recording is in seconds. record_timeout = 2 # Delete Discord users after a minute of no-activity. discord_transcriber_timeout = 60 # 60 import socket # Create socket for listening for incoming Opus audio streams from the Discord # bot. opus_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) opus_server_socket.bind(("127.0.0.1", 9967)) opus_server_socket.listen() import argparse import os import numpy as np import speech_recognition import whisper import torch from datetime import datetime, timedelta from queue import Queue from time import sleep from sys import platform import pygame import wave #from pyogg.opus import OpusEncoder import select import time import json import threading import diffstuff import audiosource from transcriber import Transcriber pygame_font_height = 32 pygame.init() pygame_display_surface = pygame.display.set_mode((960-75, pygame_font_height * 2 * 2.5)) pygame.display.set_caption("Transcription") pygame_font = pygame.font.Font("/home/kiri/.fonts/Sigmar-Regular.ttf", pygame_font_height) wave_out = wave.open("wave.wav", "wb") wave_out.setnchannels(1) wave_out.setframerate(16000) wave_out.setsampwidth(2) transcribers = [] transcriber1 = Transcriber() mic_source1 = audiosource.MicrophoneAudioSource() transcriber1.set_source(mic_source1) transcriber1.username = "Kiri" transcribers.append(transcriber1) # transcriber2 = Transcriber() # mic_source2 = audiosource.MicrophoneAudioSource() # transcriber2.set_source(mic_source2) # transcriber2.username = "Kiri2" # transcribers.append(transcriber2) discord_transcribers_per_user_id = {} while True: # Check for new opus connections. #print("Checking for new connections...") s = select.select([opus_server_socket], [], [], 0) if len(s[0]): accepted_socket, addr = opus_server_socket.accept() #print("Accepted new Opus stream: ", accepted_socket) new_stream = audiosource.OpusStreamAudioSource(accepted_socket) if (new_stream._user_info["userId"] in discord_transcribers_per_user_id): discord_transcribers_per_user_id[new_stream._user_info["userId"]].set_source(new_stream) else: new_transcriber = Transcriber() new_transcriber.set_source(new_stream) discord_transcribers_per_user_id[new_stream._user_info["userId"]] = new_transcriber new_transcriber.username = new_stream._user_info["displayName"] if not new_transcriber in transcribers: transcribers.append(new_transcriber) removal_queue = [] # Run updates. print("Running updates...") for transcriber in transcribers: #print("Running updates for... ", transcriber.username) transcriber.update() #print("Done running updates for... ", transcriber.username) if transcriber._phrase_time + timedelta(seconds=discord_transcriber_timeout) < datetime.utcnow(): if transcriber._audio_source == None: removal_queue.append(transcriber) #print("Running removals...") # Note that this will not remove them from discord_transcribers_per_user_id. # It's probably fine, though. #print("Running removals...") for removal in removal_queue: #print("Removing inactive user: ", removal.username) transcribers.remove(removal) # Sleep. print("Sleeping...") time.sleep(0.05) #print("Rendering...") # Do rendering. pygame_display_surface.fill((0, 0, 0)) # Render text. for transcriber in transcribers: pygame_text_surface = pygame_font.render(transcriber.scrolling_text, (0, 0, 0), (255, 255, 255)) pygame_text_rect = pygame_text_surface.get_rect() pygame_text_rect.center = ( pygame_display_surface.get_width() / 2, pygame_font_height * (1 + transcribers.index(transcriber))) pygame_text_rect.right = pygame_display_surface.get_width() pygame_display_surface.blit(pygame_text_surface, pygame_text_rect) # Render a background for the names. fill_rect = pygame_display_surface.get_rect() fill_rect.width = 220 fill_rect.left = 0 pygame_display_surface.fill((0, 0, 0), fill_rect) # Render names. for transcriber in transcribers: username_for_display = "" for c in transcriber.username: if ord(c) <= 127: username_for_display += c pygame_username_surface = pygame_font.render(username_for_display, (0, 0, 0), (255, 255, 255)) pygame_text_rect = pygame_username_surface.get_rect() pygame_text_rect.center = ( pygame_display_surface.get_width() / 2, pygame_font_height * (1 + transcribers.index(transcriber))) pygame_text_rect.left = 16 pygame_display_surface.blit(pygame_username_surface, pygame_text_rect) pygame.display.update() exit(0) def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default="medium", help="Model to use", choices=["tiny", "base", "small", "medium", "large"]) parser.add_argument("--non_english", action='store_true', help="Don't use the english model.") parser.add_argument("--energy_threshold", default=1000, help="Energy level for mic to detect.", type=int) parser.add_argument("--phrase_timeout", default=3, help="How much empty space between recordings before we " "consider it a new line in the transcription.", type=float) if 'linux' in platform: parser.add_argument("--default_microphone", default='pulse', help="Default microphone name for SpeechRecognition. " "Run this with 'list' to view available Microphones.", type=str) args = parser.parse_args() # The last time a recording was retrieved from the queue. phrase_time = None #data_queue = Queue() # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends. # Load / Download model model = args.model if args.model != "large" and not args.non_english: model = model + ".en" audio_model = whisper.load_model(model) phrase_timeout = args.phrase_timeout transcription = [''] # Cue the user that we're ready to go. print("Model loaded.\n") # Rolling output text buffer. # This is the one that animates. Stored as a single string. rolling_output_text = "" # This is the one that updates in big chunks at lower frequency. # Stored as an array of phrases. output_text = [""] mic_audio_source = MicrophoneAudioSource() mic_audio_source.start() data_queue = mic_audio_source.data_queue # Rolling audio input buffer. audio_data = b'' diffsize = 0 while True: try: for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() exit(0) rolling_text_target = " ".join(output_text)[-160:] if rolling_text_target != rolling_output_text: # Chop off the start all at once. It's not needed for the animation to look good. new_rolling_output_text = onestepchange(rolling_output_text, rolling_text_target) while rolling_output_text.endswith(new_rolling_output_text): new_rolling_output_text = onestepchange(new_rolling_output_text, rolling_text_target) rolling_output_text = new_rolling_output_text if countsteps(rolling_output_text, rolling_text_target) > 80: rolling_output_text = rolling_text_target print(rolling_output_text) pygame_text_surface = pygame_font.render(rolling_output_text, (0, 0, 0), (255, 255, 255)) pygame_text_rect = pygame_text_surface.get_rect() pygame_text_rect.center = (640, pygame_font_height) pygame_text_rect.right = 1280 pygame_display_surface.fill((0, 0, 0)) pygame_display_surface.blit(pygame_text_surface, pygame_text_rect) pygame.display.update() diffsize = abs(len(rolling_output_text) - len(rolling_text_target)) else: now = datetime.utcnow() # Pull raw recorded audio from the queue. if not data_queue.empty(): phrase_complete = False # If enough time has passed between recordings, consider the phrase complete. # Clear the current working audio buffer to start over with the new data. # # FIXME: Shouldn't we cut off the phrase here instead of # waiting for later? if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): phrase_complete = True # This is the last time we received new audio data from the queue. phrase_time = now # Combine audio data from queue audio_data += b''.join(data_queue.queue) data_queue.queue.clear() # Convert in-ram buffer to something the model can use directly without needing a temp file. # Convert data from 16 bit wide integers to floating point with a width of 32 bits. # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max. audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 # Run the transcription model, and extract the text. result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available()) text = result['text'].strip() # Update rolling transcription file. # Start with all our recent-but-complete phrases. output_text = transcription[-recent_phrase_count:] # Append the phrase-in-progress. (TODO: Can we make this a # different color or something?) output_text.append(text) # If we're done with the phrase, we can go ahead and stuff # it into the list and clear out the current audio data # buffer. if phrase_complete: # Append to full transcription. if text != "": transcription.append(text) # Clear audio buffer. audio_data = b'' # Infinite loops are bad for processors, must sleep. Also, limit the anim speed. if diffsize > 30: sleep(0.01) else: sleep(0.05) except KeyboardInterrupt: break if __name__ == "__main__": main()