diff --git a/README.md b/README.md deleted file mode 100644 index e8f6acc..0000000 --- a/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Real Time Whisper Transcription - -![Demo gif](demo.gif) - -This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings. - -To install dependencies simply run -``` -pip install -r requirements.txt -``` -in an environment of your choosing. - -Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers: - -``` -# on Ubuntu or Debian -sudo apt update && sudo apt install ffmpeg - -# on Arch Linux -sudo pacman -S ffmpeg - -# on MacOS using Homebrew (https://brew.sh/) -brew install ffmpeg - -# on Windows using Chocolatey (https://chocolatey.org/) -choco install ffmpeg - -# on Windows using Scoop (https://scoop.sh/) -scoop install ffmpeg -``` - -For more information on Whisper please see https://github.com/openai/whisper - -The code in this repository is public domain. \ No newline at end of file diff --git a/demo.gif b/demo.gif deleted file mode 100644 index 904ddeb..0000000 Binary files a/demo.gif and /dev/null differ diff --git a/diffstuff.py b/diffstuff.py index e69de29..2173c92 100644 --- a/diffstuff.py +++ b/diffstuff.py @@ -0,0 +1,33 @@ +import textwrap +import difflib + +def onestepchange(start, dest): + + ret = "" + + for i, s in enumerate(difflib.ndiff(start, dest)): + # print(i) + # print(s) + + if s[0] == '-': + return ret + start[i+1:] + + if s[1] == '+': + return ret + s[-1] + start[i:] + + ret = ret + s[-1] + + if len(ret) > len(start): + return ret + + if ret[i] != start[i]: + return ret + start[i:] + + return ret + +def countsteps(start, dest): + step_count = 0 + while start != dest: + start = onestepchange(start, dest) + step_count += 1 + return step_count diff --git a/difftest.py b/difftest.py deleted file mode 100644 index ad84eb3..0000000 --- a/difftest.py +++ /dev/null @@ -1,44 +0,0 @@ -import difflib - -s1 = "1234asdffooMOO" -s2 = "asdfbarMOOwhatever" - -# s1 = "asdffoo" -# s2 = "asdffooMOO" - -def onestepchange(start, dest): - - ret = "" - - for i, s in enumerate(difflib.ndiff(start, dest)): - # print(i) - # print(s) - - if s[0] == '-': - return ret + start[i+1:] - - if s[1] == '+': - return ret + s[-1] + start[i:] - - ret = ret + s[-1] - - if len(ret) > len(start): - return ret - - if ret[i] != start[i]: - return ret + start[i:] - - return ret - - -n = s1 -while n != s2: - print(n) - n = onestepchange(n, s2) - -print(n) - -# for i, s in enumerate(difflib.ndiff(s1, s2)): -# print(i) -# print(s) - diff --git a/kiri_reqs.txt b/kiri_reqs.txt deleted file mode 100644 index cb9519c..0000000 --- a/kiri_reqs.txt +++ /dev/null @@ -1 +0,0 @@ -whisper-live tokenizers==0.20.3 diff --git a/requirements2.txt b/requirements2.txt deleted file mode 100644 index 87797c1..0000000 --- a/requirements2.txt +++ /dev/null @@ -1,8 +0,0 @@ -setuptools -pyaudio -SpeechRecognition ---extra-index-url https://download.pytorch.org/whl/rocm6.2.4 -torch -numpy -git+https://github.com/openai/whisper.git -git+https://github.com/TeamPyOgg/PyOgg.git@4118fc40067eb475468726c6bccf1242abfc24fc \ No newline at end of file diff --git a/transcribe_demo.py b/transcribe_demo.py index 24490a8..5c96cf6 100644 --- a/transcribe_demo.py +++ b/transcribe_demo.py @@ -19,8 +19,6 @@ from queue import Queue from time import sleep from sys import platform -import textwrap -import difflib import pygame @@ -89,36 +87,7 @@ exit(0) -def onestepchange(start, dest): - ret = "" - - for i, s in enumerate(difflib.ndiff(start, dest)): - # print(i) - # print(s) - - if s[0] == '-': - return ret + start[i+1:] - - if s[1] == '+': - return ret + s[-1] + start[i:] - - ret = ret + s[-1] - - if len(ret) > len(start): - return ret - - if ret[i] != start[i]: - return ret + start[i:] - - return ret - -def countsteps(start, dest): - step_count = 0 - while start != dest: - start = onestepchange(start, dest) - step_count += 1 - return step_count def main(): parser = argparse.ArgumentParser() diff --git a/transcriber.py b/transcriber.py index 9120c8e..65fb553 100644 --- a/transcriber.py +++ b/transcriber.py @@ -1,10 +1,19 @@ #!/usr/bin/python3 +# Recent phrases to include in the text buffer before the current transcription. +recent_phrase_count = 8 + +# Seconds of silence before we start a new phrase. +phrase_timeout = 3 + + import numpy as np import speech_recognition import whisper import torch import wave +from datetime import datetime, timedelta +import json _audio_model = whisper.load_model("medium.en") # "large" @@ -22,43 +31,95 @@ class Transcriber: # Audio data for the current phrase. self._current_data = b'' + self.phrases = [""] + + self._phrase_time = datetime.utcnow() + def set_source(self, source): self._audio_source = source + def phrase_probably_silent(self): + """Whisper hallucinates a LOT on silence, so let's just ignore stuff + that's mostly silence.""" + + threshold = 100 + threshold_pass = 0 + threshold_fail = 0 + avg = 0 + for k in self._current_data: + avg += k + if(abs(k)) > threshold: + threshold_pass += 1 + else: + threshold_fail += 1 + + avg = avg / len(self._current_data) + threshold_pct = threshold_pass / len(self._current_data) + print("threshold_pct: ", threshold_pct) + print("avg: ", avg) + + if threshold_pct < 0.1: + return True + + return False + def update(self): + now = datetime.utcnow() + if self._audio_source: if not self._audio_source.data_queue.empty(): # We got some new data. Let's process it! + + # If enough time has passed between recordings, consider the + # last phrase complete and start a new one. Clear the current + # working audio buffer to start over with the new data. + if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout): + # TODO: Append stats to the end for debugging so we can keep + # tracking down the hallucinations. + if self.phrases[-1] != "": + self.phrases.append("") + self._current_data = b'' + + self._phrase_time = now + + # Get all the new data since last tick, new_data = [] while not self._audio_source.data_queue.empty(): new_packet = self._audio_source.data_queue.get() new_data.append(new_packet) - new_data_joined = b''.join(new_data) # For debugging... #wave_out.writeframes(new_data_joined) + # Append it to the current buffer. self._current_data = self._current_data + new_data_joined - # Convert in-ram buffer to something the model can use - # directly without needing a temp file. Convert data from 16 - # bit wide integers to floating point with a width of 32 - # bits. Clamp the audio stream frequency to a PCM wavelength - # compatible default of 32768hz max. - audio_np = np.frombuffer( - self._current_data, dtype=np.int16).astype(np.float32) / 32768.0 + if self.phrase_probably_silent(): + self.phrases[-1] = "" + else: - # Run the transcription model, and extract the text. - result = _audio_model.transcribe( - audio_np, fp16=torch.cuda.is_available()) + # Convert in-ram buffer to something the model can use + # directly without needing a temp file. Convert data from 16 + # bit wide integers to floating point with a width of 32 + # bits. Clamp the audio stream frequency to a PCM wavelength + # compatible default of 32768hz max. + audio_np = np.frombuffer( + self._current_data, dtype=np.int16).astype(np.float32) / 32768.0 - text = result['text'].strip() + # Run the transcription model, and extract the text. + result = _audio_model.transcribe( + audio_np, fp16=torch.cuda.is_available()) + + text = result['text'].strip() + + self.phrases[-1] = text + + print("phrases: ", json.dumps(self.phrases, indent=4)) - print("text now: ", text) # Automatically drop audio sources when we're finished with them. if self._audio_source.is_done():