Figured out how to reduce hallucinations a bunch.
This commit is contained in:
parent
9a16c371f8
commit
4dc7385239
@ -6,6 +6,8 @@ recent_phrase_count = 8
|
||||
# Seconds of silence before we start a new phrase.
|
||||
phrase_timeout = 3
|
||||
|
||||
# Higher is more restrictive on what it lets pass through.
|
||||
no_speech_prob_threshold = 0.2
|
||||
|
||||
import numpy as np
|
||||
import speech_recognition
|
||||
@ -33,6 +35,7 @@ class Transcriber:
|
||||
|
||||
self.phrases = [""]
|
||||
|
||||
# Time since the last data came in for the current phrase.
|
||||
self._phrase_time = datetime.utcnow()
|
||||
|
||||
def set_source(self, source):
|
||||
@ -40,7 +43,7 @@ class Transcriber:
|
||||
|
||||
def phrase_probably_silent(self):
|
||||
"""Whisper hallucinates a LOT on silence, so let's just ignore stuff
|
||||
that's mostly silence."""
|
||||
that's mostly silence. First line of defense here."""
|
||||
|
||||
threshold = 100
|
||||
threshold_pass = 0
|
||||
@ -55,8 +58,6 @@ class Transcriber:
|
||||
|
||||
avg = avg / len(self._current_data)
|
||||
threshold_pct = threshold_pass / len(self._current_data)
|
||||
print("threshold_pct: ", threshold_pct)
|
||||
print("avg: ", avg)
|
||||
|
||||
if threshold_pct < 0.1:
|
||||
return True
|
||||
@ -77,10 +78,12 @@ class Transcriber:
|
||||
# last phrase complete and start a new one. Clear the current
|
||||
# working audio buffer to start over with the new data.
|
||||
if self._phrase_time and now - self._phrase_time > timedelta(seconds=phrase_timeout):
|
||||
# TODO: Append stats to the end for debugging so we can keep
|
||||
# tracking down the hallucinations.
|
||||
|
||||
# Only add a new phrase if we actually have data in the last
|
||||
# one.
|
||||
if self.phrases[-1] != "":
|
||||
self.phrases.append("")
|
||||
|
||||
self._current_data = b''
|
||||
|
||||
self._phrase_time = now
|
||||
@ -114,7 +117,13 @@ class Transcriber:
|
||||
result = _audio_model.transcribe(
|
||||
audio_np, fp16=torch.cuda.is_available())
|
||||
|
||||
text = result['text'].strip()
|
||||
# Filter out text segments with a high no_speech_prob.
|
||||
combined_text = ""
|
||||
for seg in result["segments"]:
|
||||
if seg["no_speech_prob"] <= no_speech_prob_threshold:
|
||||
combined_text += seg["text"]
|
||||
|
||||
text = combined_text.strip()
|
||||
|
||||
self.phrases[-1] = text
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user