From 10e7cdc3a1ada92f189d010ff6a0ca97b0bda459 Mon Sep 17 00:00:00 2001 From: Mike Date: Mon, 25 Nov 2024 18:41:27 +0200 Subject: [PATCH 1/2] add transcribe vad params --- whisper_timestamped/transcribe.py | 32 +++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/whisper_timestamped/transcribe.py b/whisper_timestamped/transcribe.py index ff23405..09b1af9 100755 --- a/whisper_timestamped/transcribe.py +++ b/whisper_timestamped/transcribe.py @@ -117,6 +117,10 @@ def transcribe_timestamped( suppress_tokens="-1", sample_len=None, verbose=False, + avoid_empty_speech=True, + vad_min_speech_duration=0.05, + vad_min_silence_duration=0.1, + vad_dilatation=0, ): """ Transcribe an audio file using Whisper @@ -214,6 +218,18 @@ def transcribe_timestamped( Whether to display the text being decoded to the console. If True, displays all the details, If False, displays minimal details. If None, does not display anything + avoid_empty_speech: bool + Whether to avoid empty speech segments (i.e. segments with no speech detected). + + vad_min_speech_duration: float + Minimum duration of a speech segment, in seconds. If a speech segment is shorter than this, it will be removed. + + vad_min_silence_duration: float + Minimum duration of a silence segment, in seconds. If a silence segment is shorter than this, it will be removed. + + vad_dilatation: float + Dilatation factor for the speech segments. If a speech segment is shorter than this, it will be removed. + Returns ------- A dictionary containing the resulting text ("text") and segment-level details ("segments"), and @@ -293,7 +309,15 @@ def transcribe_timestamped( if vad is not None: audio = get_audio_tensor(audio) - audio, vad_segments, convert_timestamps = remove_non_speech(audio, method=vad, sample_rate=SAMPLE_RATE, plot=plot_word_alignment, avoid_empty_speech=True) + audio, vad_segments, convert_timestamps = remove_non_speech(audio, + method=vad, + sample_rate=SAMPLE_RATE, + plot=plot_word_alignment, + avoid_empty_speech=avoid_empty_speech, + min_speech_duration=vad_min_speech_duration, + min_silence_duration=vad_min_silence_duration, + dilatation=vad_dilatation, + ) else: vad_segments = None @@ -2084,9 +2108,9 @@ def auditok_segment_to_dict(s): def remove_non_speech(audio, use_sample=False, - min_speech_duration=0.1, - min_silence_duration=1, - dilatation=0.5, + min_speech_duration=0.05, + min_silence_duration=0.1, + dilatation=0, sample_rate=SAMPLE_RATE, method="silero", avoid_empty_speech=False, From 1aae839acccbac3dfe074f25d04bc65b14edb0dc Mon Sep 17 00:00:00 2001 From: Mike Date: Tue, 26 Nov 2024 00:15:38 +0200 Subject: [PATCH 2/2] reset params default values --- whisper_timestamped/transcribe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/whisper_timestamped/transcribe.py b/whisper_timestamped/transcribe.py index 09b1af9..aca629e 100755 --- a/whisper_timestamped/transcribe.py +++ b/whisper_timestamped/transcribe.py @@ -118,9 +118,9 @@ def transcribe_timestamped( sample_len=None, verbose=False, avoid_empty_speech=True, - vad_min_speech_duration=0.05, - vad_min_silence_duration=0.1, - vad_dilatation=0, + vad_min_speech_duration=0.1, + vad_min_silence_duration=1, + vad_dilatation=0.5, ): """ Transcribe an audio file using Whisper @@ -2108,9 +2108,9 @@ def auditok_segment_to_dict(s): def remove_non_speech(audio, use_sample=False, - min_speech_duration=0.05, - min_silence_duration=0.1, - dilatation=0, + min_speech_duration=0.1, + min_silence_duration=1, + dilatation=0.5, sample_rate=SAMPLE_RATE, method="silero", avoid_empty_speech=False,