From 294acf89903c258cb0d0ed28a01a5162f87a5c82 Mon Sep 17 00:00:00 2001 From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics Date: Thu, 6 Feb 2025 17:17:11 +0000 Subject: [PATCH 1/4] best SB model --- speechbrain/run_conformer.sh | 17 +++++++++++------ speechbrain/run_eval.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh index 3369995..a065bb0 100644 --- a/speechbrain/run_conformer.sh +++ b/speechbrain/run_conformer.sh @@ -2,27 +2,32 @@ export PYTHONPATH="..":$PYTHONPATH -SOURCE="speechbrain/asr-conformer-transformerlm-librispeech" +SOURCE="speechbrain/asr-conformer-largescaleasr" +# Run with CTC+Attn python run_eval.py \ --source=$SOURCE \ --speechbrain_pretrained_class_name="EncoderDecoderASR" \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ --split="test.clean" \ - --device=0 \ + --device=7 \ --batch_size=4 \ - --max_eval_samples=-1 + --max_eval_samples=-1 \ + --beam_size=40 \ +# Run with Attn only python run_eval.py \ --source=$SOURCE \ --speechbrain_pretrained_class_name="EncoderDecoderASR" \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ - --split="test.other" \ - --device=0 \ + --split="test.clean" \ + --device=7 \ --batch_size=4 \ - --max_eval_samples=-1 + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 # Evaluate results RUNDIR=`pwd` && \ diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py index cf43b03..8719ab6 100644 --- a/speechbrain/run_eval.py +++ b/speechbrain/run_eval.py @@ -18,6 +18,8 @@ def get_model( speechbrain_repository: str, speechbrain_pretrained_class_name: str, + beam_size: int, + ctc_weight_decode: float, **kwargs, ): """Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub. @@ -29,6 +31,10 @@ def get_model( speechbrain_pretrained_class_name : str The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`. See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py + beam_size : int + Size of the beam for decoding. + ctc_weight_decode : float + Weight of the CTC prob for decoding with joint CTC/Attn. **kwargs Additional keyword arguments to pass to override the default run options of the pretrained model. @@ -54,10 +60,18 @@ def get_model( run_opts = {**run_opt_defaults, **kwargs} + overrides = {} + if beam_size: + overrides["test_beam_size"] = beam_size + + if ctc_weight_decode: + overrides["ctc_weight_decode"] = ctc_weight_decode + kwargs = { "source": f"{speechbrain_repository}", "savedir": f"pretrained_models/{speechbrain_repository}", "run_opts": run_opts, + "overrides": overrides, } try: @@ -78,7 +92,11 @@ def main(args): device = f"cuda:{args.device}" model = get_model( - args.source, args.speechbrain_pretrained_class_name, device=device + args.source, + args.speechbrain_pretrained_class_name, + args.beam_size, + args.ctc_weight_decode, + device=device ) def benchmark(batch): @@ -232,6 +250,18 @@ def benchmark(batch): default=5, help="Number of warm-up steps to run before launching the timed runs.", ) + parser.add_argument( + "--beam_size", + type=int, + default=None, + help="Beam size for decoding" + ) + parser.add_argument( + "--ctc_weight_decode", + type=int, + default=None, + help="Weight of CTC for joint CTC/Att. decoding" + ) args = parser.parse_args() parser.set_defaults(streaming=True) From 77960f50a005d0919f8c0e281ea7cb9cd3a4f37b Mon Sep 17 00:00:00 2001 From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics Date: Thu, 6 Feb 2025 19:33:25 +0000 Subject: [PATCH 2/4] fix comments --- speechbrain/run_conformer.sh | 174 ++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 4 deletions(-) diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh index a065bb0..fc8aeef 100644 --- a/speechbrain/run_conformer.sh +++ b/speechbrain/run_conformer.sh @@ -3,6 +3,8 @@ export PYTHONPATH="..":$PYTHONPATH SOURCE="speechbrain/asr-conformer-largescaleasr" +BATCH_SIZE=8 +DEVICE_ID=0 # Run with CTC+Attn python run_eval.py \ @@ -11,11 +13,89 @@ python run_eval.py \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ --split="test.clean" \ - --device=7 \ - --batch_size=4 \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ --beam_size=40 \ +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + + # Run with Attn only python run_eval.py \ --source=$SOURCE \ @@ -23,12 +103,98 @@ python run_eval.py \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ --split="test.clean" \ - --device=7 \ - --batch_size=4 \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="librispeech" \ + --split="test.other" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ --beam_size=40 \ --ctc_weight_decode=0 +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=40 \ + --ctc_weight_decode=0 + + + # Evaluate results RUNDIR=`pwd` && \ cd ../normalizer && \ From ff8a8051fbb3bfc72e03b346449c45cce0a40749 Mon Sep 17 00:00:00 2001 From: Titouan Parcollet Date: Fri, 7 Feb 2025 11:39:22 +0000 Subject: [PATCH 3/4] minor changes --- speechbrain/run_eval.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py index 8719ab6..3519c2d 100644 --- a/speechbrain/run_eval.py +++ b/speechbrain/run_eval.py @@ -56,6 +56,7 @@ def get_model( "distributed_launch": False, "distributed_backend": "nccl", "jit_module_keys": None, + "precision": "fp16", } run_opts = {**run_opt_defaults, **kwargs} @@ -102,21 +103,17 @@ def main(args): def benchmark(batch): # Load audio inputs audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]] - minibatch_size = len(audios) - # START TIMING - start_time = time.time() audios, audio_lens = batch_pad_right(audios) audios = audios.to(device) audio_lens = audio_lens.to(device) + + start_time = time.time() predictions, _ = model.transcribe_batch(audios, audio_lens) - - # END TIMING runtime = time.time() - start_time - # normalize by minibatch size since we want the per-sample time - batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] + batch["transcription_time_s"] = runtime # normalize transcriptions with English normalizer batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions] From 8fb9ad9f100de03098122c2a61e0c407e319c517 Mon Sep 17 00:00:00 2001 From: Titouan Parcollet Date: Fri, 7 Feb 2025 17:20:17 +0000 Subject: [PATCH 4/4] fix everything --- speechbrain/run_conformer.sh | 119 ++++------------------------------- speechbrain/run_eval.py | 24 +++---- 2 files changed, 24 insertions(+), 119 deletions(-) diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh index fc8aeef..553ff4e 100644 --- a/speechbrain/run_conformer.sh +++ b/speechbrain/run_conformer.sh @@ -3,7 +3,7 @@ export PYTHONPATH="..":$PYTHONPATH SOURCE="speechbrain/asr-conformer-largescaleasr" -BATCH_SIZE=8 +BATCH_SIZE=32 DEVICE_ID=0 # Run with CTC+Attn @@ -16,97 +16,8 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="librispeech" \ - --split="test.other" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="ami" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="spgispeech" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="tedlium" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="earnings22" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="gigaspeech" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="voxpopuli" \ - --split="test" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ - - -# Run with Attn only -python run_eval.py \ - --source=$SOURCE \ - --speechbrain_pretrained_class_name="EncoderDecoderASR" \ - --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ - --dataset="librispeech" \ - --split="test.clean" \ - --device=${DEVICE_ID} \ - --batch_size=${BATCH_SIZE} \ - --max_eval_samples=-1 \ - --beam_size=40 \ + --no-streaming \ + --beam_size=10 \ --ctc_weight_decode=0 python run_eval.py \ @@ -118,7 +29,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ + --beam_size=10 \ --ctc_weight_decode=0 python run_eval.py \ @@ -130,8 +41,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 + --beam_size=10 \ python run_eval.py \ --source=$SOURCE \ @@ -142,8 +52,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 + --beam_size=10 \ python run_eval.py \ --source=$SOURCE \ @@ -154,8 +63,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 + --beam_size=10 \ python run_eval.py \ --source=$SOURCE \ @@ -166,8 +74,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 + --beam_size=10 \ python run_eval.py \ --source=$SOURCE \ @@ -178,8 +85,7 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 + --beam_size=10 \ python run_eval.py \ --source=$SOURCE \ @@ -190,13 +96,10 @@ python run_eval.py \ --device=${DEVICE_ID} \ --batch_size=${BATCH_SIZE} \ --max_eval_samples=-1 \ - --beam_size=40 \ - --ctc_weight_decode=0 - - + --beam_size=10 \ # Evaluate results RUNDIR=`pwd` && \ cd ../normalizer && \ python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ -cd $RUNDIR \ No newline at end of file +cd $RUNDIR diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py index 3519c2d..9e840cd 100644 --- a/speechbrain/run_eval.py +++ b/speechbrain/run_eval.py @@ -50,7 +50,7 @@ def get_model( """ run_opt_defaults = { - "device": "cpu", + "device": "cuda", "data_parallel_count": -1, "data_parallel_backend": False, "distributed_launch": False, @@ -59,14 +59,15 @@ def get_model( "precision": "fp16", } - run_opts = {**run_opt_defaults, **kwargs} + run_opts = {**run_opt_defaults} overrides = {} if beam_size: overrides["test_beam_size"] = beam_size - if ctc_weight_decode: - overrides["ctc_weight_decode"] = ctc_weight_decode + if ctc_weight_decode == 0.0: + overrides["scorer"] = None + overrides["ctc_weight_decode"] = ctc_weight_decode kwargs = { "source": f"{speechbrain_repository}", @@ -81,7 +82,7 @@ def get_model( raise AttributeError( f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py" ) - + return model_class.from_hparams(**kwargs) @@ -103,17 +104,18 @@ def main(args): def benchmark(batch): # Load audio inputs audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]] - + minibatch_size = len(audios) audios, audio_lens = batch_pad_right(audios) audios = audios.to(device) audio_lens = audio_lens.to(device) start_time = time.time() - predictions, _ = model.transcribe_batch(audios, audio_lens) + with torch.autocast(device_type="cuda"): + predictions, _ = model.transcribe_batch(audios, audio_lens) runtime = time.time() - start_time - batch["transcription_time_s"] = runtime + batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] # normalize transcriptions with English normalizer batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions] @@ -244,7 +246,7 @@ def benchmark(batch): parser.add_argument( "--warmup_steps", type=int, - default=5, + default=2, help="Number of warm-up steps to run before launching the timed runs.", ) parser.add_argument( @@ -255,8 +257,8 @@ def benchmark(batch): ) parser.add_argument( "--ctc_weight_decode", - type=int, - default=None, + type=float, + default=0.3, help="Weight of CTC for joint CTC/Att. decoding" ) args = parser.parse_args()