diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh index 3369995..553ff4e 100644 --- a/speechbrain/run_conformer.sh +++ b/speechbrain/run_conformer.sh @@ -2,17 +2,23 @@ export PYTHONPATH="..":$PYTHONPATH -SOURCE="speechbrain/asr-conformer-transformerlm-librispeech" +SOURCE="speechbrain/asr-conformer-largescaleasr" +BATCH_SIZE=32 +DEVICE_ID=0 +# Run with CTC+Attn python run_eval.py \ --source=$SOURCE \ --speechbrain_pretrained_class_name="EncoderDecoderASR" \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ --split="test.clean" \ - --device=0 \ - --batch_size=4 \ - --max_eval_samples=-1 + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --no-streaming \ + --beam_size=10 \ + --ctc_weight_decode=0 python run_eval.py \ --source=$SOURCE \ @@ -20,12 +26,80 @@ python run_eval.py \ --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ --dataset="librispeech" \ --split="test.other" \ - --device=0 \ - --batch_size=4 \ - --max_eval_samples=-1 + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + --ctc_weight_decode=0 + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="ami" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="spgispeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="tedlium" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="earnings22" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="gigaspeech" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ + +python run_eval.py \ + --source=$SOURCE \ + --speechbrain_pretrained_class_name="EncoderDecoderASR" \ + --dataset_path="hf-audio/esb-datasets-test-only-sorted" \ + --dataset="voxpopuli" \ + --split="test" \ + --device=${DEVICE_ID} \ + --batch_size=${BATCH_SIZE} \ + --max_eval_samples=-1 \ + --beam_size=10 \ # Evaluate results RUNDIR=`pwd` && \ cd ../normalizer && \ python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \ -cd $RUNDIR \ No newline at end of file +cd $RUNDIR diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py index cf43b03..9e840cd 100644 --- a/speechbrain/run_eval.py +++ b/speechbrain/run_eval.py @@ -18,6 +18,8 @@ def get_model( speechbrain_repository: str, speechbrain_pretrained_class_name: str, + beam_size: int, + ctc_weight_decode: float, **kwargs, ): """Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub. @@ -29,6 +31,10 @@ def get_model( speechbrain_pretrained_class_name : str The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`. See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py + beam_size : int + Size of the beam for decoding. + ctc_weight_decode : float + Weight of the CTC prob for decoding with joint CTC/Attn. **kwargs Additional keyword arguments to pass to override the default run options of the pretrained model. @@ -44,20 +50,30 @@ def get_model( """ run_opt_defaults = { - "device": "cpu", + "device": "cuda", "data_parallel_count": -1, "data_parallel_backend": False, "distributed_launch": False, "distributed_backend": "nccl", "jit_module_keys": None, + "precision": "fp16", } - run_opts = {**run_opt_defaults, **kwargs} + run_opts = {**run_opt_defaults} + + overrides = {} + if beam_size: + overrides["test_beam_size"] = beam_size + + if ctc_weight_decode == 0.0: + overrides["scorer"] = None + overrides["ctc_weight_decode"] = ctc_weight_decode kwargs = { "source": f"{speechbrain_repository}", "savedir": f"pretrained_models/{speechbrain_repository}", "run_opts": run_opts, + "overrides": overrides, } try: @@ -66,7 +82,7 @@ def get_model( raise AttributeError( f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py" ) - + return model_class.from_hparams(**kwargs) @@ -78,7 +94,11 @@ def main(args): device = f"cuda:{args.device}" model = get_model( - args.source, args.speechbrain_pretrained_class_name, device=device + args.source, + args.speechbrain_pretrained_class_name, + args.beam_size, + args.ctc_weight_decode, + device=device ) def benchmark(batch): @@ -86,18 +106,15 @@ def benchmark(batch): audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]] minibatch_size = len(audios) - # START TIMING - start_time = time.time() - audios, audio_lens = batch_pad_right(audios) audios = audios.to(device) audio_lens = audio_lens.to(device) - predictions, _ = model.transcribe_batch(audios, audio_lens) - - # END TIMING + + start_time = time.time() + with torch.autocast(device_type="cuda"): + predictions, _ = model.transcribe_batch(audios, audio_lens) runtime = time.time() - start_time - # normalize by minibatch size since we want the per-sample time batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size] # normalize transcriptions with English normalizer @@ -229,9 +246,21 @@ def benchmark(batch): parser.add_argument( "--warmup_steps", type=int, - default=5, + default=2, help="Number of warm-up steps to run before launching the timed runs.", ) + parser.add_argument( + "--beam_size", + type=int, + default=None, + help="Beam size for decoding" + ) + parser.add_argument( + "--ctc_weight_decode", + type=float, + default=0.3, + help="Weight of CTC for joint CTC/Att. decoding" + ) args = parser.parse_args() parser.set_defaults(streaming=True)