From 294acf89903c258cb0d0ed28a01a5162f87a5c82 Mon Sep 17 00:00:00 2001
From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics
 <t.parcollet@sruk-ccn4.eu.corp.samsungelectronics.net>
Date: Thu, 6 Feb 2025 17:17:11 +0000
Subject: [PATCH 1/4] best SB model

---
 speechbrain/run_conformer.sh | 17 +++++++++++------
 speechbrain/run_eval.py      | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh
index 3369995..a065bb0 100644
--- a/speechbrain/run_conformer.sh
+++ b/speechbrain/run_conformer.sh
@@ -2,27 +2,32 @@
 
 export PYTHONPATH="..":$PYTHONPATH
 
-SOURCE="speechbrain/asr-conformer-transformerlm-librispeech"
+SOURCE="speechbrain/asr-conformer-largescaleasr"
 
+# Run with CTC+Attn
 python run_eval.py \
   --source=$SOURCE \
   --speechbrain_pretrained_class_name="EncoderDecoderASR" \
   --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
   --dataset="librispeech" \
   --split="test.clean" \
-  --device=0 \
+  --device=7 \
   --batch_size=4 \
-  --max_eval_samples=-1
+  --max_eval_samples=-1 \
+  --beam_size=40 \
 
+# Run with Attn only
 python run_eval.py \
   --source=$SOURCE \
   --speechbrain_pretrained_class_name="EncoderDecoderASR" \
   --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
   --dataset="librispeech" \
-  --split="test.other" \
-  --device=0 \
+  --split="test.clean" \
+  --device=7 \
   --batch_size=4 \
-  --max_eval_samples=-1
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
 
 # Evaluate results
 RUNDIR=`pwd` && \
diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py
index cf43b03..8719ab6 100644
--- a/speechbrain/run_eval.py
+++ b/speechbrain/run_eval.py
@@ -18,6 +18,8 @@
 def get_model(
     speechbrain_repository: str,
     speechbrain_pretrained_class_name: str,
+    beam_size: int,
+    ctc_weight_decode: float,
     **kwargs,
 ):
     """Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub.
@@ -29,6 +31,10 @@ def get_model(
     speechbrain_pretrained_class_name : str
         The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`.
         See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py
+    beam_size : int
+        Size of the beam for decoding.
+    ctc_weight_decode : float
+        Weight of the CTC prob for decoding with joint CTC/Attn.
     **kwargs
         Additional keyword arguments to pass to override the default run options of the pretrained model.
 
@@ -54,10 +60,18 @@ def get_model(
 
     run_opts = {**run_opt_defaults, **kwargs}
 
+    overrides = {}
+    if beam_size:
+        overrides["test_beam_size"] = beam_size
+    
+    if ctc_weight_decode:
+        overrides["ctc_weight_decode"] = ctc_weight_decode
+
     kwargs = {
         "source": f"{speechbrain_repository}",
         "savedir": f"pretrained_models/{speechbrain_repository}",
         "run_opts": run_opts,
+        "overrides": overrides,
     }
 
     try:
@@ -78,7 +92,11 @@ def main(args):
         device = f"cuda:{args.device}"
 
     model = get_model(
-        args.source, args.speechbrain_pretrained_class_name, device=device
+        args.source, 
+        args.speechbrain_pretrained_class_name, 
+        args.beam_size,
+        args.ctc_weight_decode, 
+        device=device
     )
 
     def benchmark(batch):
@@ -232,6 +250,18 @@ def benchmark(batch):
         default=5,
         help="Number of warm-up steps to run before launching the timed runs.",
     )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=None,
+        help="Beam size for decoding"
+    )
+    parser.add_argument(
+        "--ctc_weight_decode",
+        type=int,
+        default=None,
+        help="Weight of CTC for joint CTC/Att. decoding"
+    )
     args = parser.parse_args()
     parser.set_defaults(streaming=True)
 

From 77960f50a005d0919f8c0e281ea7cb9cd3a4f37b Mon Sep 17 00:00:00 2001
From: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics
 <t.parcollet@sruk-ccn4.eu.corp.samsungelectronics.net>
Date: Thu, 6 Feb 2025 19:33:25 +0000
Subject: [PATCH 2/4] fix comments

---
 speechbrain/run_conformer.sh | 174 ++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 4 deletions(-)

diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh
index a065bb0..fc8aeef 100644
--- a/speechbrain/run_conformer.sh
+++ b/speechbrain/run_conformer.sh
@@ -3,6 +3,8 @@
 export PYTHONPATH="..":$PYTHONPATH
 
 SOURCE="speechbrain/asr-conformer-largescaleasr"
+BATCH_SIZE=8
+DEVICE_ID=0
 
 # Run with CTC+Attn
 python run_eval.py \
@@ -11,11 +13,89 @@ python run_eval.py \
   --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
   --dataset="librispeech" \
   --split="test.clean" \
-  --device=7 \
-  --batch_size=4 \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
   --beam_size=40 \
 
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="librispeech" \
+  --split="test.other" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="ami" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="spgispeech" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="tedlium" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="earnings22" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="gigaspeech" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="voxpopuli" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+
+
 # Run with Attn only
 python run_eval.py \
   --source=$SOURCE \
@@ -23,12 +103,98 @@ python run_eval.py \
   --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
   --dataset="librispeech" \
   --split="test.clean" \
-  --device=7 \
-  --batch_size=4 \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="librispeech" \
+  --split="test.other" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="ami" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="spgispeech" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
   --beam_size=40 \
   --ctc_weight_decode=0
 
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="tedlium" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="earnings22" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="gigaspeech" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+python run_eval.py \
+  --source=$SOURCE \
+  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
+  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
+  --dataset="voxpopuli" \
+  --split="test" \
+  --device=${DEVICE_ID} \
+  --batch_size=${BATCH_SIZE} \
+  --max_eval_samples=-1 \
+  --beam_size=40 \
+  --ctc_weight_decode=0
+
+
+
 # Evaluate results
 RUNDIR=`pwd` && \
 cd ../normalizer && \

From ff8a8051fbb3bfc72e03b346449c45cce0a40749 Mon Sep 17 00:00:00 2001
From: Titouan Parcollet <parcollet.titouan@gmail.com>
Date: Fri, 7 Feb 2025 11:39:22 +0000
Subject: [PATCH 3/4] minor changes

---
 speechbrain/run_eval.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py
index 8719ab6..3519c2d 100644
--- a/speechbrain/run_eval.py
+++ b/speechbrain/run_eval.py
@@ -56,6 +56,7 @@ def get_model(
         "distributed_launch": False,
         "distributed_backend": "nccl",
         "jit_module_keys": None,
+        "precision": "fp16",
     }
 
     run_opts = {**run_opt_defaults, **kwargs}
@@ -102,21 +103,17 @@ def main(args):
     def benchmark(batch):
         # Load audio inputs
         audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]]
-        minibatch_size = len(audios)
 
-        # START TIMING
-        start_time = time.time()
 
         audios, audio_lens = batch_pad_right(audios)
         audios = audios.to(device)
         audio_lens = audio_lens.to(device)
+        
+        start_time = time.time()
         predictions, _ = model.transcribe_batch(audios, audio_lens)
-
-        # END TIMING
         runtime = time.time() - start_time
 
-        # normalize by minibatch size since we want the per-sample time
-        batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]
+        batch["transcription_time_s"] = runtime
 
         # normalize transcriptions with English normalizer
         batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions]

From 8fb9ad9f100de03098122c2a61e0c407e319c517 Mon Sep 17 00:00:00 2001
From: Titouan Parcollet <parcollet.titouan@gmail.com>
Date: Fri, 7 Feb 2025 17:20:17 +0000
Subject: [PATCH 4/4] fix everything

---
 speechbrain/run_conformer.sh | 119 ++++-------------------------------
 speechbrain/run_eval.py      |  24 +++----
 2 files changed, 24 insertions(+), 119 deletions(-)

diff --git a/speechbrain/run_conformer.sh b/speechbrain/run_conformer.sh
index fc8aeef..553ff4e 100644
--- a/speechbrain/run_conformer.sh
+++ b/speechbrain/run_conformer.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="..":$PYTHONPATH
 
 SOURCE="speechbrain/asr-conformer-largescaleasr"
-BATCH_SIZE=8
+BATCH_SIZE=32
 DEVICE_ID=0
 
 # Run with CTC+Attn
@@ -16,97 +16,8 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="librispeech" \
-  --split="test.other" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="ami" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="spgispeech" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="tedlium" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="earnings22" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="gigaspeech" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="voxpopuli" \
-  --split="test" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
-
-
-# Run with Attn only
-python run_eval.py \
-  --source=$SOURCE \
-  --speechbrain_pretrained_class_name="EncoderDecoderASR" \
-  --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
-  --dataset="librispeech" \
-  --split="test.clean" \
-  --device=${DEVICE_ID} \
-  --batch_size=${BATCH_SIZE} \
-  --max_eval_samples=-1 \
-  --beam_size=40 \
+  --no-streaming \
+  --beam_size=10 \
   --ctc_weight_decode=0
 
 python run_eval.py \
@@ -118,7 +29,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
+  --beam_size=10 \
   --ctc_weight_decode=0
 
 python run_eval.py \
@@ -130,8 +41,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
+  --beam_size=10 \
 
 python run_eval.py \
   --source=$SOURCE \
@@ -142,8 +52,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
+  --beam_size=10 \
 
 python run_eval.py \
   --source=$SOURCE \
@@ -154,8 +63,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
+  --beam_size=10 \
 
 python run_eval.py \
   --source=$SOURCE \
@@ -166,8 +74,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
+  --beam_size=10 \
 
 python run_eval.py \
   --source=$SOURCE \
@@ -178,8 +85,7 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
+  --beam_size=10 \
 
 python run_eval.py \
   --source=$SOURCE \
@@ -190,13 +96,10 @@ python run_eval.py \
   --device=${DEVICE_ID} \
   --batch_size=${BATCH_SIZE} \
   --max_eval_samples=-1 \
-  --beam_size=40 \
-  --ctc_weight_decode=0
-
-
+  --beam_size=10 \
 
 # Evaluate results
 RUNDIR=`pwd` && \
 cd ../normalizer && \
 python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
-cd $RUNDIR
\ No newline at end of file
+cd $RUNDIR
diff --git a/speechbrain/run_eval.py b/speechbrain/run_eval.py
index 3519c2d..9e840cd 100644
--- a/speechbrain/run_eval.py
+++ b/speechbrain/run_eval.py
@@ -50,7 +50,7 @@ def get_model(
     """
 
     run_opt_defaults = {
-        "device": "cpu",
+        "device": "cuda",
         "data_parallel_count": -1,
         "data_parallel_backend": False,
         "distributed_launch": False,
@@ -59,14 +59,15 @@ def get_model(
         "precision": "fp16",
     }
 
-    run_opts = {**run_opt_defaults, **kwargs}
+    run_opts = {**run_opt_defaults}
 
     overrides = {}
     if beam_size:
         overrides["test_beam_size"] = beam_size
     
-    if ctc_weight_decode:
-        overrides["ctc_weight_decode"] = ctc_weight_decode
+    if ctc_weight_decode == 0.0:
+        overrides["scorer"] = None
+    overrides["ctc_weight_decode"] = ctc_weight_decode
 
     kwargs = {
         "source": f"{speechbrain_repository}",
@@ -81,7 +82,7 @@ def get_model(
         raise AttributeError(
             f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py"
         )
-
+    
     return model_class.from_hparams(**kwargs)
 
 
@@ -103,17 +104,18 @@ def main(args):
     def benchmark(batch):
         # Load audio inputs
         audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]]
-
+        minibatch_size = len(audios)
 
         audios, audio_lens = batch_pad_right(audios)
         audios = audios.to(device)
         audio_lens = audio_lens.to(device)
         
         start_time = time.time()
-        predictions, _ = model.transcribe_batch(audios, audio_lens)
+        with torch.autocast(device_type="cuda"):
+            predictions, _ = model.transcribe_batch(audios, audio_lens)
         runtime = time.time() - start_time
 
-        batch["transcription_time_s"] = runtime
+        batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]
 
         # normalize transcriptions with English normalizer
         batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions]
@@ -244,7 +246,7 @@ def benchmark(batch):
     parser.add_argument(
         "--warmup_steps",
         type=int,
-        default=5,
+        default=2,
         help="Number of warm-up steps to run before launching the timed runs.",
     )
     parser.add_argument(
@@ -255,8 +257,8 @@ def benchmark(batch):
     )
     parser.add_argument(
         "--ctc_weight_decode",
-        type=int,
-        default=None,
+        type=float,
+        default=0.3,
         help="Weight of CTC for joint CTC/Att. decoding"
     )
     args = parser.parse_args()