Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding largescale ASR model for speechbrain #49

Merged
merged 4 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 82 additions & 8 deletions speechbrain/run_conformer.sh
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since you are using same model but different settings, I don;t think this would play well when HF outputs results as the files are saved based on model_name and eval data. Pls run on your machine to verify this. Otherwise rest all LGTM

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, i've played a bit with a local a100, and the decoding seems to slow compared to what I would have expected (what we see within SB). I'll investigate a bit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes please don't run any test - there is something fishy. I'll investigate.

Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,104 @@

export PYTHONPATH="..":$PYTHONPATH

SOURCE="speechbrain/asr-conformer-transformerlm-librispeech"
SOURCE="speechbrain/asr-conformer-largescaleasr"
BATCH_SIZE=32
DEVICE_ID=0

# Run with CTC+Attn
python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="librispeech" \
--split="test.clean" \
--device=0 \
--batch_size=4 \
--max_eval_samples=-1
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--no-streaming \
--beam_size=10 \
--ctc_weight_decode=0

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="librispeech" \
--split="test.other" \
--device=0 \
--batch_size=4 \
--max_eval_samples=-1
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \
--ctc_weight_decode=0

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="ami" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="spgispeech" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="tedlium" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="earnings22" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="gigaspeech" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

python run_eval.py \
--source=$SOURCE \
--speechbrain_pretrained_class_name="EncoderDecoderASR" \
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
--dataset="voxpopuli" \
--split="test" \
--device=${DEVICE_ID} \
--batch_size=${BATCH_SIZE} \
--max_eval_samples=-1 \
--beam_size=10 \

# Evaluate results
RUNDIR=`pwd` && \
cd ../normalizer && \
python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
cd $RUNDIR
cd $RUNDIR
53 changes: 41 additions & 12 deletions speechbrain/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
def get_model(
speechbrain_repository: str,
speechbrain_pretrained_class_name: str,
beam_size: int,
ctc_weight_decode: float,
**kwargs,
):
"""Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub.
Expand All @@ -29,6 +31,10 @@ def get_model(
speechbrain_pretrained_class_name : str
The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`.
See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py
beam_size : int
Size of the beam for decoding.
ctc_weight_decode : float
Weight of the CTC prob for decoding with joint CTC/Attn.
**kwargs
Additional keyword arguments to pass to override the default run options of the pretrained model.

Expand All @@ -44,20 +50,30 @@ def get_model(
"""

run_opt_defaults = {
"device": "cpu",
"device": "cuda",
"data_parallel_count": -1,
"data_parallel_backend": False,
"distributed_launch": False,
"distributed_backend": "nccl",
"jit_module_keys": None,
"precision": "fp16",
}

run_opts = {**run_opt_defaults, **kwargs}
run_opts = {**run_opt_defaults}

overrides = {}
if beam_size:
overrides["test_beam_size"] = beam_size

if ctc_weight_decode == 0.0:
overrides["scorer"] = None
overrides["ctc_weight_decode"] = ctc_weight_decode

kwargs = {
"source": f"{speechbrain_repository}",
"savedir": f"pretrained_models/{speechbrain_repository}",
"run_opts": run_opts,
"overrides": overrides,
}

try:
Expand All @@ -66,7 +82,7 @@ def get_model(
raise AttributeError(
f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py"
)

return model_class.from_hparams(**kwargs)


Expand All @@ -78,26 +94,27 @@ def main(args):
device = f"cuda:{args.device}"

model = get_model(
args.source, args.speechbrain_pretrained_class_name, device=device
args.source,
args.speechbrain_pretrained_class_name,
args.beam_size,
args.ctc_weight_decode,
device=device
)

def benchmark(batch):
# Load audio inputs
audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]]
minibatch_size = len(audios)

# START TIMING
start_time = time.time()

audios, audio_lens = batch_pad_right(audios)
audios = audios.to(device)
audio_lens = audio_lens.to(device)
predictions, _ = model.transcribe_batch(audios, audio_lens)

# END TIMING

start_time = time.time()
with torch.autocast(device_type="cuda"):
predictions, _ = model.transcribe_batch(audios, audio_lens)
runtime = time.time() - start_time

# normalize by minibatch size since we want the per-sample time
batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]

# normalize transcriptions with English normalizer
Expand Down Expand Up @@ -229,9 +246,21 @@ def benchmark(batch):
parser.add_argument(
"--warmup_steps",
type=int,
default=5,
default=2,
help="Number of warm-up steps to run before launching the timed runs.",
)
parser.add_argument(
"--beam_size",
type=int,
default=None,
help="Beam size for decoding"
)
parser.add_argument(
"--ctc_weight_decode",
type=float,
default=0.3,
help="Weight of CTC for joint CTC/Att. decoding"
)
args = parser.parse_args()
parser.set_defaults(streaming=True)

Expand Down