open-asr-leaderboard-speechbrain

Running

App Files Files Community

bezzam HF Staff commited on 7 days ago

Commit

3cf412c

verified ·

1 Parent(s): a9dd49d

Update run_eval.py

Browse files

Files changed (1) hide show

run_eval.py +201 -163

run_eval.py CHANGED Viewed

@@ -1,207 +1,224 @@
 import argparse
-import io
-import os
-import torch
 import evaluate
-import soundfile
-from tqdm import tqdm
 from normalizer import data_utils
-import numpy as np
-from nemo.collections.asr.models import ASRModel
-import time
-wer_metric = evaluate.load("wer")
 def main(args):
-    data_cache_root = args.data_cache_root if args.data_cache_root is not None else os.getcwd()
-    DATA_CACHE_DIR = os.path.join(data_cache_root, "audio_cache")
-    DATASET_NAME = args.dataset
-    SPLIT_NAME = args.split
-    CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME)
-    if not os.path.exists(CACHE_DIR):
-        os.makedirs(CACHE_DIR)
-    if args.device >= 0:
-        device = torch.device(f"cuda:{args.device}")
-        compute_dtype=torch.bfloat16
     else:
-        device = torch.device("cpu")
-        compute_dtype=torch.float32
-    if args.model_id.endswith(".nemo"):
-        asr_model = ASRModel.restore_from(args.model_id, map_location=device)
-    else:
-        asr_model = ASRModel.from_pretrained(args.model_id, map_location=device)  # type: ASRModel
-    asr_model.to(compute_dtype)
-    asr_model.eval()
-    print(f"Model size: {sum(p.numel() for p in asr_model.parameters()) / 1e9:.2f}B parameters")
-    dataset = data_utils.load_data(args)
-    if args.max_eval_samples is not None and args.max_eval_samples > 0:
-        print(f"Subsampling dataset to first {args.max_eval_samples} samples !")
-        dataset = dataset.take(args.max_eval_samples)
-    # Prepare data FIRST - this casts audio to proper format with "array" and "sampling_rate" keys
-    dataset = data_utils.prepare_data(dataset)
-    def download_audio_files(batch):
-        # download audio files and write the paths, transcriptions and durations to a manifest file
-        audio_paths = []
-        original_audio_paths = []
-        durations = []
-        file_names = batch.get("file_name", [None] * len(batch["audio"]))
-        # Use 'id' column if available, otherwise generate sequential IDs
-        if "id" in batch:
-            ids = batch["id"]
         else:
-            # Generate IDs based on index
-            start_idx = len([f for f in os.listdir(CACHE_DIR) if f.endswith('.wav')]) if os.path.exists(CACHE_DIR) else 0
-            ids = [f"sample_{start_idx + i}" for i in range(len(batch["audio"]))]
-        for id, file_name, audio_sample in zip(ids, file_names, batch["audio"]):
-            # first step added here to make ID and wav filenames unique
-            # several datasets like earnings22 have a hierarchical structure
-            # for eg. earnings22/test/4432298/281.wav, earnings22/test/4450488/281.wav
-            # lhotse uses the filename (281.wav) here as unique ID to create and name cuts
-            # ref: https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/collation.py#L186
-            original_id = id  # preserve before sanitization for use as audio_filepath
-            id = id.replace('/', '_').removesuffix('.wav')
-            audio_path = os.path.join(CACHE_DIR, f"{id}.wav")
-            audio_array = np.float32(audio_sample["array"])
-            sample_rate = audio_sample["sampling_rate"]
-            if not os.path.exists(audio_path):
-                os.makedirs(os.path.dirname(audio_path), exist_ok=True)
-                soundfile.write(audio_path, audio_array, sample_rate)
-            audio_paths.append(audio_path)
-            # Prefer the original file_name from the dataset; fall back to the
-            # sample id (before path-sanitization) so audio_filepath in the
-            # JSONL is always a meaningful identifier rather than "sample_N".
-            if file_name is not None:
-                original_audio_paths.append(os.path.basename(str(file_name)))
-            else:
-                original_audio_paths.append(original_id)
-            durations.append(len(audio_array) / sample_rate)
-        batch["references"] = batch["norm_text"]
-        batch["audio_filepaths"] = audio_paths
-        batch["original_audio_filepaths"] = original_audio_paths
-        batch["durations"] = durations
-        return batch
-    if asr_model.cfg.decoding.strategy != "beam":
-        asr_model.cfg.decoding.strategy = "greedy_batch"
-        asr_model.change_decoding_strategy(asr_model.cfg.decoding)
-    # prepraing the offline dataset
-    dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, remove_columns=["audio"])
-    # Write manifest from daraset batch using json and keys audio_filepath, duration, text
-    all_data = {
-        "audio_filepaths": [],
-        "original_audio_filepaths": [],
-        "durations": [],
         "references": [],
     }
-    data_itr = iter(dataset)
-    for data in tqdm(data_itr, desc="Downloading Samples"):
-        for key in all_data:
-            all_data[key].append(data[key])
-    # Sort audio_filepaths and references based on durations values
-    sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True)
-    all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices]
-    all_data["original_audio_filepaths"] = [all_data["original_audio_filepaths"][i] for i in sorted_indices]
-    all_data["references"] = [all_data["references"][i] for i in sorted_indices]
-    all_data["durations"] = [all_data["durations"][i] for i in sorted_indices]
-    total_time = 0
-    for _ in range(2): # warmup once and calculate rtf
-        if _ == 0:
-            audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches
-        else:
-            audio_files = all_data["audio_filepaths"]
-        start_time = time.time()
-        with torch.inference_mode(), torch.no_grad():
-            if 'canary' in args.model_id and 'v2' not in args.model_id:
-                pnc = 'nopnc'
-            else:
-                pnc = 'pnc'
-            if 'canary' in args.model_id:
-                transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, pnc=pnc, num_workers=1)
-            else:
-                transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, num_workers=1)
-        end_time = time.time()
-        if _ == 1:
-            total_time += end_time - start_time
-    total_time = total_time
-    # normalize transcriptions with English normalizer
-    if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
-        transcriptions = transcriptions[0]
-    predictions = [data_utils.normalizer(pred.text) for pred in transcriptions]
-    avg_time = total_time / len(all_data["audio_filepaths"])
     # Write manifest results (WER and RTFX)
     manifest_path = data_utils.write_manifest(
-        all_data["references"],
-        predictions,
-        args.model_id,
         args.dataset_path,
         args.dataset,
         args.split,
-        audio_length=all_data["durations"],
-        transcription_time=[avg_time] * len(all_data["audio_filepaths"]),
-        audio_filepaths=all_data["original_audio_filepaths"],
     )
     print("Results saved at path:", os.path.abspath(manifest_path))
-    wer = wer_metric.compute(references=all_data['references'], predictions=predictions)
     wer = round(100 * wer, 2)
-    # transcription_time = sum(all_results["transcription_time"])
-    audio_length = sum(all_data["durations"])
-    rtfx = audio_length / total_time
-    rtfx = round(rtfx, 2)
-    print("RTFX:", rtfx)
-    print("WER:", wer, "%")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.",
     )
     parser.add_argument(
-        '--dataset_path', type=str, default='hf-audio/open-asr-leaderboard', help='Dataset path. By default, it is `hf-audio/open-asr-leaderboard`'
     )
     parser.add_argument(
-        '--data_cache_root', type=str, default=None, help='Root directory for audio cache. By default, it is the current working directory.'
     )
     parser.add_argument(
         "--dataset",
@@ -223,7 +240,10 @@ if __name__ == "__main__":
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
     parser.add_argument(
-        "--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.",
     )
     parser.add_argument(
         "--max_eval_samples",
@@ -236,6 +256,24 @@ if __name__ == "__main__":
         action="store_true",
         help="Stream the dataset lazily over the network instead of downloading it in full before the evaluation. Off by default for reproducible benchmark timings.",
     )
     args = parser.parse_args()
     main(args)

+"""Script to evaluate a pretrained SpeechBrain model from the 🤗 Hub.
+Authors
+* Adel Moumen 2023 <adel.moumen@univ-avignon.fr>
+* Sanchit Gandhi 2024 <sanchit@huggingface.co>
+"""
 import argparse
+import time
 import evaluate
 from normalizer import data_utils
+from tqdm import tqdm
+import torch
+import speechbrain.inference.ASR as ASR
+from speechbrain.utils.data_utils import batch_pad_right
+import os
+def get_model(
+    speechbrain_repository: str,
+    speechbrain_pretrained_class_name: str,
+    beam_size: int,
+    ctc_weight_decode: float,
+    **kwargs,
+):
+    """Fetch a pretrained SpeechBrain model from the SpeechBrain 🤗 Hub.
+    Arguments
+    ---------
+    speechbrain_repository : str
+        The name of the SpeechBrain repository to fetch the pretrained model from. E.g. `asr-crdnn-rnnlm-librispeech`.
+    speechbrain_pretrained_class_name : str
+        The name of the SpeechBrain pretrained class to fetch. E.g. `EncoderASR`.
+        See: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py
+    beam_size : int
+        Size of the beam for decoding.
+    ctc_weight_decode : float
+        Weight of the CTC prob for decoding with joint CTC/Attn.
+    **kwargs
+        Additional keyword arguments to pass to override the default run options of the pretrained model.
+    Returns
+    -------
+    SpeechBrain pretrained model
+        The Pretrained model.
+    Example
+    -------
+    >>> from open_asr_leaderboard.speechbrain.run_eval import get_model
+    >>> model = get_model("asr-crdnn-rnnlm-librispeech", "EncoderASR", device="cuda:0")
+    """
+    run_opt_defaults = {
+        "device": "cuda",
+        "data_parallel_count": -1,
+        "data_parallel_backend": False,
+        "distributed_launch": False,
+        "distributed_backend": "nccl",
+        "jit_module_keys": None,
+        "precision": "fp16",
+    }
+    run_opts = {**run_opt_defaults}
+    overrides_dict = {}
+    if beam_size:
+        overrides_dict["test_beam_size"] = beam_size
+    if ctc_weight_decode is not None:
+        overrides_dict["ctc_weight_decode"] = ctc_weight_decode
+    # Build overrides as a YAML string so hyperpyyaml applies them during
+    # parsing (before class imports), preventing ImportError for missing classes.
+    override_lines = []
+    if ctc_weight_decode is not None and ctc_weight_decode == 0.0:
+        override_lines.append("scorer: null")
+    for k, v in overrides_dict.items():
+        override_lines.append(f"{k}: {v}")
+    overrides_str = "\n".join(override_lines) if override_lines else None
+    kwargs = {
+        "source": f"{speechbrain_repository}",
+        "savedir": f"pretrained_models/{speechbrain_repository}",
+        "run_opts": run_opts,
+    }
+    if overrides_str:
+        kwargs["overrides"] = overrides_str
+    try:
+        model_class = getattr(ASR, speechbrain_pretrained_class_name)
+    except AttributeError:
+        raise AttributeError(
+            f"SpeechBrain Pretrained class: {speechbrain_pretrained_class_name} not found in pretrained.py"
+        )
+    return model_class.from_hparams(**kwargs)
 def main(args):
+    """Run the evaluation script."""
+    if args.device == -1:
+        device = "cpu"
     else:
+        device = f"cuda:{args.device}"
+    model = get_model(
+        args.source,
+        args.speechbrain_pretrained_class_name,
+        args.beam_size,
+        args.ctc_weight_decode,
+        device=device
+    )
+    print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")
+    def benchmark(batch):
+        # Load audio inputs
+        audios = [torch.from_numpy(sample["array"]) for sample in batch["audio"]]
+        minibatch_size = len(audios)
+        sampling_rate = batch["audio"][0]["sampling_rate"]
+        batch["audio_length_s"] = [len(sample["array"]) / sampling_rate for sample in batch["audio"]]
+        batch["audio_filepath"] = data_utils.extract_audio_filepaths_from_batch(batch, minibatch_size)
+        audios, audio_lens = batch_pad_right(audios)
+        audios = audios.to(device)
+        audio_lens = audio_lens.to(device)
+        start_time = time.time()
+        with torch.autocast(device_type="cuda"):
+            predictions, _ = model.transcribe_batch(audios, audio_lens)
+        runtime = time.time() - start_time
+        batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]
+        # normalize transcriptions with English normalizer
+        batch["predictions"] = [data_utils.normalizer(pred) for pred in predictions]
+        batch["references"] = batch["norm_text"]
+        return batch
+    if args.warmup_steps is not None:
+        dataset = data_utils.load_data(args)
+        dataset = data_utils.prepare_data(dataset)
+        num_warmup_samples = args.warmup_steps * args.batch_size
+        if args.streaming:
+            warmup_dataset = dataset.take(num_warmup_samples)
         else:
+            warmup_dataset = dataset.select(range(min(num_warmup_samples, len(dataset))))
+        warmup_dataset = iter(warmup_dataset.map(benchmark, batch_size=args.batch_size, batched=True))
+        for _ in tqdm(warmup_dataset, desc="Warming up..."):
+            continue
+    dataset = data_utils.load_data(args)
+    if args.max_eval_samples is not None and args.max_eval_samples > 0:
+        print(f"Subsampling dataset to first {args.max_eval_samples} samples!")
+        if args.streaming:
+            dataset = dataset.take(args.max_eval_samples)
+        else:
+            dataset = dataset.select(range(min(args.max_eval_samples, len(dataset))))
+    dataset = data_utils.prepare_data(dataset)
+    dataset = dataset.map(
+        benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"],
+    )
+    all_results = {
+        "audio_length_s": [],
+        "transcription_time_s": [],
+        "predictions": [],
         "references": [],
+        "audio_filepath": [],
     }
+    result_iter = iter(dataset)
+    for result in tqdm(result_iter, desc="Samples..."):
+        for key in all_results:
+            all_results[key].append(result[key])
     # Write manifest results (WER and RTFX)
     manifest_path = data_utils.write_manifest(
+        all_results["references"],
+        all_results["predictions"],
+        args.source,
         args.dataset_path,
         args.dataset,
         args.split,
+        audio_length=all_results["audio_length_s"],
+        transcription_time=all_results["transcription_time_s"],
+        audio_filepaths=all_results["audio_filepath"],
     )
     print("Results saved at path:", os.path.abspath(manifest_path))
+    wer_metric = evaluate.load("wer")
+    wer = wer_metric.compute(
+        references=all_results["references"], predictions=all_results["predictions"]
+    )
     wer = round(100 * wer, 2)
+    rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2)
+    print("WER:", wer, "%", "RTFx:", rtfx)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
+        "--source",
+        type=str,
+        required=True,
+        help="SpeechBrain model repository. E.g. `asr-crdnn-rnnlm-librispeech`",
     )
     parser.add_argument(
+        "--speechbrain_pretrained_class_name",
+        type=str,
+        required=True,
+        help="SpeechBrain pretrained class name. E.g. `EncoderASR`",
     )
     parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="hf-audio/open-asr-leaderboard",
+        help="Dataset path. By default, it is `hf-audio/open-asr-leaderboard`",
     )
     parser.add_argument(
         "--dataset",
         help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
     )
     parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=16,
+        help="Number of samples to go through each streamed batch.",
     )
     parser.add_argument(
         "--max_eval_samples",
         action="store_true",
         help="Stream the dataset lazily over the network instead of downloading it in full before the evaluation. Off by default for reproducible benchmark timings.",
     )
+    parser.add_argument(
+        "--warmup_steps",
+        type=int,
+        default=2,
+        help="Number of warm-up steps to run before launching the timed runs.",
+    )
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=None,
+        help="Beam size for decoding"
+    )
+    parser.add_argument(
+        "--ctc_weight_decode",
+        type=float,
+        default=None,
+        help="Weight of CTC for joint CTC/Att. decoding. Only pass for models that support it (e.g. EncoderDecoderASR)."
+    )
     args = parser.parse_args()
     main(args)