From 7681d7119618cd9047096bff52158bf28bca1d0f Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Mon, 10 Mar 2025 11:39:52 -0700 Subject: [PATCH 001/107] initial commit for end-of-utterance detection Signed-off-by: Weiqing Wang --- .../speech_to_text_rnnt_bpe_eou.py | 91 +++++++++++++ .../asr/data/audio_to_eou_label_lhotse.py | 83 ++++++++++++ nemo/collections/asr/models/__init__.py | 1 + .../asr/models/rnnt_bpe_eou_models.py | 120 ++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py create mode 100644 nemo/collections/asr/data/audio_to_eou_label_lhotse.py create mode 100644 nemo/collections/asr/models/rnnt_bpe_eou_models.py diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py new file mode 100644 index 000000000000..ba6d6db1ca89 --- /dev/null +++ b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py @@ -0,0 +1,91 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +# Preparing the Tokenizer for the dataset +Use the `process_asr_text_tokenizer.py` script under /scripts/tokenizers/ in order to prepare the tokenizer. + +```sh +python /scripts/tokenizers/process_asr_text_tokenizer.py \ + --manifest= + OR + --data_file= \ + --data_root="" \ + --vocab_size= \ + --tokenizer=<"spe" or "wpe"> \ + --no_lower_case \ + --spe_type=<"unigram", "bpe", "char" or "word"> \ + --spe_character_coverage=1.0 \ + --log +``` + +# Training the model +```sh +python speech_to_text_rnnt_bpe.py \ + # (Optional: --config-path= --config-name=) \ + model.train_ds.manifest_filepath= \ + model.validation_ds.manifest_filepath= \ + model.tokenizer.dir= \ + model.tokenizer.type= \ + trainer.devices=-1 \ + trainer.accelerator="gpu" \ + trainer.strategy="ddp" \ + trainer.max_epochs=100 \ + model.optim.name="adamw" \ + model.optim.lr=0.001 \ + model.optim.betas=[0.9,0.999] \ + model.optim.weight_decay=0.0001 \ + model.optim.sched.warmup_steps=2000 + exp_manager.create_wandb_logger=True \ + exp_manager.wandb_logger_kwargs.name="" \ + exp_manager.wandb_logger_kwargs.project="" +``` + +# Fine-tune a model + +For documentation on fine-tuning this model, please visit - +https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations + +""" + +import lightning.pytorch as pl +from omegaconf import OmegaConf + +from nemo.collections.asr.models import EncDecRNNTBPEEOUModel +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg + + +@hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt_bpe") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) + exp_manager(trainer, cfg.get("exp_manager", None)) + asr_model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) + + # Initialize the weights of the model from another model, if provided via config + asr_model.maybe_init_from_pretrained_checkpoint(cfg) + + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py new file mode 100644 index 000000000000..8aff5d46d88b --- /dev/null +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -0,0 +1,83 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Dict, Optional, Tuple + +import torch.utils.data +from lhotse.dataset import AudioSamples +from lhotse.dataset.collation import collate_vectors + +from nemo.collections.common.tokenizers.aggregate_tokenizer import TokenizerWrapper +from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec +from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType + + +class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): + """ + This dataset is a Lhotse version of diarization dataset in audio_to_diar_label.py. + Unlike native NeMo datasets, Lhotse dataset defines only the mapping from + a CutSet (meta-data) to a mini-batch with PyTorch tensors. + Specifically, it performs tokenization, I/O, augmentation, and feature extraction (if any). + Managing data, sampling, de-duplication across workers/nodes etc. is all handled + by Lhotse samplers instead. + """ + + @property + def output_types(self) -> Optional[Dict[str, NeuralType]]: + """Define the output types of the dataset.""" + return { + 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), + 'a_sig_length': NeuralType(tuple('B'), LengthsType()), + 'targets': NeuralType(('B', 'T', 'N'), LabelsType()), + 'target_length': NeuralType(tuple('B'), LengthsType()), + 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), + } + + def __init__(self, cfg, tokenizer: TokenizerSpec, return_cuts: bool = False): + super().__init__() + self.cfg = cfg + self.tokenizer = TokenizerWrapper(tokenizer) + self.load_audio = AudioSamples(fault_tolerant=True) + self.num_sample_per_mel_frame = int( + self.cfg.get('window_stride', 0.01) * self.cfg.get('sample_rate', 16000) + ) # 160 samples for every 1ms by default + self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) + self.eou_token = self.cfg.get('eou_token', '') + + def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: + audio, audio_lens, cuts = self.load_audio(cuts) + targets = [] + tokens = [] + for i in range(len(cuts)): + targets.append(self.get_frame_labels(audio_lens[i], self.num_sample_per_mel_frame, self.num_mel_frame_per_target_frame)) + tokens.append(torch.as_tensor(self.tokenizer(cuts[i].text + ' ' + self.eou_token))) + + target_lens = torch.tensor([t.size(0) for t in targets], dtype=torch.long) + targets = collate_vectors(targets, padding_value=0) + token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) + tokens = collate_vectors(tokens, padding_value=0) + + return audio, audio_lens, targets, target_lens, tokens, token_lens + + def get_frame_labels(self, num_samples, num_sample_per_mel_frame: int = 160, num_mel_frame_per_asr_frame: int = 8): + + mel_frame_count = math.ceil((num_samples + 1) / num_sample_per_mel_frame) + hidden_length = math.ceil(mel_frame_count / num_mel_frame_per_asr_frame) + + targets = torch.ones(hidden_length) # speech label + targets[0] = 2 # start of utterance + targets[-1] = 3 # end of utterance + + return targets diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py index 34dead15b33d..9982c1557ce3 100644 --- a/nemo/collections/asr/models/__init__.py +++ b/nemo/collections/asr/models/__init__.py @@ -42,3 +42,4 @@ SpeechEncDecSelfSupervisedModel, ) from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE +from nemo.collections.asr.models.rnnt_bpe_eou_models import EncDecRNNTBPEEOUModel diff --git a/nemo/collections/asr/models/rnnt_bpe_eou_models.py b/nemo/collections/asr/models/rnnt_bpe_eou_models.py new file mode 100644 index 000000000000..f3ae409552a2 --- /dev/null +++ b/nemo/collections/asr/models/rnnt_bpe_eou_models.py @@ -0,0 +1,120 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from typing import Dict, List, Optional, Union + +import torch +from lightning.pytorch import Trainer +from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict + +from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset +from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset +from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset +from nemo.collections.asr.losses.rnnt import RNNTLoss +from nemo.collections.asr.metrics.wer import WER +from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel +from nemo.collections.asr.parts.mixins import ASRBPEMixin +from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config +from nemo.core.classes.common import PretrainedModelInfo +from nemo.utils import logging, model_utils + + +class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel): + """Base class for encoder decoder RNNT-based models with subword tokenization.""" + + def __init__(self, cfg: DictConfig, trainer: Trainer = None): + super().__init__(cfg, trainer) + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + if config.get("use_lhotse"): + return get_lhotse_dataloader_from_config( + config, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), + dataset=LhotseSpeechToTextBpeEOUDataset( + cfg=config, + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ), + tokenizer=self.tokenizer, + ) + + dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( + config=config, + local_rank=self.local_rank, + global_rank=self.global_rank, + world_size=self.world_size, + tokenizer=self.tokenizer, + preprocessor_cfg=self.cfg.get("preprocessor", None), + ) + + if dataset is None: + return None + + if isinstance(dataset, AudioToBPEDALIDataset): + # DALI Dataset implements dataloader interface + return dataset + + shuffle = config['shuffle'] + if isinstance(dataset, torch.utils.data.IterableDataset): + shuffle = False + + if hasattr(dataset, 'collate_fn'): + collate_fn = dataset.collate_fn + elif hasattr(dataset.datasets[0], 'collate_fn'): + # support datasets that are lists of entries + collate_fn = dataset.datasets[0].collate_fn + else: + # support datasets that are lists of lists + collate_fn = dataset.datasets[0].datasets[0].collate_fn + + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + + return torch.utils.data.DataLoader( + dataset=dataset, + batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, + collate_fn=collate_fn, + drop_last=config.get('drop_last', False), + shuffle=shuffle, + num_workers=config.get('num_workers', 0), + pin_memory=config.get('pin_memory', False), + ) + + def training_step(self, batch, batch_nb): + signal, signal_len, _, _, tokens, token_len = batch + + batch = (signal, signal_len, tokens, token_len) + + return super().training_step(batch, batch_nb) \ No newline at end of file From 68e6b6f673f84160ba837a3f32bf2e8d781df788 Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Mon, 10 Mar 2025 11:46:14 -0700 Subject: [PATCH 002/107] change targets to long() type Signed-off-by: Weiqing Wang --- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 8aff5d46d88b..2a591115f282 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -76,7 +76,7 @@ def get_frame_labels(self, num_samples, num_sample_per_mel_frame: int = 160, num mel_frame_count = math.ceil((num_samples + 1) / num_sample_per_mel_frame) hidden_length = math.ceil(mel_frame_count / num_mel_frame_per_asr_frame) - targets = torch.ones(hidden_length) # speech label + targets = torch.ones(hidden_length).long() # speech label targets[0] = 2 # start of utterance targets[-1] = 3 # end of utterance From 0069facdab6687c8f5230b04026d116b2b166575 Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Mon, 10 Mar 2025 13:26:05 -0700 Subject: [PATCH 003/107] change output_types() Signed-off-by: Weiqing Wang --- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 2a591115f282..e6430b76e77d 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -40,9 +40,10 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: return { 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - 'targets': NeuralType(('B', 'T', 'N'), LabelsType()), + 'targets': NeuralType(('B', 'T'), LabelsType()), 'target_length': NeuralType(tuple('B'), LengthsType()), - 'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True), + 'token_ids': NeuralType(tuple('B', 'T'), LengthsType(), optional=True), + 'token_length': NeuralType(tuple('B'), LengthsType(), optional=True), } def __init__(self, cfg, tokenizer: TokenizerSpec, return_cuts: bool = False): From 1550b569e5900a52acd8052fd85cd664b0aee592 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 17 Mar 2025 12:54:18 -0400 Subject: [PATCH 004/107] add random padding and refactor for multiple utterances per sample Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 218 +++++++++++++++--- 1 file changed, 188 insertions(+), 30 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index e6430b76e77d..da3ae8a46699 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import math from typing import Dict, Optional, Tuple +from omegaconf import DictConfig import torch.utils.data +from lhotse.cut import Cut, CutSet from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors @@ -26,29 +29,67 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ - This dataset is a Lhotse version of diarization dataset in audio_to_diar_label.py. - Unlike native NeMo datasets, Lhotse dataset defines only the mapping from - a CutSet (meta-data) to a mini-batch with PyTorch tensors. - Specifically, it performs tokenization, I/O, augmentation, and feature extraction (if any). - Managing data, sampling, de-duplication across workers/nodes etc. is all handled - by Lhotse samplers instead. + This dataset processes the audio data and the corresponding text data to generate the ASR labels, + along with EOU labels for each frame. The audios used in this dataset should only contain speech with + NO precedding or following silence. The dataset also randomly pads non-speech frames before and after + the audio signal for training EOU prediction task. + + To generate EOU labels, the first frame of the audio will be marked as "start of utterance" (labeled as `2`), + while the last frame will be marked as "end of utterance" (labeled as `3`). The rest of the frames in between + will be marked as "speech" (labeled as `1`). + The padded non-speech signals will be marked as "non-speech" (labeled as 0). + + Returns: + audio: torch.Tensor of audio signal + audio_lens: torch.Tensor of audio signal length + eou_targets: torch.Tensor of EOU labels + eou_target_lens: torch.Tensor of EOU label length + text_tokens: torch.Tensor of text text_tokens + text_token_lens: torch.Tensor of text token length + + Padding logic: + 0. Don't pad when `random_padding` is None or during validation/test + 1. randomly draw a probability to decide whether to apply padding + 2. if not padding or audio duration is longer than the maximum duration, + 1) return the original audio and EOU labels + 3. if apply padding, + 1) get the max padding duration based on the maximum total duration and the audio duration + 2) randomly draw a total padding duration based on the given distribution + 3) randomly split the total padding duration into pre-padding and post-padding + 4) randomly generate the non-speech signal (audio signal=0) for pre-padding and post-padding + 5) concatenate the pre-padding, audio, and post-padding to get the padded audio signal + 6) update the EOU labels accordingly + + Random padding yaml config: + ``` + random_padding: + padding_prob: 0.99 # probability of applying padding + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + pad_normal_mean: 0.5 # mean of normal distribution for padding duration + pad_normal_std: 2.0 # standard deviation of normal distribution for padding duration + ``` + """ @property def output_types(self) -> Optional[Dict[str, NeuralType]]: """Define the output types of the dataset.""" return { - 'audio_signal': NeuralType(('B', 'T'), AudioSignal()), - 'a_sig_length': NeuralType(tuple('B'), LengthsType()), - 'targets': NeuralType(('B', 'T'), LabelsType()), - 'target_length': NeuralType(tuple('B'), LengthsType()), - 'token_ids': NeuralType(tuple('B', 'T'), LengthsType(), optional=True), - 'token_length': NeuralType(tuple('B'), LengthsType(), optional=True), + 'audio': NeuralType(('B', 'T'), AudioSignal()), + 'audio_lens': NeuralType(tuple('B'), LengthsType()), + 'eou_targets': NeuralType(('B', 'T'), LabelsType()), + 'eou_target_lens': NeuralType(tuple('B'), LengthsType()), + 'text_tokens': NeuralType(tuple('B', 'T'), LengthsType(), optional=True), + 'text_token_lens': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__(self, cfg, tokenizer: TokenizerSpec, return_cuts: bool = False): + def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, is_train: bool = False): super().__init__() self.cfg = cfg + self.is_train = is_train + self.return_eou_labels = cfg.get('return_eou_labels', True) self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.num_sample_per_mel_frame = int( @@ -56,29 +97,146 @@ def __init__(self, cfg, tokenizer: TokenizerSpec, return_cuts: bool = False): ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) self.eou_token = self.cfg.get('eou_token', '') + self.sou_token = self.cfg.get('sou_token', '') + self.padding_cfg = self.cfg.get('random_padding', None) - def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]: + def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: audio, audio_lens, cuts = self.load_audio(cuts) - targets = [] - tokens = [] + audio_signals = [] + audio_lengths = [] + eou_targets = [] + text_tokens = [] for i in range(len(cuts)): - targets.append(self.get_frame_labels(audio_lens[i], self.num_sample_per_mel_frame, self.num_mel_frame_per_target_frame)) - tokens.append(torch.as_tensor(self.tokenizer(cuts[i].text + ' ' + self.eou_token))) + eou_targets_i = self.get_frame_labels(cuts[i], audio_lens[i]) + text_tokens_i = self.get_text_tokens(cuts[i]) - target_lens = torch.tensor([t.size(0) for t in targets], dtype=torch.long) - targets = collate_vectors(targets, padding_value=0) - token_lens = torch.tensor([t.size(0) for t in tokens], dtype=torch.long) - tokens = collate_vectors(tokens, padding_value=0) + audio_i, audio_len_i, eou_targets_i = self.random_pad_audio( + audio[i], audio_lens[i], eou_targets_i + ) + audio_signals.append(audio_i) + audio_lengths.append(audio_len_i) + eou_targets.append(eou_targets_i) + text_tokens.append(text_tokens_i) + + audio_signals = collate_vectors(audio_signals, padding_value=0) + audio_lengths = torch.tensor(audio_lengths, dtype=torch.long) + eou_target_lens = torch.tensor([t.size(0) for t in eou_targets], dtype=torch.long) + eou_targets = collate_vectors(eou_targets, padding_value=0) + text_token_lens = torch.tensor([t.size(0) for t in text_tokens], dtype=torch.long) + text_tokens = collate_vectors(text_tokens, padding_value=0) - return audio, audio_lens, targets, target_lens, tokens, token_lens + if not self.return_eou_labels: + return audio_signals, audio_lengths, text_tokens, text_token_lens + return audio_signals, audio_lengths, eou_targets, eou_target_lens, text_tokens, text_token_lens - def get_frame_labels(self, num_samples, num_sample_per_mel_frame: int = 160, num_mel_frame_per_asr_frame: int = 8): + def _audio_len_to_frame_len(self, num_samples: int): + """ + Convert the raw audio length to the number of frames after audio encoder. + + self.num_sample_per_mel_frame = int( + self.cfg.get('window_stride', 0.01) * self.cfg.get('sample_rate', 16000) + ) # 160 samples for every 1ms by default + self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) + """ + mel_frame_count = math.ceil((num_samples + 1) / self.num_sample_per_mel_frame) + hidden_length = math.ceil(mel_frame_count / self.num_mel_frame_per_target_frame) + return hidden_length + + def get_frame_labels(self, cut: Cut, num_samples: int): + hidden_length = self._audio_len_to_frame_len(num_samples) + + if not cut.has_custom("sou_time") or not cut.has_custom("eou_time"): + # assume only single speech segment + eou_targets = torch.ones(hidden_length).long() # speech label + eou_targets[0] = 2 # start of utterance + eou_targets[-1] = 3 # end of utterance + return eou_targets + + sou_time = cut.custom["sou_time"] + eou_time = cut.custom["eou_time"] + + if not isinstance(sou_time, list) and not isinstance(eou_time, list): + # only single speech segment + sou_time = [sou_time] + eou_time = [eou_time] + + assert len(sou_time) == len(eou_time), f"Number of SOU and EOU do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" + + eou_targets = torch.zeros(hidden_length).long() + for i in range(len(sou_time)): + sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.cfg.sample_rate)) + seg_len_in_secs = eou_time[i] - sou_time[i] + seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) + eou_targets[sou_idx:sou_idx+seg_len] = 1 + eou_targets[sou_idx] = 2 # start of utterance + eou_targets[sou_idx+seg_len-1] = 3 # end of utterance + + return eou_targets + + def get_text_tokens(self, cut: Cut): + text = cut.text + if getattr(cut, 'add_sou_eou', True): + text = f"{self.sou_token} {text} {self.eou_token}" + return torch.as_tensor(self.tokenizer(text)) + + def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_targets: torch.Tensor): + """ + Randomly pad the audio signal with non-speech signal before and after the audio signal. + Args: + audio: torch.Tensor of a single audio signal, shape [T] + audio_len: torch.Tensor of audio signal length, shape [1] + eou_targets: torch.Tensor of EOU labels, shape [T] + Returns: + padded_audio: torch.Tensor of padded audio signal, shape [T+padding] + padded_audio_len: torch.Tensor of padded audio signal length, shape [1] + padded_eou_targets: torch.Tensor of padded EOU labels, shape [T+padding] + padded_eou_targets_len: torch.Tensor of padded EOU label length, shape [1] + """ + p = np.random.rand() + if self.padding_cfg is None or not self.is_train or p > self.padding_cfg.padding_prob: + return audio, audio_len, eou_targets, eou_targets.size(0) - mel_frame_count = math.ceil((num_samples + 1) / num_sample_per_mel_frame) - hidden_length = math.ceil(mel_frame_count / num_mel_frame_per_asr_frame) + duration = audio_len.item() / self.cfg.sample_rate + # if already longer than the maximum duration, return the original audio + if duration >= self.padding_cfg.max_total_duration: + return audio, audio_len, eou_targets, eou_targets.size(0) + + # apply padding + audio = audio[:audio_len] + max_padding_duration = max(0, self.padding_cfg.max_total_duration - duration) + if max_padding_duration <= self.padding_cfg.min_pad_duration: + min_padding_duration = 0 + else: + min_padding_duration = self.padding_cfg.min_pad_duration + + if self.padding_cfg.pad_distribution == 'uniform': + total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) + elif self.padding_cfg.pad_distribution == 'normal': + total_padding_duration = np.random.normal(self.padding_cfg.pad_normal_mean, self.padding_cfg.pad_normal_std) + total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) + else: + raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") + + pre_padding_duration = np.random.uniform(0, total_padding_duration) + post_padding_duration = total_padding_duration - pre_padding_duration + + pre_padding_len = math.ceil(pre_padding_duration * self.cfg.sample_rate) + post_padding_len = math.ceil(post_padding_duration * self.cfg.sample_rate) + + # pad the audio signal + pre_padding = torch.zeros(pre_padding_len, dtype=audio.dtype) + post_padding = torch.zeros(post_padding_len, dtype=audio.dtype) + padded_audio = torch.cat((pre_padding, audio, post_padding), dim=0) + padded_audio_len = audio_len + pre_padding_len + post_padding_len + + # pad the EOU labels + pre_padding_eou_len = self._audio_len_to_frame_len(pre_padding_len) + post_padding_eou_len = self._audio_len_to_frame_len(post_padding_len) + pre_padding_eou = torch.zeros(pre_padding_eou_len, dtype=eou_targets.dtype) + post_padding_eou = torch.zeros(post_padding_eou_len, dtype=eou_targets.dtype) + padded_eou_targets = torch.cat((pre_padding_eou, eou_targets, post_padding_eou), dim=0) + + return padded_audio, padded_audio_len, padded_eou_targets + - targets = torch.ones(hidden_length).long() # speech label - targets[0] = 2 # start of utterance - targets[-1] = 3 # end of utterance - return targets From 867c7996cc99b7dada136904940233605bab4252 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 17 Mar 2025 13:05:04 -0400 Subject: [PATCH 005/107] add handling multiple text groundtruth Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 91 +++++++++++-------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index da3ae8a46699..ef3326d312df 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import math from typing import Dict, Optional, Tuple -from omegaconf import DictConfig +import numpy as np import torch.utils.data from lhotse.cut import Cut, CutSet from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors +from omegaconf import DictConfig from nemo.collections.common.tokenizers.aggregate_tokenizer import TokenizerWrapper from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -29,16 +29,16 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ - This dataset processes the audio data and the corresponding text data to generate the ASR labels, - along with EOU labels for each frame. The audios used in this dataset should only contain speech with - NO precedding or following silence. The dataset also randomly pads non-speech frames before and after + This dataset processes the audio data and the corresponding text data to generate the ASR labels, + along with EOU labels for each frame. The audios used in this dataset should only contain speech with + NO precedding or following silence. The dataset also randomly pads non-speech frames before and after the audio signal for training EOU prediction task. - To generate EOU labels, the first frame of the audio will be marked as "start of utterance" (labeled as `2`), - while the last frame will be marked as "end of utterance" (labeled as `3`). The rest of the frames in between - will be marked as "speech" (labeled as `1`). + To generate EOU labels, the first frame of the audio will be marked as "start of utterance" (labeled as `2`), + while the last frame will be marked as "end of utterance" (labeled as `3`). The rest of the frames in between + will be marked as "speech" (labeled as `1`). The padded non-speech signals will be marked as "non-speech" (labeled as 0). - + Returns: audio: torch.Tensor of audio signal audio_lens: torch.Tensor of audio signal length @@ -50,9 +50,9 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): Padding logic: 0. Don't pad when `random_padding` is None or during validation/test 1. randomly draw a probability to decide whether to apply padding - 2. if not padding or audio duration is longer than the maximum duration, + 2. if not padding or audio duration is longer than the maximum duration, 1) return the original audio and EOU labels - 3. if apply padding, + 3. if apply padding, 1) get the max padding duration based on the maximum total duration and the audio duration 2) randomly draw a total padding duration based on the given distribution 3) randomly split the total padding duration into pre-padding and post-padding @@ -69,8 +69,8 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' pad_normal_mean: 0.5 # mean of normal distribution for padding duration pad_normal_std: 2.0 # standard deviation of normal distribution for padding duration - ``` - + ``` + """ @property @@ -110,9 +110,7 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: eou_targets_i = self.get_frame_labels(cuts[i], audio_lens[i]) text_tokens_i = self.get_text_tokens(cuts[i]) - audio_i, audio_len_i, eou_targets_i = self.random_pad_audio( - audio[i], audio_lens[i], eou_targets_i - ) + audio_i, audio_len_i, eou_targets_i = self.random_pad_audio(audio[i], audio_lens[i], eou_targets_i) audio_signals.append(audio_i) audio_lengths.append(audio_len_i) eou_targets.append(eou_targets_i) @@ -124,11 +122,11 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: eou_targets = collate_vectors(eou_targets, padding_value=0) text_token_lens = torch.tensor([t.size(0) for t in text_tokens], dtype=torch.long) text_tokens = collate_vectors(text_tokens, padding_value=0) - + if not self.return_eou_labels: return audio_signals, audio_lengths, text_tokens, text_token_lens return audio_signals, audio_lengths, eou_targets, eou_target_lens, text_tokens, text_token_lens - + def _audio_len_to_frame_len(self, num_samples: int): """ Convert the raw audio length to the number of frames after audio encoder. @@ -147,37 +145,55 @@ def get_frame_labels(self, cut: Cut, num_samples: int): if not cut.has_custom("sou_time") or not cut.has_custom("eou_time"): # assume only single speech segment - eou_targets = torch.ones(hidden_length).long() # speech label - eou_targets[0] = 2 # start of utterance - eou_targets[-1] = 3 # end of utterance + eou_targets = torch.ones(hidden_length).long() # speech label + eou_targets[0] = 2 # start of utterance + eou_targets[-1] = 3 # end of utterance return eou_targets sou_time = cut.custom["sou_time"] eou_time = cut.custom["eou_time"] - - if not isinstance(sou_time, list) and not isinstance(eou_time, list): - # only single speech segment + if not isinstance(sou_time, list): sou_time = [sou_time] + if not isinstance(eou_time, list): eou_time = [eou_time] - assert len(sou_time) == len(eou_time), f"Number of SOU and EOU do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" + assert len(sou_time) == len( + eou_time + ), f"Number of SOU and EOU do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" eou_targets = torch.zeros(hidden_length).long() for i in range(len(sou_time)): sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.cfg.sample_rate)) seg_len_in_secs = eou_time[i] - sou_time[i] seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) - eou_targets[sou_idx:sou_idx+seg_len] = 1 + eou_targets[sou_idx : sou_idx + seg_len] = 1 eou_targets[sou_idx] = 2 # start of utterance - eou_targets[sou_idx+seg_len-1] = 3 # end of utterance + eou_targets[sou_idx + seg_len - 1] = 3 # end of utterance return eou_targets def get_text_tokens(self, cut: Cut): - text = cut.text - if getattr(cut, 'add_sou_eou', True): - text = f"{self.sou_token} {text} {self.eou_token}" - return torch.as_tensor(self.tokenizer(text)) + if not cut.has_custom("sou_time") or not cut.has_custom("eou_time") or not cut.has_custom("utterances"): + # assume only single speech segment + utterances = [cut.supervisions[0].text] + else: + utterances = cut.custom["utterances"] + sou_time = cut.custom["sou_time"] + eou_time = cut.custom["eou_time"] + if not isinstance(utterances, list): + utterances = [utterances] + if not isinstance(sou_time, list): + sou_time = [sou_time] + if not isinstance(eou_time, list): + eou_time = [eou_time] + + total_text = "" + for text in utterances: + if getattr(cut, 'add_sou_eou', True): + text = f"{self.sou_token} {text} {self.eou_token}" + total_text += text + " " + total_text = total_text.strip() + return torch.as_tensor(self.tokenizer(total_text)) def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_targets: torch.Tensor): """ @@ -195,7 +211,7 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar p = np.random.rand() if self.padding_cfg is None or not self.is_train or p > self.padding_cfg.padding_prob: return audio, audio_len, eou_targets, eou_targets.size(0) - + duration = audio_len.item() / self.cfg.sample_rate # if already longer than the maximum duration, return the original audio if duration >= self.padding_cfg.max_total_duration: @@ -212,11 +228,13 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar if self.padding_cfg.pad_distribution == 'uniform': total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) elif self.padding_cfg.pad_distribution == 'normal': - total_padding_duration = np.random.normal(self.padding_cfg.pad_normal_mean, self.padding_cfg.pad_normal_std) + total_padding_duration = np.random.normal( + self.padding_cfg.pad_normal_mean, self.padding_cfg.pad_normal_std + ) total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) else: raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") - + pre_padding_duration = np.random.uniform(0, total_padding_duration) post_padding_duration = total_padding_duration - pre_padding_duration @@ -237,6 +255,3 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar padded_eou_targets = torch.cat((pre_padding_eou, eou_targets, post_padding_eou), dim=0) return padded_audio, padded_audio_len, padded_eou_targets - - - From c9c8a0d70c12978da80a81f6c50b462ce366c632 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 4 Apr 2025 16:56:36 -0400 Subject: [PATCH 006/107] update and add eval scripts Signed-off-by: stevehuang52 --- .../asr/modules/conformer_encoder.py | 10 +- nemo/collections/asr/parts/mixins/mixins.py | 2 +- scripts/asr_end_of_utterance/evaluate_eou.py | 223 ++++++++++++++++++ 3 files changed, 232 insertions(+), 3 deletions(-) create mode 100644 scripts/asr_end_of_utterance/evaluate_eou.py diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index e6b415eab5ae..11eeccd96d43 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -863,8 +863,14 @@ def setup_streaming_params( streaming_cfg.last_channel_cache_size = att_context_size[0] if att_context_size[0] >= 0 else max_context else: if left_chunks is None: - raise ValueError("left_chunks can not be None when chunk_size is set.") - streaming_cfg.last_channel_cache_size = left_chunks * chunk_size + streaming_cfg.last_channel_cache_size = ( + att_context_size[0] if att_context_size[0] >= 0 else max_context + ) + logging.warning( + f"left_chunks is not set. Setting it to default: {streaming_cfg.last_channel_cache_size}." + ) + else: + streaming_cfg.last_channel_cache_size = left_chunks * chunk_size if hasattr(self.pre_encode, "get_sampling_frames"): sampling_frames = self.pre_encode.get_sampling_frames() diff --git a/nemo/collections/asr/parts/mixins/mixins.py b/nemo/collections/asr/parts/mixins/mixins.py index 577b6393248c..d99ae3cc70b4 100644 --- a/nemo/collections/asr/parts/mixins/mixins.py +++ b/nemo/collections/asr/parts/mixins/mixins.py @@ -610,7 +610,7 @@ def conformer_stream_step( processed_signal: the input audio signals processed_signal_length: the length of the audios cache_last_channel: the cache tensor for last channel layers like MHA - cache_last_channel_len: engths for cache_last_channel + cache_last_channel_len: lengths for cache_last_channel cache_last_time: the cache tensor for last time layers like convolutions keep_all_outputs: if set to True, would not drop the extra outputs specified by encoder.streaming_cfg.valid_out_len previous_hypotheses: the hypotheses from the previous step for RNNT models diff --git a/scripts/asr_end_of_utterance/evaluate_eou.py b/scripts/asr_end_of_utterance/evaluate_eou.py new file mode 100644 index 000000000000..9159a9c313de --- /dev/null +++ b/scripts/asr_end_of_utterance/evaluate_eou.py @@ -0,0 +1,223 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from pprint import pprint +from typing import List + +import numpy as np + +parser = argparse.ArgumentParser(description="Evaluate end of utterance predictions against reference labels.") +parser.add_argument( + "-p", + "--predictions", + type=str, + required=True, + help="Path to the JSON file containing the predictions.", +) +parser.add_argument( + "-r", + "--references", + type=str, + required=True, + help="Path to the JSON file containing the reference labels.", +) +parser.add_argument( + "-t", + "--threshold", + type=float, + default=0.5, + help="Threshold for considering a prediction as EOU.", +) +parser.add_argument( + "--drop_prefix", + default="", + type=str, + help="Prefix to drop from the audio_filepath in the JSON file.", +) +parser.add_argument( + "-c", + "--collar", + type=float, + default=0.1, + help="Collar time in seconds for matching predictions to references.", +) +parser.add_argument( + "-o", + "--output_dir", + type=str, + default="eou_results/", + help="Output directory to save the evaluation results.", +) + + +@dataclass +class EOUResult: + latency: list + early_cutoff: list + true_positives: int + false_negatives: int + false_positives: int + num_utterances: int + num_predictions: int + + +def load_json(file_path: str, drop_prefix: str = "") -> List[dict]: + """Load a JSON file, then clean the audio_filepath.""" + with open(file_path, "r") as f: + data = json.load(f) + + cleaned_data = [] + for item in data: + audio_filepath = item["audio_filepath"] + if drop_prefix and audio_filepath.startswith(drop_prefix): + audio_filepath = audio_filepath[len(drop_prefix) :] + elif audio_filepath.startswith("./"): + audio_filepath = audio_filepath[2:] + item["audio_filepath"] = audio_filepath + + cleaned_data.append(item) + return cleaned_data + + +def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float, collar: float) -> EOUResult: + """ + Evaluate end of utterance predictions against reference labels. + Each item in predicition/reference is a dictionary containing: + { + "session_id": str, + "start_time": float, # start time in seconds + "end_time": float, # end time in seconds + "words": str, # transcription of the utterance + "audio_filepath": str, # only in prediction + "eou_prob": float, # only in prediction, probability of EOU in range [0.1] + "eou_pred": bool, # only in prediction + "full_text": str, # only in prediction, which is the full transcription up to the end_time + } + + Args: + predictions (List[dict]): List of dictionaries containing predictions. + references (List[dict]): List of dictionaries containing reference labels. + threshold (float): Threshold for considering a prediction as EOU. + collar (float): Collar time in seconds for matching predictions to references. + """ + + latency = [] + early_cutoff = [] + true_positives = 0 + false_negatives = 0 + false_positives = 0 + num_utterances = len(reference) + num_predictions = len(prediction) + + predicted_eou = [p for p in prediction if p["eou_pred"] > threshold] + predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) + reference = sorted(reference, key=lambda x: x["start_time"]) + + p_idx = 0 + r_idx = 0 + for p_idx in range(len(predicted_eou)): + p = predicted_eou[p_idx] + p_start = p["start_time"] + p_end = p["end_time"] + + while r_idx < len(reference) and reference[r_idx]["end_time"] < p_start: + # Current reference ends before the current predicted utterance starts, find the next reference + r_idx += 1 + + if r_idx >= len(reference): + # No more references to compare against + false_positives += 1 + continue + + r = reference[r_idx] + r_start = r["start_time"] + r_end = r["end_time"] + + if np.abs(p_end - r_end) <= collar: + # Correctly predicted EOU + true_positives += 1 + latency.append(p_end - r_end) + r_idx += 1 + elif r_start <= p_end < r_end - collar: + # Early cutoff + # current predicted EOU is within the current reference utterance + false_positives += 1 + early_cutoff.append(r_end - p_end) + elif r_end + collar < p_end: + # Late EOU + # Current predicted EOU is after the current reference ends + false_negatives += 1 + latency.append(p_end - r_end) + else: + # p_end <= r_start + # Current predicted EOU is before the current reference starts + false_positives += 1 + + if r_idx < len(reference): + # There are remaining references that were not matched + false_negatives += len(reference) - r_idx + + return EOUResult( + latency=latency, + early_cutoff=early_cutoff, + true_positives=true_positives, + false_negatives=false_negatives, + false_positives=false_positives, + num_utterances=num_utterances, + num_predictions=num_predictions, + ) + + +def main(): + args = parser.parse_args() + + predictions = load_json(args.predictions, args.drop_prefix) + references = load_json(args.references, args.drop_prefix) + results = evaluate_eou( + predictions, + references, + threshold=args.threshold, + collar=args.collar, + ) + + f1_score = ( + (2 * results.true_positives / (2 * results.true_positives + results.false_negatives + results.false_positives)) + if (results.true_positives + results.false_negatives + results.false_positives) > 0 + else 0 + ) + + avg_cutoffs = len(results.early_cutoff) / len(results.num_utterances) if len(results.num_utterances) > 0 else 0 + + p80_latency = np.percentile(results.latency, 80) if len(results.latency) > 0 else 0 + p90_latency = np.percentile(results.latency, 90) if len(results.latency) > 0 else 0 + p95_latency = np.percentile(results.latency, 95) if len(results.latency) > 0 else 0 + p99_latency = np.percentile(results.latency, 99) if len(results.latency) > 0 else 0 + # Print the results + print("Evaluation Results:") + print(f"Number of utterances: {results.num_utterances}") + print(f"Number of predictions: {results.num_predictions}") + print(f"F1 Score: {f1_score:.4f}") + print(f"Early cutoff rate: {avg_cutoffs:.4f}") + print(f"P80 Latency: {p80_latency:.4f} seconds") + print(f"P90 Latency: {p90_latency:.4f} seconds") + print(f"P95 Latency: {p95_latency:.4f} seconds") + print(f"P99 Latency: {p99_latency:.4f} seconds") + + +if __name__ == "__main__": + main() From 201b7063f1ed925658ae70955a2f267b9b205c77 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 7 Apr 2025 16:59:10 -0400 Subject: [PATCH 007/107] drop sou label and add eob label Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index ef3326d312df..2aa633a536c7 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -28,24 +28,40 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): + EOU_LABEL = 2 + EOB_LABEL = 3 + EOU_STRING = '' + EOB_STRING = '' """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, along with EOU labels for each frame. The audios used in this dataset should only contain speech with NO precedding or following silence. The dataset also randomly pads non-speech frames before and after the audio signal for training EOU prediction task. - To generate EOU labels, the first frame of the audio will be marked as "start of utterance" (labeled as `2`), - while the last frame will be marked as "end of utterance" (labeled as `3`). The rest of the frames in between - will be marked as "speech" (labeled as `1`). + To generate EOU labels, the last frame of utterance will be marked as "end of utterance" (labeled as `2`), + while if it's a backchannel utterance it'll be marked asd "end of backchannel" (labeled as `3`). + The rest of the speech frames will be marked as "speech" (labeled as `1`). The padded non-speech signals will be marked as "non-speech" (labeled as 0). Returns: audio: torch.Tensor of audio signal audio_lens: torch.Tensor of audio signal length - eou_targets: torch.Tensor of EOU labels - eou_target_lens: torch.Tensor of EOU label length text_tokens: torch.Tensor of text text_tokens text_token_lens: torch.Tensor of text token length + eou_targets (optional): torch.Tensor of EOU labels + eou_target_lens (optional): torch.Tensor of EOU label length + + The input manifest should be a jsonl file where each line is a python dictionary. + Example manifest sample: + { + "audio_filepath": "/path/to/audio.wav", + "offset": 0.0, + "duration": 6.0, + "sou_time": [0.3, 4.0], + "eou_time": [1.3, 4.5], + "utterances": ["Tell me a joke", "Ah-ha"], + "is_backchannel": [False, True], + } Padding logic: 0. Don't pad when `random_padding` is None or during validation/test @@ -96,8 +112,9 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, is_train: bool = F self.cfg.get('window_stride', 0.01) * self.cfg.get('sample_rate', 16000) ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) - self.eou_token = self.cfg.get('eou_token', '') - self.sou_token = self.cfg.get('sou_token', '') + self.eou_string = self.cfg.get('eou_string', self.EOU_STRING) + self.eob_string = self.cfg.get('eob_string', self.EOB_STRING) + self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) self.padding_cfg = self.cfg.get('random_padding', None) def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: @@ -146,8 +163,9 @@ def get_frame_labels(self, cut: Cut, num_samples: int): if not cut.has_custom("sou_time") or not cut.has_custom("eou_time"): # assume only single speech segment eou_targets = torch.ones(hidden_length).long() # speech label - eou_targets[0] = 2 # start of utterance - eou_targets[-1] = 3 # end of utterance + eou_targets[-1] = self.EOU_LABEL # by default it's end of utterance + if cut.has_custom("is_backchannel") and cut.custom["is_backchannel"]: + eou_targets[-1] = self.EOB_LABEL # end of backchannel return eou_targets sou_time = cut.custom["sou_time"] @@ -159,7 +177,17 @@ def get_frame_labels(self, cut: Cut, num_samples: int): assert len(sou_time) == len( eou_time - ), f"Number of SOU and EOU do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" + ), f"Number of SOU time and EOU time do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" + + if cut.has_custom("is_backchannel"): + is_backchannel = cut.custom["is_backchannel"] + if not isinstance(is_backchannel, list): + is_backchannel = [is_backchannel] + assert len(sou_time) == len( + cut.custom["is_backchannel"] + ), f"Number of SOU and backchannel do not match: SOU ({len(sou_time)}) vs backchannel ({len(is_backchannel)})" + else: + is_backchannel = [False] * len(sou_time) eou_targets = torch.zeros(hidden_length).long() for i in range(len(sou_time)): @@ -167,8 +195,10 @@ def get_frame_labels(self, cut: Cut, num_samples: int): seg_len_in_secs = eou_time[i] - sou_time[i] seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) eou_targets[sou_idx : sou_idx + seg_len] = 1 - eou_targets[sou_idx] = 2 # start of utterance - eou_targets[sou_idx + seg_len - 1] = 3 # end of utterance + if is_backchannel[i]: + eou_targets[sou_idx + seg_len - 1] = self.EOB_LABEL # end of backchannel + else: + eou_targets[sou_idx + seg_len - 1] = self.EOU_LABEL # end of utterance return eou_targets @@ -178,20 +208,26 @@ def get_text_tokens(self, cut: Cut): utterances = [cut.supervisions[0].text] else: utterances = cut.custom["utterances"] - sou_time = cut.custom["sou_time"] - eou_time = cut.custom["eou_time"] - if not isinstance(utterances, list): - utterances = [utterances] - if not isinstance(sou_time, list): - sou_time = [sou_time] - if not isinstance(eou_time, list): - eou_time = [eou_time] + + if not isinstance(utterances, list): + utterances = [utterances] + + if cut.has_custom("is_backchannel"): + is_backchannel = cut.custom["is_backchannel"] + if not isinstance(is_backchannel, list): + is_backchannel = [is_backchannel] + assert len(utterances) == len( + cut.custom["is_backchannel"] + ), f"Number of utterances and backchannel do not match: utterance ({len(utterances)}) vs backchannel ({len(is_backchannel)})" + else: + is_backchannel = [False] * len(utterances) total_text = "" - for text in utterances: - if getattr(cut, 'add_sou_eou', True): - text = f"{self.sou_token} {text} {self.eou_token}" - total_text += text + " " + for i, text in enumerate(utterances): + eou_string = self.eob_string if is_backchannel[i] else self.eou_string + if self.add_sep_before_eou: + eou_string = " " + eou_string + total_text += text + eou_string + " " total_text = total_text.strip() return torch.as_tensor(self.tokenizer(total_text)) From af380f61c509a0447ded7315b02099f9a6826577 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 8 Apr 2025 13:15:21 -0400 Subject: [PATCH 008/107] update hybrid-rnnt-ctc and rnnt models to use eou dataset Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 5 ++--- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 16 ++++++++++++---- nemo/collections/asr/models/rnnt_bpe_models.py | 16 ++++++++++++---- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 2aa633a536c7..4a240faafcc2 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -101,10 +101,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'text_token_lens': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, is_train: bool = False): + def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec): super().__init__() self.cfg = cfg - self.is_train = is_train self.return_eou_labels = cfg.get('return_eou_labels', True) self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) @@ -245,7 +244,7 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar padded_eou_targets_len: torch.Tensor of padded EOU label length, shape [1] """ p = np.random.rand() - if self.padding_cfg is None or not self.is_train or p > self.padding_cfg.padding_prob: + if self.padding_cfg is None or p > self.padding_cfg.padding_prob: return audio, audio_len, eou_targets, eou_targets.size(0) duration = audio_len.item() / self.cfg.sample_rate diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index cd04a5ad2462..1d3dbc979598 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -21,6 +21,7 @@ from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset @@ -138,6 +139,16 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): + + if config.get("use_eou", False): + cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config + dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer) + else: + dataset = LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ) + return get_lhotse_dataloader_from_config( config, # During transcription, the model is initially loaded on the CPU. @@ -145,10 +156,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # these values must be passed from the configuration. global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ), + dataset=dataset, tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index cd8667f2f0fe..0faed26c24f6 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -21,6 +21,7 @@ from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset @@ -507,6 +508,16 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig, verbose: bool = Tru def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): + + if config.get("use_eou", False): + cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config + dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer) + else: + dataset = LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ) + return get_lhotse_dataloader_from_config( config, # During transcription, the model is initially loaded on the CPU. @@ -514,10 +525,7 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # these values must be passed from the configuration. global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ), + dataset=dataset, tokenizer=self.tokenizer, ) From 82cdb603ba4f09b4ac2673b9743d4c078c20984e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 8 Apr 2025 13:16:05 -0400 Subject: [PATCH 009/107] set default return eou frame label to false Signed-off-by: stevehuang52 --- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 4a240faafcc2..d404e2814da4 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -104,7 +104,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec): super().__init__() self.cfg = cfg - self.return_eou_labels = cfg.get('return_eou_labels', True) + self.return_eou_labels = cfg.get('return_eou_labels', False) self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.num_sample_per_mel_frame = int( From 9b6f95d8dc151188ab7fb04ea57d74040072e5d5 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 8 Apr 2025 19:35:23 -0400 Subject: [PATCH 010/107] handle empty utterance Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 10 ++ .../asr/models/rnnt_bpe_eou_models.py | 120 ------------------ 2 files changed, 10 insertions(+), 120 deletions(-) delete mode 100644 nemo/collections/asr/models/rnnt_bpe_eou_models.py diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index d404e2814da4..7576a20b28b7 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -161,6 +161,10 @@ def get_frame_labels(self, cut: Cut, num_samples: int): if not cut.has_custom("sou_time") or not cut.has_custom("eou_time"): # assume only single speech segment + text = cut.supervisions[0].text + if not text: + # skip empty utterances + return torch.zeros(hidden_length).long() eou_targets = torch.ones(hidden_length).long() # speech label eou_targets[-1] = self.EOU_LABEL # by default it's end of utterance if cut.has_custom("is_backchannel") and cut.custom["is_backchannel"]: @@ -190,6 +194,9 @@ def get_frame_labels(self, cut: Cut, num_samples: int): eou_targets = torch.zeros(hidden_length).long() for i in range(len(sou_time)): + if sou_time[i] is None or eou_time[i] is None or sou_time[i] < 0 or eou_time[i] < 0: + # skip empty utterances + continue sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.cfg.sample_rate)) seg_len_in_secs = eou_time[i] - sou_time[i] seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) @@ -223,6 +230,9 @@ def get_text_tokens(self, cut: Cut): total_text = "" for i, text in enumerate(utterances): + if not text: + # skip empty utterances + continue eou_string = self.eob_string if is_backchannel[i] else self.eou_string if self.add_sep_before_eou: eou_string = " " + eou_string diff --git a/nemo/collections/asr/models/rnnt_bpe_eou_models.py b/nemo/collections/asr/models/rnnt_bpe_eou_models.py deleted file mode 100644 index f3ae409552a2..000000000000 --- a/nemo/collections/asr/models/rnnt_bpe_eou_models.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -from typing import Dict, List, Optional, Union - -import torch -from lightning.pytorch import Trainer -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict - -from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_text import _AudioTextDataset -from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset -from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset -from nemo.collections.asr.losses.rnnt import RNNTLoss -from nemo.collections.asr.metrics.wer import WER -from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel -from nemo.collections.asr.parts.mixins import ASRBPEMixin -from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig -from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler -from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config -from nemo.core.classes.common import PretrainedModelInfo -from nemo.utils import logging, model_utils - - -class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel): - """Base class for encoder decoder RNNT-based models with subword tokenization.""" - - def __init__(self, cfg: DictConfig, trainer: Trainer = None): - super().__init__(cfg, trainer) - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - if config.get("use_lhotse"): - return get_lhotse_dataloader_from_config( - config, - # During transcription, the model is initially loaded on the CPU. - # To ensure the correct global_rank and world_size are set, - # these values must be passed from the configuration. - global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), - world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=LhotseSpeechToTextBpeEOUDataset( - cfg=config, - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ), - tokenizer=self.tokenizer, - ) - - dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config( - config=config, - local_rank=self.local_rank, - global_rank=self.global_rank, - world_size=self.world_size, - tokenizer=self.tokenizer, - preprocessor_cfg=self.cfg.get("preprocessor", None), - ) - - if dataset is None: - return None - - if isinstance(dataset, AudioToBPEDALIDataset): - # DALI Dataset implements dataloader interface - return dataset - - shuffle = config['shuffle'] - if isinstance(dataset, torch.utils.data.IterableDataset): - shuffle = False - - if hasattr(dataset, 'collate_fn'): - collate_fn = dataset.collate_fn - elif hasattr(dataset.datasets[0], 'collate_fn'): - # support datasets that are lists of entries - collate_fn = dataset.datasets[0].collate_fn - else: - # support datasets that are lists of lists - collate_fn = dataset.datasets[0].datasets[0].collate_fn - - batch_sampler = None - if config.get('use_semi_sorted_batching', False): - if not isinstance(dataset, _AudioTextDataset): - raise RuntimeError( - "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " - f"but found dataset of type {type(dataset)}" - ) - # set batch_size and batch_sampler to None to disable automatic batching - batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) - config['batch_size'] = None - config['drop_last'] = False - shuffle = False - - return torch.utils.data.DataLoader( - dataset=dataset, - batch_size=config['batch_size'], - sampler=batch_sampler, - batch_sampler=None, - collate_fn=collate_fn, - drop_last=config.get('drop_last', False), - shuffle=shuffle, - num_workers=config.get('num_workers', 0), - pin_memory=config.get('pin_memory', False), - ) - - def training_step(self, batch, batch_nb): - signal, signal_len, _, _, tokens, token_len = batch - - batch = (signal, signal_len, tokens, token_len) - - return super().training_step(batch, batch_nb) \ No newline at end of file From 4228641c961b2cef4082c2998679c25c06537fb2 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 8 Apr 2025 20:20:57 -0400 Subject: [PATCH 011/107] add script for injecting special eou tokens into SPE tokenizer Signed-off-by: stevehuang52 --- .../add_special_tokens_to_sentencepiece.py | 145 ++ .../tokenizers/sentencepiece_model_pb2.py | 1428 +++++++++++++++++ 2 files changed, 1573 insertions(+) create mode 100644 scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py create mode 100644 scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py new file mode 100644 index 000000000000..7e08191f0a05 --- /dev/null +++ b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py @@ -0,0 +1,145 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" +import logging +import sys +import tempfile + +from argparse import ArgumentParser +from pathlib import Path + +import sentencepiece as spm + +from nemo.core.connectors.save_restore_connector import SaveRestoreConnector + +try: + import sentencepiece_model_pb2 as spt +except (ImportError, ModuleNotFoundError): + raise Exception("Ensure that sentencepiece_model_pb2.py has been generated from the protoc compiler") + + +SPECIAL_TOKENS = ["", ""] + +"""Utility to add special tokens to existing sentencepiece models. + +Generate sentencepiece_model_pb2.py in the directory of this script before running +To generate run `protoc --python_out=/scripts/asr_end_of_utterance/tokenizers sentencepiece_model.proto` +inside the src folder in sentencepiece repo +Refer: https://github.com/google/sentencepiece/issues/121 + +Usage: +python add_special_tokens_to_sentencepiece.py \ + --input_file your_model.nemo \ + --output_file /path/to/new/tokenizer.model +""" + + +parser = ArgumentParser(description="Add special tokens to sentencepiece model") +parser.add_argument( + "--input_file", + type=str, + required=True, + help="Path to sentencepiece model file", +) +parser.add_argument( + "--output_file", + type=str, + required=True, + help="Path to sentencepiece model file", +) +parser.add_argument( + "--tokens", + type=str, + nargs='+', + help="Special tokens to add to tokenizer", + default=SPECIAL_TOKENS, +) +parser.add_argument( + "--is_userdefined", + action="store_true", + help="When set, the new tokens are set as user_defined tokens", +) + + +def extract_nemo_tokenizer(nemo_filepath, output_dir): + SaveRestoreConnector._unpack_nemo_file(path2file=nemo_filepath, out_folder=output_dir) + tokenizer = None + for file in Path(output_dir).glob("**/*"): + if file.is_file() and file.name.endswith("tokenizer.model"): + tokenizer = file + break + if tokenizer is None: + raise ValueError(f"Tokenizer not found in {output_dir}: {os.listdir(output_dir)}") + return str(tokenizer.absolute()) + + +def edit_spt_model(input_file, output_file, tokens, is_userdefined): + + token_type = 3 + if is_userdefined: + token_type = 4 + + model = spt.ModelProto() + model.ParseFromString(open(input_file, 'rb').read()) + + for token in tokens: + piece = model.SentencePiece(piece=token, score=0.0, type=token_type) + if piece in model.pieces: + logging.error(f"Special Token '{token}' already exists in the input model!") + sys.exit(1) + model.pieces.append(piece) + + sp = spm.SentencePieceProcessor() + try: + sp.LoadFromSerializedProto(model.SerializeToString()) + for token in tokens: + id = sp.piece_to_id(token) + logging.info(f"Created token '{token}' at ID {id}") + logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}") + except: + logging.error("Could not appropriately configure new tokenizer. Verify if the special tokens already exist.") + sys.exit(1) + + with open(output_file, 'wb') as outf: + outf.write(model.SerializeToString()) + + logging.info(f"Created new tokenizer at: {output_file}") + + +def inject_special_tokens(input_file, output_file, tokens, is_userdefined): + if not os.path.exists(input_file): + raise ValueError(f"Input file {input_file} does not exist") + + with tempfile.TemporaryDirectory() as temp_dir: + # Check if input file is a Nemo file + if input_file.endswith(".nemo"): + input_file = extract_nemo_tokenizer(input_file, temp_dir) + logging.info(f"Extracted tokenizer from Nemo file: {input_file}") + else: + input_file = os.path.abspath(input_file) + logging.info(f"Using input file: {input_file}") + + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + if os.path.exists(output_file): + logging.info(f"Output file {output_file} already exists. Overwriting.") + edit_spt_model(input_file, output_file, tokens, is_userdefined) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + args = parser.parse_args() + inject_special_tokens(args.input_file, args.output_file, args.tokens, args.is_userdefined) diff --git a/scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py b/scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py new file mode 100644 index 000000000000..0ea467f28d15 --- /dev/null +++ b/scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py @@ -0,0 +1,1428 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: sentencepiece_model.proto + +import sys + +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='sentencepiece_model.proto', + package='sentencepiece', + syntax='proto2', + serialized_options=_b('H\003'), + serialized_pb=_b( + '\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa4\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\x12\"\n\x18seed_sentencepieces_file\x18\x36 \x01(\t:\x00\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03' + ), +) + + +_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( + name='ModelType', + full_name='sentencepiece.TrainerSpec.ModelType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor(name='UNIGRAM', index=0, number=1, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='BPE', index=1, number=2, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='WORD', index=2, number=3, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='CHAR', index=3, number=4, serialized_options=None, type=None), + ], + containing_type=None, + serialized_options=None, + serialized_start=1553, + serialized_end=1606, +) +_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE) + +_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor( + name='Type', + full_name='sentencepiece.ModelProto.SentencePiece.Type', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor(name='NORMAL', index=0, number=1, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='UNKNOWN', index=1, number=2, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='CONTROL', index=2, number=3, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='USER_DEFINED', index=3, number=4, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='BYTE', index=4, number=6, serialized_options=None, type=None), + _descriptor.EnumValueDescriptor(name='UNUSED', index=5, number=5, serialized_options=None, type=None), + ], + containing_type=None, + serialized_options=None, + serialized_start=2359, + serialized_end=2443, +) +_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE) + + +_TRAINERSPEC = _descriptor.Descriptor( + name='TrainerSpec', + full_name='sentencepiece.TrainerSpec', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='input', + full_name='sentencepiece.TrainerSpec.input', + index=0, + number=1, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='input_format', + full_name='sentencepiece.TrainerSpec.input_format', + index=1, + number=7, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='model_prefix', + full_name='sentencepiece.TrainerSpec.model_prefix', + index=2, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='model_type', + full_name='sentencepiece.TrainerSpec.model_type', + index=3, + number=3, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='vocab_size', + full_name='sentencepiece.TrainerSpec.vocab_size', + index=4, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=8000, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='accept_language', + full_name='sentencepiece.TrainerSpec.accept_language', + index=5, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='self_test_sample_size', + full_name='sentencepiece.TrainerSpec.self_test_sample_size', + index=6, + number=6, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='enable_differential_privacy', + full_name='sentencepiece.TrainerSpec.enable_differential_privacy', + index=7, + number=50, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='differential_privacy_noise_level', + full_name='sentencepiece.TrainerSpec.differential_privacy_noise_level', + index=8, + number=51, + type=2, + cpp_type=6, + label=1, + has_default_value=True, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='differential_privacy_clipping_threshold', + full_name='sentencepiece.TrainerSpec.differential_privacy_clipping_threshold', + index=9, + number=52, + type=4, + cpp_type=4, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='character_coverage', + full_name='sentencepiece.TrainerSpec.character_coverage', + index=10, + number=10, + type=2, + cpp_type=6, + label=1, + has_default_value=True, + default_value=float(0.9995), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='input_sentence_size', + full_name='sentencepiece.TrainerSpec.input_sentence_size', + index=11, + number=11, + type=4, + cpp_type=4, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='shuffle_input_sentence', + full_name='sentencepiece.TrainerSpec.shuffle_input_sentence', + index=12, + number=19, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='mining_sentence_size', + full_name='sentencepiece.TrainerSpec.mining_sentence_size', + index=13, + number=12, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b('\030\001'), + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='training_sentence_size', + full_name='sentencepiece.TrainerSpec.training_sentence_size', + index=14, + number=13, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b('\030\001'), + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='seed_sentencepiece_size', + full_name='sentencepiece.TrainerSpec.seed_sentencepiece_size', + index=15, + number=14, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=1000000, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='shrinking_factor', + full_name='sentencepiece.TrainerSpec.shrinking_factor', + index=16, + number=15, + type=2, + cpp_type=6, + label=1, + has_default_value=True, + default_value=float(0.75), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='max_sentence_length', + full_name='sentencepiece.TrainerSpec.max_sentence_length', + index=17, + number=18, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=4192, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='num_threads', + full_name='sentencepiece.TrainerSpec.num_threads', + index=18, + number=16, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=16, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='num_sub_iterations', + full_name='sentencepiece.TrainerSpec.num_sub_iterations', + index=19, + number=17, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=2, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='max_sentencepiece_length', + full_name='sentencepiece.TrainerSpec.max_sentencepiece_length', + index=20, + number=20, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=16, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='split_by_unicode_script', + full_name='sentencepiece.TrainerSpec.split_by_unicode_script', + index=21, + number=21, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='split_by_number', + full_name='sentencepiece.TrainerSpec.split_by_number', + index=22, + number=23, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='split_by_whitespace', + full_name='sentencepiece.TrainerSpec.split_by_whitespace', + index=23, + number=22, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='treat_whitespace_as_suffix', + full_name='sentencepiece.TrainerSpec.treat_whitespace_as_suffix', + index=24, + number=24, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='allow_whitespace_only_pieces', + full_name='sentencepiece.TrainerSpec.allow_whitespace_only_pieces', + index=25, + number=26, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='split_digits', + full_name='sentencepiece.TrainerSpec.split_digits', + index=26, + number=25, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='pretokenization_delimiter', + full_name='sentencepiece.TrainerSpec.pretokenization_delimiter', + index=27, + number=53, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='control_symbols', + full_name='sentencepiece.TrainerSpec.control_symbols', + index=28, + number=30, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='user_defined_symbols', + full_name='sentencepiece.TrainerSpec.user_defined_symbols', + index=29, + number=31, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='required_chars', + full_name='sentencepiece.TrainerSpec.required_chars', + index=30, + number=36, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='byte_fallback', + full_name='sentencepiece.TrainerSpec.byte_fallback', + index=31, + number=35, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='vocabulary_output_piece_score', + full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', + index=32, + number=32, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='hard_vocab_limit', + full_name='sentencepiece.TrainerSpec.hard_vocab_limit', + index=33, + number=33, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='use_all_vocab', + full_name='sentencepiece.TrainerSpec.use_all_vocab', + index=34, + number=34, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='unk_id', + full_name='sentencepiece.TrainerSpec.unk_id', + index=35, + number=40, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='bos_id', + full_name='sentencepiece.TrainerSpec.bos_id', + index=36, + number=41, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='eos_id', + full_name='sentencepiece.TrainerSpec.eos_id', + index=37, + number=42, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=2, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='pad_id', + full_name='sentencepiece.TrainerSpec.pad_id', + index=38, + number=43, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=-1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='unk_piece', + full_name='sentencepiece.TrainerSpec.unk_piece', + index=39, + number=45, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='bos_piece', + full_name='sentencepiece.TrainerSpec.bos_piece', + index=40, + number=46, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='eos_piece', + full_name='sentencepiece.TrainerSpec.eos_piece', + index=41, + number=47, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='pad_piece', + full_name='sentencepiece.TrainerSpec.pad_piece', + index=42, + number=48, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='unk_surface', + full_name='sentencepiece.TrainerSpec.unk_surface', + index=43, + number=44, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b(" \342\201\207 ").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='train_extremely_large_corpus', + full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', + index=44, + number=49, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='seed_sentencepieces_file', + full_name='sentencepiece.TrainerSpec.seed_sentencepieces_file', + index=45, + number=54, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[ + _TRAINERSPEC_MODELTYPE, + ], + serialized_options=None, + is_extendable=True, + syntax='proto2', + extension_ranges=[ + (200, 536870912), + ], + oneofs=[], + serialized_start=45, + serialized_end=1617, +) + + +_NORMALIZERSPEC = _descriptor.Descriptor( + name='NormalizerSpec', + full_name='sentencepiece.NormalizerSpec', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', + full_name='sentencepiece.NormalizerSpec.name', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='precompiled_charsmap', + full_name='sentencepiece.NormalizerSpec.precompiled_charsmap', + index=1, + number=2, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='add_dummy_prefix', + full_name='sentencepiece.NormalizerSpec.add_dummy_prefix', + index=2, + number=3, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='remove_extra_whitespaces', + full_name='sentencepiece.NormalizerSpec.remove_extra_whitespaces', + index=3, + number=4, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='escape_whitespaces', + full_name='sentencepiece.NormalizerSpec.escape_whitespaces', + index=4, + number=5, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='normalization_rule_tsv', + full_name='sentencepiece.NormalizerSpec.normalization_rule_tsv', + index=5, + number=6, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=True, + syntax='proto2', + extension_ranges=[ + (200, 536870912), + ], + oneofs=[], + serialized_start=1620, + serialized_end=1829, +) + + +_SELFTESTDATA_SAMPLE = _descriptor.Descriptor( + name='Sample', + full_name='sentencepiece.SelfTestData.Sample', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='input', + full_name='sentencepiece.SelfTestData.Sample.input', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='expected', + full_name='sentencepiece.SelfTestData.Sample.expected', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1900, + serialized_end=1941, +) + +_SELFTESTDATA = _descriptor.Descriptor( + name='SelfTestData', + full_name='sentencepiece.SelfTestData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='samples', + full_name='sentencepiece.SelfTestData.samples', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[ + _SELFTESTDATA_SAMPLE, + ], + enum_types=[], + serialized_options=None, + is_extendable=True, + syntax='proto2', + extension_ranges=[ + (200, 536870912), + ], + oneofs=[], + serialized_start=1831, + serialized_end=1952, +) + + +_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( + name='SentencePiece', + full_name='sentencepiece.ModelProto.SentencePiece', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='piece', + full_name='sentencepiece.ModelProto.SentencePiece.piece', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='score', + full_name='sentencepiece.ModelProto.SentencePiece.score', + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='type', + full_name='sentencepiece.ModelProto.SentencePiece.type', + index=2, + number=3, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[ + _MODELPROTO_SENTENCEPIECE_TYPE, + ], + serialized_options=None, + is_extendable=True, + syntax='proto2', + extension_ranges=[ + (200, 536870912), + ], + oneofs=[], + serialized_start=2244, + serialized_end=2454, +) + +_MODELPROTO = _descriptor.Descriptor( + name='ModelProto', + full_name='sentencepiece.ModelProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='pieces', + full_name='sentencepiece.ModelProto.pieces', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='trainer_spec', + full_name='sentencepiece.ModelProto.trainer_spec', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='normalizer_spec', + full_name='sentencepiece.ModelProto.normalizer_spec', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='self_test_data', + full_name='sentencepiece.ModelProto.self_test_data', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name='denormalizer_spec', + full_name='sentencepiece.ModelProto.denormalizer_spec', + index=4, + number=5, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[ + _MODELPROTO_SENTENCEPIECE, + ], + enum_types=[], + serialized_options=None, + is_extendable=True, + syntax='proto2', + extension_ranges=[ + (200, 536870912), + ], + oneofs=[], + serialized_start=1955, + serialized_end=2465, +) + +_TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE +_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC +_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA +_SELFTESTDATA.fields_by_name['samples'].message_type = _SELFTESTDATA_SAMPLE +_MODELPROTO_SENTENCEPIECE.fields_by_name['type'].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE +_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO +_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE +_MODELPROTO.fields_by_name['pieces'].message_type = _MODELPROTO_SENTENCEPIECE +_MODELPROTO.fields_by_name['trainer_spec'].message_type = _TRAINERSPEC +_MODELPROTO.fields_by_name['normalizer_spec'].message_type = _NORMALIZERSPEC +_MODELPROTO.fields_by_name['self_test_data'].message_type = _SELFTESTDATA +_MODELPROTO.fields_by_name['denormalizer_spec'].message_type = _NORMALIZERSPEC +DESCRIPTOR.message_types_by_name['TrainerSpec'] = _TRAINERSPEC +DESCRIPTOR.message_types_by_name['NormalizerSpec'] = _NORMALIZERSPEC +DESCRIPTOR.message_types_by_name['SelfTestData'] = _SELFTESTDATA +DESCRIPTOR.message_types_by_name['ModelProto'] = _MODELPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +TrainerSpec = _reflection.GeneratedProtocolMessageType( + 'TrainerSpec', + (_message.Message,), + dict( + DESCRIPTOR=_TRAINERSPEC, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec) + ), +) +_sym_db.RegisterMessage(TrainerSpec) + +NormalizerSpec = _reflection.GeneratedProtocolMessageType( + 'NormalizerSpec', + (_message.Message,), + dict( + DESCRIPTOR=_NORMALIZERSPEC, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec) + ), +) +_sym_db.RegisterMessage(NormalizerSpec) + +SelfTestData = _reflection.GeneratedProtocolMessageType( + 'SelfTestData', + (_message.Message,), + dict( + Sample=_reflection.GeneratedProtocolMessageType( + 'Sample', + (_message.Message,), + dict( + DESCRIPTOR=_SELFTESTDATA_SAMPLE, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample) + ), + ), + DESCRIPTOR=_SELFTESTDATA, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData) + ), +) +_sym_db.RegisterMessage(SelfTestData) +_sym_db.RegisterMessage(SelfTestData.Sample) + +ModelProto = _reflection.GeneratedProtocolMessageType( + 'ModelProto', + (_message.Message,), + dict( + SentencePiece=_reflection.GeneratedProtocolMessageType( + 'SentencePiece', + (_message.Message,), + dict( + DESCRIPTOR=_MODELPROTO_SENTENCEPIECE, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece) + ), + ), + DESCRIPTOR=_MODELPROTO, + __module__='sentencepiece_model_pb2', + # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto) + ), +) +_sym_db.RegisterMessage(ModelProto) +_sym_db.RegisterMessage(ModelProto.SentencePiece) + + +DESCRIPTOR._options = None +_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None +_TRAINERSPEC.fields_by_name['training_sentence_size']._options = None +# @@protoc_insertion_point(module_scope) From ca5dd35ba97e60f5585f3e98f7b60dc1634159aa Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 9 Apr 2025 17:09:40 -0400 Subject: [PATCH 012/107] refactor eou eval utils Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/utils/eou_utils.py | 116 +++++++++++++++++ scripts/asr_end_of_utterance/evaluate_eou.py | 118 +++--------------- 2 files changed, 131 insertions(+), 103 deletions(-) create mode 100644 nemo/collections/asr/parts/utils/eou_utils.py diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py new file mode 100644 index 000000000000..dff89abfea5f --- /dev/null +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -0,0 +1,116 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List + + +@dataclass +class EOUResult: + latency: list + early_cutoff: list + true_positives: int + false_negatives: int + false_positives: int + num_utterances: int + num_predictions: int + + +def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float, collar: float) -> EOUResult: + """ + Evaluate end of utterance predictions against reference labels. + Each item in predicition/reference is a dictionary containing: + { + "session_id": str, + "start_time": float, # start time in seconds + "end_time": float, # end time in seconds + "words": str, # transcription of the utterance + "audio_filepath": str, # only in prediction + "eou_prob": float, # only in prediction, probability of EOU in range [0.1] + "eou_pred": bool, # only in prediction + "full_text": str, # only in prediction, which is the full transcription up to the end_time + } + + Args: + predictions (List[dict]): List of dictionaries containing predictions. + references (List[dict]): List of dictionaries containing reference labels. + threshold (float): Threshold for considering a prediction as EOU. + collar (float): Collar time in seconds for matching predictions to references. + """ + + latency = [] + early_cutoff = [] + true_positives = 0 + false_negatives = 0 + false_positives = 0 + num_utterances = len(reference) + num_predictions = len(prediction) + + predicted_eou = [p for p in prediction if p["eou_pred"] > threshold] + predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) + reference = sorted(reference, key=lambda x: x["start_time"]) + + p_idx = 0 + r_idx = 0 + for p_idx in range(len(predicted_eou)): + p = predicted_eou[p_idx] + p_start = p["start_time"] + p_end = p["end_time"] + + while r_idx < len(reference) and reference[r_idx]["end_time"] < p_start: + # Current reference ends before the current predicted utterance starts, find the next reference + r_idx += 1 + + if r_idx >= len(reference): + # No more references to compare against + false_positives += 1 + continue + + r = reference[r_idx] + r_start = r["start_time"] + r_end = r["end_time"] + + if np.abs(p_end - r_end) <= collar: + # Correctly predicted EOU + true_positives += 1 + latency.append(p_end - r_end) + r_idx += 1 + elif r_start <= p_end < r_end - collar: + # Early cutoff + # current predicted EOU is within the current reference utterance + false_positives += 1 + early_cutoff.append(r_end - p_end) + elif r_end + collar < p_end: + # Late EOU + # Current predicted EOU is after the current reference ends + false_negatives += 1 + latency.append(p_end - r_end) + else: + # p_end <= r_start + # Current predicted EOU is before the current reference starts + false_positives += 1 + + if r_idx < len(reference): + # There are remaining references that were not matched + false_negatives += len(reference) - r_idx + + return EOUResult( + latency=latency, + early_cutoff=early_cutoff, + true_positives=true_positives, + false_negatives=false_negatives, + false_positives=false_positives, + num_utterances=num_utterances, + num_predictions=num_predictions, + ) diff --git a/scripts/asr_end_of_utterance/evaluate_eou.py b/scripts/asr_end_of_utterance/evaluate_eou.py index 9159a9c313de..0137b8a10f72 100644 --- a/scripts/asr_end_of_utterance/evaluate_eou.py +++ b/scripts/asr_end_of_utterance/evaluate_eou.py @@ -14,13 +14,12 @@ import argparse import json -from dataclasses import dataclass -from pathlib import Path -from pprint import pprint from typing import List import numpy as np +from nemo.collections.asr.parts.utils.eou_utils import evaluate_eou + parser = argparse.ArgumentParser(description="Evaluate end of utterance predictions against reference labels.") parser.add_argument( "-p", @@ -65,17 +64,6 @@ ) -@dataclass -class EOUResult: - latency: list - early_cutoff: list - true_positives: int - false_negatives: int - false_positives: int - num_utterances: int - num_predictions: int - - def load_json(file_path: str, drop_prefix: str = "") -> List[dict]: """Load a JSON file, then clean the audio_filepath.""" with open(file_path, "r") as f: @@ -94,95 +82,6 @@ def load_json(file_path: str, drop_prefix: str = "") -> List[dict]: return cleaned_data -def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float, collar: float) -> EOUResult: - """ - Evaluate end of utterance predictions against reference labels. - Each item in predicition/reference is a dictionary containing: - { - "session_id": str, - "start_time": float, # start time in seconds - "end_time": float, # end time in seconds - "words": str, # transcription of the utterance - "audio_filepath": str, # only in prediction - "eou_prob": float, # only in prediction, probability of EOU in range [0.1] - "eou_pred": bool, # only in prediction - "full_text": str, # only in prediction, which is the full transcription up to the end_time - } - - Args: - predictions (List[dict]): List of dictionaries containing predictions. - references (List[dict]): List of dictionaries containing reference labels. - threshold (float): Threshold for considering a prediction as EOU. - collar (float): Collar time in seconds for matching predictions to references. - """ - - latency = [] - early_cutoff = [] - true_positives = 0 - false_negatives = 0 - false_positives = 0 - num_utterances = len(reference) - num_predictions = len(prediction) - - predicted_eou = [p for p in prediction if p["eou_pred"] > threshold] - predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) - reference = sorted(reference, key=lambda x: x["start_time"]) - - p_idx = 0 - r_idx = 0 - for p_idx in range(len(predicted_eou)): - p = predicted_eou[p_idx] - p_start = p["start_time"] - p_end = p["end_time"] - - while r_idx < len(reference) and reference[r_idx]["end_time"] < p_start: - # Current reference ends before the current predicted utterance starts, find the next reference - r_idx += 1 - - if r_idx >= len(reference): - # No more references to compare against - false_positives += 1 - continue - - r = reference[r_idx] - r_start = r["start_time"] - r_end = r["end_time"] - - if np.abs(p_end - r_end) <= collar: - # Correctly predicted EOU - true_positives += 1 - latency.append(p_end - r_end) - r_idx += 1 - elif r_start <= p_end < r_end - collar: - # Early cutoff - # current predicted EOU is within the current reference utterance - false_positives += 1 - early_cutoff.append(r_end - p_end) - elif r_end + collar < p_end: - # Late EOU - # Current predicted EOU is after the current reference ends - false_negatives += 1 - latency.append(p_end - r_end) - else: - # p_end <= r_start - # Current predicted EOU is before the current reference starts - false_positives += 1 - - if r_idx < len(reference): - # There are remaining references that were not matched - false_negatives += len(reference) - r_idx - - return EOUResult( - latency=latency, - early_cutoff=early_cutoff, - true_positives=true_positives, - false_negatives=false_negatives, - false_positives=false_positives, - num_utterances=num_utterances, - num_predictions=num_predictions, - ) - - def main(): args = parser.parse_args() @@ -203,16 +102,29 @@ def main(): avg_cutoffs = len(results.early_cutoff) / len(results.num_utterances) if len(results.num_utterances) > 0 else 0 + p80_cutoff = np.percentile(results.early_cutoff, 80) if len(results.early_cutoff) > 0 else 0 + p90_cutoff = np.percentile(results.early_cutoff, 90) if len(results.early_cutoff) > 0 else 0 + p95_cutoff = np.percentile(results.early_cutoff, 95) if len(results.early_cutoff) > 0 else 0 + p99_cutoff = np.percentile(results.early_cutoff, 99) if len(results.early_cutoff) > 0 else 0 + p80_latency = np.percentile(results.latency, 80) if len(results.latency) > 0 else 0 p90_latency = np.percentile(results.latency, 90) if len(results.latency) > 0 else 0 p95_latency = np.percentile(results.latency, 95) if len(results.latency) > 0 else 0 p99_latency = np.percentile(results.latency, 99) if len(results.latency) > 0 else 0 + # Print the results + print("======================================") print("Evaluation Results:") print(f"Number of utterances: {results.num_utterances}") print(f"Number of predictions: {results.num_predictions}") print(f"F1 Score: {f1_score:.4f}") + print("======================================") print(f"Early cutoff rate: {avg_cutoffs:.4f}") + print(f"Early cutoff P80: {p80_cutoff:.4f} seconds") + print(f"Early cutoff P90: {p90_cutoff:.4f} seconds") + print(f"Early cutoff P95: {p95_cutoff:.4f} seconds") + print(f"Early cutoff P99: {p99_cutoff:.4f} seconds") + print("======================================") print(f"P80 Latency: {p80_latency:.4f} seconds") print(f"P90 Latency: {p90_latency:.4f} seconds") print(f"P95 Latency: {p95_latency:.4f} seconds") From df3151d2314f8764af096def8beb97ceefe29f8c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 11 Apr 2025 14:41:45 -0400 Subject: [PATCH 013/107] add eou rnnt training Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_rnnt_eou.py | 215 ++++++++++++++++++ .../helpers/convert_nemo_asr_hybrid_to_ctc.py | 2 +- .../speech_to_text_rnnt_bpe_eou.py | 91 -------- .../asr/data/audio_to_eou_label_lhotse.py | 45 ++-- nemo/collections/asr/metrics/wer.py | 4 + nemo/collections/asr/models/__init__.py | 1 - nemo/collections/asr/models/asr_eou_models.py | 215 ++++++++++++++++++ .../common/data/lhotse/nemo_adapters.py | 1 + .../add_special_tokens_to_sentencepiece.py | 56 +++-- 9 files changed, 502 insertions(+), 128 deletions(-) create mode 100644 examples/asr/asr_eou/speech_to_text_rnnt_eou.py delete mode 100644 examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py create mode 100644 nemo/collections/asr/models/asr_eou_models.py diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py new file mode 100644 index 000000000000..d4f3cd726c69 --- /dev/null +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py @@ -0,0 +1,215 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +```bash +#!/bin/bash + +NEMO_PATH=/home/heh/codes/nemo-eou +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json + +PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo +TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms-eou/ + +BATCH_DURATION=30 +NUM_WORKERS=0 +LIMIT_TRAIN_BATCHES=100 +VAL_CHECK_INTERVAL=100 +MAX_STEPS=1000000 + +EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug + +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming +CONFIG_NAME=fastconformer_transducer_bpe_streaming + +CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ + --config-path $CONFIG_PATH \ + --config-name $CONFIG_NAME \ + ++init_from_nemo_model=$PRETRAINED_NEMO \ + model.encoder.att_context_size="[70,1]" \ + model.tokenizer.dir=$TOKENIZER_DIR \ + model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ + model.validation_ds.manifest_filepath=$VAL_MANIFEST \ + model.train_ds.batch_size=null \ + model.validation_ds.batch_size=null \ + ++model.train_ds.defer_setup=true \ + ++model.train_ds.batch_duration=$BATCH_DURATION \ + ++model.train_ds.num_workers=$NUM_WORKERS \ + ++model.train_ds.quadratic_duration=30 \ + ++model.train_ds.num_buckets=30 \ + ++model.train_ds.num_cuts_for_bins_estimate=10000 \ + ++model.train_ds.bucket_buffer_size=10000 \ + ++model.train_ds.shuffle_buffer_size=10000 \ + ++model.train_ds.shuffle=true \ + ++model.validation_ds.defer_setup=true \ + ++model.validation_ds.batch_duration=$BATCH_DURATION \ + ++model.validation_ds.num_workers=$NUM_WORKERS \ + ++model.validation_ds.quadratic_duration=30 \ + ++model.validation_ds.num_buckets=30 \ + ++model.validation_ds.num_cuts_for_bins_estimate=10000 \ + ++model.validation_ds.bucket_buffer_size=10000 \ + ++model.validation_ds.shuffle_buffer_size=10000 \ + ++model.validation_ds.shuffle=false \ + ~model.test_ds \ + ++trainer.use_distributed_sampler=false \ + ++trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ + trainer.val_check_interval=$VAL_CHECK_INTERVAL \ + trainer.max_steps=$MAX_STEPS \ + trainer.max_epochs=-1 \ + exp_manager.name=$EXP_NAME +``` + +""" + + +from typing import Optional + +import lightning.pytorch as pl +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.models.asr_eou_models import EncDecRNNTBPEEOUModel +from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg + + +def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: + if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None: + raise NotImplementedError( + "Currently for simplicity of single script for all model types, we only support `init_from_nemo_model` and `init_from_pretrained_model`" + ) + nemo_model_path = cfg.get('init_from_nemo_model', None) + pretrained_name = cfg.get('init_from_pretrained_model', None) + if nemo_model_path is not None and pretrained_name is not None: + raise ValueError("Only pass `init_from_nemo_model` or `init_from_pretrained_model` but not both") + elif nemo_model_path is None and pretrained_name is None: + return None + + if nemo_model_path: + return nemo_model_path + if pretrained_name: + return pretrained_name + + +def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str): + """ + load the pretrained model from a .nemo file, taking into account the joint network + """ + if pretrained_model_path.endswith('.nemo'): + pretrained_model = ASRModel.restore_from(restore_path=pretrained_model_path) # type: EncDecRNNTBPEModel + else: + try: + pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel + except Exception as e: + raise ValueError(f"Could not load pretrained model from {pretrained_model_path}.") from e + + if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): + raise ValueError( + f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." + ) + + # Load encoder state dict into the model + model.encoder.load_state_dict(pretrained_model.encoder.state_dict(), strict=True) + + # Load decoder state dict into the model + decoder = model.decoder # type: RNNTDecoder + pretrained_decoder = pretrained_model.decoder # type: RNNTDecoder + if not isinstance(decoder, RNNTDecoder) or not isinstance(pretrained_decoder, RNNTDecoder): + raise ValueError( + f"Decoder {decoder.__class__} is not RNNTDecoder or pretrained decoder {pretrained_decoder.__class__} is not RNNTDecoder." + ) + + decoder.prediction["dec_rnn"].load_state_dict(pretrained_decoder.prediction["dec_rnn"].state_dict(), strict=True) + + decoder_embed_states = decoder.prediction["embed"].state_dict()['weight'] # shape: [num_classes+2, hid_dim] + pretrained_decoder_embed_states = pretrained_decoder.prediction["embed"].state_dict()[ + 'weight' + ] # shape: [num_classes, hid_dim] + if decoder_embed_states.shape[0] != pretrained_decoder_embed_states.shape[0] + 2: + raise ValueError( + f"Size mismatched between pretrained ({pretrained_decoder_embed_states.shape[0]}+2) and current model ({decoder_embed_states.shape[0]}), skip loading decoder embedding." + ) + + decoder_embed_states[:-3, :] = pretrained_decoder_embed_states[:-1, :] # everything except EOU, EOB and blank + decoder_embed_states[-1, :] = pretrained_decoder_embed_states[-1, :] # blank class + decoder.prediction["embed"].load_state_dict({"weight": decoder_embed_states}, strict=True) + + # Load joint network weights if new model's joint network has two more classes than the pretrained model + joint_network = model.joint # type: RNNTJoint + pretrained_joint_network = pretrained_model.joint # type: RNNTJoint + assert isinstance(joint_network, RNNTJoint), f"Joint network {joint_network.__class__} is not RNNTJoint." + assert isinstance( + pretrained_joint_network, RNNTJoint + ), f"Pretrained joint network {pretrained_joint_network.__class__} is not RNNTJoint." + joint_network.pred.load_state_dict(pretrained_joint_network.pred.state_dict(), strict=True) + joint_network.enc.load_state_dict(pretrained_joint_network.enc.state_dict(), strict=True) + + if joint_network.num_classes_with_blank != pretrained_joint_network.num_classes_with_blank + 2: + logging.info( + f"Size mismatched between pretrained ({pretrained_joint_network.num_classes_with_blank}+2) and current model ({joint_network.num_classes_with_blank}), skip loading joint network." + ) + return + + # Load the joint network weights + pretrained_joint_state = pretrained_joint_network.joint_net.state_dict() + joint_state = joint_network.joint_net.state_dict() + pretrained_joint_clf_weight = pretrained_joint_state['2.weight'] # shape: [num_classes, hid_dim] + pretrained_joint_clf_bias = pretrained_joint_state['2.bias'] if '2.bias' in pretrained_joint_state else None + + # Copy the weights and biases from the pretrained model to the new model + # shape: [num_classes+2, hid_dim] + joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank + joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class + if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: + joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank + joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class + + # Load the joint network weights + joint_network.joint_net.load_state_dict(joint_state, strict=True) + logging.info(f"Joint network weights loaded from {pretrained_model_path}.") + + +@hydra_runner( + config_path="../conf/fastconformer/cache_aware_streaming", config_name="fastconformer_transducer_bpe_streaming" +) +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) + exp_manager(trainer, cfg.get("exp_manager", None)) + + asr_model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) + + init_from_model = get_pretrained_model_name(cfg) + if init_from_model: + init_from_pretrained_nemo(asr_model, init_from_model) + + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py b/examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py index 199e399ead11..34afa8309084 100644 --- a/examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py +++ b/examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py @@ -20,7 +20,7 @@ in NeMo. The resulting .nemo file will be a pure CTC or RNNT model, and can be used like any other .nemo model including in nemo2riva. -Usage: python convert_nemo_asr_hybrid_to_ctc.py -i /path/to/hybrid.nemo -o /path/to/saved_ctc_model.nemo -m ctc|rnnt +Usage: python convert_nemo_asr_hybrid_to_ctc.py -i /path/to/hybrid.nemo -o /path/to/saved_ctc_model.nemo -t ctc|rnnt """ diff --git a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py b/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py deleted file mode 100644 index ba6d6db1ca89..000000000000 --- a/examples/asr/asr_transducer/speech_to_text_rnnt_bpe_eou.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -# Preparing the Tokenizer for the dataset -Use the `process_asr_text_tokenizer.py` script under /scripts/tokenizers/ in order to prepare the tokenizer. - -```sh -python /scripts/tokenizers/process_asr_text_tokenizer.py \ - --manifest= - OR - --data_file= \ - --data_root="" \ - --vocab_size= \ - --tokenizer=<"spe" or "wpe"> \ - --no_lower_case \ - --spe_type=<"unigram", "bpe", "char" or "word"> \ - --spe_character_coverage=1.0 \ - --log -``` - -# Training the model -```sh -python speech_to_text_rnnt_bpe.py \ - # (Optional: --config-path= --config-name=) \ - model.train_ds.manifest_filepath= \ - model.validation_ds.manifest_filepath= \ - model.tokenizer.dir= \ - model.tokenizer.type= \ - trainer.devices=-1 \ - trainer.accelerator="gpu" \ - trainer.strategy="ddp" \ - trainer.max_epochs=100 \ - model.optim.name="adamw" \ - model.optim.lr=0.001 \ - model.optim.betas=[0.9,0.999] \ - model.optim.weight_decay=0.0001 \ - model.optim.sched.warmup_steps=2000 - exp_manager.create_wandb_logger=True \ - exp_manager.wandb_logger_kwargs.name="" \ - exp_manager.wandb_logger_kwargs.project="" -``` - -# Fine-tune a model - -For documentation on fine-tuning this model, please visit - -https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations - -""" - -import lightning.pytorch as pl -from omegaconf import OmegaConf - -from nemo.collections.asr.models import EncDecRNNTBPEEOUModel -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager -from nemo.utils.trainer_utils import resolve_trainer_cfg - - -@hydra_runner(config_path="experimental/contextnet_rnnt", config_name="config_rnnt_bpe") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) - exp_manager(trainer, cfg.get("exp_manager", None)) - asr_model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) - - # Initialize the weights of the model from another model, if provided via config - asr_model.maybe_init_from_pretrained_checkpoint(cfg) - - trainer.fit(asr_model) - - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if asr_model.prepare_test(trainer): - trainer.test(asr_model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 7576a20b28b7..00381489473b 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -17,7 +17,7 @@ import numpy as np import torch.utils.data -from lhotse.cut import Cut, CutSet +from lhotse.cut import Cut, CutSet, MonoCut from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors from omegaconf import DictConfig @@ -26,12 +26,13 @@ from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType +EOU_LABEL = 2 +EOB_LABEL = 3 +EOU_STRING = '' +EOB_STRING = '' + class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): - EOU_LABEL = 2 - EOB_LABEL = 3 - EOU_STRING = '' - EOB_STRING = '' """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, along with EOU labels for each frame. The audios used in this dataset should only contain speech with @@ -39,7 +40,7 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): the audio signal for training EOU prediction task. To generate EOU labels, the last frame of utterance will be marked as "end of utterance" (labeled as `2`), - while if it's a backchannel utterance it'll be marked asd "end of backchannel" (labeled as `3`). + while if it's a backchannel utterance it'll be marked asd "end of backchannel" (labeled as `3`). The rest of the speech frames will be marked as "speech" (labeled as `1`). The padded non-speech signals will be marked as "non-speech" (labeled as 0). @@ -48,10 +49,10 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): audio_lens: torch.Tensor of audio signal length text_tokens: torch.Tensor of text text_tokens text_token_lens: torch.Tensor of text token length - eou_targets (optional): torch.Tensor of EOU labels + eou_targets (optional): torch.Tensor of EOU labels eou_target_lens (optional): torch.Tensor of EOU label length - The input manifest should be a jsonl file where each line is a python dictionary. + The input manifest should be a jsonl file where each line is a python dictionary. Example manifest sample: { "audio_filepath": "/path/to/audio.wav", @@ -101,23 +102,25 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'text_token_lens': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec): + def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_eou_labels: bool = False): super().__init__() self.cfg = cfg - self.return_eou_labels = cfg.get('return_eou_labels', False) + self.return_eou_labels = return_eou_labels self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.num_sample_per_mel_frame = int( self.cfg.get('window_stride', 0.01) * self.cfg.get('sample_rate', 16000) ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) - self.eou_string = self.cfg.get('eou_string', self.EOU_STRING) - self.eob_string = self.cfg.get('eob_string', self.EOB_STRING) + self.eou_string = self.cfg.get('eou_string', EOU_STRING) + self.eob_string = self.cfg.get('eob_string', EOB_STRING) self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) self.padding_cfg = self.cfg.get('random_padding', None) + self.drop_pnc = self.cfg.get('drop_pnc', False) + self.pc_strip = self.cfg.get('pc_strip', False) def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: - audio, audio_lens, cuts = self.load_audio(cuts) + audio, audio_lens, _ = self.load_audio(cuts) audio_signals = [] audio_lengths = [] eou_targets = [] @@ -125,7 +128,6 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: for i in range(len(cuts)): eou_targets_i = self.get_frame_labels(cuts[i], audio_lens[i]) text_tokens_i = self.get_text_tokens(cuts[i]) - audio_i, audio_len_i, eou_targets_i = self.random_pad_audio(audio[i], audio_lens[i], eou_targets_i) audio_signals.append(audio_i) audio_lengths.append(audio_len_i) @@ -158,17 +160,16 @@ def _audio_len_to_frame_len(self, num_samples: int): def get_frame_labels(self, cut: Cut, num_samples: int): hidden_length = self._audio_len_to_frame_len(num_samples) - - if not cut.has_custom("sou_time") or not cut.has_custom("eou_time"): + if not "sou_time" in cut.custom or not "eou_time" in cut.custom: # assume only single speech segment text = cut.supervisions[0].text if not text: # skip empty utterances return torch.zeros(hidden_length).long() eou_targets = torch.ones(hidden_length).long() # speech label - eou_targets[-1] = self.EOU_LABEL # by default it's end of utterance + eou_targets[-1] = EOU_LABEL # by default it's end of utterance if cut.has_custom("is_backchannel") and cut.custom["is_backchannel"]: - eou_targets[-1] = self.EOB_LABEL # end of backchannel + eou_targets[-1] = EOB_LABEL # end of backchannel return eou_targets sou_time = cut.custom["sou_time"] @@ -202,9 +203,9 @@ def get_frame_labels(self, cut: Cut, num_samples: int): seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) eou_targets[sou_idx : sou_idx + seg_len] = 1 if is_backchannel[i]: - eou_targets[sou_idx + seg_len - 1] = self.EOB_LABEL # end of backchannel + eou_targets[sou_idx + seg_len - 1] = EOB_LABEL # end of backchannel else: - eou_targets[sou_idx + seg_len - 1] = self.EOU_LABEL # end of utterance + eou_targets[sou_idx + seg_len - 1] = EOU_LABEL # end of utterance return eou_targets @@ -255,12 +256,12 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar """ p = np.random.rand() if self.padding_cfg is None or p > self.padding_cfg.padding_prob: - return audio, audio_len, eou_targets, eou_targets.size(0) + return audio, audio_len, eou_targets duration = audio_len.item() / self.cfg.sample_rate # if already longer than the maximum duration, return the original audio if duration >= self.padding_cfg.max_total_duration: - return audio, audio_len, eou_targets, eou_targets.size(0) + return audio, audio_len, eou_targets # apply padding audio = audio[:audio_len] diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index 312e43983ca5..d8e70d3aaadc 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -296,6 +296,7 @@ def update( targets_lengths: torch.Tensor, predictions_mask: Optional[torch.Tensor] = None, input_ids: Optional[torch.Tensor] = None, + return_hypotheses: Optional[bool] = False, ): """ Updates metric state. @@ -345,6 +346,9 @@ def update( self.scores = torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) self.words = torch.tensor(words, device=self.words.device, dtype=self.words.dtype) + if return_hypotheses: + return hypotheses + return None def compute(self): scores = self.scores.detach().float() diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py index 9982c1557ce3..34dead15b33d 100644 --- a/nemo/collections/asr/models/__init__.py +++ b/nemo/collections/asr/models/__init__.py @@ -42,4 +42,3 @@ SpeechEncDecSelfSupervisedModel, ) from nemo.collections.asr.models.transformer_bpe_models import EncDecTransfModelBPE -from nemo.collections.asr.models.rnnt_bpe_eou_models import EncDecRNNTBPEEOUModel diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py new file mode 100644 index 000000000000..1f3f95042dff --- /dev/null +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -0,0 +1,215 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Optional + +import torch +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset +from nemo.collections.asr.models import EncDecRNNTBPEModel +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config +from nemo.core.classes.mixins import AccessMixin + +__all__ = ['EncDecRNNTBPEEOUModel'] + + +class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel): + def _setup_dataloader_from_config(self, config: Optional[Dict]): + cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config + dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer, return_eou_labels=True) + return get_lhotse_dataloader_from_config( + config, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), + dataset=dataset, + tokenizer=self.tokenizer, + ) + + def training_step(self, batch, batch_nb): + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + signal, signal_len, transcript, transcript_len, eou_targets, eou_len = batch + + # forward() only performs encoder forward + encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + # During training, loss must be computed, so decoder forward is necessary + decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) + + if hasattr(self, '_trainer') and self._trainer is not None: + log_every_n_steps = self._trainer.log_every_n_steps + sample_id = self._trainer.global_step + else: + log_every_n_steps = 1 + sample_id = batch_nb + + # If experimental fused Joint-Loss-WER is not used + if not self.joint.fuse_loss_wer: + # Compute full joint and loss + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + loss_value = self.loss( + log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length + ) + + # Add auxiliary losses, if registered + loss_value = self.add_auxiliary_losses(loss_value) + + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + tensorboard_logs = { + 'train_loss': loss_value, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), + } + + if (sample_id + 1) % log_every_n_steps == 0: + self.wer.update( + predictions=encoded, + predictions_lengths=encoded_len, + targets=transcript, + targets_lengths=transcript_len, + ) + _, scores, words = self.wer.compute() + self.wer.reset() + tensorboard_logs.update({'training_batch_wer': scores.float() / words}) + + else: + # If experimental fused Joint-Loss-WER is used + if (sample_id + 1) % log_every_n_steps == 0: + compute_wer = True + else: + compute_wer = False + + # Fused joint step + loss_value, wer, _, _ = self.joint( + encoder_outputs=encoded, + decoder_outputs=decoder, + encoder_lengths=encoded_len, + transcripts=transcript, + transcript_lengths=transcript_len, + compute_wer=compute_wer, + ) + + # Add auxiliary losses, if registered + loss_value = self.add_auxiliary_losses(loss_value) + + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + tensorboard_logs = { + 'train_loss': loss_value, + 'learning_rate': self._optimizer.param_groups[0]['lr'], + 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), + } + + if compute_wer: + tensorboard_logs.update({'training_batch_wer': wer}) + + # Log items + self.log_dict(tensorboard_logs) + + # Preserve batch acoustic model T and language model U parameters if normalizing + if self._optim_normalize_joint_txu: + self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] + + return {'loss': loss_value} + + def predict_step(self, batch, batch_idx, dataloader_idx=0): + signal, signal_len, transcript, transcript_len, eou_targets, eou_len = batch + + # forward() only performs encoder forward + encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False + ) + + return list(best_hyp_text) + + def validation_pass(self, batch, batch_idx, dataloader_idx=0): + signal, signal_len, transcript, transcript_len = batch + + # forward() only performs encoder forward + encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + tensorboard_logs = {} + + # If experimental fused Joint-Loss-WER is not used + if not self.joint.fuse_loss_wer: + if self.compute_eval_loss: + decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + + loss_value = self.loss( + log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length + ) + + tensorboard_logs['val_loss'] = loss_value + + self.wer.update( + predictions=encoded, + predictions_lengths=encoded_len, + targets=transcript, + targets_lengths=transcript_len, + ) + wer, wer_num, wer_denom = self.wer.compute() + self.wer.reset() + + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + + else: + # If experimental fused Joint-Loss-WER is used + compute_wer = True + + if self.compute_eval_loss: + decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) + else: + decoded = None + target_len = transcript_len + + # Fused joint step + loss_value, wer, wer_num, wer_denom = self.joint( + encoder_outputs=encoded, + decoder_outputs=decoded, + encoder_lengths=encoded_len, + transcripts=transcript, + transcript_lengths=target_len, + compute_wer=compute_wer, + ) + + if loss_value is not None: + tensorboard_logs['val_loss'] = loss_value + + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + return tensorboard_logs diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index 8e15a6cbb39e..9f898425a428 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -116,6 +116,7 @@ def __iter__(self) -> Generator[Cut, None, None]: cut = self._create_cut( audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None) ) + assert isinstance(cut, MonoCut) # Note that start=0 and not start=offset because supervision's start if relative to the # start of the cut; and cut.start is already set to offset cut.supervisions.append( diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py index 7e08191f0a05..c9c276e99243 100644 --- a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py +++ b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py @@ -44,22 +44,24 @@ Usage: python add_special_tokens_to_sentencepiece.py \ --input_file your_model.nemo \ - --output_file /path/to/new/tokenizer.model + --output_dir /path/to/new/tokenizer_dir/ """ parser = ArgumentParser(description="Add special tokens to sentencepiece model") parser.add_argument( + "-i", "--input_file", type=str, required=True, help="Path to sentencepiece model file", ) parser.add_argument( - "--output_file", + "-o", + "--output_dir", type=str, required=True, - help="Path to sentencepiece model file", + help="Path to output directory for new tokenizer", ) parser.add_argument( "--tokens", @@ -87,7 +89,15 @@ def extract_nemo_tokenizer(nemo_filepath, output_dir): return str(tokenizer.absolute()) -def edit_spt_model(input_file, output_file, tokens, is_userdefined): +def edit_spt_model(input_file, output_dir, tokens, is_userdefined): + output_dir = Path(output_dir) + + if output_dir.exists(): + logging.warning(f"Output directory {output_dir} already exists. Overwriting it.") + + output_dir.mkdir(parents=True, exist_ok=True) + + output_file = str(output_dir / "tokenizer.model") token_type = 3 if is_userdefined: @@ -104,8 +114,8 @@ def edit_spt_model(input_file, output_file, tokens, is_userdefined): model.pieces.append(piece) sp = spm.SentencePieceProcessor() + sp.LoadFromSerializedProto(model.SerializeToString()) try: - sp.LoadFromSerializedProto(model.SerializeToString()) for token in tokens: id = sp.piece_to_id(token) logging.info(f"Created token '{token}' at ID {id}") @@ -116,11 +126,34 @@ def edit_spt_model(input_file, output_file, tokens, is_userdefined): with open(output_file, 'wb') as outf: outf.write(model.SerializeToString()) - logging.info(f"Created new tokenizer at: {output_file}") - -def inject_special_tokens(input_file, output_file, tokens, is_userdefined): + # Write the vocab to file + vocab_file = str(output_dir / "tokenizer.vocab") + with open(vocab_file, "w", encoding="utf-8") as f: + for i in range(sp.get_piece_size()): + piece = sp.id_to_piece(i) + score = sp.get_score(i) # Optional: only available if using newer SentencePiece versions + f.write(f"{piece}\t{score}\n") # Format follows the original vocab format + logging.info(f"Created new tokenizer vocab at: {vocab_file}") + + special_tokens = ["", "", "", ""] + special_tokens.extend(tokens) + vocab_txt_file = str(output_dir / "vocab.txt") + with open(vocab_txt_file, "w", encoding="utf-8") as f: + for i in range(sp.get_piece_size()): + piece = sp.id_to_piece(i) + if piece in special_tokens: + # skip special tokens + continue + token = piece[1:] if piece.startswith("▁") else f"##{piece}" + if len(token) == 0: + tokens = piece[0] + f.write(f"{token}\n") # Format follows the original vocab format + logging.info(f"Created new tokenizer vocab at: {vocab_txt_file}") + + +def inject_special_tokens(input_file, output_dir, tokens, is_userdefined): if not os.path.exists(input_file): raise ValueError(f"Input file {input_file} does not exist") @@ -133,13 +166,10 @@ def inject_special_tokens(input_file, output_file, tokens, is_userdefined): input_file = os.path.abspath(input_file) logging.info(f"Using input file: {input_file}") - Path(output_file).parent.mkdir(parents=True, exist_ok=True) - if os.path.exists(output_file): - logging.info(f"Output file {output_file} already exists. Overwriting.") - edit_spt_model(input_file, output_file, tokens, is_userdefined) + edit_spt_model(input_file, output_dir, tokens, is_userdefined) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") args = parser.parse_args() - inject_special_tokens(args.input_file, args.output_file, args.tokens, args.is_userdefined) + inject_special_tokens(args.input_file, args.output_dir, args.tokens, args.is_userdefined) From 514b6d2f6dd9bc52921c35fa4f50502c8784bd7d Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 11 Apr 2025 19:41:27 -0400 Subject: [PATCH 014/107] update doc Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_rnnt_eou.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py index d4f3cd726c69..894731d8b6ec 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py @@ -15,6 +15,15 @@ """ Example usage: +0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py + +1. Add special tokens and to the tokenizer of pretrained model, by refering to the script + /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py + +2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script + /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py + +3. Run the following command to train the ASR-EOU model: ```bash #!/bin/bash From a63bef3de0c90d2e8d740ccaa75fd0d9311d4634 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 15 Apr 2025 11:47:34 -0400 Subject: [PATCH 015/107] update data augmentation Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_rnnt_eou.py | 34 +- ...astconformer_transducer_bpe_streaming.yaml | 308 ++++++++++++++++++ .../asr/data/audio_to_eou_label_lhotse.py | 162 +++++++-- 3 files changed, 449 insertions(+), 55 deletions(-) create mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py index 894731d8b6ec..f78835250783 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py @@ -32,6 +32,7 @@ TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms-eou/ @@ -55,33 +56,16 @@ model.encoder.att_context_size="[70,1]" \ model.tokenizer.dir=$TOKENIZER_DIR \ model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ + model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.train_ds.batch_size=null \ - model.validation_ds.batch_size=null \ - ++model.train_ds.defer_setup=true \ - ++model.train_ds.batch_duration=$BATCH_DURATION \ - ++model.train_ds.num_workers=$NUM_WORKERS \ - ++model.train_ds.quadratic_duration=30 \ - ++model.train_ds.num_buckets=30 \ - ++model.train_ds.num_cuts_for_bins_estimate=10000 \ - ++model.train_ds.bucket_buffer_size=10000 \ - ++model.train_ds.shuffle_buffer_size=10000 \ - ++model.train_ds.shuffle=true \ - ++model.validation_ds.defer_setup=true \ - ++model.validation_ds.batch_duration=$BATCH_DURATION \ - ++model.validation_ds.num_workers=$NUM_WORKERS \ - ++model.validation_ds.quadratic_duration=30 \ - ++model.validation_ds.num_buckets=30 \ - ++model.validation_ds.num_cuts_for_bins_estimate=10000 \ - ++model.validation_ds.bucket_buffer_size=10000 \ - ++model.validation_ds.shuffle_buffer_size=10000 \ - ++model.validation_ds.shuffle=false \ + model.train_ds.batch_duration=$BATCH_DURATION \ + model.train_ds.num_workers=$NUM_WORKERS \ + model.validation_ds.batch_duration=$BATCH_DURATION \ + model.validation_ds.num_workers=$NUM_WORKERS \ ~model.test_ds \ - ++trainer.use_distributed_sampler=false \ - ++trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ + trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ trainer.val_check_interval=$VAL_CHECK_INTERVAL \ trainer.max_steps=$MAX_STEPS \ - trainer.max_epochs=-1 \ exp_manager.name=$EXP_NAME ``` @@ -198,9 +182,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat logging.info(f"Joint network weights loaded from {pretrained_model_path}.") -@hydra_runner( - config_path="../conf/fastconformer/cache_aware_streaming", config_name="fastconformer_transducer_bpe_streaming" -) +@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_transducer_bpe_streaming") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml new file mode 100644 index 000000000000..71094058583e --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -0,0 +1,308 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming-EOU" + +model: + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: 300 # you may disable batch_duration by setting it to `null` + batch_size: null + shuffle: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.5 + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution for padding duration + normal_std: 2.0 # standard deviation of normal distribution for padding duration + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: 300 # you may disable batch_duration by setting it to `null` + batch_size: null + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + test_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: 300 # you may disable batch_duration by setting it to `null` + batch_size: null + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 00381489473b..cee03a075069 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -17,14 +17,17 @@ import numpy as np import torch.utils.data -from lhotse.cut import Cut, CutSet, MonoCut +from lhotse.cut import Cut, CutSet, MixedCut from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf +from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations +from nemo.collections.asr.parts.preprocessing.segment import AudioSegment from nemo.collections.common.tokenizers.aggregate_tokenizer import TokenizerWrapper from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType +from nemo.utils import logging EOU_LABEL = 2 EOB_LABEL = 3 @@ -32,6 +35,10 @@ EOB_STRING = '' +EOU_LENGTH_PERTURBATION = ['speed', 'time_stretch'] +EOU_PROHIBITED_AUGMENTATIONS = ['random_segment'] + + class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, @@ -44,6 +51,23 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): The rest of the speech frames will be marked as "speech" (labeled as `1`). The padded non-speech signals will be marked as "non-speech" (labeled as 0). + Args: + cfg: DictConfig object container following keys, usually taken from your `model.train_ds` + or `model.validation_ds` config: + ``` + sample_rate: # int, Sample rate of the audio signal + window_stride: # float, Window stride for audio encoder + subsampling_factor: # Subsampling factor for audio encoder + random_padding: # Random padding configuration + prob: 0.9 # probability of applying padding + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution for padding duration + normal_std: 2.0 # standard deviation of normal distribution for padding duration + ``` + + Returns: audio: torch.Tensor of audio signal audio_lens: torch.Tensor of audio signal length @@ -77,17 +101,6 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): 5) concatenate the pre-padding, audio, and post-padding to get the padded audio signal 6) update the EOU labels accordingly - Random padding yaml config: - ``` - random_padding: - padding_prob: 0.99 # probability of applying padding - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - pad_normal_mean: 0.5 # mean of normal distribution for padding duration - pad_normal_std: 2.0 # standard deviation of normal distribution for padding duration - ``` - """ @property @@ -108,27 +121,66 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_eou_labels: self.return_eou_labels = return_eou_labels self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) + self.sample_rate = self.cfg.get('sample_rate', 16000) self.num_sample_per_mel_frame = int( - self.cfg.get('window_stride', 0.01) * self.cfg.get('sample_rate', 16000) + self.cfg.get('window_stride', 0.01) * self.sample_rate ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) self.eou_string = self.cfg.get('eou_string', EOU_STRING) self.eob_string = self.cfg.get('eob_string', EOB_STRING) self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) self.padding_cfg = self.cfg.get('random_padding', None) - self.drop_pnc = self.cfg.get('drop_pnc', False) - self.pc_strip = self.cfg.get('pc_strip', False) + self.augmentor = None + self.len_augmentor = None + if self.cfg.get('augmentor', None) is not None: + augmentor = {} + len_augmentor = {} + aug_cfg = OmegaConf.to_container(self.cfg.augmentor, resolve=True) + for k, v in aug_cfg.items(): + if k in EOU_PROHIBITED_AUGMENTATIONS: + logging.warning(f"EOU dataset does not support {k} augmentation, skipping.") + continue + if k in EOU_LENGTH_PERTURBATION: + len_augmentor[k] = v + else: + augmentor[k] = v + + if len(augmentor) > 0: + logging.info(f"EOU dataset will apply augmentations: {augmentor}") + self.augmentor = process_augmentations(augmentor) + if len(len_augmentor) > 0: + logging.info(f"EOU dataset will apply length augmentations: {len_augmentor}") + self.len_augmentor = process_augmentations(len_augmentor) def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: - audio, audio_lens, _ = self.load_audio(cuts) + audio, audio_lens, cuts = self.load_audio(cuts) audio_signals = [] audio_lengths = [] eou_targets = [] text_tokens = [] + for i in range(len(cuts)): - eou_targets_i = self.get_frame_labels(cuts[i], audio_lens[i]) - text_tokens_i = self.get_text_tokens(cuts[i]) - audio_i, audio_len_i, eou_targets_i = self.random_pad_audio(audio[i], audio_lens[i], eou_targets_i) + c = cuts[i] + if isinstance(c, MixedCut): + c = c.first_non_padding_cut + + audio_i = audio[i] + audio_len_i = audio_lens[i] + + # Maybe apply speed perturbation, this has to be done before getting the EOU labels + audio_i, audio_len_i = self._maybe_augment_length(audio_i, audio_len_i) + + # Get EOU labels and text tokens + eou_targets_i = self._get_frame_labels(c, audio_len_i) + text_tokens_i = self._get_text_tokens(c) + + # Maybe apply random padding to both sides of the audio + audio_i, audio_len_i, eou_targets_i = self._random_pad_audio(audio_i, audio_len_i, eou_targets_i) + + # Maybe apply augmentations to the audio signal after padding + audio_i, audio_len_i = self._maybe_augment_audio(audio_i, audio_len_i) + + # Append the processed audio, EOU labels, and text tokens to the lists audio_signals.append(audio_i) audio_lengths.append(audio_len_i) eou_targets.append(eou_targets_i) @@ -158,7 +210,7 @@ def _audio_len_to_frame_len(self, num_samples: int): hidden_length = math.ceil(mel_frame_count / self.num_mel_frame_per_target_frame) return hidden_length - def get_frame_labels(self, cut: Cut, num_samples: int): + def _get_frame_labels(self, cut: Cut, num_samples: int): hidden_length = self._audio_len_to_frame_len(num_samples) if not "sou_time" in cut.custom or not "eou_time" in cut.custom: # assume only single speech segment @@ -198,9 +250,9 @@ def get_frame_labels(self, cut: Cut, num_samples: int): if sou_time[i] is None or eou_time[i] is None or sou_time[i] < 0 or eou_time[i] < 0: # skip empty utterances continue - sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.cfg.sample_rate)) + sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.sample_rate)) seg_len_in_secs = eou_time[i] - sou_time[i] - seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.cfg.sample_rate)) + seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.sample_rate)) eou_targets[sou_idx : sou_idx + seg_len] = 1 if is_backchannel[i]: eou_targets[sou_idx + seg_len - 1] = EOB_LABEL # end of backchannel @@ -209,7 +261,7 @@ def get_frame_labels(self, cut: Cut, num_samples: int): return eou_targets - def get_text_tokens(self, cut: Cut): + def _get_text_tokens(self, cut: Cut): if not cut.has_custom("sou_time") or not cut.has_custom("eou_time") or not cut.has_custom("utterances"): # assume only single speech segment utterances = [cut.supervisions[0].text] @@ -241,7 +293,7 @@ def get_text_tokens(self, cut: Cut): total_text = total_text.strip() return torch.as_tensor(self.tokenizer(total_text)) - def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_targets: torch.Tensor): + def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_targets: torch.Tensor): """ Randomly pad the audio signal with non-speech signal before and after the audio signal. Args: @@ -255,7 +307,7 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar padded_eou_targets_len: torch.Tensor of padded EOU label length, shape [1] """ p = np.random.rand() - if self.padding_cfg is None or p > self.padding_cfg.padding_prob: + if self.padding_cfg is None or p > self.padding_cfg.prob: return audio, audio_len, eou_targets duration = audio_len.item() / self.cfg.sample_rate @@ -274,9 +326,7 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar if self.padding_cfg.pad_distribution == 'uniform': total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) elif self.padding_cfg.pad_distribution == 'normal': - total_padding_duration = np.random.normal( - self.padding_cfg.pad_normal_mean, self.padding_cfg.pad_normal_std - ) + total_padding_duration = np.random.normal(self.padding_cfg.normal_mean, self.padding_cfg.normal_std) total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) else: raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") @@ -301,3 +351,57 @@ def random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_tar padded_eou_targets = torch.cat((pre_padding_eou, eou_targets, post_padding_eou), dim=0) return padded_audio, padded_audio_len, padded_eou_targets + + def _maybe_augment_audio(self, audio: torch.Tensor, audio_len: torch.Tensor): + """ + Apply augmentation to the audio signal if augmentor is provided. + Args: + audio: torch.Tensor of a single audio signal, shape [T] + audio_len: torch.Tensor of audio signal length, shape [1] + Returns: + augmented_audio: torch.Tensor of augmented audio signal, shape [T] + augmented_audio_len: torch.Tensor of augmented audio signal length, shape [1] + """ + if self.augmentor is None: + return audio, audio_len + + # Cast to AudioSegment + audio_segment = AudioSegment( + samples=audio[:audio_len].numpy(), + sample_rate=self.sample_rate, + offset=0, + duration=audio_len.item() / self.sample_rate, + ) + # Apply augmentation + self.augmentor.perturb(audio_segment) + audio = torch.from_numpy(audio_segment.samples).float() + audio_len = audio.size(0) + + return audio, audio_len + + def _maybe_augment_length(self, audio: torch.Tensor, audio_len: torch.Tensor): + """ + Apply length augmentation (e.g., speed perturb) to the audio signal if augmentor is provided. + Args: + audio: torch.Tensor of a single audio signal, shape [T] + audio_len: torch.Tensor of audio signal length, shape [1] + Returns: + augmented_audio: torch.Tensor of augmented audio signal, shape [T] + augmented_audio_len: torch.Tensor of augmented audio signal length, shape [1] + """ + if self.len_augmentor is None: + return audio, audio_len + + # Cast to AudioSegment + audio_segment = AudioSegment( + samples=audio[:audio_len].numpy(), + sample_rate=self.sample_rate, + offset=0, + duration=audio_len.item() / self.sample_rate, + ) + # Apply augmentation + self.len_augmentor.perturb(audio_segment) + audio = torch.from_numpy(audio_segment.samples).float() + audio_len = audio.size(0) + + return audio, audio_len From bc44d9a6924f5d3dbaaf7174c00f428d5b2b6451 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 15 Apr 2025 21:16:35 -0400 Subject: [PATCH 016/107] update data related functions Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 10 +- nemo/collections/asr/models/asr_eou_models.py | 2 + scripts/asr_end_of_utterance/conf/data.yaml | 44 +++++ .../generate_noisy_eval_data.py | 183 ++++++++++++++++++ 4 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 scripts/asr_end_of_utterance/conf/data.yaml create mode 100644 scripts/asr_end_of_utterance/generate_noisy_eval_data.py diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index cee03a075069..26427070f384 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -67,7 +67,6 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): normal_std: 2.0 # standard deviation of normal distribution for padding duration ``` - Returns: audio: torch.Tensor of audio signal audio_lens: torch.Tensor of audio signal length @@ -115,10 +114,14 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'text_token_lens': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_eou_labels: bool = False): + def __init__( + self, cfg: DictConfig, tokenizer: TokenizerSpec, return_eou_labels: bool = False, return_cuts: bool = False + ): super().__init__() self.cfg = cfg self.return_eou_labels = return_eou_labels + self.return_cuts = return_cuts + self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.sample_rate = self.cfg.get('sample_rate', 16000) @@ -193,6 +196,9 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: text_token_lens = torch.tensor([t.size(0) for t in text_tokens], dtype=torch.long) text_tokens = collate_vectors(text_tokens, padding_value=0) + if self.return_cuts: + return audio_signals, audio_lengths, cuts + if not self.return_eou_labels: return audio_signals, audio_lengths, text_tokens, text_token_lens return audio_signals, audio_lengths, eou_targets, eou_target_lens, text_tokens, text_token_lens diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 1f3f95042dff..853461988628 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -137,6 +137,7 @@ def training_step(self, batch, batch_nb): return {'loss': loss_value} def predict_step(self, batch, batch_idx, dataloader_idx=0): + # TODO: add EOU metrics signal, signal_len, transcript, transcript_len, eou_targets, eou_len = batch # forward() only performs encoder forward @@ -150,6 +151,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): return list(best_hyp_text) def validation_pass(self, batch, batch_idx, dataloader_idx=0): + # TODO: add EOU metrics signal, signal_len, transcript, transcript_len = batch # forward() only performs encoder forward diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml new file mode 100644 index 000000000000..70f68b81a855 --- /dev/null +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -0,0 +1,44 @@ + +output_dir: ??? + +data: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: 16000 + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + batch_duration: 300 # you may disable batch_duration by setting it to `null` + batch_size: null + shuffle: false + seed: 42 + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.5 + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution for padding duration + normal_std: 2.0 # standard deviation of normal distribution for padding duration + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 \ No newline at end of file diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py new file mode 100644 index 000000000000..2e9291ebc205 --- /dev/null +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This script is used to generate noisy evaluation data for ASR and end of utterance detection. + +Example usage: +python generate_noisy_eval_data.py \ + --config-path conf/ \ + --config-name data \ + output_dir=/path/to/output \ + data.manifest_filepath=/path/to/manifest.json \ + data.seed=42 \ + data.noise.manifest_path /path/to/noise_manifest.json + +""" + +from pathlib import Path +from shutil import rmtree + +import lightning.pytorch as pl +import numpy as np +import soundfile as sf +import torch +import yaml +from lhotse.cut import MixedCut +from omegaconf import OmegaConf, open_dict +from tqdm import tqdm + +from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config +from nemo.collections.common.parts.preprocessing import parsers +from nemo.core.config import hydra_runner +from nemo.utils import logging + +# Dummy labels for the tokenizer +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'", +] + + +@hydra_runner(config_path="conf/", config_name="data") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + # Seed everything for reproducibility + seed = cfg.data.get('seed', 42) + logging.info(f'Setting random seed to {seed}') + pl.seed_everything(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Make output directory + output_dir = Path(cfg.output_dir) + if output_dir.exists(): + logging.info(f'Removing existing output directory: {output_dir}') + rmtree(output_dir) + logging.info(f'Creating output directory: {output_dir}') + output_dir.mkdir(parents=True, exist_ok=True) + + # Dump the config to the output directory + config = OmegaConf.to_container(cfg, resolve=True) + with open(output_dir / 'config.yaml', 'w') as f: + yaml.dump(config, f) + logging.info(f'Config dumped to {output_dir / "config.yaml"}') + + # Load the input manifest + input_manifest = read_manifest(cfg.data.manifest_filepath) + logging.info(f'Found {len(input_manifest)} items in input manifest: {cfg.data.manifest_filepath}') + manifest_parent_dir = Path(cfg.data.manifest_filepath).parent + if Path(input_manifest[0]["audio_filepath"]).is_absolute(): + output_audio_dir = output_dir / 'wav' + flatten_audio_path = True + else: + output_audio_dir = output_dir + flatten_audio_path = False + + # Load the dataset + tokenizer = parsers.make_parser(labels) # dummy tokenizer + dataset = LhotseSpeechToTextBpeEOUDataset( + cfg=cfg.data, tokenizer=tokenizer, return_eou_labels=False, return_cuts=True + ) + + with open_dict(cfg.data): + cfg.data.force_finite = True + cfg.data.force_map_dataset = True + cfg.data.shuffle = False + + dataloader = get_lhotse_dataloader_from_config( + config=cfg.data, + global_rank=0, + world_size=1, + dataset=dataset, + tokenizer=tokenizer, + ) + + # Generate noisy evaluation data + manifest = [] + for i, batch in enumerate(tqdm(dataloader, desc="Generating noisy evaluation data")): + audio_batch, audio_len_batch, cuts_batch = batch + audio_batch = audio_batch.cpu().numpy() + audio_len_batch = audio_len_batch.cpu().numpy() + + for j in range(len(cuts_batch)): + cut = cuts_batch[j] + if isinstance(cut, MixedCut): + cut = cut.first_non_padding_cut + + manifest_item = {} + for k, v in cut.custom.items(): + if k == "dataloading_info": + continue + manifest_item[k] = v + + audio = audio_batch[j][: audio_len_batch[j]] + audio_file = cut.recording.sources[0].source + + if flatten_audio_path: + output_audio_file = output_audio_dir / str(audio_file).replace('/', '_') + else: + output_audio_file = output_audio_dir / Path(audio_file).relative_to(manifest_parent_dir) + + output_audio_file.parent.mkdir(parents=True, exist_ok=True) + sf.write(output_audio_file, audio, dataset.sample_rate) + + manifest_item["audio_filepath"] = str(output_audio_file) + manifest_item["offset"] = 0 + manifest_item["duration"] = audio.shape[0] / dataset.sample_rate + + manifest.append(manifest_item) + + # Write the output manifest + output_manifest_file = output_dir / Path(cfg.data.manifest_filepath).name + write_manifest(output_manifest_file, manifest) + logging.info(f'Output manifest written to {output_manifest_file}') + + +if __name__ == "__main__": + main() From b6081cffffd30b516484e28b825fb83bb4fedcae Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 21 Apr 2025 16:49:26 -0400 Subject: [PATCH 017/107] fix tokenizer with eou tokens Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 25 ++++++++++++++++--- .../add_special_tokens_to_sentencepiece.py | 17 ++++++------- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 26427070f384..f7716befc162 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -31,8 +31,8 @@ EOU_LABEL = 2 EOB_LABEL = 3 -EOU_STRING = '' -EOB_STRING = '' +EOU_STRING = '' +EOB_STRING = '' EOU_LENGTH_PERTURBATION = ['speed', 'time_stretch'] @@ -121,6 +121,10 @@ def __init__( self.cfg = cfg self.return_eou_labels = return_eou_labels self.return_cuts = return_cuts + self.eou_string = self.cfg.get('eou_string', EOU_STRING) + self.eob_string = self.cfg.get('eob_string', EOB_STRING) + + self._check_special_tokens(tokenizer) self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) @@ -129,8 +133,6 @@ def __init__( self.cfg.get('window_stride', 0.01) * self.sample_rate ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) - self.eou_string = self.cfg.get('eou_string', EOU_STRING) - self.eob_string = self.cfg.get('eob_string', EOB_STRING) self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) self.padding_cfg = self.cfg.get('random_padding', None) self.augmentor = None @@ -155,6 +157,21 @@ def __init__( logging.info(f"EOU dataset will apply length augmentations: {len_augmentor}") self.len_augmentor = process_augmentations(len_augmentor) + def _check_special_tokens(self, tokenizer: TokenizerSpec): + """ + Check if the special tokens are in the tokenizer vocab. + """ + special_tokens = set([self.eou_string, self.eob_string]) + vocab_size = tokenizer.vocab_size + special_tokens_in_vocab = set([tokenizer.ids_to_text(vocab_size - 1), tokenizer.ids_to_text(vocab_size - 2)]) + if special_tokens != special_tokens_in_vocab: + raise ValueError( + f"Input special tokens {special_tokens} don't match with the tokenizer vocab {special_tokens_in_vocab}. " + f"Please add them to tokenizer or change input `eou_string` and/or `eob_string` accordingly. " + "Special tokens should be added as the last two tokens in the new tokenizer. " + "Please refer to scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py for details." + ) + def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: audio, audio_lens, cuts = self.load_audio(cuts) audio_signals = [] diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py index c9c276e99243..703ff09486a6 100644 --- a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py +++ b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py @@ -18,12 +18,12 @@ import logging import sys import tempfile - from argparse import ArgumentParser from pathlib import Path import sentencepiece as spm +from nemo.collections.asr.data.audio_to_eou_label_lhotse import EOB_STRING, EOU_STRING from nemo.core.connectors.save_restore_connector import SaveRestoreConnector try: @@ -32,7 +32,7 @@ raise Exception("Ensure that sentencepiece_model_pb2.py has been generated from the protoc compiler") -SPECIAL_TOKENS = ["", ""] +SPECIAL_TOKENS = [EOU_STRING, EOB_STRING] """Utility to add special tokens to existing sentencepiece models. @@ -70,11 +70,6 @@ help="Special tokens to add to tokenizer", default=SPECIAL_TOKENS, ) -parser.add_argument( - "--is_userdefined", - action="store_true", - help="When set, the new tokens are set as user_defined tokens", -) def extract_nemo_tokenizer(nemo_filepath, output_dir): @@ -153,7 +148,11 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined): logging.info(f"Created new tokenizer vocab at: {vocab_txt_file}") -def inject_special_tokens(input_file, output_dir, tokens, is_userdefined): +def inject_special_tokens(input_file, output_dir, tokens, is_userdefined=True): + """ + NOTE: is_userdefined should be set to True in order for ASR model to work + with the new special tokens properly. + """ if not os.path.exists(input_file): raise ValueError(f"Input file {input_file} does not exist") @@ -172,4 +171,4 @@ def inject_special_tokens(input_file, output_dir, tokens, is_userdefined): if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") args = parser.parse_args() - inject_special_tokens(args.input_file, args.output_dir, args.tokens, args.is_userdefined) + inject_special_tokens(args.input_file, args.output_dir, args.tokens) From 442dfec81826ff5008fab71e3897d99e8911ee7c Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Tue, 22 Apr 2025 17:49:10 -0700 Subject: [PATCH 018/107] adding eou force aligner Signed-off-by: Weiqing Wang --- tools/nemo_forced_aligner/align_eou.py | 386 +++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 tools/nemo_forced_aligner/align_eou.py diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py new file mode 100644 index 000000000000..26c83f3691c2 --- /dev/null +++ b/tools/nemo_forced_aligner/align_eou.py @@ -0,0 +1,386 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json +import math +import os +import shutil +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import List, Optional + +import torch +from omegaconf import OmegaConf +from utils.data_prep import ( + add_t_start_end_to_utt_obj, + get_batch_starts_ends, + get_batch_variables, + get_manifest_lines_batch, + is_entry_in_all_lines, + is_entry_in_any_lines, +) +from utils.make_ass_files import make_ass_files +from utils.make_ctm_files import make_ctm_files +from utils.make_output_manifest import write_manifest_out_line +from utils.viterbi_decoding import viterbi_decoding + +from nemo.collections.asr.models.ctc_models import EncDecCTCModel +from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel +from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR +from nemo.collections.asr.parts.utils.transcribe_utils import setup_model +from nemo.core.config import hydra_runner +from nemo.utils import logging + +""" +Align the utterances in manifest_filepath. +Results are saved in ctm files in output_dir. + +Arguments: + pretrained_name: string specifying the name of a CTC NeMo ASR model which will be automatically downloaded + from NGC and used for generating the log-probs which we will use to do alignment. + Note: NFA can only use CTC models (not Transducer models) at the moment. + model_path: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the + log-probs which we will use to do alignment. + Note: NFA can only use CTC models (not Transducer models) at the moment. + Note: if a model_path is provided, it will override the pretrained_name. + manifest_filepath: filepath to the manifest of the data you want to align, + containing 'audio_filepath' and 'text' fields. + output_dir: the folder where output CTM files and new JSON manifest will be saved. + align_using_pred_text: if True, will transcribe the audio using the specified model and then use that transcription + as the reference text for the forced alignment. + transcribe_device: None, or a string specifying the device that will be used for generating log-probs (i.e. "transcribing"). + The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available + (otherwise will set it to 'cpu'). + viterbi_device: None, or string specifying the device that will be used for doing Viterbi decoding. + The string needs to be in a format recognized by torch.device(). If None, NFA will set it to 'cuda' if it is available + (otherwise will set it to 'cpu'). + batch_size: int specifying batch size that will be used for generating log-probs and doing Viterbi decoding. + use_local_attention: boolean flag specifying whether to try to use local attention for the ASR Model (will only + work if the ASR Model is a Conformer model). If local attention is used, we will set the local attention context + size to [64,64]. + additional_segment_grouping_separator: an optional string used to separate the text into smaller segments. + If this is not specified, then the whole text will be treated as a single segment. + remove_blank_tokens_from_ctm: a boolean denoting whether to remove tokens from token-level output CTMs. + audio_filepath_parts_in_utt_id: int specifying how many of the 'parts' of the audio_filepath + we will use (starting from the final part of the audio_filepath) to determine the + utt_id that will be used in the CTM files. Note also that any spaces that are present in the audio_filepath + will be replaced with dashes, so as not to change the number of space-separated elements in the + CTM files. + e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 1 => utt_id will be "e1" + e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 2 => utt_id will be "d_e1" + e.g. if audio_filepath is "/a/b/c/d/e 1.wav" and audio_filepath_parts_in_utt_id is 3 => utt_id will be "c_d_e1" + use_buffered_infer: False, if set True, using streaming to do get the logits for alignment + This flag is useful when aligning large audio file. + However, currently the chunk streaming inference does not support batch inference, + which means even you set batch_size > 1, it will only infer one by one instead of doing + the whole batch inference together. + chunk_len_in_secs: float chunk length in seconds + total_buffer_in_secs: float Length of buffer (chunk + left and right padding) in seconds + chunk_batch_size: int batch size for buffered chunk inference, + which will cut one audio into segments and do inference on chunk_batch_size segments at a time + + simulate_cache_aware_streaming: False, if set True, using cache aware streaming to do get the logits for alignment + + save_output_file_formats: List of strings specifying what type of output files to save (default: ["ctm", "ass"]) + ctm_file_config: CTMFileConfig to specify the configuration of the output CTM files + ass_file_config: ASSFileConfig to specify the configuration of the output ASS files +""" + + +@dataclass +class CTMFileConfig: + remove_blank_tokens: bool = False + # minimum duration (in seconds) for timestamps in the CTM.If any line in the CTM has a + # duration lower than this, it will be enlarged from the middle outwards until it + # meets the minimum_timestamp_duration, or reaches the beginning or end of the audio file. + # Note that this may cause timestamps to overlap. + minimum_timestamp_duration: float = 0 + + +@dataclass +class ASSFileConfig: + fontsize: int = 20 + vertical_alignment: str = "center" + # if resegment_text_to_fill_space is True, the ASS files will use new segments + # such that each segment will not take up more than (approximately) max_lines_per_segment + # when the ASS file is applied to a video + resegment_text_to_fill_space: bool = False + max_lines_per_segment: int = 2 + text_already_spoken_rgb: List[int] = field(default_factory=lambda: [49, 46, 61]) # dark gray + text_being_spoken_rgb: List[int] = field(default_factory=lambda: [57, 171, 9]) # dark green + text_not_yet_spoken_rgb: List[int] = field(default_factory=lambda: [194, 193, 199]) # light gray + + +@dataclass +class AlignmentConfig: + # Required configs + pretrained_name: Optional[str] = None + model_path: Optional[str] = None + manifest_filepath: Optional[str] = None + output_dir: Optional[str] = '.tmp' # set it to .tmp and will be removed after alignment + output_manifest_filepath: Optional[str] = None # only need this file to save sou and eou time + + # General configs + align_using_pred_text: bool = False + transcribe_device: Optional[str] = None + viterbi_device: Optional[str] = None + batch_size: int = 1 + use_local_attention: bool = True + additional_segment_grouping_separator: Optional[str] = None + audio_filepath_parts_in_utt_id: int = 1 + + # Buffered chunked streaming configs + use_buffered_chunked_streaming: bool = False + chunk_len_in_secs: float = 1.6 + total_buffer_in_secs: float = 4.0 + chunk_batch_size: int = 32 + + # Cache aware streaming configs + simulate_cache_aware_streaming: Optional[bool] = False + + # Output file configs + save_output_file_formats: List[str] = field(default_factory=lambda: ["ctm", "ass"]) + ctm_file_config: CTMFileConfig = field(default_factory=lambda: CTMFileConfig()) + ass_file_config: ASSFileConfig = field(default_factory=lambda: ASSFileConfig()) + + # remove tmp dir after alignment + remove_tmp_dir: bool = False + + +@hydra_runner(config_name="AlignmentConfig", schema=AlignmentConfig) +def main(cfg: AlignmentConfig): + + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + if is_dataclass(cfg): + cfg = OmegaConf.structured(cfg) + + # Validate config + if cfg.model_path is None and cfg.pretrained_name is None: + raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None") + + if cfg.model_path is not None and cfg.pretrained_name is not None: + raise ValueError("One of cfg.model_path and cfg.pretrained_name must be None") + + if cfg.manifest_filepath is None: + raise ValueError("cfg.manifest_filepath must be specified") + + if cfg.output_dir is None: + raise ValueError("cfg.output_dir must be specified") + + if cfg.batch_size < 1: + raise ValueError("cfg.batch_size cannot be zero or a negative number") + + if cfg.additional_segment_grouping_separator == "" or cfg.additional_segment_grouping_separator == " ": + raise ValueError("cfg.additional_grouping_separator cannot be empty string or space character") + + if cfg.ctm_file_config.minimum_timestamp_duration < 0: + raise ValueError("cfg.minimum_timestamp_duration cannot be a negative number") + + if cfg.ass_file_config.vertical_alignment not in ["top", "center", "bottom"]: + raise ValueError("cfg.ass_file_config.vertical_alignment must be one of 'top', 'center' or 'bottom'") + + for rgb_list in [ + cfg.ass_file_config.text_already_spoken_rgb, + cfg.ass_file_config.text_already_spoken_rgb, + cfg.ass_file_config.text_already_spoken_rgb, + ]: + if len(rgb_list) != 3: + raise ValueError( + "cfg.ass_file_config.text_already_spoken_rgb," + " cfg.ass_file_config.text_being_spoken_rgb," + " and cfg.ass_file_config.text_already_spoken_rgb all need to contain" + " exactly 3 elements." + ) + + # Validate manifest contents + if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"): + raise RuntimeError( + "At least one line in cfg.manifest_filepath does not contain an 'audio_filepath' entry. " + "All lines must contain an 'audio_filepath' entry." + ) + + if cfg.align_using_pred_text: + if is_entry_in_any_lines(cfg.manifest_filepath, "pred_text"): + raise RuntimeError( + "Cannot specify cfg.align_using_pred_text=True when the manifest at cfg.manifest_filepath " + "contains 'pred_text' entries. This is because the audio will be transcribed and may produce " + "a different 'pred_text'. This may cause confusion." + ) + else: + if not is_entry_in_all_lines(cfg.manifest_filepath, "text"): + raise RuntimeError( + "At least one line in cfg.manifest_filepath does not contain a 'text' entry. " + "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=False." + ) + + # init devices + if cfg.transcribe_device is None: + transcribe_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + else: + transcribe_device = torch.device(cfg.transcribe_device) + logging.info(f"Device to be used for transcription step (`transcribe_device`) is {transcribe_device}") + + if cfg.viterbi_device is None: + viterbi_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + else: + viterbi_device = torch.device(cfg.viterbi_device) + logging.info(f"Device to be used for viterbi step (`viterbi_device`) is {viterbi_device}") + + if transcribe_device.type == 'cuda' or viterbi_device.type == 'cuda': + logging.warning( + 'One or both of transcribe_device and viterbi_device are GPUs. If you run into OOM errors ' + 'it may help to change both devices to be the CPU.' + ) + + # load model + model, _ = setup_model(cfg, transcribe_device) + model.eval() + + if isinstance(model, EncDecHybridRNNTCTCModel): + model.change_decoding_strategy(decoder_type="ctc") + + if cfg.use_local_attention: + logging.info( + "Flag use_local_attention is set to True => will try to use local attention for model if it allows it" + ) + model.change_attention_model(self_attention_model="rel_pos_local_attn", att_context_size=[64, 64]) + + if not (isinstance(model, EncDecCTCModel) or isinstance(model, EncDecHybridRNNTCTCModel)): + raise NotImplementedError( + f"Model is not an instance of NeMo EncDecCTCModel or ENCDecHybridRNNTCTCModel." + " Currently only instances of these models are supported" + ) + + if cfg.ctm_file_config.minimum_timestamp_duration > 0: + logging.warning( + f"cfg.ctm_file_config.minimum_timestamp_duration has been set to {cfg.ctm_file_config.minimum_timestamp_duration} seconds. " + "This may cause the alignments for some tokens/words/additional segments to be overlapping." + ) + + buffered_chunk_params = {} + if cfg.use_buffered_chunked_streaming: + model_cfg = copy.deepcopy(model._cfg) + + OmegaConf.set_struct(model_cfg.preprocessor, False) + # some changes for streaming scenario + model_cfg.preprocessor.dither = 0.0 + model_cfg.preprocessor.pad_to = 0 + + if model_cfg.preprocessor.normalize != "per_feature": + logging.error( + "Only EncDecCTCModelBPE models trained with per_feature normalization are supported currently" + ) + # Disable config overwriting + OmegaConf.set_struct(model_cfg.preprocessor, True) + + feature_stride = model_cfg.preprocessor['window_stride'] + model_stride_in_secs = feature_stride * cfg.model_downsample_factor + total_buffer = cfg.total_buffer_in_secs + chunk_len = float(cfg.chunk_len_in_secs) + tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs) + mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs) + logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}") + + model = FrameBatchASR( + asr_model=model, + frame_len=chunk_len, + total_buffer=cfg.total_buffer_in_secs, + batch_size=cfg.chunk_batch_size, + ) + buffered_chunk_params = { + "delay": mid_delay, + "model_stride_in_secs": model_stride_in_secs, + "tokens_per_chunk": tokens_per_chunk, + } + # get start and end line IDs of batches + starts, ends = get_batch_starts_ends(cfg.manifest_filepath, cfg.batch_size) + + # init output_timestep_duration = None and we will calculate and update it during the first batch + output_timestep_duration = None + + # init f_manifest_out + os.makedirs(cfg.output_dir, exist_ok=True) + tgt_manifest_name = str(Path(cfg.manifest_filepath).stem) + "_with_output_file_paths.json" + tgt_manifest_filepath = str(Path(cfg.output_dir) / tgt_manifest_name) + f_manifest_out = open(tgt_manifest_filepath, 'w') + + # get alignment and save in CTM batch-by-batch + for start, end in zip(starts, ends): + manifest_lines_batch = get_manifest_lines_batch(cfg.manifest_filepath, start, end) + + (log_probs_batch, y_batch, T_batch, U_batch, utt_obj_batch, output_timestep_duration,) = get_batch_variables( + manifest_lines_batch, + model, + cfg.additional_segment_grouping_separator, + cfg.align_using_pred_text, + cfg.audio_filepath_parts_in_utt_id, + output_timestep_duration, + cfg.simulate_cache_aware_streaming, + cfg.use_buffered_chunked_streaming, + buffered_chunk_params, + ) + + alignments_batch = viterbi_decoding(log_probs_batch, y_batch, T_batch, U_batch, viterbi_device) + + for utt_obj, alignment_utt in zip(utt_obj_batch, alignments_batch): + + utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration) + + if "ctm" in cfg.save_output_file_formats: + utt_obj = make_ctm_files(utt_obj, cfg.output_dir, cfg.ctm_file_config,) + + if "ass" in cfg.save_output_file_formats: + utt_obj = make_ass_files(utt_obj, cfg.output_dir, cfg.ass_file_config) + + write_manifest_out_line( + f_manifest_out, utt_obj, + ) + + f_manifest_out.close() + + # adding eou processing here + input_manifest_lines = [json.loads(line) for line in open(cfg.manifest_filepath)] + with open(tgt_manifest_filepath, 'r') as f: + for i, line in enumerate(f.readlines()): + item = json.loads(line) + assert os.path.basename(input_manifest_lines[i]['audio_filepath']) == os.path.basename(item['audio_filepath']) + + # get sou/eou time + lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] + start_time = min([float(line[2]) for line in lines]) + end_time = max([float(line[2]) + float(line[3]) for line in lines]) + input_manifest_lines[i]['sou_time'] = start_time + input_manifest_lines[i]['eou_time'] = end_time + + with open(cfg.output_manifest_filepath, 'w') as f: + for item in input_manifest_lines: + f.write(json.dumps(item) + '\n') + + if cfg.remove_tmp_dir: # savely removing tmp dir after alignment + for file_or_folder in [tgt_manifest_filepath, os.path.join(cfg.output_dir, 'ctm'), os.path.join(cfg.output_dir, 'ass')]: + if os.path.exists(file_or_folder): + if os.path.isfile(file_or_folder): + os.remove(file_or_folder) + else: + shutil.rmtree(file_or_folder) + if os.path.exists(cfg.output_dir) and len(os.listdir(cfg.output_dir)) == 0: + shutil.rmtree(cfg.output_dir) + + return None + + +if __name__ == "__main__": + main() From 13bdc042e57a4cc5b6b8d33b07486c8afa9a6718 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 23 Apr 2025 15:56:18 -0400 Subject: [PATCH 019/107] update for eou Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_rnnt_eou.py | 9 +++- ...astconformer_transducer_bpe_streaming.yaml | 4 +- .../asr/data/audio_to_eou_label_lhotse.py | 2 +- nemo/collections/asr/models/asr_eou_models.py | 4 +- .../add_special_tokens_to_sentencepiece.py | 49 ++++++++++++------- 5 files changed, 43 insertions(+), 25 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py index f78835250783..e9e1cd162b94 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou.py @@ -123,6 +123,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat # Load encoder state dict into the model model.encoder.load_state_dict(pretrained_model.encoder.state_dict(), strict=True) + logging.info(f"Encoder weights loaded from {pretrained_model_path}.") # Load decoder state dict into the model decoder = model.decoder # type: RNNTDecoder @@ -146,6 +147,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat decoder_embed_states[:-3, :] = pretrained_decoder_embed_states[:-1, :] # everything except EOU, EOB and blank decoder_embed_states[-1, :] = pretrained_decoder_embed_states[-1, :] # blank class decoder.prediction["embed"].load_state_dict({"weight": decoder_embed_states}, strict=True) + logging.info(f"Decoder weights loaded from {pretrained_model_path}.") # Load joint network weights if new model's joint network has two more classes than the pretrained model joint_network = model.joint # type: RNNTJoint @@ -158,10 +160,9 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat joint_network.enc.load_state_dict(pretrained_joint_network.enc.state_dict(), strict=True) if joint_network.num_classes_with_blank != pretrained_joint_network.num_classes_with_blank + 2: - logging.info( + raise ValueError( f"Size mismatched between pretrained ({pretrained_joint_network.num_classes_with_blank}+2) and current model ({joint_network.num_classes_with_blank}), skip loading joint network." ) - return # Load the joint network weights pretrained_joint_state = pretrained_joint_network.joint_net.state_dict() @@ -173,9 +174,13 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat # shape: [num_classes+2, hid_dim] joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class + joint_state['2.weight'][-2, :] = 0.0001 # EOB class + joint_state['2.weight'][-3, :] = 0.0001 # EOU class if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class + joint_state['2.bias'][-2] = -1000.0 # EOB class + joint_state['2.bias'][-3] = -1000.0 # EOU class # Load the joint network weights joint_network.joint_net.load_state_dict(joint_state, strict=True) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 71094058583e..4c8539d9d3a4 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -299,8 +299,8 @@ exp_manager: save_top_k: 5 always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false + resume_if_exists: true + resume_ignore_no_checkpoint: true create_wandb_logger: false wandb_logger_kwargs: diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index f7716befc162..765747fc02fb 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -218,7 +218,7 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: if not self.return_eou_labels: return audio_signals, audio_lengths, text_tokens, text_token_lens - return audio_signals, audio_lengths, eou_targets, eou_target_lens, text_tokens, text_token_lens + return audio_signals, audio_lengths, text_tokens, text_token_lens, eou_targets, eou_target_lens def _audio_len_to_frame_len(self, num_samples: int): """ diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 853461988628..a5e0354cbe9f 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -138,7 +138,7 @@ def training_step(self, batch, batch_nb): def predict_step(self, batch, batch_idx, dataloader_idx=0): # TODO: add EOU metrics - signal, signal_len, transcript, transcript_len, eou_targets, eou_len = batch + signal, signal_len, transcript, transcript_len, eou_target, eou_len = batch # forward() only performs encoder forward encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) @@ -152,7 +152,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): def validation_pass(self, batch, batch_idx, dataloader_idx=0): # TODO: add EOU metrics - signal, signal_len, transcript, transcript_len = batch + signal, signal_len, transcript, transcript_len, eou_target, eou_len = batch # forward() only performs encoder forward encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py index 703ff09486a6..f2c95c20bd4a 100644 --- a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py +++ b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py @@ -70,6 +70,11 @@ help="Special tokens to add to tokenizer", default=SPECIAL_TOKENS, ) +parser.add_argument( + "--extract_only", + action="store_true", + help="Extract tokenizer without adding special tokens", +) def extract_nemo_tokenizer(nemo_filepath, output_dir): @@ -84,7 +89,10 @@ def extract_nemo_tokenizer(nemo_filepath, output_dir): return str(tokenizer.absolute()) -def edit_spt_model(input_file, output_dir, tokens, is_userdefined): +def edit_spt_model(input_file, output_dir, tokens, is_userdefined, extract_only=False): + if extract_only: + logging.info("Extracting tokenizer only, no special tokens will be added.") + output_dir = Path(output_dir) if output_dir.exists(): @@ -101,23 +109,28 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined): model = spt.ModelProto() model.ParseFromString(open(input_file, 'rb').read()) - for token in tokens: - piece = model.SentencePiece(piece=token, score=0.0, type=token_type) - if piece in model.pieces: - logging.error(f"Special Token '{token}' already exists in the input model!") - sys.exit(1) - model.pieces.append(piece) + if not extract_only: + for token in tokens: + piece = model.SentencePiece(piece=token, score=0.0, type=token_type) + if piece in model.pieces: + logging.error(f"Special Token '{token}' already exists in the input model!") + sys.exit(1) + model.pieces.append(piece) sp = spm.SentencePieceProcessor() sp.LoadFromSerializedProto(model.SerializeToString()) - try: - for token in tokens: - id = sp.piece_to_id(token) - logging.info(f"Created token '{token}' at ID {id}") - logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}") - except: - logging.error("Could not appropriately configure new tokenizer. Verify if the special tokens already exist.") - sys.exit(1) + + if not extract_only: + try: + for token in tokens: + id = sp.piece_to_id(token) + logging.info(f"Created token '{token}' at ID {id}") + logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}") + except: + logging.error( + "Could not appropriately configure new tokenizer. Verify if the special tokens already exist." + ) + sys.exit(1) with open(output_file, 'wb') as outf: outf.write(model.SerializeToString()) @@ -148,7 +161,7 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined): logging.info(f"Created new tokenizer vocab at: {vocab_txt_file}") -def inject_special_tokens(input_file, output_dir, tokens, is_userdefined=True): +def inject_special_tokens(input_file, output_dir, tokens, is_userdefined=True, extract_only=False): """ NOTE: is_userdefined should be set to True in order for ASR model to work with the new special tokens properly. @@ -165,10 +178,10 @@ def inject_special_tokens(input_file, output_dir, tokens, is_userdefined=True): input_file = os.path.abspath(input_file) logging.info(f"Using input file: {input_file}") - edit_spt_model(input_file, output_dir, tokens, is_userdefined) + edit_spt_model(input_file, output_dir, tokens, is_userdefined, extract_only=extract_only) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") args = parser.parse_args() - inject_special_tokens(args.input_file, args.output_dir, args.tokens) + inject_special_tokens(args.input_file, args.output_dir, args.tokens, extract_only=args.extract_only) From 813be94509119c0617f9cb266bcaf740c5511b7e Mon Sep 17 00:00:00 2001 From: Weiqing Wang Date: Wed, 23 Apr 2025 14:52:01 -0700 Subject: [PATCH 020/107] fix the case when 'segments_level_ctm_filepath' is not produced Signed-off-by: Weiqing Wang --- tools/nemo_forced_aligner/align_eou.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 26c83f3691c2..e8bec17e03b8 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -353,20 +353,23 @@ def main(cfg: AlignmentConfig): # adding eou processing here input_manifest_lines = [json.loads(line) for line in open(cfg.manifest_filepath)] + output_manifest_lines = [] with open(tgt_manifest_filepath, 'r') as f: for i, line in enumerate(f.readlines()): item = json.loads(line) assert os.path.basename(input_manifest_lines[i]['audio_filepath']) == os.path.basename(item['audio_filepath']) # get sou/eou time - lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] - start_time = min([float(line[2]) for line in lines]) - end_time = max([float(line[2]) + float(line[3]) for line in lines]) - input_manifest_lines[i]['sou_time'] = start_time - input_manifest_lines[i]['eou_time'] = end_time + if 'segments_level_ctm_filepath' in item: + lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] + start_time = min([float(line[2]) for line in lines]) + end_time = max([float(line[2]) + float(line[3]) for line in lines]) + input_manifest_lines[i]['sou_time'] = start_time + input_manifest_lines[i]['eou_time'] = end_time + output_manifest_lines.append(input_manifest_lines[i]) with open(cfg.output_manifest_filepath, 'w') as f: - for item in input_manifest_lines: + for item in output_manifest_lines: f.write(json.dumps(item) + '\n') if cfg.remove_tmp_dir: # savely removing tmp dir after alignment From e9cf11ab13f5336b5a37d5f155a70e1113b4beee Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Apr 2025 10:05:46 -0400 Subject: [PATCH 021/107] fix force aligner Signed-off-by: stevehuang52 --- tools/nemo_forced_aligner/align_eou.py | 46 +++++++++++++++----- tools/nemo_forced_aligner/utils/data_prep.py | 37 +++++++++++++--- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 26c83f3691c2..c7dedfce595c 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -129,8 +129,8 @@ class AlignmentConfig: pretrained_name: Optional[str] = None model_path: Optional[str] = None manifest_filepath: Optional[str] = None - output_dir: Optional[str] = '.tmp' # set it to .tmp and will be removed after alignment - output_manifest_filepath: Optional[str] = None # only need this file to save sou and eou time + output_dir: Optional[str] = '.tmp' # set it to .tmp and will be removed after alignment + output_manifest_filepath: Optional[str] = None # only need this file to save sou and eou time # General configs align_using_pred_text: bool = False @@ -321,7 +321,14 @@ def main(cfg: AlignmentConfig): for start, end in zip(starts, ends): manifest_lines_batch = get_manifest_lines_batch(cfg.manifest_filepath, start, end) - (log_probs_batch, y_batch, T_batch, U_batch, utt_obj_batch, output_timestep_duration,) = get_batch_variables( + ( + log_probs_batch, + y_batch, + T_batch, + U_batch, + utt_obj_batch, + output_timestep_duration, + ) = get_batch_variables( manifest_lines_batch, model, cfg.additional_segment_grouping_separator, @@ -340,15 +347,20 @@ def main(cfg: AlignmentConfig): utt_obj = add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration) if "ctm" in cfg.save_output_file_formats: - utt_obj = make_ctm_files(utt_obj, cfg.output_dir, cfg.ctm_file_config,) + utt_obj = make_ctm_files( + utt_obj, + cfg.output_dir, + cfg.ctm_file_config, + ) if "ass" in cfg.save_output_file_formats: utt_obj = make_ass_files(utt_obj, cfg.output_dir, cfg.ass_file_config) write_manifest_out_line( - f_manifest_out, utt_obj, + f_manifest_out, + utt_obj, ) - + f_manifest_out.close() # adding eou processing here @@ -356,7 +368,15 @@ def main(cfg: AlignmentConfig): with open(tgt_manifest_filepath, 'r') as f: for i, line in enumerate(f.readlines()): item = json.loads(line) - assert os.path.basename(input_manifest_lines[i]['audio_filepath']) == os.path.basename(item['audio_filepath']) + assert os.path.basename(input_manifest_lines[i]['audio_filepath']) == os.path.basename( + item['audio_filepath'] + ) + + if 'segments_level_ctm_filepath' not in item: + print( + f"`segments_level_ctm_filepath` not found for {input_manifest_lines[i]['audio_filepath']}, skipping" + ) + continue # get sou/eou time lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] @@ -368,9 +388,13 @@ def main(cfg: AlignmentConfig): with open(cfg.output_manifest_filepath, 'w') as f: for item in input_manifest_lines: f.write(json.dumps(item) + '\n') - - if cfg.remove_tmp_dir: # savely removing tmp dir after alignment - for file_or_folder in [tgt_manifest_filepath, os.path.join(cfg.output_dir, 'ctm'), os.path.join(cfg.output_dir, 'ass')]: + + if cfg.remove_tmp_dir: # savely removing tmp dir after alignment + for file_or_folder in [ + tgt_manifest_filepath, + os.path.join(cfg.output_dir, 'ctm'), + os.path.join(cfg.output_dir, 'ass'), + ]: if os.path.exists(file_or_folder): if os.path.isfile(file_or_folder): os.remove(file_or_folder) diff --git a/tools/nemo_forced_aligner/utils/data_prep.py b/tools/nemo_forced_aligner/utils/data_prep.py index c5ee74a13b44..7431dd9c6288 100644 --- a/tools/nemo_forced_aligner/utils/data_prep.py +++ b/tools/nemo_forced_aligner/utils/data_prep.py @@ -22,6 +22,7 @@ from tqdm.auto import tqdm from utils.constants import BLANK_TOKEN, SPACE_TOKEN, V_NEGATIVE_NUM +from nemo.collections.common.parts.preprocessing.manifest import get_full_path from nemo.utils import logging @@ -85,6 +86,7 @@ def get_manifest_lines_batch(manifest_filepath, start, end): for line_i, line in enumerate(f): if line_i >= start and line_i <= end: data = json.loads(line) + data["audio_filepath"] = get_full_path(data["audio_filepath"], manifest_filepath) if "text" in data: # remove any BOM, any duplicated spaces, convert any # newline chars to spaces @@ -231,7 +233,12 @@ class Utterance: def get_utt_obj( - text, model, separator, T, audio_filepath, utt_id, + text, + model, + separator, + T, + audio_filepath, + utt_id, ): """ Function to create an Utterance object and add all necessary information to it except @@ -258,7 +265,11 @@ def get_utt_obj( # remove any empty segments segments = [seg for seg in segments if len(seg) > 0] - utt = Utterance(text=text, audio_filepath=audio_filepath, utt_id=utt_id,) + utt = Utterance( + text=text, + audio_filepath=audio_filepath, + utt_id=utt_id, + ) # build up lists: token_ids_with_blanks, segments_and_tokens. # The code for these is different depending on whether we use char-based tokens or not @@ -289,7 +300,14 @@ def get_utt_obj( return utt # build up data structures containing segments/words/tokens - utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,)) + utt.segments_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + s_start=0, + s_end=0, + ) + ) segment_s_pointer = 1 # first segment will start at s=1 because s=0 is a blank word_s_pointer = 1 # first word will start at s=1 because s=0 is a blank @@ -422,7 +440,14 @@ def get_utt_obj( return utt # build up data structures containing segments/words/tokens - utt.segments_and_tokens.append(Token(text=BLANK_TOKEN, text_cased=BLANK_TOKEN, s_start=0, s_end=0,)) + utt.segments_and_tokens.append( + Token( + text=BLANK_TOKEN, + text_cased=BLANK_TOKEN, + s_start=0, + s_end=0, + ) + ) segment_s_pointer = 1 # first segment will start at s=1 because s=0 is a blank word_s_pointer = 1 # first word will start at s=1 because s=0 is a blank @@ -589,9 +614,9 @@ def add_t_start_end_to_utt_obj(utt_obj, alignment_utt, output_timestep_duration) """ Function to add t_start and t_end (representing time in seconds) to the Utterance object utt_obj. Args: - utt_obj: Utterance object to which we will add t_start and t_end for its + utt_obj: Utterance object to which we will add t_start and t_end for its constituent segments/words/tokens. - alignment_utt: a list of ints indicating which token does the alignment pass through at each + alignment_utt: a list of ints indicating which token does the alignment pass through at each timestep (will take the form [0, 0, 1, 1, ..., ]). output_timestep_duration: a float indicating the duration of a single output timestep from the ASR Model. From fb4a8155a6c13a8a8c2caaff6e23aed2621d5f1b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 24 Apr 2025 14:40:02 -0400 Subject: [PATCH 022/107] fix aligner Signed-off-by: stevehuang52 --- tools/nemo_forced_aligner/align_eou.py | 109 ++++++++++++++++++------- 1 file changed, 78 insertions(+), 31 deletions(-) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index ebc041234287..042bc30a6f99 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -45,7 +45,9 @@ """ Align the utterances in manifest_filepath. -Results are saved in ctm files in output_dir. +Results are saved in ctm files in output_dir as well as json manifest in output_manifest_filepath. +If no output_manifest_filepath is specified, it will save the results in the same parent directory as +the input manifest_filepath. Arguments: pretrained_name: string specifying the name of a CTC NeMo ASR model which will be automatically downloaded @@ -58,6 +60,9 @@ manifest_filepath: filepath to the manifest of the data you want to align, containing 'audio_filepath' and 'text' fields. output_dir: the folder where output CTM files and new JSON manifest will be saved. + output_manifest_filepath: Optional[str] = None # output of manfiest with sou_time and eou_time + manifest_pattern: Optional[str] = None # pattern used in Path.glob() for finding manifests + align_using_pred_text: if True, will transcribe the audio using the specified model and then use that transcription as the reference text for the forced alignment. transcribe_device: None, or a string specifying the device that will be used for generating log-probs (i.e. "transcribing"). @@ -128,9 +133,10 @@ class AlignmentConfig: # Required configs pretrained_name: Optional[str] = None model_path: Optional[str] = None - manifest_filepath: Optional[str] = None + manifest_filepath: Optional[str] = None # path to manifest file or directory output_dir: Optional[str] = '.tmp' # set it to .tmp and will be removed after alignment - output_manifest_filepath: Optional[str] = None # only need this file to save sou and eou time + output_manifest_filepath: Optional[str] = None # output of manfiest with sou_time and eou_time + manifest_pattern: Optional[str] = None # pattern used in Path.glob() for finding manifests # General configs align_using_pred_text: bool = False @@ -205,27 +211,6 @@ def main(cfg: AlignmentConfig): " exactly 3 elements." ) - # Validate manifest contents - if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"): - raise RuntimeError( - "At least one line in cfg.manifest_filepath does not contain an 'audio_filepath' entry. " - "All lines must contain an 'audio_filepath' entry." - ) - - if cfg.align_using_pred_text: - if is_entry_in_any_lines(cfg.manifest_filepath, "pred_text"): - raise RuntimeError( - "Cannot specify cfg.align_using_pred_text=True when the manifest at cfg.manifest_filepath " - "contains 'pred_text' entries. This is because the audio will be transcribed and may produce " - "a different 'pred_text'. This may cause confusion." - ) - else: - if not is_entry_in_all_lines(cfg.manifest_filepath, "text"): - raise RuntimeError( - "At least one line in cfg.manifest_filepath does not contain a 'text' entry. " - "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=False." - ) - # init devices if cfg.transcribe_device is None: transcribe_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -305,6 +290,69 @@ def main(cfg: AlignmentConfig): "model_stride_in_secs": model_stride_in_secs, "tokens_per_chunk": tokens_per_chunk, } + + if Path(cfg.manifest_filepath).is_file(): + manifest_list = [cfg.manifest_filepath] + elif Path(cfg.manifest_filepath).is_dir(): + if cfg.manifest_pattern is not None: + manifest_list = list(Path(cfg.manifest_filepath).glob(cfg.manifest_pattern)) + else: + manifest_list = list(Path(cfg.manifest_filepath).glob("*.json")) + else: + raise ValueError( + f"cfg.manifest_filepath is not a valid file or directory. " + f"Please check the path: {cfg.manifest_filepath}" + ) + + origin_output_manifest_filepath = cfg.output_manifest_filepath + logging.info(f"Found {len(manifest_list)} manifest files to process.") + # process each manifest file + for manifest_filepath in manifest_list: + logging.info(f"Processing manifest file: {manifest_filepath}") + cfg.manifest_filepath = str(manifest_filepath) + + if origin_output_manifest_filepath is None: + cfg.output_manifest_filepath = str( + Path(manifest_filepath).parent / f"{Path(manifest_filepath).stem}-aligned.json" + ) + elif len(manifest_list) > 1 and origin_output_manifest_filepath is not None: + raise ValueError( + "cfg.output_manifest_filepath must be None when processing multiple manifest files. " + "Please set it to None." + ) + + if not cfg.remove_tmp_dir and len(manifest_list) > 1: + # if keep alignment files, then we need to set output_dir to be different for each manifest + cfg.output_dir = str(Path(manifest_filepath).parent / f"{Path(manifest_filepath).stem}_alignment") + + process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device) + logging.info(f"Output manifest saved to: {cfg.output_manifest_filepath}") + + logging.info("All manifest files processed successfully.") + + +def process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device): + # Validate manifest contents + if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"): + raise RuntimeError( + "At least one line in cfg.manifest_filepath does not contain an 'audio_filepath' entry. " + "All lines must contain an 'audio_filepath' entry." + ) + + if cfg.align_using_pred_text: + if is_entry_in_any_lines(cfg.manifest_filepath, "pred_text"): + raise RuntimeError( + "Cannot specify cfg.align_using_pred_text=True when the manifest at cfg.manifest_filepath " + "contains 'pred_text' entries. This is because the audio will be transcribed and may produce " + "a different 'pred_text'. This may cause confusion." + ) + else: + if not is_entry_in_all_lines(cfg.manifest_filepath, "text"): + raise RuntimeError( + "At least one line in cfg.manifest_filepath does not contain a 'text' entry. " + "NFA requires all lines to contain a 'text' entry when cfg.align_using_pred_text=False." + ) + # get start and end line IDs of batches starts, ends = get_batch_starts_ends(cfg.manifest_filepath, cfg.batch_size) @@ -380,13 +428,12 @@ def main(cfg: AlignmentConfig): continue # get sou/eou time - if 'segments_level_ctm_filepath' in item: - lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] - start_time = min([float(line[2]) for line in lines]) - end_time = max([float(line[2]) + float(line[3]) for line in lines]) - input_manifest_lines[i]['sou_time'] = start_time - input_manifest_lines[i]['eou_time'] = end_time - output_manifest_lines.append(input_manifest_lines[i]) + lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] + start_time = min([float(line[2]) for line in lines]) + end_time = max([float(line[2]) + float(line[3]) for line in lines]) + input_manifest_lines[i]['sou_time'] = start_time + input_manifest_lines[i]['eou_time'] = end_time + output_manifest_lines.append(input_manifest_lines[i]) with open(cfg.output_manifest_filepath, 'w') as f: for item in output_manifest_lines: From e8a49cdbd7fc5f0f0e2bc274780a102858b177a8 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Apr 2025 10:21:44 -0400 Subject: [PATCH 023/107] update for asr-eou Signed-off-by: stevehuang52 --- .../asr_eou/speech_to_text_rnnt_eou_eval.py | 127 +++++++ ...ou.py => speech_to_text_rnnt_eou_train.py} | 2 +- ...astconformer_transducer_bpe_streaming.yaml | 4 +- .../asr/data/audio_to_eou_label_lhotse.py | 47 ++- nemo/collections/asr/metrics/wer.py | 25 +- nemo/collections/asr/models/asr_eou_models.py | 350 +++++++++++++++++- nemo/collections/asr/modules/rnnt.py | 18 + .../asr/parts/submodules/rnnt_decoding.py | 12 +- nemo/collections/asr/parts/utils/eou_utils.py | 80 +++- nemo/core/classes/modelPT.py | 4 +- scripts/asr_end_of_utterance/evaluate_eou.py | 4 +- .../generate_noisy_eval_data.py | 16 +- 12 files changed, 632 insertions(+), 57 deletions(-) create mode 100644 examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py rename examples/asr/asr_eou/{speech_to_text_rnnt_eou.py => speech_to_text_rnnt_eou_train.py} (99%) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py new file mode 100644 index 000000000000..4037dc7eeab3 --- /dev/null +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py @@ -0,0 +1,127 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py + +1. Add special tokens and to the tokenizer of pretrained model, by refering to the script + /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py + +2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script + /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py + +3. Run the following command to train the ASR-EOU model: +```bash +#!/bin/bash + +NEMO_PATH=/home/heh/codes/nemo-eou +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json + +PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo +TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms-eou/ + +BATCH_DURATION=30 +NUM_WORKERS=0 +LIMIT_TRAIN_BATCHES=100 +VAL_CHECK_INTERVAL=100 +MAX_STEPS=1000000 + +EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug + +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming +CONFIG_NAME=fastconformer_transducer_bpe_streaming + +CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ + --config-path $CONFIG_PATH \ + --config-name $CONFIG_NAME \ + ++init_from_nemo_model=$PRETRAINED_NEMO \ + model.encoder.att_context_size="[70,1]" \ + model.tokenizer.dir=$TOKENIZER_DIR \ + model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ + model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ + model.validation_ds.manifest_filepath=$VAL_MANIFEST \ + model.train_ds.batch_duration=$BATCH_DURATION \ + model.train_ds.num_workers=$NUM_WORKERS \ + model.validation_ds.batch_duration=$BATCH_DURATION \ + model.validation_ds.num_workers=$NUM_WORKERS \ + ~model.test_ds \ + trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ + trainer.val_check_interval=$VAL_CHECK_INTERVAL \ + trainer.max_steps=$MAX_STEPS \ + exp_manager.name=$EXP_NAME +``` + +""" + + +from typing import Optional + +import lightning.pytorch as pl +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.models.asr_eou_models import EncDecRNNTBPEEOUModel +from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg + + +def load_model(cfg, trainer): + keys = ['init_from_nemo_model', 'init_from_ptl_ckpt', 'init_from_pretrained_model'] + num_keys = sum([1 for key in keys if key in cfg]) + if num_keys > 1: + raise ValueError( + f"Only one of the following keys should be present in the config: {keys}. Found {num_keys} keys." + ) + if "init_from_nemo_model" in cfg: + model = EncDecRNNTBPEEOUModel.restore_from(cfg.init_from_nemo_model, trainer=trainer) + elif "init_from_ptl_ckpt" in cfg: + model = EncDecRNNTBPEEOUModel.load_from_checkpoint(cfg.init_from_ptl_ckpt, trainer=trainer) + elif "init_from_pretrained_model" in cfg: + model = EncDecRNNTBPEEOUModel.from_pretrained(cfg.init_from_pretrained_model, trainer=trainer) + else: + model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) + return model + + +@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_transducer_bpe_streaming") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) + exp_manager(trainer, cfg.get("exp_manager", None)) + + asr_model = load_model(cfg, trainer) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + asr_model.setup_test_data(test_data_config=cfg.model.test_ds) + trainer.test(asr_model) + else: + raise ValueError( + "No test dataset provided. Please provide a test dataset in the config file under model.test_ds." + ) + logging.info("Test completed.") + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py similarity index 99% rename from examples/asr/asr_eou/speech_to_text_rnnt_eou.py rename to examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index e9e1cd162b94..6da6665c9d5b 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -35,7 +35,7 @@ NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo -TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms-eou/ +TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou BATCH_DURATION=30 NUM_WORKERS=0 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 4c8539d9d3a4..71094058583e 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -299,8 +299,8 @@ exp_manager: save_top_k: 5 always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: true - resume_ignore_no_checkpoint: true + resume_if_exists: false + resume_ignore_no_checkpoint: false create_wandb_logger: false wandb_logger_kwargs: diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 765747fc02fb..d3709b0f2285 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -13,7 +13,8 @@ # limitations under the License. import math -from typing import Dict, Optional, Tuple +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple import numpy as np import torch.utils.data @@ -39,6 +40,18 @@ EOU_PROHIBITED_AUGMENTATIONS = ['random_segment'] +@dataclass +class AudioToTextEOUBatch: + sample_ids: List | None = None + audio_filepaths: List | None = None + audio_signal: torch.Tensor | None = None + audio_lengths: torch.Tensor | None = None + text_tokens: torch.Tensor | None = None + text_token_lengths: torch.Tensor | None = None + eou_targets: torch.Tensor | None = None + eou_target_lengths: torch.Tensor | None = None + + class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, @@ -114,17 +127,15 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: 'text_token_lens': NeuralType(tuple('B'), LengthsType(), optional=True), } - def __init__( - self, cfg: DictConfig, tokenizer: TokenizerSpec, return_eou_labels: bool = False, return_cuts: bool = False - ): + def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool = False): super().__init__() self.cfg = cfg - self.return_eou_labels = return_eou_labels self.return_cuts = return_cuts self.eou_string = self.cfg.get('eou_string', EOU_STRING) self.eob_string = self.cfg.get('eob_string', EOB_STRING) - self._check_special_tokens(tokenizer) + if cfg.get('check_tokenizer', True): + self._check_special_tokens(tokenizer) self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) @@ -178,12 +189,16 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: audio_lengths = [] eou_targets = [] text_tokens = [] - + sample_ids = [] + audio_filepaths = [] for i in range(len(cuts)): c = cuts[i] if isinstance(c, MixedCut): c = c.first_non_padding_cut + sample_ids.append(c.id) + audio_filepaths.append(c.recording.sources[0].source) + audio_i = audio[i] audio_len_i = audio_lens[i] @@ -216,9 +231,16 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: if self.return_cuts: return audio_signals, audio_lengths, cuts - if not self.return_eou_labels: - return audio_signals, audio_lengths, text_tokens, text_token_lens - return audio_signals, audio_lengths, text_tokens, text_token_lens, eou_targets, eou_target_lens + return AudioToTextEOUBatch( + sample_ids=sample_ids, + audio_filepaths=audio_filepaths, + audio_signal=audio_signals, + audio_lengths=audio_lengths, + text_tokens=text_tokens, + text_token_lengths=text_token_lens, + eou_targets=eou_targets, + eou_target_lengths=eou_target_lens, + ) def _audio_len_to_frame_len(self, num_samples: int): """ @@ -277,10 +299,11 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): seg_len_in_secs = eou_time[i] - sou_time[i] seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.sample_rate)) eou_targets[sou_idx : sou_idx + seg_len] = 1 + last_idx = min(sou_idx + seg_len - 1, hidden_length - 1) if is_backchannel[i]: - eou_targets[sou_idx + seg_len - 1] = EOB_LABEL # end of backchannel + eou_targets[last_idx] = EOB_LABEL # end of backchannel else: - eou_targets[sou_idx + seg_len - 1] = EOU_LABEL # end of utterance + eou_targets[last_idx] = EOU_LABEL # end of utterance return eou_targets diff --git a/nemo/collections/asr/metrics/wer.py b/nemo/collections/asr/metrics/wer.py index d8e70d3aaadc..e950fa7ee188 100644 --- a/nemo/collections/asr/metrics/wer.py +++ b/nemo/collections/asr/metrics/wer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy from typing import List, Optional, Tuple, Union import editdistance @@ -255,6 +256,7 @@ def __init__( batch_dim_index=0, dist_sync_on_step=False, sync_on_compute=True, + return_hypotheses=False, ): super().__init__(dist_sync_on_step=dist_sync_on_step, sync_on_compute=sync_on_compute) @@ -263,30 +265,33 @@ def __init__( self.log_prediction = log_prediction self.fold_consecutive = fold_consecutive self.batch_dim_index = batch_dim_index + self.return_hypotheses = return_hypotheses self.decode = None if isinstance(self.decoding, AbstractRNNTDecoding): self.decode = lambda predictions, predictions_lengths, predictions_mask, input_ids, targets: self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=predictions, encoded_lengths=predictions_lengths + encoder_output=predictions, encoded_lengths=predictions_lengths, return_hypotheses=return_hypotheses ) elif isinstance(self.decoding, AbstractCTCDecoding): self.decode = lambda predictions, predictions_lengths, predictions_mask, input_ids, targets: self.decoding.ctc_decoder_predictions_tensor( decoder_outputs=predictions, decoder_lengths=predictions_lengths, fold_consecutive=self.fold_consecutive, + return_hypotheses=return_hypotheses, ) elif isinstance(self.decoding, AbstractMultiTaskDecoding): self.decode = lambda predictions, prediction_lengths, predictions_mask, input_ids, targets: self.decoding.decode_predictions_tensor( encoder_hidden_states=predictions, encoder_input_mask=predictions_mask, decoder_input_ids=input_ids, - return_hypotheses=False, + return_hypotheses=return_hypotheses, ) else: raise TypeError(f"WER metric does not support decoding of type {type(self.decoding)}") self.add_state("scores", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) self.add_state("words", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False) + self.hypotheses = None def update( self, @@ -296,7 +301,6 @@ def update( targets_lengths: torch.Tensor, predictions_mask: Optional[torch.Tensor] = None, input_ids: Optional[torch.Tensor] = None, - return_hypotheses: Optional[bool] = False, ): """ Updates metric state. @@ -346,11 +350,22 @@ def update( self.scores = torch.tensor(scores, device=self.scores.device, dtype=self.scores.dtype) self.words = torch.tensor(words, device=self.words.device, dtype=self.words.dtype) - if return_hypotheses: - return hypotheses + self.hypotheses = hypotheses return None def compute(self): scores = self.scores.detach().float() words = self.words.detach().float() return scores / words, scores, words + + def reset(self): + super().reset() + self.hypotheses = None + + def get_hypotheses(self): + """ + Returns the hypotheses generated during the last call to update. + """ + if self.hypotheses is None: + raise ValueError("No hypotheses available. Please call update() first.") + return deepcopy(self.hypotheses) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index a5e0354cbe9f..8f7bdabe9745 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -12,24 +12,91 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Optional +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset +from omegaconf import DictConfig, OmegaConf, open_dict + +from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( + EOB_LABEL, + EOB_STRING, + EOU_LABEL, + EOU_STRING, + AudioToTextEOUBatch, + LhotseSpeechToTextBpeEOUDataset, +) +from nemo.collections.asr.metrics.wer import WER from nemo.collections.asr.models import EncDecRNNTBPEModel +from nemo.collections.asr.parts.utils.eou_utils import ( + EOUResult, + cal_eou_metrics_from_frame_labels, + flatten_nested_list, +) from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config +from nemo.collections.common.data.utils import move_data_to_device from nemo.core.classes.mixins import AccessMixin __all__ = ['EncDecRNNTBPEEOUModel'] +@dataclass +class EOUPrediction: + eou_probs: Optional[List[float]] = None + eob_probs: Optional[List[float]] = None + eou_preds: Optional[List[bool]] = None + eob_preds: Optional[List[bool]] = None + + class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel): + def __init__(self, cfg: DictConfig, trainer): + + self._patch_decoding_cfg(cfg) + super().__init__(cfg=cfg, trainer=trainer) + + self.eou_token = self.tokenizer.token_to_id(EOU_STRING) + self.eob_token = self.tokenizer.token_to_id(EOB_STRING) + self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor + + self.wer = WER( + decoding=self.decoding, + batch_dim_index=0, + use_cer=self._cfg.get('use_cer', False), + log_prediction=self._cfg.get('log_prediction', True), + dist_sync_on_step=True, + return_hypotheses=True, + ) + + # Setup fused Joint step if flag is set + if self.joint.fuse_loss_wer: + self.joint.set_loss(self.loss) + self.joint.set_wer(self.wer) + + def _patch_decoding_cfg(self, cfg: DictConfig): + """ + Patch the decoding config as needed for EOU computation + """ + with open_dict(cfg): + if cfg.decoding.strategy in ['greedy', 'greedy_batch']: + cfg.decoding.greedy.preserve_alignments = True + cfg.decoding.greedy.compute_timestamps = True + elif cfg.decoding.strategy in ['beam', 'tsd', 'alsd', 'maes']: + cfg.decoding.beam.preserve_alignments = True + cfg.decoding.beam.compute_timestamps = True + + def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: + """ + PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device + """ + batch = move_data_to_device(batch, device) + return batch + def _setup_dataloader_from_config(self, config: Optional[Dict]): cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config - dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer, return_eou_labels=True) + dataset = LhotseSpeechToTextBpeEOUDataset( + cfg=cfg, tokenizer=self.tokenizer, return_cuts=config.get("do_transcribe", False) + ) return get_lhotse_dataloader_from_config( config, # During transcription, the model is initially loaded on the CPU. @@ -41,12 +108,15 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): tokenizer=self.tokenizer, ) - def training_step(self, batch, batch_nb): + def training_step(self, batch: AudioToTextEOUBatch, batch_nb): # Reset access registry if AccessMixin.is_access_enabled(self.model_guid): AccessMixin.reset_registry(self) - signal, signal_len, transcript, transcript_len, eou_targets, eou_len = batch + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths # forward() only performs encoder forward encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) @@ -136,9 +206,9 @@ def training_step(self, batch, batch_nb): return {'loss': loss_value} - def predict_step(self, batch, batch_idx, dataloader_idx=0): - # TODO: add EOU metrics - signal, signal_len, transcript, transcript_len, eou_target, eou_len = batch + def predict_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): + signal = batch.audio_signal + signal_len = batch.audio_lengths # forward() only performs encoder forward encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) @@ -150,16 +220,21 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0): return list(best_hyp_text) - def validation_pass(self, batch, batch_idx, dataloader_idx=0): - # TODO: add EOU metrics - signal, signal_len, transcript, transcript_len, eou_target, eou_len = batch + def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader_idx: int = 0): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths # forward() only performs encoder forward encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) del signal tensorboard_logs = {} - + text_gt = self._get_text_from_tokens(transcript, transcript_len) + tensorboard_logs['val_sample_id'] = batch.sample_ids + tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths + tensorboard_logs['val_text_gt'] = text_gt # If experimental fused Joint-Loss-WER is not used if not self.joint.fuse_loss_wer: if self.compute_eval_loss: @@ -178,12 +253,21 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0): targets=transcript, targets_lengths=transcript_len, ) + hypotheses = self.wer.get_hypotheses() + + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + eou_predictions = self.get_eou_predictions_from_hypotheses(hypotheses, batch) + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + wer, wer_num, wer_denom = self.wer.compute() self.wer.reset() tensorboard_logs['val_wer_num'] = wer_num tensorboard_logs['val_wer_denom'] = wer_denom tensorboard_logs['val_wer'] = wer + tensorboard_logs['val_eou_metrics'] = eou_metrics_list + tensorboard_logs['val_eob_metrics'] = eob_metrics_list + tensorboard_logs['val_text_pred'] = text_pred else: # If experimental fused Joint-Loss-WER is used @@ -203,15 +287,253 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0): transcripts=transcript, transcript_lengths=target_len, compute_wer=compute_wer, + keep_hypotheses=True, ) + hypotheses = self.joint.get_hypotheses() + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + + eou_predictions = self.get_eou_predictions_from_hypotheses(hypotheses, batch) + del hypotheses + + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + del eou_predictions + if loss_value is not None: tensorboard_logs['val_loss'] = loss_value tensorboard_logs['val_wer_num'] = wer_num tensorboard_logs['val_wer_denom'] = wer_denom tensorboard_logs['val_wer'] = wer + tensorboard_logs['val_eou_metrics'] = eou_metrics_list + tensorboard_logs['val_eob_metrics'] = eob_metrics_list + tensorboard_logs['val_text_pred'] = text_pred self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) return tensorboard_logs + + def _get_text_from_tokens(self, tokens: torch.Tensor, tokens_len: Optional[torch.Tensor] = None) -> List[str]: + """ + Convert tokens to text. + Args: + tokens: tensor of tokens + Returns: + text: list of text + """ + text_list = [] + for i in range(len(tokens)): + tokens_i = tokens[i] + if tokens_len is not None: + tokens_i = tokens[i][: tokens_len[i]] + text = self.tokenizer.ids_to_text(tokens_i) + text_list.append(text) + return text_list + + def get_eou_predictions_from_hypotheses( + self, hypotheses: List[Hypothesis], batch: AudioToTextEOUBatch + ) -> List[EOUPrediction]: + """ + Get EOU predictions from the hypotheses. + Args: + hypotheses: batch of hypotheses + Returns: + eou_predictions: list of EOU predictions + """ + eou_predictions = [] + + for hyp in hypotheses: + # Process one hypothesis at a time + eou_probs = [] + eob_probs = [] + eou_preds = [] + eob_preds = [] + for alignment in hyp.alignments: + # Process for each timestamp + probs = torch.softmax(torch.stack([a[0] for a in alignment], dim=0), dim=-1) # unfold RNNT preds + tokens = torch.stack([a[1] for a in alignment], dim=0) # unfold RNNT preds + # Get the max prob for eou and eob + # and check if eou and eob are predicted + max_eou_prob = probs[:, self.eou_token].max().item() + max_eob_prob = probs[:, self.eob_token].max().item() + eou_pred = torch.any(tokens == self.eou_token).item() + eob_pred = torch.any(tokens == self.eob_token).item() + + eou_probs.append(max_eou_prob) + eob_probs.append(max_eob_prob) + eou_preds.append(eou_pred) + eob_preds.append(eob_pred) + + eou_predictions.append( + EOUPrediction( + eou_probs=eou_probs, + eob_probs=eob_probs, + eou_preds=eou_preds, + eob_preds=eob_preds, + ) + ) + + return eou_predictions + + def _pad_to_same_length(self, eou_labels: List[float], eou_preds: List[float]) -> Tuple[List[float], List[float]]: + """ + Pad the EOU labels and predictions to the same length. + Args: + eou_labels: list of EOU labels + eou_preds: list of EOU predictions + Returns: + eou_labels: list of EOU labels, padded to the same length + eou_preds: list of EOU predictions, padded to the same length + """ + if len(eou_labels) < len(eou_preds): + eou_labels = eou_labels + [0] * (len(eou_preds) - len(eou_labels)) + elif len(eou_labels) > len(eou_preds): + eou_preds = eou_preds + [0] * (len(eou_labels) - len(eou_preds)) + return eou_labels, eou_preds + + def _calculate_eou_metrics( + self, eou_predictions: List[EOUPrediction], batch: AudioToTextEOUBatch + ) -> Tuple[List, List]: + """ + Calculate EOU metrics. + Args: + eou_predictions: list of EOU predictions + batch: batch of data + Returns: + eou_metrics_list: list of EOU metrics, each is of type EOUResult + eob_metrics_list: list of EOB metrics, each is of type EOUResult + """ + # Get the ground truth EOU labels + eou_labels = batch.eou_targets + eou_labels_len = batch.eou_target_lengths + + # Calculate EOU metrics + eou_metrics_list = [] + eob_metrics_list = [] + for i, eou_prediction in enumerate(eou_predictions): + eou_preds_i = [float(x) for x in eou_prediction.eou_preds] + eob_preds_i = [float(x) for x in eou_prediction.eob_preds] + + eou_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOU_LABEL).float().tolist() + eob_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOB_LABEL).float().tolist() + + # Pad the EOU labels and predictions to the same length with zeros + eou_labels_i, eou_preds_i = self._pad_to_same_length(eou_labels_i, eou_preds_i) + eob_labels_i, eob_preds_i = self._pad_to_same_length(eob_labels_i, eob_preds_i) + + # Calculate EOU metrics + eou_metrics = cal_eou_metrics_from_frame_labels( + prediction=eou_preds_i, + reference=eou_labels_i, + threshold=0.0, + collar=0.0, + frame_len_in_secs=self.frame_len_in_secs, + ) # type: EOUResult + + eob_metrics = cal_eou_metrics_from_frame_labels( + prediction=eob_preds_i, + reference=eob_labels_i, + threshold=0.0, + collar=0.0, + frame_len_in_secs=self.frame_len_in_secs, + ) + + eou_metrics_list.append(eou_metrics) + eob_metrics_list.append(eob_metrics) + + del eou_labels_i + del eou_preds_i + del eob_labels_i + del eob_preds_i + del eou_prediction + + return eou_metrics_list, eob_metrics_list + + def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): + assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." + # Aggregate WER metrics + if self.compute_eval_loss: + loss_mean = torch.stack([x[f'{mode}_loss'] for x in outputs]).mean() + loss_log = {f'{mode}_loss': loss_mean} + else: + loss_log = {} + wer_num = torch.stack([x[f'{mode}_wer_num'] for x in outputs]).sum() + wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() + tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} + + # Aggregate EOU/EOB metrics + eou_metrics = [] # type: List[EOUResult] + eob_metrics = [] # type: List[EOUResult] + for x in outputs: + eou_metrics.extend(x[f'{mode}_eou_metrics']) + eob_metrics.extend(x[f'{mode}_eob_metrics']) + + num_utterances = sum([x.num_utterances for x in eou_metrics]) + eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) + eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) + eob_latency = flatten_nested_list([x.latency for x in eob_metrics]) + eob_early_cutoff = flatten_nested_list([x.early_cutoff for x in eob_metrics]) + + eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_utterances + eob_avg_num_early_cutoff = len(eob_early_cutoff) / num_utterances + if len(eou_latency) == 0: + eou_latency = [0.0] + if len(eou_early_cutoff) == 0: + eou_early_cutoff = [0.0] + if len(eob_latency) == 0: + eob_latency = [0.0] + if len(eob_early_cutoff) == 0: + eob_early_cutoff = [0.0] + + eou_missing = [x.missing for x in eou_metrics] + eob_missing = [x.missing for x in eob_metrics] + + eou_latency = torch.tensor(eou_latency) + eou_latency_p90 = torch.quantile(eou_latency, 0.9).item() + eou_latency_p95 = torch.quantile(eou_latency, 0.95).item() + eou_latency_p99 = torch.quantile(eou_latency, 0.99).item() + + eou_early_cutoff = torch.tensor(eou_early_cutoff) + eou_early_cutoff_p90 = torch.quantile(eou_early_cutoff, 0.9).item() + eou_early_cutoff_p95 = torch.quantile(eou_early_cutoff, 0.95).item() + eou_early_cutoff_p99 = torch.quantile(eou_early_cutoff, 0.99).item() + + eob_latency = torch.tensor(eob_latency) + eob_latency_p90 = torch.quantile(eob_latency, 0.9).item() + eob_latency_p95 = torch.quantile(eob_latency, 0.95).item() + eob_latency_p99 = torch.quantile(eob_latency, 0.99).item() + + eob_early_cutoff = torch.tensor(eob_early_cutoff) + eob_early_cutoff_p90 = torch.quantile(eob_early_cutoff, 0.9).item() + eob_early_cutoff_p95 = torch.quantile(eob_early_cutoff, 0.95).item() + eob_early_cutoff_p99 = torch.quantile(eob_early_cutoff, 0.99).item() + + tensorboard_logs[f'{mode}_eou_latency_p90'] = eou_latency_p90 + tensorboard_logs[f'{mode}_eou_latency_p95'] = eou_latency_p95 + tensorboard_logs[f'{mode}_eou_latency_p99'] = eou_latency_p99 + + tensorboard_logs[f'{mode}_eou_early_cutoff_p90'] = eou_early_cutoff_p90 + tensorboard_logs[f'{mode}_eou_early_cutoff_p95'] = eou_early_cutoff_p95 + tensorboard_logs[f'{mode}_eou_early_cutoff_p99'] = eou_early_cutoff_p99 + + tensorboard_logs[f'{mode}_eob_latency_p90'] = eob_latency_p90 + tensorboard_logs[f'{mode}_eob_latency_p95'] = eob_latency_p95 + tensorboard_logs[f'{mode}_eob_latency_p99'] = eob_latency_p99 + + tensorboard_logs[f'{mode}_eob_early_cutoff_p90'] = eob_early_cutoff_p90 + tensorboard_logs[f'{mode}_eob_early_cutoff_p95'] = eob_early_cutoff_p95 + tensorboard_logs[f'{mode}_eob_early_cutoff_p99'] = eob_early_cutoff_p99 + + tensorboard_logs[f'{mode}_eou_early_cutoff_avg_num'] = eou_avg_num_early_cutoff + tensorboard_logs[f'{mode}_eob_early_cutoff_avg_num'] = eob_avg_num_early_cutoff + + tensorboard_logs[f'{mode}_eou_missing'] = sum(eou_missing) / num_utterances + tensorboard_logs[f'{mode}_eob_missing'] = sum(eob_missing) / num_utterances + + return {**loss_log, 'log': tensorboard_logs} + + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index ac130ab2c2c3..785f4b62400e 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -1225,6 +1225,7 @@ def input_types(self): "transcripts": NeuralType(('B', 'T'), LabelsType(), optional=True), "transcript_lengths": NeuralType(tuple('B'), LengthsType(), optional=True), "compute_wer": NeuralType(optional=True), + "keep_hypotheses": NeuralType(optional=True), } @property @@ -1337,6 +1338,8 @@ def __init__( # to change, requires running ``model.temperature = T`` explicitly self.temperature = 1.0 + self.hypotheses = None + @typecheck() def forward( self, @@ -1346,6 +1349,7 @@ def forward( transcripts: Optional[torch.Tensor] = None, transcript_lengths: Optional[torch.Tensor] = None, compute_wer: bool = False, + keep_hypotheses: bool = False, ) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]: # encoder = (B, D, T) # decoder = (B, D, U) if passed, else None @@ -1383,6 +1387,7 @@ def forward( wers, wer_nums, wer_denoms = [], [], [] target_lengths = [] batch_size = int(encoder_outputs.size(0)) # actual batch size + hypotheses = [] # Iterate over batch using fused_batch_size steps for batch_idx in range(0, batch_size, self._fused_batch_size): @@ -1467,6 +1472,9 @@ def forward( targets=sub_transcripts, targets_lengths=sub_transcript_lens, ) + + hyp = self.wer.get_hypotheses() if keep_hypotheses else [] + # Sync and all_reduce on all processes, compute global WER wer, wer_num, wer_denom = self.wer.compute() self.wer.reset() @@ -1477,6 +1485,7 @@ def forward( wers.append(wer) wer_nums.append(wer_num) wer_denoms.append(wer_denom) + hypotheses.extend(hyp) del sub_enc, sub_transcripts, sub_enc_lens, sub_transcript_lens @@ -1494,8 +1503,17 @@ def forward( wer_num = None wer_denom = None + self.hypotheses = hypotheses if keep_hypotheses else None return losses, wer, wer_num, wer_denom + def get_hypotheses(self): + """ + Returns the hypotheses generated during the last forward pass. + """ + if self.hypotheses is None: + raise ValueError("No hypotheses were generated during the last forward pass.") + return self.hypotheses + def project_encoder(self, encoder_output: torch.Tensor) -> torch.Tensor: """ Project the encoder output to the joint hidden dimension. diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 1a50f10d3ed4..01e61998b0f0 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -548,7 +548,7 @@ def rnnt_decoder_predictions_tensor( return all_hyp else: - hypotheses = self.decode_hypothesis(prediction_list) # type: List[str] + hypotheses = self.decode_hypothesis(prediction_list) # type: List[Hypothesis] # If computing timestamps if self.compute_timestamps is True: @@ -821,13 +821,13 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = for t in range(len(char_offsets)): # Subtract one here for the extra RNNT BLANK token emitted to designate "End of timestep" num_flattened_tokens += len(char_offsets[t]['char']) - 1 + if char_offsets[t]['char'][-1] != self.blank_id: + num_flattened_tokens += 1 # Add one back if it reaches max steps without blank token if num_flattened_tokens != len(hypothesis.text): - raise ValueError( - f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" - " have to be of the same length, but are: " - f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:" - f" {len(hypothesis.text)}" + raise RuntimeError( + f"Number of tokens in hypothesis ({len(hypothesis.text)}) does not match the number of offsets " + f"({num_flattened_tokens}). Please check the hypothesis and offsets manually." ) encoded_char_offsets = copy.deepcopy(char_offsets) diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index dff89abfea5f..8b41ca6c0b56 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -15,6 +15,8 @@ from dataclasses import dataclass from typing import List +import numpy as np + @dataclass class EOUResult: @@ -25,12 +27,26 @@ class EOUResult: false_positives: int num_utterances: int num_predictions: int + missing: int + + +def flatten_nested_list(nested_list: List[List[float]]) -> List[float]: + """ + Flatten a nested list into a single list. + Args: + nested_list (List[List]): A nested list to be flattened. + Returns: + List: A flattened list. + """ + return [item for sublist in nested_list for item in sublist] -def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float, collar: float) -> EOUResult: +def evaluate_eou( + *, prediction: List[dict], reference: List[dict], threshold: float, collar: float, do_sorting: bool = True +) -> EOUResult: """ Evaluate end of utterance predictions against reference labels. - Each item in predicition/reference is a dictionary containing: + Each item in predicition/reference is a dictionary in SegLST containing: { "session_id": str, "start_time": float, # start time in seconds @@ -47,6 +63,9 @@ def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float references (List[dict]): List of dictionaries containing reference labels. threshold (float): Threshold for considering a prediction as EOU. collar (float): Collar time in seconds for matching predictions to references. + do_sorting (bool): Whether to sort the predictions and references by start time. + Returns: + EOUResult: A dataclass containing the evaluation results. """ latency = [] @@ -56,10 +75,13 @@ def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float false_positives = 0 num_utterances = len(reference) num_predictions = len(prediction) + missing = 0 - predicted_eou = [p for p in prediction if p["eou_pred"] > threshold] - predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) - reference = sorted(reference, key=lambda x: x["start_time"]) + predicted_eou = [p for p in prediction if p["eou_prob"] > threshold] + + if do_sorting: + predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) + reference = sorted(reference, key=lambda x: x["start_time"]) p_idx = 0 r_idx = 0 @@ -96,6 +118,7 @@ def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float # Current predicted EOU is after the current reference ends false_negatives += 1 latency.append(p_end - r_end) + r_idx += 1 else: # p_end <= r_start # Current predicted EOU is before the current reference starts @@ -104,6 +127,7 @@ def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float if r_idx < len(reference): # There are remaining references that were not matched false_negatives += len(reference) - r_idx + missing += len(reference) - r_idx return EOUResult( latency=latency, @@ -113,4 +137,50 @@ def evaluate_eou(prediction: List[dict], reference: List[dict], threshold: float false_positives=false_positives, num_utterances=num_utterances, num_predictions=num_predictions, + missing=missing, + ) + + +def get_SegLST_from_frame_labels(frame_labels: List[int], frame_len_in_secs: float = 0.08) -> List[dict]: + """ + Convert frame labels to SegLST format. + Args: + frame_labels (List[int]): List of frame labels. + frame_len_in_secs (float): Length of each frame in seconds. + Returns: + List[dict]: List of dictionaries in SegLST format. + """ + seg_lst = [] + start_time = 0.0 + for i, label in enumerate(frame_labels): + if label > 0: + end_time = start_time + frame_len_in_secs * i + seg_lst.append({"start_time": start_time, "end_time": end_time, "eou_prob": label}) + start_time = end_time + return seg_lst + + +def cal_eou_metrics_from_frame_labels( + *, prediction: List, reference: List, threshold: float = 0.5, collar: float = 0, frame_len_in_secs: float = 0.08 +) -> EOUResult: + """ + Calculate EOU metrics from lists of predictions and references. + Args: + prediction (List): List of floats containing predicted EOU probabilities. + reference (List): List of binary floats containing reference EOU probabilities. + threshold (float): Threshold for considering a prediction as EOU. + collar (float): Collar time in seconds for matching predictions to references. + frame_len_in_secs (float): Length of each frame in seconds. + """ + + if len(prediction) != len(reference): + raise ValueError( + f"Prediction ({len(prediction)}) and reference ({len(reference)}) lists must have the same length." + ) + + pred_seg_lst = get_SegLST_from_frame_labels(prediction, frame_len_in_secs) + ref_seg_lst = get_SegLST_from_frame_labels(reference, frame_len_in_secs) + eou_metrics = evaluate_eou( + prediction=pred_seg_lst, reference=ref_seg_lst, threshold=threshold, collar=collar, do_sorting=False ) + return eou_metrics diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 78f0715251be..c0129df8837a 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -1408,7 +1408,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st if isinstance(cfg.init_from_ptl_ckpt, str): # Restore checkpoint ckpt_path = cfg.pop('init_from_ptl_ckpt') - ckpt = torch.load(ckpt_path, map_location=map_location) + ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False) # Restore checkpoint into current model self.load_state_dict(ckpt['state_dict'], strict=False) @@ -1422,7 +1422,7 @@ def maybe_init_from_pretrained_checkpoint(self, cfg: OmegaConf, map_location: st for model_load_cfg in model_load_dict.values(): ckpt_path = model_load_cfg.path # Restore model - ckpt = torch.load(ckpt_path, map_location=map_location) + ckpt = torch.load(ckpt_path, map_location=map_location, weights_only=False) include = model_load_cfg.pop('include', [""]) exclude = model_load_cfg.pop('exclude', []) diff --git a/scripts/asr_end_of_utterance/evaluate_eou.py b/scripts/asr_end_of_utterance/evaluate_eou.py index 0137b8a10f72..85e8c4b79fd4 100644 --- a/scripts/asr_end_of_utterance/evaluate_eou.py +++ b/scripts/asr_end_of_utterance/evaluate_eou.py @@ -88,8 +88,8 @@ def main(): predictions = load_json(args.predictions, args.drop_prefix) references = load_json(args.references, args.drop_prefix) results = evaluate_eou( - predictions, - references, + prediction=predictions, + reference=references, threshold=args.threshold, collar=args.collar, ) diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index 2e9291ebc205..b91189d36b3b 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -119,16 +119,16 @@ def main(cfg): output_audio_dir = output_dir flatten_audio_path = False - # Load the dataset - tokenizer = parsers.make_parser(labels) # dummy tokenizer - dataset = LhotseSpeechToTextBpeEOUDataset( - cfg=cfg.data, tokenizer=tokenizer, return_eou_labels=False, return_cuts=True - ) - + # Patch data config with open_dict(cfg.data): cfg.data.force_finite = True cfg.data.force_map_dataset = True cfg.data.shuffle = False + cfg.data.check_tokenizer = False # No need to check tokenizer in LhotseSpeechToTextBpeEOUDataset + + # Load the dataset + tokenizer = parsers.make_parser(labels) # dummy tokenizer + dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg.data, tokenizer=tokenizer, return_cuts=True) dataloader = get_lhotse_dataloader_from_config( config=cfg.data, @@ -160,14 +160,14 @@ def main(cfg): audio_file = cut.recording.sources[0].source if flatten_audio_path: - output_audio_file = output_audio_dir / str(audio_file).replace('/', '_') + output_audio_file = output_audio_dir / str(audio_file).replace('/', '_')[:255] else: output_audio_file = output_audio_dir / Path(audio_file).relative_to(manifest_parent_dir) output_audio_file.parent.mkdir(parents=True, exist_ok=True) sf.write(output_audio_file, audio, dataset.sample_rate) - manifest_item["audio_filepath"] = str(output_audio_file) + manifest_item["audio_filepath"] = str(output_audio_file.relative_to(output_audio_dir)) manifest_item["offset"] = 0 manifest_item["duration"] = audio.shape[0] / dataset.sample_rate From 5667d717453097959970e0b1fb709412f0e0596f Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Apr 2025 12:58:34 -0400 Subject: [PATCH 024/107] clean up and update infer Signed-off-by: stevehuang52 --- .../asr_eou/speech_to_text_rnnt_eou_eval.py | 90 ++++--------------- .../asr_eou/speech_to_text_rnnt_eou_train.py | 2 +- nemo/collections/asr/models/asr_eou_models.py | 45 ++++++++-- 3 files changed, 54 insertions(+), 83 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py index 4037dc7eeab3..1aca0df1c9aa 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,74 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Example usage: - -0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py - -1. Add special tokens and to the tokenizer of pretrained model, by refering to the script - /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py - -2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script - /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py - -3. Run the following command to train the ASR-EOU model: -```bash -#!/bin/bash - -NEMO_PATH=/home/heh/codes/nemo-eou -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH - -TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json - -PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo -TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms-eou/ - -BATCH_DURATION=30 -NUM_WORKERS=0 -LIMIT_TRAIN_BATCHES=100 -VAL_CHECK_INTERVAL=100 -MAX_STEPS=1000000 - -EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug - -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py -CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming -CONFIG_NAME=fastconformer_transducer_bpe_streaming - -CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ - --config-path $CONFIG_PATH \ - --config-name $CONFIG_NAME \ - ++init_from_nemo_model=$PRETRAINED_NEMO \ - model.encoder.att_context_size="[70,1]" \ - model.tokenizer.dir=$TOKENIZER_DIR \ - model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ - model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ - model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.train_ds.batch_duration=$BATCH_DURATION \ - model.train_ds.num_workers=$NUM_WORKERS \ - model.validation_ds.batch_duration=$BATCH_DURATION \ - model.validation_ds.num_workers=$NUM_WORKERS \ - ~model.test_ds \ - trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ - trainer.val_check_interval=$VAL_CHECK_INTERVAL \ - trainer.max_steps=$MAX_STEPS \ - exp_manager.name=$EXP_NAME -``` - -""" - - -from typing import Optional import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf +import torch +from omegaconf import OmegaConf, open_dict -from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel from nemo.collections.asr.models.asr_eou_models import EncDecRNNTBPEEOUModel -from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -87,20 +25,20 @@ def load_model(cfg, trainer): - keys = ['init_from_nemo_model', 'init_from_ptl_ckpt', 'init_from_pretrained_model'] - num_keys = sum([1 for key in keys if key in cfg]) - if num_keys > 1: - raise ValueError( - f"Only one of the following keys should be present in the config: {keys}. Found {num_keys} keys." - ) if "init_from_nemo_model" in cfg: + logging.info(f"Loading model from local file: {cfg.init_from_nemo_model}") model = EncDecRNNTBPEEOUModel.restore_from(cfg.init_from_nemo_model, trainer=trainer) - elif "init_from_ptl_ckpt" in cfg: - model = EncDecRNNTBPEEOUModel.load_from_checkpoint(cfg.init_from_ptl_ckpt, trainer=trainer) elif "init_from_pretrained_model" in cfg: + logging.info(f"Loading model from remote: {cfg.init_from_pretrained_model}") model = EncDecRNNTBPEEOUModel.from_pretrained(cfg.init_from_pretrained_model, trainer=trainer) else: - model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) + raise ValueError( + "Please provide either 'init_from_nemo_model' or 'init_from_pretrained_model' in the config file." + ) + if "init_from_ptl_ckpt" in cfg: + logging.info(f"Loading weights from checkpoint: {cfg.init_from_ptl_ckpt}") + state_dict = torch.load(cfg.init_from_ptl_ckpt, map_location='cpu', weights_only=False)['state_dict'] + model.load_state_dict(state_dict, strict=True) return model @@ -113,6 +51,10 @@ def main(cfg): asr_model = load_model(cfg, trainer) + if "save_pred_to_file" in cfg: + with open_dict(asr_model.cfg): + asr_model.cfg.save_pred_to_file = cfg.save_pred_to_file + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: asr_model.setup_test_data(test_data_config=cfg.model.test_ds) trainer.test(asr_model) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 6da6665c9d5b..ef7b7fa2ff82 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 8f7bdabe9745..9662dc4b8b67 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -13,9 +13,11 @@ # limitations under the License. from dataclasses import dataclass +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import torch +from lightning.pytorch.utilities import rank_zero_only from omegaconf import DictConfig, OmegaConf, open_dict from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( @@ -33,10 +35,12 @@ cal_eou_metrics_from_frame_labels, flatten_nested_list, ) +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.data.utils import move_data_to_device from nemo.core.classes.mixins import AccessMixin +from nemo.utils import logging __all__ = ['EncDecRNNTBPEEOUModel'] @@ -294,10 +298,8 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) eou_predictions = self.get_eou_predictions_from_hypotheses(hypotheses, batch) - del hypotheses eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) - del eou_predictions if loss_value is not None: tensorboard_logs['val_loss'] = loss_value @@ -441,16 +443,12 @@ def _calculate_eou_metrics( eou_metrics_list.append(eou_metrics) eob_metrics_list.append(eob_metrics) - del eou_labels_i - del eou_preds_i - del eob_labels_i - del eob_preds_i - del eou_prediction - return eou_metrics_list, eob_metrics_list def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." + self._maybe_save_predictions(outputs, mode=mode, dataloader_idx=dataloader_idx) + # Aggregate WER metrics if self.compute_eval_loss: loss_mean = torch.stack([x[f'{mode}_loss'] for x in outputs]).mean() @@ -537,3 +535,34 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') + + @rank_zero_only + def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0): + """ + Save predictions to disk. + Args: + outputs: list of outputs + mode: mode of the model, either 'val' or 'test' + """ + + if not self.cfg.get('save_pred_to_file', None): + return + + output_file = Path(self.cfg.save_pred_to_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + output_file = output_file.with_suffix(f'.{dataloader_idx}.json') + + manifest = [] + for output in outputs: + for i in range(len(output[f'{mode}_sample_id'])): + item = { + "sample_id": output[f'{mode}_sample_id'][i], + "audio_filepath": output[f'{mode}_audio_filepath'][i], + "eou_text": output[f'{mode}_text_gt'][i], + "eou_pred_text": output[f'{mode}_text_pred'][i], + } + manifest.append(item) + write_manifest(output_file, manifest) + logging.info(f"Predictions saved to {output_file}") + return output_file From c9502b49953af420ef3b3325e5ff385c0965ab63 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 7 May 2025 19:53:31 -0400 Subject: [PATCH 025/107] update Signed-off-by: stevehuang52 --- .../asr_eou/speech_to_text_rnnt_eou_train.py | 2 -- tools/nemo_forced_aligner/align_eou.py | 27 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index ef7b7fa2ff82..b80d34a746d6 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -174,8 +174,6 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat # shape: [num_classes+2, hid_dim] joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class - joint_state['2.weight'][-2, :] = 0.0001 # EOB class - joint_state['2.weight'][-3, :] = 0.0001 # EOU class if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 042bc30a6f99..1a38418751af 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -19,6 +19,7 @@ import shutil from dataclasses import dataclass, field, is_dataclass from pathlib import Path +from string import punctuation from typing import List, Optional import torch @@ -163,6 +164,30 @@ class AlignmentConfig: # remove tmp dir after alignment remove_tmp_dir: bool = False + clean_text: bool = True + + +def clean_text(manifest: List[dict]): + punctuations = punctuation.replace("'", "") + # replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] + replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] + replace_with_apos = [char for char in '‘’ʻ‘’‘'] + + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + for i in range(len(manifest)): + text = manifest[i]["text"].strip().lower() # type: str + text = text.translate(str.maketrans("", "", punctuations)) + new_text = "" + for c in text: + if c in valid_chars: + new_text += c + text = new_text + for c in replace_with_blank: + text = text.replace(c, "") + for c in replace_with_apos: + text = text.replace(c, "'") + manifest[i]["text"] = text + return manifest @hydra_runner(config_name="AlignmentConfig", schema=AlignmentConfig) @@ -369,6 +394,8 @@ def process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device): for start, end in zip(starts, ends): manifest_lines_batch = get_manifest_lines_batch(cfg.manifest_filepath, start, end) + if cfg.clean_text: + manifest_lines_batch = clean_text(manifest_lines_batch) ( log_probs_batch, y_batch, From 016e5cc82c33d0160a00dda21c4b2ee86393f408 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 12 May 2025 10:26:22 -0400 Subject: [PATCH 026/107] update Signed-off-by: stevehuang52 --- .../asr_eou/speech_to_text_rnnt_eou_train.py | 45 ++++++++++- ...astconformer_transducer_bpe_streaming.yaml | 24 +++--- .../asr/data/audio_to_eou_label_lhotse.py | 4 +- scripts/asr_end_of_utterance/conf/data.yaml | 1 + .../generate_noisy_eval_data.py | 59 ++++++++++----- tools/nemo_forced_aligner/align_eou.py | 74 +++++++++++++++---- 6 files changed, 159 insertions(+), 48 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index b80d34a746d6..a8184121846b 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -104,7 +104,7 @@ def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: return pretrained_name -def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str): +def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str, cfg: DictConfig): """ load the pretrained model from a .nemo file, taking into account the joint network """ @@ -170,15 +170,52 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat pretrained_joint_clf_weight = pretrained_joint_state['2.weight'] # shape: [num_classes, hid_dim] pretrained_joint_clf_bias = pretrained_joint_state['2.bias'] if '2.bias' in pretrained_joint_state else None + token_init_method = cfg.model.get('token_init_method', 'constant') # Copy the weights and biases from the pretrained model to the new model # shape: [num_classes+2, hid_dim] joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class + + value = None + if token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_joint_clf_weight.min(dim=0)[0] + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_joint_clf_weight.max(dim=0)[0] + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_joint_clf_weight.mean(dim=0) + elif token_init_method == 'constant': + value = cfg.model.get('token_init_weight_value', 0.01) + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + + if value is not None: + joint_state['2.weight'][-2, :] = value # EOB class + joint_state['2.weight'][-3, :] = value # EOU class + if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class - joint_state['2.bias'][-2] = -1000.0 # EOB class - joint_state['2.bias'][-3] = -1000.0 # EOU class + value = None + if token_init_method == 'constant': + value = cfg.model.get('token_init_bias_value', -600.0) + elif token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_joint_clf_bias.min() + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_joint_clf_bias.max() + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_joint_clf_bias.mean() + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + + if value is not None: + joint_state['2.bias'][-2] = value # EOB class + joint_state['2.bias'][-3] = value # EOU class # Load the joint network weights joint_network.joint_net.load_state_dict(joint_state, strict=True) @@ -196,7 +233,7 @@ def main(cfg): init_from_model = get_pretrained_model_name(cfg) if init_from_model: - init_from_pretrained_nemo(asr_model, init_from_model) + init_from_pretrained_nemo(asr_model, init_from_model, cfg) trainer.fit(asr_model) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 71094058583e..7c7abe0f027b 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -8,6 +8,10 @@ name: "FastConformer-Transducer-BPE-Streaming-EOU" model: + token_init_method: "min" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: 0.01 # only applicable when token_init_method='constant' + token_init_bias_value: -500.0 # only applicable when token_init_method='constant' + sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. log_prediction: true # enables logging sample predictions in the output during training @@ -25,8 +29,8 @@ model: max_duration: 30 # you may need to update it for your dataset min_duration: 0.1 defer_setup: true - batch_duration: 300 # you may disable batch_duration by setting it to `null` - batch_size: null + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 shuffle: true num_workers: 8 pin_memory: true @@ -37,12 +41,12 @@ model: shuffle_buffer_size: 10000 random_padding: - prob: 0.5 + prob: 0.9 min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + max_total_duration: 35.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution for padding duration - normal_std: 2.0 # standard deviation of normal distribution for padding duration + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' augmentor: white_noise: @@ -67,8 +71,8 @@ model: max_duration: 30 # you may need to update it for your dataset min_duration: 0.1 defer_setup: true - batch_duration: 300 # you may disable batch_duration by setting it to `null` - batch_size: null + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 shuffle: false num_workers: 8 pin_memory: true @@ -85,8 +89,8 @@ model: max_duration: 30 # you may need to update it for your dataset min_duration: 0.1 defer_setup: true - batch_duration: 300 # you may disable batch_duration by setting it to `null` - batch_size: null + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 shuffle: false num_workers: 8 pin_memory: true diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index d3709b0f2285..0ae3dff25f54 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -285,7 +285,7 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): if not isinstance(is_backchannel, list): is_backchannel = [is_backchannel] assert len(sou_time) == len( - cut.custom["is_backchannel"] + is_backchannel ), f"Number of SOU and backchannel do not match: SOU ({len(sou_time)}) vs backchannel ({len(is_backchannel)})" else: is_backchannel = [False] * len(sou_time) @@ -322,7 +322,7 @@ def _get_text_tokens(self, cut: Cut): if not isinstance(is_backchannel, list): is_backchannel = [is_backchannel] assert len(utterances) == len( - cut.custom["is_backchannel"] + is_backchannel ), f"Number of utterances and backchannel do not match: utterance ({len(utterances)}) vs backchannel ({len(is_backchannel)})" else: is_backchannel = [False] * len(utterances) diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index 70f68b81a855..a4731e6cce8f 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -2,6 +2,7 @@ output_dir: ??? data: + pattern: "*.json" manifest_filepath: ??? tarred_audio_filepaths: null sample_rate: 16000 diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index b91189d36b3b..f6f4185a2739 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -27,6 +27,7 @@ """ +from copy import deepcopy from pathlib import Path from shutil import rmtree @@ -46,7 +47,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging -# Dummy labels for the tokenizer +# Dummy labels for the dummy tokenizer labels = [ " ", "a", @@ -94,13 +95,21 @@ def main(cfg): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + # Patch data config + with open_dict(cfg.data): + cfg.data.force_finite = True + cfg.data.force_map_dataset = True + cfg.data.shuffle = False + cfg.data.check_tokenizer = False # No need to check tokenizer in LhotseSpeechToTextBpeEOUDataset + # Make output directory output_dir = Path(cfg.output_dir) - if output_dir.exists(): + if output_dir.exists() and cfg.get('overwrite', False): logging.info(f'Removing existing output directory: {output_dir}') rmtree(output_dir) - logging.info(f'Creating output directory: {output_dir}') - output_dir.mkdir(parents=True, exist_ok=True) + if not output_dir.exists(): + logging.info(f'Creating output directory: {output_dir}') + output_dir.mkdir(parents=True, exist_ok=True) # Dump the config to the output directory config = OmegaConf.to_container(cfg, resolve=True) @@ -108,10 +117,29 @@ def main(cfg): yaml.dump(config, f) logging.info(f'Config dumped to {output_dir / "config.yaml"}') + input_manifest_file = Path(cfg.data.manifest_filepath) + if input_manifest_file.is_dir(): + pattern = cfg.data.get('pattern', '*.json') + manifest_list = list(input_manifest_file.glob(pattern)) + if not manifest_list: + raise ValueError(f"No files found in {input_manifest_file} matching pattern `{pattern}`") + else: + manifest_list = [Path(x) for x in str(input_manifest_file).split(",")] + + logging.info(f'Found {len(manifest_list)} manifest files to process...') + + for i, manifest_file in enumerate(manifest_list): + logging.info(f'[{i+1}/{len(manifest_list)}] Processing {manifest_file}...') + data_cfg = deepcopy(cfg.data) + data_cfg.manifest_filepath = str(manifest_file) + process_manifest(data_cfg, output_dir) + + +def process_manifest(data_cfg, output_dir): # Load the input manifest - input_manifest = read_manifest(cfg.data.manifest_filepath) - logging.info(f'Found {len(input_manifest)} items in input manifest: {cfg.data.manifest_filepath}') - manifest_parent_dir = Path(cfg.data.manifest_filepath).parent + input_manifest = read_manifest(data_cfg.manifest_filepath) + logging.info(f'Found {len(input_manifest)} items in input manifest: {data_cfg.manifest_filepath}') + manifest_parent_dir = Path(data_cfg.manifest_filepath).parent if Path(input_manifest[0]["audio_filepath"]).is_absolute(): output_audio_dir = output_dir / 'wav' flatten_audio_path = True @@ -119,19 +147,12 @@ def main(cfg): output_audio_dir = output_dir flatten_audio_path = False - # Patch data config - with open_dict(cfg.data): - cfg.data.force_finite = True - cfg.data.force_map_dataset = True - cfg.data.shuffle = False - cfg.data.check_tokenizer = False # No need to check tokenizer in LhotseSpeechToTextBpeEOUDataset - # Load the dataset tokenizer = parsers.make_parser(labels) # dummy tokenizer - dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg.data, tokenizer=tokenizer, return_cuts=True) + dataset = LhotseSpeechToTextBpeEOUDataset(cfg=data_cfg, tokenizer=tokenizer, return_cuts=True) dataloader = get_lhotse_dataloader_from_config( - config=cfg.data, + config=data_cfg, global_rank=0, world_size=1, dataset=dataset, @@ -160,9 +181,9 @@ def main(cfg): audio_file = cut.recording.sources[0].source if flatten_audio_path: - output_audio_file = output_audio_dir / str(audio_file).replace('/', '_')[:255] + output_audio_file = output_audio_dir / str(audio_file).replace('/', '_')[:255] # type: Path else: - output_audio_file = output_audio_dir / Path(audio_file).relative_to(manifest_parent_dir) + output_audio_file = output_audio_dir / Path(audio_file).relative_to(manifest_parent_dir) # type: Path output_audio_file.parent.mkdir(parents=True, exist_ok=True) sf.write(output_audio_file, audio, dataset.sample_rate) @@ -174,7 +195,7 @@ def main(cfg): manifest.append(manifest_item) # Write the output manifest - output_manifest_file = output_dir / Path(cfg.data.manifest_filepath).name + output_manifest_file = output_dir / Path(data_cfg.manifest_filepath).name write_manifest(output_manifest_file, manifest) logging.info(f'Output manifest written to {output_manifest_file}') diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 1a38418751af..13a99e5d6d2c 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -17,6 +17,7 @@ import math import os import shutil +import uuid from dataclasses import dataclass, field, is_dataclass from pathlib import Path from string import punctuation @@ -166,6 +167,24 @@ class AlignmentConfig: remove_tmp_dir: bool = False clean_text: bool = True + # For multi-node multi-gpu processing + num_nodes: int = 1 # total num of nodes/machines + num_gpus: int = 1 # num of GPUs per node/machine + node_idx: int = 0 # current node index + gpu_idx: int = 0 # current GPU index + + +def drop_pnc(text): + """ + Clean the text by removing invalid characters and converting to lowercase. + + :param text: Input text. + :return: Cleaned text. + """ + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = text.lower() + return ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + def clean_text(manifest: List[dict]): punctuations = punctuation.replace("'", "") @@ -173,15 +192,10 @@ def clean_text(manifest: List[dict]): replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] replace_with_apos = [char for char in '‘’ʻ‘’‘'] - valid_chars = "abcdefghijklmnopqrstuvwxyz'" for i in range(len(manifest)): text = manifest[i]["text"].strip().lower() # type: str text = text.translate(str.maketrans("", "", punctuations)) - new_text = "" - for c in text: - if c in valid_chars: - new_text += c - text = new_text + text = drop_pnc(text) for c in replace_with_blank: text = text.replace(c, "") for c in replace_with_apos: @@ -190,6 +204,36 @@ def clean_text(manifest: List[dict]): return manifest +def get_manifests_for_this_rank(manifest_list, num_nodes, num_gpus, node_idx, gpu_idx): + """ + Get the manifest files for this rank. + """ + if len(manifest_list) == 0: + return manifest_list + + assert num_nodes > 0, "num_nodes must be greater than 0" + assert num_gpus > 0, "num_gpus must be greater than 0" + assert 0 <= node_idx < num_nodes, f"node_idx {node_idx} must be between 0 and {num_nodes - 1}" + assert 0 <= gpu_idx < num_gpus, f"gpu_idx {gpu_idx} must be between 0 and {num_gpus - 1}" + + manifests_this_node = [] + for i, manifest_file in enumerate(manifest_list): + if num_nodes > 1: + if i % num_nodes == node_idx: + manifests_this_node.append(manifest_file) + else: + manifests_this_node.append(manifest_file) + + manifests_this_gpu = [] + for i, manifest_file in enumerate(manifests_this_node): + if num_gpus > 1: + if i % num_gpus == gpu_idx: + manifests_this_gpu.append(manifest_file) + else: + manifests_this_gpu.append(manifest_file) + return manifests_this_gpu + + @hydra_runner(config_name="AlignmentConfig", schema=AlignmentConfig) def main(cfg: AlignmentConfig): @@ -208,8 +252,8 @@ def main(cfg: AlignmentConfig): if cfg.manifest_filepath is None: raise ValueError("cfg.manifest_filepath must be specified") - if cfg.output_dir is None: - raise ValueError("cfg.output_dir must be specified") + if cfg.output_dir is None and not cfg.remove_tmp_dir: + raise ValueError("cfg.output_dir must be specified if cfg.remove_tmp_dir is False") if cfg.batch_size < 1: raise ValueError("cfg.batch_size cannot be zero or a negative number") @@ -330,6 +374,8 @@ def main(cfg: AlignmentConfig): ) origin_output_manifest_filepath = cfg.output_manifest_filepath + + manifest_list = get_manifests_for_this_rank(manifest_list, cfg.num_nodes, cfg.num_gpus, cfg.node_idx, cfg.gpu_idx) logging.info(f"Found {len(manifest_list)} manifest files to process.") # process each manifest file for manifest_filepath in manifest_list: @@ -337,9 +383,8 @@ def main(cfg: AlignmentConfig): cfg.manifest_filepath = str(manifest_filepath) if origin_output_manifest_filepath is None: - cfg.output_manifest_filepath = str( - Path(manifest_filepath).parent / f"{Path(manifest_filepath).stem}-aligned.json" - ) + manifest_stem = Path(manifest_filepath).stem.replace("-aligned", "") + cfg.output_manifest_filepath = str(Path(manifest_filepath).parent / f"{manifest_stem}-aligned.json") elif len(manifest_list) > 1 and origin_output_manifest_filepath is not None: raise ValueError( "cfg.output_manifest_filepath must be None when processing multiple manifest files. " @@ -356,7 +401,7 @@ def main(cfg: AlignmentConfig): logging.info("All manifest files processed successfully.") -def process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device): +def process_single_manifest(cfg: AlignmentConfig, model, buffered_chunk_params, viterbi_device): # Validate manifest contents if not is_entry_in_all_lines(cfg.manifest_filepath, "audio_filepath"): raise RuntimeError( @@ -384,6 +429,9 @@ def process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device): # init output_timestep_duration = None and we will calculate and update it during the first batch output_timestep_duration = None + if cfg.remove_tmp_dir and cfg.output_dir is None: + cfg.output_dir = f"alignment-{uuid.uuid4()}" + # init f_manifest_out os.makedirs(cfg.output_dir, exist_ok=True) tgt_manifest_name = str(Path(cfg.manifest_filepath).stem) + "_with_output_file_paths.json" @@ -466,7 +514,7 @@ def process_single_manifest(cfg, model, buffered_chunk_params, viterbi_device): for item in output_manifest_lines: f.write(json.dumps(item) + '\n') - if cfg.remove_tmp_dir: # savely removing tmp dir after alignment + if cfg.remove_tmp_dir: # safely removing tmp dir after alignment for file_or_folder in [ tgt_manifest_filepath, os.path.join(cfg.output_dir, 'ctm'), From 78dbb45be08017e0c2bcadcaf438de080df1eec3 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 12 May 2025 15:35:54 -0400 Subject: [PATCH 027/107] fix rnnt_decoding for empty string Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/submodules/rnnt_decoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 01e61998b0f0..6cbf9863169f 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -1207,7 +1207,7 @@ def _get_segment_offsets( previous_word_index = i continue - elif word[-1] in segment_delimiter_tokens or word in segment_delimiter_tokens: + elif len(word) and (word[-1] in segment_delimiter_tokens or word in segment_delimiter_tokens): segment_words.append(word) if segment_words: segment_offsets.append( From cd6da4068e82ef0428bdfdf891d3f4c7cd4ea91c Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 12 May 2025 16:28:11 -0400 Subject: [PATCH 028/107] update cfg Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 7c7abe0f027b..82101374f651 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -253,15 +253,14 @@ model: optim: name: adamw - lr: 5.0 + lr: 1e-4 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: - name: NoamAnnealing - d_model: ${model.encoder.d_model} + name: CosineAnnealing # scheduler config override warmup_steps: 10000 warmup_ratio: null From f8cf80ab571fdbdb6c391822333521a97b4eb0c7 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 12 May 2025 17:58:33 -0400 Subject: [PATCH 029/107] update cfg Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 82101374f651..b51632644bca 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -10,7 +10,7 @@ name: "FastConformer-Transducer-BPE-Streaming-EOU" model: token_init_method: "min" # choices=['min', 'max', 'mean', 'constant'] token_init_weight_value: 0.01 # only applicable when token_init_method='constant' - token_init_bias_value: -500.0 # only applicable when token_init_method='constant' + token_init_bias_value: -800.0 # only applicable when token_init_method='constant' sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. From 9362c21758b671252c41dd21ddfc64a3d2d7a009 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 13 May 2025 10:54:27 -0400 Subject: [PATCH 030/107] update padding augment Signed-off-by: stevehuang52 --- ...astconformer_transducer_bpe_streaming.yaml | 3 ++- .../asr/data/audio_to_eou_label_lhotse.py | 21 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index b51632644bca..0710d678465c 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -43,7 +43,8 @@ model: random_padding: prob: 0.9 min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_total_duration: 35.0 # maximum total duration of the padded audio in seconds + max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 0ae3dff25f54..2f34b1159212 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -74,6 +74,7 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): random_padding: # Random padding configuration prob: 0.9 # probability of applying padding min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: 2.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution for padding duration @@ -354,6 +355,7 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta """ p = np.random.rand() if self.padding_cfg is None or p > self.padding_cfg.prob: + # don't apply padding return audio, audio_len, eou_targets duration = audio_len.item() / self.cfg.sample_rate @@ -363,11 +365,12 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta # apply padding audio = audio[:audio_len] + max_padding_duration = max(0, self.padding_cfg.max_total_duration - duration) - if max_padding_duration <= self.padding_cfg.min_pad_duration: + if max_padding_duration <= 2 * self.padding_cfg.min_pad_duration: min_padding_duration = 0 else: - min_padding_duration = self.padding_cfg.min_pad_duration + min_padding_duration = 2 * self.padding_cfg.min_pad_duration if self.padding_cfg.pad_distribution == 'uniform': total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) @@ -377,8 +380,18 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta else: raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") - pre_padding_duration = np.random.uniform(0, total_padding_duration) - post_padding_duration = total_padding_duration - pre_padding_duration + if min_padding_duration == 0: + pre_padding_duration = total_padding_duration / 2 + post_padding_duration = total_padding_duration / 2 + else: + pre_padding_duration = np.random.uniform( + min_padding_duration, total_padding_duration - min_padding_duration + ) + post_padding_duration = total_padding_duration - pre_padding_duration + + if self.padding_cfg.max_pad_duration is not None: + pre_padding_duration = min(pre_padding_duration, self.padding_cfg.max_pad_duration) + post_padding_duration = min(post_padding_duration, self.padding_cfg.max_pad_duration) pre_padding_len = math.ceil(pre_padding_duration * self.cfg.sample_rate) post_padding_len = math.ceil(post_padding_duration * self.cfg.sample_rate) From 9513e4236b546df855dc2fb58081e06a6f1e5e76 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 13 May 2025 11:31:27 -0400 Subject: [PATCH 031/107] update Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index a8184121846b..e173b44141bc 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -187,7 +187,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat # set the EOU and EOB class to the mean value of the pretrained model value = pretrained_joint_clf_weight.mean(dim=0) elif token_init_method == 'constant': - value = cfg.model.get('token_init_weight_value', 0.01) + value = cfg.model.get('token_init_weight_value', None) elif token_init_method: raise ValueError(f"Unknown token_init_method: {token_init_method}.") @@ -200,7 +200,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class value = None if token_init_method == 'constant': - value = cfg.model.get('token_init_bias_value', -600.0) + value = cfg.model.get('token_init_bias_value', None) elif token_init_method == 'min': # set the EOU and EOB class to the minimum value of the pretrained model value = pretrained_joint_clf_bias.min() From 516d9f4f2c93e0f457bbb8e7be730292873a384a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 13 May 2025 14:00:26 -0400 Subject: [PATCH 032/107] update Signed-off-by: stevehuang52 --- .../fastconformer_transducer_bpe_streaming.yaml | 11 ++++++----- scripts/asr_end_of_utterance/conf/data.yaml | 1 + .../generate_noisy_eval_data.py | 13 ++++++++++++- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 0710d678465c..f7149cb3215a 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -8,9 +8,9 @@ name: "FastConformer-Transducer-BPE-Streaming-EOU" model: - token_init_method: "min" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: 0.01 # only applicable when token_init_method='constant' - token_init_bias_value: -800.0 # only applicable when token_init_method='constant' + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. @@ -254,15 +254,16 @@ model: optim: name: adamw - lr: 1e-4 + lr: 1e-4 # 1e-4 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: - name: CosineAnnealing + name: CosineAnnealing # NoamAnnealing CosineAnnealing # scheduler config override + # d_model: ${model.encoder.d_model} warmup_steps: 10000 warmup_ratio: null min_lr: 1e-6 diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index a4731e6cce8f..93056488edf4 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -23,6 +23,7 @@ data: random_padding: prob: 0.5 min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution for padding duration diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index f6f4185a2739..5a0c00476fbd 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -16,7 +16,7 @@ """ This script is used to generate noisy evaluation data for ASR and end of utterance detection. -Example usage: +Example usage with a single manifest input: python generate_noisy_eval_data.py \ --config-path conf/ \ --config-name data \ @@ -25,6 +25,17 @@ data.seed=42 \ data.noise.manifest_path /path/to/noise_manifest.json + +Example usage with multiple manifests matching a pattern: +python generate_noisy_eval_data.py \ + --config-path conf/ \ + --config-name data \ + output_dir=/path/to/output/dir \ + data.manifest_filepath=/path/to/manifest/dir/ \ + data.pattern="*.json" \ + data.seed=42 \ + data.noise.manifest_path /path/to/noise_manifest.json + """ from copy import deepcopy From a68531fd74e3a566e008f068129d47658295a251 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 13 May 2025 14:38:42 -0400 Subject: [PATCH 033/107] update cfg Signed-off-by: stevehuang52 --- .../asr_eou/fastconformer_transducer_bpe_streaming.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index f7149cb3215a..08ce859ea1eb 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -43,7 +43,7 @@ model: random_padding: prob: 0.9 min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' @@ -254,16 +254,16 @@ model: optim: name: adamw - lr: 1e-4 # 1e-4 + lr: 5.0 # 1e-4 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: - name: CosineAnnealing # NoamAnnealing CosineAnnealing + name: NoamAnnealing # NoamAnnealing CosineAnnealing # scheduler config override - # d_model: ${model.encoder.d_model} + d_model: ${model.encoder.d_model} warmup_steps: 10000 warmup_ratio: null min_lr: 1e-6 From 1915e02933398cc8223e16a5bba77c63d595a6ad Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 14 May 2025 10:28:03 -0400 Subject: [PATCH 034/107] fix eob metric logging Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 9662dc4b8b67..92a7c7cbd2d2 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -466,14 +466,16 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str eou_metrics.extend(x[f'{mode}_eou_metrics']) eob_metrics.extend(x[f'{mode}_eob_metrics']) - num_utterances = sum([x.num_utterances for x in eou_metrics]) + num_eou_utterances = sum([x.num_utterances for x in eou_metrics]) eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) + + num_eob_utterances = sum([x.num_utterances for x in eob_metrics]) eob_latency = flatten_nested_list([x.latency for x in eob_metrics]) eob_early_cutoff = flatten_nested_list([x.early_cutoff for x in eob_metrics]) - eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_utterances - eob_avg_num_early_cutoff = len(eob_early_cutoff) / num_utterances + eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_eou_utterances + eob_avg_num_early_cutoff = len(eob_early_cutoff) / num_eob_utterances if len(eou_latency) == 0: eou_latency = [0.0] if len(eou_early_cutoff) == 0: @@ -525,8 +527,8 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str tensorboard_logs[f'{mode}_eou_early_cutoff_avg_num'] = eou_avg_num_early_cutoff tensorboard_logs[f'{mode}_eob_early_cutoff_avg_num'] = eob_avg_num_early_cutoff - tensorboard_logs[f'{mode}_eou_missing'] = sum(eou_missing) / num_utterances - tensorboard_logs[f'{mode}_eob_missing'] = sum(eob_missing) / num_utterances + tensorboard_logs[f'{mode}_eou_missing'] = sum(eou_missing) / num_eou_utterances + tensorboard_logs[f'{mode}_eob_missing'] = sum(eob_missing) / num_eob_utterances return {**loss_log, 'log': tensorboard_logs} From 3114f2c62fe53c84279476c3e879cf3bb6fd2e57 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 15 May 2025 10:34:48 -0400 Subject: [PATCH 035/107] refactor and add hybrid model Signed-off-by: stevehuang52 --- ...eou_eval.py => speech_to_text_eou_eval.py} | 6 +- .../speech_to_text_hybrid_eou_train.py | 331 ++++++++ ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 333 ++++++++ ...astconformer_transducer_bpe_streaming.yaml | 5 +- nemo/collections/asr/models/asr_eou_models.py | 762 ++++++++++++------ nemo/collections/asr/modules/rnnt.py | 4 +- 6 files changed, 1193 insertions(+), 248 deletions(-) rename examples/asr/asr_eou/{speech_to_text_rnnt_eou_eval.py => speech_to_text_eou_eval.py} (90%) create mode 100644 examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py create mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py b/examples/asr/asr_eou/speech_to_text_eou_eval.py similarity index 90% rename from examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py rename to examples/asr/asr_eou/speech_to_text_eou_eval.py index 1aca0df1c9aa..122f1739d76e 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_eou_eval.py @@ -17,7 +17,7 @@ import torch from omegaconf import OmegaConf, open_dict -from nemo.collections.asr.models.asr_eou_models import EncDecRNNTBPEEOUModel +from nemo.collections.asr.models import ASRModel from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -27,10 +27,10 @@ def load_model(cfg, trainer): if "init_from_nemo_model" in cfg: logging.info(f"Loading model from local file: {cfg.init_from_nemo_model}") - model = EncDecRNNTBPEEOUModel.restore_from(cfg.init_from_nemo_model, trainer=trainer) + model = ASRModel.restore_from(cfg.init_from_nemo_model, trainer=trainer) elif "init_from_pretrained_model" in cfg: logging.info(f"Loading model from remote: {cfg.init_from_pretrained_model}") - model = EncDecRNNTBPEEOUModel.from_pretrained(cfg.init_from_pretrained_model, trainer=trainer) + model = ASRModel.from_pretrained(cfg.init_from_pretrained_model, trainer=trainer) else: raise ValueError( "Please provide either 'init_from_nemo_model' or 'init_from_pretrained_model' in the config file." diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py new file mode 100644 index 000000000000..ac070a8cb578 --- /dev/null +++ b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py @@ -0,0 +1,331 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py + +1. Add special tokens and to the tokenizer of pretrained model, by refering to the script + /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py + +2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script + /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py + +3. Run the following command to train the ASR-EOU model: +```bash +#!/bin/bash + +NEMO_PATH=/home/heh/codes/nemo-eou +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json + +PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo +TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou + +BATCH_DURATION=30 +NUM_WORKERS=0 +LIMIT_TRAIN_BATCHES=100 +VAL_CHECK_INTERVAL=100 +MAX_STEPS=1000000 + +EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug + +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming +CONFIG_NAME=fastconformer_transducer_bpe_streaming + +CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ + --config-path $CONFIG_PATH \ + --config-name $CONFIG_NAME \ + ++init_from_nemo_model=$PRETRAINED_NEMO \ + model.encoder.att_context_size="[70,1]" \ + model.tokenizer.dir=$TOKENIZER_DIR \ + model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ + model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ + model.validation_ds.manifest_filepath=$VAL_MANIFEST \ + model.train_ds.batch_duration=$BATCH_DURATION \ + model.train_ds.num_workers=$NUM_WORKERS \ + model.validation_ds.batch_duration=$BATCH_DURATION \ + model.validation_ds.num_workers=$NUM_WORKERS \ + ~model.test_ds \ + trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ + trainer.val_check_interval=$VAL_CHECK_INTERVAL \ + trainer.max_steps=$MAX_STEPS \ + exp_manager.name=$EXP_NAME +``` + +""" + + +from typing import Optional + +import lightning.pytorch as pl +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.models.asr_eou_models import EncDecHybridRNNTCTCBPEEOUModel, EncDecRNNTBPEEOUModel +from nemo.collections.asr.modules.conv_asr import ConvASRDecoder +from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg + + +def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: + if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None: + raise NotImplementedError( + "Currently for simplicity of single script for all model types, we only support `init_from_nemo_model` and `init_from_pretrained_model`" + ) + nemo_model_path = cfg.get('init_from_nemo_model', None) + pretrained_name = cfg.get('init_from_pretrained_model', None) + if nemo_model_path is not None and pretrained_name is not None: + raise ValueError("Only pass `init_from_nemo_model` or `init_from_pretrained_model` but not both") + elif nemo_model_path is None and pretrained_name is None: + return None + + if nemo_model_path: + return nemo_model_path + if pretrained_name: + return pretrained_name + + +def init_from_pretrained_nemo(model: EncDecHybridRNNTCTCBPEModel, pretrained_model_path: str, cfg: DictConfig): + """ + load the pretrained model from a .nemo file, taking into account the joint network + """ + if pretrained_model_path.endswith('.nemo'): + pretrained_model = ASRModel.restore_from(restore_path=pretrained_model_path) # type: EncDecRNNTBPEModel + else: + try: + pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel + except Exception as e: + raise ValueError(f"Could not load pretrained model from {pretrained_model_path}.") from e + + if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): + raise ValueError( + f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." + ) + + # Load encoder state dict into the model + model.encoder.load_state_dict(pretrained_model.encoder.state_dict(), strict=True) + logging.info(f"Encoder weights loaded from {pretrained_model_path}.") + + # Load decoder state dict into the model + decoder = model.decoder # type: RNNTDecoder + pretrained_decoder = pretrained_model.decoder # type: RNNTDecoder + if not isinstance(decoder, RNNTDecoder) or not isinstance(pretrained_decoder, RNNTDecoder): + raise ValueError( + f"Decoder {decoder.__class__} is not RNNTDecoder or pretrained decoder {pretrained_decoder.__class__} is not RNNTDecoder." + ) + + decoder.prediction["dec_rnn"].load_state_dict(pretrained_decoder.prediction["dec_rnn"].state_dict(), strict=True) + + decoder_embed_states = decoder.prediction["embed"].state_dict()['weight'] # shape: [num_classes+2, hid_dim] + pretrained_decoder_embed_states = pretrained_decoder.prediction["embed"].state_dict()[ + 'weight' + ] # shape: [num_classes, hid_dim] + if decoder_embed_states.shape[0] != pretrained_decoder_embed_states.shape[0] + 2: + raise ValueError( + f"Size mismatched between pretrained ({pretrained_decoder_embed_states.shape[0]}+2) and current model ({decoder_embed_states.shape[0]}), skip loading decoder embedding." + ) + + decoder_embed_states[:-3, :] = pretrained_decoder_embed_states[:-1, :] # everything except EOU, EOB and blank + decoder_embed_states[-1, :] = pretrained_decoder_embed_states[-1, :] # blank class + decoder.prediction["embed"].load_state_dict({"weight": decoder_embed_states}, strict=True) + logging.info(f"Decoder weights loaded from {pretrained_model_path}.") + + # Load joint network weights if new model's joint network has two more classes than the pretrained model + joint_network = model.joint # type: RNNTJoint + pretrained_joint_network = pretrained_model.joint # type: RNNTJoint + assert isinstance(joint_network, RNNTJoint), f"Joint network {joint_network.__class__} is not RNNTJoint." + assert isinstance( + pretrained_joint_network, RNNTJoint + ), f"Pretrained joint network {pretrained_joint_network.__class__} is not RNNTJoint." + joint_network.pred.load_state_dict(pretrained_joint_network.pred.state_dict(), strict=True) + joint_network.enc.load_state_dict(pretrained_joint_network.enc.state_dict(), strict=True) + + if joint_network.num_classes_with_blank != pretrained_joint_network.num_classes_with_blank + 2: + raise ValueError( + f"Size mismatched between pretrained ({pretrained_joint_network.num_classes_with_blank}+2) and current model ({joint_network.num_classes_with_blank}), skip loading joint network." + ) + + # Load the joint network weights + pretrained_joint_state = pretrained_joint_network.joint_net.state_dict() + joint_state = joint_network.joint_net.state_dict() + pretrained_joint_clf_weight = pretrained_joint_state['2.weight'] # shape: [num_classes, hid_dim] + pretrained_joint_clf_bias = pretrained_joint_state['2.bias'] if '2.bias' in pretrained_joint_state else None + + token_init_method = cfg.model.get('token_init_method', 'constant') + # Copy the weights and biases from the pretrained model to the new model + # shape: [num_classes+2, hid_dim] + joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank + joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class + + value = None + if token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_joint_clf_weight.min(dim=0)[0] + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_joint_clf_weight.max(dim=0)[0] + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_joint_clf_weight.mean(dim=0) + elif token_init_method == 'constant': + value = cfg.model.get('token_init_weight_value', None) + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + + if value is not None: + joint_state['2.weight'][-2, :] = value # EOB class + joint_state['2.weight'][-3, :] = value # EOU class + + if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: + joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank + joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class + value = None + if token_init_method == 'constant': + value = cfg.model.get('token_init_bias_value', None) + elif token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_joint_clf_bias.min() + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_joint_clf_bias.max() + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_joint_clf_bias.mean() + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + + if value is not None: + joint_state['2.bias'][-2] = value # EOB class + joint_state['2.bias'][-3] = value # EOU class + + # Load the joint network weights + joint_network.joint_net.load_state_dict(joint_state, strict=True) + logging.info(f"Joint network weights loaded from {pretrained_model_path}.") + + # Load the CTC decoder weights if the model is EncDecHybridRNNTCTCBPEEOUModel + if not hasattr(model, 'ctc_decoder') or not isinstance(model, EncDecHybridRNNTCTCBPEEOUModel): + return + if not hasattr(pretrained_model, 'ctc_decoder') or not isinstance(pretrained_model, EncDecHybridRNNTCTCBPEModel): + raise ValueError( + f"CTC decoder {model.ctc_decoder.__class__} is not EncDecHybridRNNTCTCBPEEOUModel or pretrained CTC decoder {pretrained_model.ctc_decoder.__class__} is not EncDecHybridRNNTCTCBPEModel." + ) + + ctc_decoder = model.ctc_decoder # type: ConvASRDecoder + pretrained_ctc_decoder = pretrained_model.ctc_decoder # type: ConvASRDecoder + assert isinstance(ctc_decoder, ConvASRDecoder), f"CTC decoder {ctc_decoder.__class__} is not ConvASRDecoder." + assert isinstance( + pretrained_ctc_decoder, ConvASRDecoder + ), f"Pretrained CTC decoder {pretrained_ctc_decoder.__class__} is not ConvASRDecoder." + + ctc_decoder_state = ctc_decoder.state_dict() + pretrained_ctc_decoder_state = pretrained_ctc_decoder.state_dict() + + if ctc_decoder._num_classes == pretrained_ctc_decoder._num_classes: + logging.info("CTC decoder weights loaded from pretrained model with same shape.") + ctc_decoder.load_state_dict(pretrained_ctc_decoder_state, strict=True) + return + elif ctc_decoder._num_classes != pretrained_ctc_decoder._num_classes + 2: + raise ValueError( + f"Size mismatched between pretrained ({pretrained_ctc_decoder._num_classes}+2) and current model ({ctc_decoder._num_classes}), skip loading CTC decoder." + ) + + pretrained_weight = pretrained_ctc_decoder_state['decoder_layers.0.weight'] # shape: [num_classes, hid_dim, 1] + pretrained_bias = ( + pretrained_ctc_decoder_state['decoder_layers.0.bias'] + if 'decoder_layers.0.bias' in pretrained_ctc_decoder_state + else None + ) # shape: [num_classes] + + # Copy the weights and biases from the pretrained model to the new model + ctc_decoder_state['decoder_layers.0.weight'][:-3, :, :] = pretrained_weight[ + :-1, :, : + ] # everything except EOU, EOB and blank + ctc_decoder_state['decoder_layers.0.weight'][-1, :, :] = pretrained_weight[-1, :, :] # blank class + value = None + if token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_weight.min(dim=0)[0] + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_weight.max(dim=0)[0] + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_weight.mean(dim=0) + elif token_init_method == 'constant': + value = cfg.model.get('token_init_weight_value', None) + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + + if value is not None: + ctc_decoder_state['decoder_layers.0.weight'][-2, :] = value # EOB class + ctc_decoder_state['decoder_layers.0.weight'][-3, :] = value # EOU class + + if pretrained_bias is not None and 'decoder_layers.0.bias' in ctc_decoder_state: + ctc_decoder_state['decoder_layers.0.bias'][:-3] = pretrained_bias[:-1] # everything except EOU, EOB and blank + ctc_decoder_state['decoder_layers.0.bias'][-1] = pretrained_bias[-1] # blank class + value = None + if token_init_method == 'constant': + value = cfg.model.get('token_init_bias_value', None) + elif token_init_method == 'min': + # set the EOU and EOB class to the minimum value of the pretrained model + value = pretrained_bias.min() + elif token_init_method == 'max': + # set the EOU and EOB class to the maximum value of the pretrained model + value = pretrained_bias.max() + elif token_init_method == 'mean': + # set the EOU and EOB class to the mean value of the pretrained model + value = pretrained_bias.mean() + elif token_init_method: + raise ValueError(f"Unknown token_init_method: {token_init_method}.") + if value is not None: + ctc_decoder_state['decoder_layers.0.bias'][-2] = value + ctc_decoder_state['decoder_layers.0.bias'][-3] = value + + # Load the CTC decoder weights + model.ctc_decoder.load_state_dict(ctc_decoder_state, strict=True) + logging.info(f"CTC decoder weights loaded from {pretrained_model_path}.") + return + + +@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_transducer_bpe_streaming") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) + exp_manager(trainer, cfg.get("exp_manager", None)) + + asr_model = EncDecHybridRNNTCTCBPEEOUModel(cfg=cfg.model, trainer=trainer) + + init_from_model = get_pretrained_model_name(cfg) + if init_from_model: + init_from_pretrained_nemo(asr_model, init_from_model, cfg) + + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml new file mode 100644 index 000000000000..21dce1e4f743 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -0,0 +1,333 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +# Note: if training loss does not converge, you may increase warm-up to 20K. + +name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming-EOU" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.9 + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 13] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 08ce859ea1eb..976a84383da8 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -32,6 +32,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -77,6 +78,7 @@ model: shuffle: false num_workers: 8 pin_memory: true + drop_last: true quadratic_duration: 30 num_buckets: 30 num_cuts_for_bins_estimate: 10000 @@ -84,7 +86,7 @@ model: shuffle_buffer_size: 10000 test_ds: - manifest_filepath: ??? + manifest_filepath: null tarred_audio_filepaths: null sample_rate: ${model.sample_rate} max_duration: 30 # you may need to update it for your dataset @@ -93,6 +95,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false + drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 92a7c7cbd2d2..26420fda1dbb 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -29,7 +29,7 @@ LhotseSpeechToTextBpeEOUDataset, ) from nemo.collections.asr.metrics.wer import WER -from nemo.collections.asr.models import EncDecRNNTBPEModel +from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel from nemo.collections.asr.parts.utils.eou_utils import ( EOUResult, cal_eou_metrics_from_frame_labels, @@ -42,7 +42,7 @@ from nemo.core.classes.mixins import AccessMixin from nemo.utils import logging -__all__ = ['EncDecRNNTBPEEOUModel'] +__all__ = ['EncDecRNNTBPEEOUModel', 'EncDecHybridRNNTCTCBPEEOUModel'] @dataclass @@ -53,7 +53,269 @@ class EOUPrediction: eob_preds: Optional[List[bool]] = None -class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel): +class ASREOUModelMixin: + def _patch_decoding_cfg(self, cfg: DictConfig): + """ + Patch the decoding config as needed for EOU computation + """ + with open_dict(cfg): + cfg.decoding.preserve_alignments = True + cfg.decoding.compute_timestamps = True + + def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: + """ + PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device + """ + batch = move_data_to_device(batch, device) + return batch + + def _get_text_from_tokens(self, tokens: torch.Tensor, tokens_len: Optional[torch.Tensor] = None) -> List[str]: + """ + Convert tokens to text. + Args: + tokens: tensor of tokens + Returns: + text: list of text + """ + text_list = [] + for i in range(len(tokens)): + tokens_i = tokens[i] + if tokens_len is not None: + tokens_i = tokens[i][: tokens_len[i]] + tokens_i = [int(x) for x in tokens_i if x < self.tokenizer.vocab_size] + text = self.tokenizer.ids_to_text(tokens_i) + text_list.append(text) + return text_list + + def _get_eou_predictions_from_hypotheses( + self, hypotheses: List[Hypothesis], batch: AudioToTextEOUBatch + ) -> List[EOUPrediction]: + """ + Get EOU predictions from the hypotheses. + Args: + hypotheses: batch of hypotheses + Returns: + eou_predictions: list of EOU predictions + """ + eou_predictions = [] + + for hyp in hypotheses: + # Process one hypothesis at a time + eou_probs = [] + eob_probs = [] + eou_preds = [] + eob_preds = [] + if isinstance(hyp.alignments, tuple): + alignments = [hyp.alignments] # CTC + else: + alignments = hyp.alignments # RNNT + for alignment in alignments: + # Process for each timestamp + if isinstance(alignment, tuple): + # CTC + probs = torch.softmax(alignment[0], dim=-1) + tokens = alignment[1] + else: + # RNNT, alignment is a list of tuples + probs = torch.softmax(torch.stack([a[0] for a in alignment], dim=0), dim=-1) # unfold RNNT preds + tokens = torch.stack([a[1] for a in alignment], dim=0) # unfold RNNT preds + + # Get the max prob for eou and eob + # and check if eou and eob are predicted + max_eou_prob = probs[:, self.eou_token].max().item() + max_eob_prob = probs[:, self.eob_token].max().item() + eou_pred = torch.any(tokens == self.eou_token).item() + eob_pred = torch.any(tokens == self.eob_token).item() + + eou_probs.append(max_eou_prob) + eob_probs.append(max_eob_prob) + eou_preds.append(eou_pred) + eob_preds.append(eob_pred) + + eou_predictions.append( + EOUPrediction( + eou_probs=eou_probs, + eob_probs=eob_probs, + eou_preds=eou_preds, + eob_preds=eob_preds, + ) + ) + + return eou_predictions + + def _pad_to_same_length(self, eou_labels: List[float], eou_preds: List[float]) -> Tuple[List[float], List[float]]: + """ + Pad the EOU labels and predictions to the same length. + Args: + eou_labels: list of EOU labels + eou_preds: list of EOU predictions + Returns: + eou_labels: list of EOU labels, padded to the same length + eou_preds: list of EOU predictions, padded to the same length + """ + if len(eou_labels) < len(eou_preds): + eou_labels = eou_labels + [0] * (len(eou_preds) - len(eou_labels)) + elif len(eou_labels) > len(eou_preds): + eou_preds = eou_preds + [0] * (len(eou_labels) - len(eou_preds)) + return eou_labels, eou_preds + + def _calculate_eou_metrics( + self, eou_predictions: List[EOUPrediction], batch: AudioToTextEOUBatch + ) -> Tuple[List, List]: + """ + Calculate EOU metrics. + Args: + eou_predictions: list of EOU predictions + batch: batch of data + Returns: + eou_metrics_list: list of EOU metrics, each is of type EOUResult + eob_metrics_list: list of EOB metrics, each is of type EOUResult + """ + # Get the ground truth EOU labels + eou_labels = batch.eou_targets + eou_labels_len = batch.eou_target_lengths + + # Calculate EOU metrics + eou_metrics_list = [] + eob_metrics_list = [] + for i, eou_prediction in enumerate(eou_predictions): + eou_preds_i = [float(x) for x in eou_prediction.eou_preds] + eob_preds_i = [float(x) for x in eou_prediction.eob_preds] + + eou_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOU_LABEL).float().tolist() + eob_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOB_LABEL).float().tolist() + + # Pad the EOU labels and predictions to the same length with zeros + eou_labels_i, eou_preds_i = self._pad_to_same_length(eou_labels_i, eou_preds_i) + eob_labels_i, eob_preds_i = self._pad_to_same_length(eob_labels_i, eob_preds_i) + + # Calculate EOU metrics + eou_metrics = cal_eou_metrics_from_frame_labels( + prediction=eou_preds_i, + reference=eou_labels_i, + threshold=0.0, + collar=0.0, + frame_len_in_secs=self.frame_len_in_secs, + ) # type: EOUResult + + eob_metrics = cal_eou_metrics_from_frame_labels( + prediction=eob_preds_i, + reference=eob_labels_i, + threshold=0.0, + collar=0.0, + frame_len_in_secs=self.frame_len_in_secs, + ) + + eou_metrics_list.append(eou_metrics) + eob_metrics_list.append(eob_metrics) + + return eou_metrics_list, eob_metrics_list + + def _aggregate_eou_metrics(self, outputs: List[dict], mode: str): + # Aggregate EOU/EOB metrics + eou_metrics = [] # type: List[EOUResult] + eob_metrics = [] # type: List[EOUResult] + for x in outputs: + eou_metrics.extend(x[f'{mode}_eou_metrics']) + eob_metrics.extend(x[f'{mode}_eob_metrics']) + num_eou_utterances = sum([x.num_utterances for x in eou_metrics]) + eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) + eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) + + num_eob_utterances = sum([x.num_utterances for x in eob_metrics]) + eob_latency = flatten_nested_list([x.latency for x in eob_metrics]) + eob_early_cutoff = flatten_nested_list([x.early_cutoff for x in eob_metrics]) + + eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_eou_utterances if num_eou_utterances > 0 else 0.0 + eob_avg_num_early_cutoff = len(eob_early_cutoff) / num_eob_utterances if num_eob_utterances > 0 else 0.0 + if len(eou_latency) == 0: + eou_latency = [0.0] + if len(eou_early_cutoff) == 0: + eou_early_cutoff = [0.0] + if len(eob_latency) == 0: + eob_latency = [0.0] + if len(eob_early_cutoff) == 0: + eob_early_cutoff = [0.0] + + eou_missing = [x.missing for x in eou_metrics] + eob_missing = [x.missing for x in eob_metrics] + + eou_latency = torch.tensor(eou_latency) + eou_latency_p90 = torch.quantile(eou_latency, 0.9).item() + eou_latency_p95 = torch.quantile(eou_latency, 0.95).item() + + eou_early_cutoff = torch.tensor(eou_early_cutoff) + eou_early_cutoff_p90 = torch.quantile(eou_early_cutoff, 0.9).item() + eou_early_cutoff_p95 = torch.quantile(eou_early_cutoff, 0.95).item() + + eob_latency = torch.tensor(eob_latency) + eob_latency_p90 = torch.quantile(eob_latency, 0.9).item() + eob_latency_p95 = torch.quantile(eob_latency, 0.95).item() + + eob_early_cutoff = torch.tensor(eob_early_cutoff) + eob_early_cutoff_p90 = torch.quantile(eob_early_cutoff, 0.9).item() + eob_early_cutoff_p95 = torch.quantile(eob_early_cutoff, 0.95).item() + + tensorboard_logs = {} + tensorboard_logs[f'{mode}_eou_latency_p90'] = eou_latency_p90 + tensorboard_logs[f'{mode}_eou_latency_p95'] = eou_latency_p95 + + tensorboard_logs[f'{mode}_eou_early_cutoff_p90'] = eou_early_cutoff_p90 + tensorboard_logs[f'{mode}_eou_early_cutoff_p95'] = eou_early_cutoff_p95 + + tensorboard_logs[f'{mode}_eob_latency_p90'] = eob_latency_p90 + tensorboard_logs[f'{mode}_eob_latency_p95'] = eob_latency_p95 + + tensorboard_logs[f'{mode}_eob_early_cutoff_p90'] = eob_early_cutoff_p90 + tensorboard_logs[f'{mode}_eob_early_cutoff_p95'] = eob_early_cutoff_p95 + + tensorboard_logs[f'{mode}_eou_early_cutoff_avg_num'] = eou_avg_num_early_cutoff + tensorboard_logs[f'{mode}_eob_early_cutoff_avg_num'] = eob_avg_num_early_cutoff + + tensorboard_logs[f'{mode}_eou_missing'] = ( + sum(eou_missing) / num_eou_utterances if num_eou_utterances > 0 else 0.0 + ) + tensorboard_logs[f'{mode}_eob_missing'] = ( + sum(eob_missing) / num_eob_utterances if num_eob_utterances > 0 else 0.0 + ) + + return tensorboard_logs + + @rank_zero_only + def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0): + """ + Save predictions to disk. + Args: + outputs: list of outputs + mode: mode of the model, either 'val' or 'test' + """ + + if not self.cfg.get('save_pred_to_file', None): + return + + output_file = Path(self.cfg.save_pred_to_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + output_file = output_file.with_suffix(f'.{dataloader_idx}.json') + + manifest = [] + for output in outputs: + for i in range(len(output[f'{mode}_sample_id'])): + item = { + "sample_id": output[f'{mode}_sample_id'][i], + "audio_filepath": output[f'{mode}_audio_filepath'][i], + "eou_text": output[f'{mode}_text_gt'][i], + "eou_pred_text": output[f'{mode}_text_pred'][i], + } + if f"{mode}_text_pred_ctc" in output: + item["eou_pred_text_ctc"] = output[f"{mode}_text_pred_ctc"][i] + manifest.append(item) + write_manifest(output_file, manifest) + logging.info(f"Predictions saved to {output_file}") + return output_file + + +class EncDecRNNTBPEEOUModel(EncDecRNNTBPEModel, ASREOUModelMixin): def __init__(self, cfg: DictConfig, trainer): self._patch_decoding_cfg(cfg) @@ -77,25 +339,6 @@ def __init__(self, cfg: DictConfig, trainer): self.joint.set_loss(self.loss) self.joint.set_wer(self.wer) - def _patch_decoding_cfg(self, cfg: DictConfig): - """ - Patch the decoding config as needed for EOU computation - """ - with open_dict(cfg): - if cfg.decoding.strategy in ['greedy', 'greedy_batch']: - cfg.decoding.greedy.preserve_alignments = True - cfg.decoding.greedy.compute_timestamps = True - elif cfg.decoding.strategy in ['beam', 'tsd', 'alsd', 'maes']: - cfg.decoding.beam.preserve_alignments = True - cfg.decoding.beam.compute_timestamps = True - - def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: - """ - PTL hook: https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#transfer-batch-to-device - """ - batch = move_data_to_device(batch, device) - return batch - def _setup_dataloader_from_config(self, config: Optional[Dict]): cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config dataset = LhotseSpeechToTextBpeEOUDataset( @@ -235,10 +478,12 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader del signal tensorboard_logs = {} - text_gt = self._get_text_from_tokens(transcript, transcript_len) - tensorboard_logs['val_sample_id'] = batch.sample_ids - tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths - tensorboard_logs['val_text_gt'] = text_gt + + if self.cfg.get('save_pred_to_file', None): + text_gt = self._get_text_from_tokens(transcript, transcript_len) + tensorboard_logs['val_sample_id'] = batch.sample_ids + tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths + tensorboard_logs['val_text_gt'] = text_gt # If experimental fused Joint-Loss-WER is not used if not self.joint.fuse_loss_wer: if self.compute_eval_loss: @@ -259,8 +504,11 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader ) hypotheses = self.wer.get_hypotheses() - text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) - eou_predictions = self.get_eou_predictions_from_hypotheses(hypotheses, batch) + if self.cfg.get('save_pred_to_file', None): + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred + + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) wer, wer_num, wer_denom = self.wer.compute() @@ -271,7 +519,6 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader tensorboard_logs['val_wer'] = wer tensorboard_logs['val_eou_metrics'] = eou_metrics_list tensorboard_logs['val_eob_metrics'] = eob_metrics_list - tensorboard_logs['val_text_pred'] = text_pred else: # If experimental fused Joint-Loss-WER is used @@ -295,9 +542,12 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader ) hypotheses = self.joint.get_hypotheses() - text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) - eou_predictions = self.get_eou_predictions_from_hypotheses(hypotheses, batch) + if self.cfg.get('save_pred_to_file', None): + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred + + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) @@ -309,141 +559,257 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader tensorboard_logs['val_wer'] = wer tensorboard_logs['val_eou_metrics'] = eou_metrics_list tensorboard_logs['val_eob_metrics'] = eob_metrics_list - tensorboard_logs['val_text_pred'] = text_pred self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) return tensorboard_logs - def _get_text_from_tokens(self, tokens: torch.Tensor, tokens_len: Optional[torch.Tensor] = None) -> List[str]: - """ - Convert tokens to text. - Args: - tokens: tensor of tokens - Returns: - text: list of text - """ - text_list = [] - for i in range(len(tokens)): - tokens_i = tokens[i] - if tokens_len is not None: - tokens_i = tokens[i][: tokens_len[i]] - text = self.tokenizer.ids_to_text(tokens_i) - text_list.append(text) - return text_list + def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): + assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." + self._maybe_save_predictions(outputs, mode=mode, dataloader_idx=dataloader_idx) - def get_eou_predictions_from_hypotheses( - self, hypotheses: List[Hypothesis], batch: AudioToTextEOUBatch - ) -> List[EOUPrediction]: - """ - Get EOU predictions from the hypotheses. - Args: - hypotheses: batch of hypotheses - Returns: - eou_predictions: list of EOU predictions - """ - eou_predictions = [] + # Aggregate WER metrics + if self.compute_eval_loss: + loss_mean = torch.stack([x[f'{mode}_loss'] for x in outputs]).mean() + loss_log = {f'{mode}_loss': loss_mean} + else: + loss_log = {} + wer_num = torch.stack([x[f'{mode}_wer_num'] for x in outputs]).sum() + wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() + tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} - for hyp in hypotheses: - # Process one hypothesis at a time - eou_probs = [] - eob_probs = [] - eou_preds = [] - eob_preds = [] - for alignment in hyp.alignments: - # Process for each timestamp - probs = torch.softmax(torch.stack([a[0] for a in alignment], dim=0), dim=-1) # unfold RNNT preds - tokens = torch.stack([a[1] for a in alignment], dim=0) # unfold RNNT preds - # Get the max prob for eou and eob - # and check if eou and eob are predicted - max_eou_prob = probs[:, self.eou_token].max().item() - max_eob_prob = probs[:, self.eob_token].max().item() - eou_pred = torch.any(tokens == self.eou_token).item() - eob_pred = torch.any(tokens == self.eob_token).item() + eou_metrics = self._aggregate_eou_metrics(outputs, mode=mode) + tensorboard_logs.update(eou_metrics) - eou_probs.append(max_eou_prob) - eob_probs.append(max_eob_prob) - eou_preds.append(eou_pred) - eob_preds.append(eob_pred) + return {**loss_log, 'log': tensorboard_logs} - eou_predictions.append( - EOUPrediction( - eou_probs=eou_probs, - eob_probs=eob_probs, - eou_preds=eou_preds, - eob_preds=eob_preds, + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') + + +class EncDecHybridRNNTCTCBPEEOUModel(EncDecHybridRNNTCTCBPEModel, ASREOUModelMixin): + def __init__(self, cfg: DictConfig, trainer): + self._patch_decoding_cfg(cfg) + if cfg.aux_ctc.get('decoding', None) is not None: + with open_dict(cfg): + cfg.aux_ctc.decoding.preserve_alignments = True + cfg.aux_ctc.decoding.compute_timestamps = True + + super().__init__(cfg=cfg, trainer=trainer) + + self.eou_token = self.tokenizer.token_to_id(EOU_STRING) + self.eob_token = self.tokenizer.token_to_id(EOB_STRING) + self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor + + self.wer = WER( + decoding=self.decoding, + batch_dim_index=0, + use_cer=self._cfg.get('use_cer', False), + log_prediction=self._cfg.get('log_prediction', True), + dist_sync_on_step=True, + return_hypotheses=True, + ) + + self.ctc_wer = WER( + decoding=self.ctc_decoding, + use_cer=self.cfg.aux_ctc.get('use_cer', False), + dist_sync_on_step=True, + log_prediction=self.cfg.get("log_prediction", False), + return_hypotheses=True, + ) + + # Setup fused Joint step if flag is set + if self.joint.fuse_loss_wer: + self.joint.set_loss(self.loss) + self.joint.set_wer(self.wer) + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config + dataset = LhotseSpeechToTextBpeEOUDataset( + cfg=cfg, tokenizer=self.tokenizer, return_cuts=config.get("do_transcribe", False) + ) + return get_lhotse_dataloader_from_config( + config, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), + dataset=dataset, + tokenizer=self.tokenizer, + ) + + def training_step(self, batch: AudioToTextEOUBatch, batch_nb): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths + + new_batch = (signal, signal_len, transcript, transcript_len) + return super().training_step(new_batch, batch_nb) + + def predict_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths + sample_ids = batch.sample_ids + new_batch = (signal, signal_len, transcript, transcript_len, sample_ids) + return super().predict_step(new_batch, batch_idx, dataloader_idx) + + def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader_idx: int = 0): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths + + encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + tensorboard_logs = {} + + if self.cfg.get('save_pred_to_file', None): + text_gt = self._get_text_from_tokens(transcript, transcript_len) + tensorboard_logs['val_sample_id'] = batch.sample_ids + tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths + tensorboard_logs['val_text_gt'] = text_gt + + loss_value = None + + # If experimental fused Joint-Loss-WER is not used + if not self.joint.fuse_loss_wer: + if self.compute_eval_loss: + decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + + loss_value = self.loss( + log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length ) + tensorboard_logs['val_loss'] = loss_value + + self.wer.update( + predictions=encoded, + predictions_lengths=encoded_len, + targets=transcript, + targets_lengths=transcript_len, ) - return eou_predictions + hypotheses = self.wer.get_hypotheses() - def _pad_to_same_length(self, eou_labels: List[float], eou_preds: List[float]) -> Tuple[List[float], List[float]]: - """ - Pad the EOU labels and predictions to the same length. - Args: - eou_labels: list of EOU labels - eou_preds: list of EOU predictions - Returns: - eou_labels: list of EOU labels, padded to the same length - eou_preds: list of EOU predictions, padded to the same length - """ - if len(eou_labels) < len(eou_preds): - eou_labels = eou_labels + [0] * (len(eou_preds) - len(eou_labels)) - elif len(eou_labels) > len(eou_preds): - eou_preds = eou_preds + [0] * (len(eou_labels) - len(eou_preds)) - return eou_labels, eou_preds + if self.cfg.get('save_pred_to_file', None): + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred - def _calculate_eou_metrics( - self, eou_predictions: List[EOUPrediction], batch: AudioToTextEOUBatch - ) -> Tuple[List, List]: - """ - Calculate EOU metrics. - Args: - eou_predictions: list of EOU predictions - batch: batch of data - Returns: - eou_metrics_list: list of EOU metrics, each is of type EOUResult - eob_metrics_list: list of EOB metrics, each is of type EOUResult - """ - # Get the ground truth EOU labels - eou_labels = batch.eou_targets - eou_labels_len = batch.eou_target_lengths + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) - # Calculate EOU metrics - eou_metrics_list = [] - eob_metrics_list = [] - for i, eou_prediction in enumerate(eou_predictions): - eou_preds_i = [float(x) for x in eou_prediction.eou_preds] - eob_preds_i = [float(x) for x in eou_prediction.eob_preds] + wer, wer_num, wer_denom = self.wer.compute() + self.wer.reset() - eou_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOU_LABEL).float().tolist() - eob_labels_i = (eou_labels[i][: eou_labels_len[i]] == EOB_LABEL).float().tolist() + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + tensorboard_logs['val_eou_metrics'] = eou_metrics_list + tensorboard_logs['val_eob_metrics'] = eob_metrics_list + tensorboard_logs['val_text_pred'] = text_pred - # Pad the EOU labels and predictions to the same length with zeros - eou_labels_i, eou_preds_i = self._pad_to_same_length(eou_labels_i, eou_preds_i) - eob_labels_i, eob_preds_i = self._pad_to_same_length(eob_labels_i, eob_preds_i) + else: + # If experimental fused Joint-Loss-WER is used + compute_wer = True - # Calculate EOU metrics - eou_metrics = cal_eou_metrics_from_frame_labels( - prediction=eou_preds_i, - reference=eou_labels_i, - threshold=0.0, - collar=0.0, - frame_len_in_secs=self.frame_len_in_secs, - ) # type: EOUResult + if self.compute_eval_loss: + decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) + else: + decoded = None + target_len = transcript_len - eob_metrics = cal_eou_metrics_from_frame_labels( - prediction=eob_preds_i, - reference=eob_labels_i, - threshold=0.0, - collar=0.0, - frame_len_in_secs=self.frame_len_in_secs, + # Fused joint step + loss_value, wer, wer_num, wer_denom = self.joint( + encoder_outputs=encoded, + decoder_outputs=decoded, + encoder_lengths=encoded_len, + transcripts=transcript, + transcript_lengths=target_len, + compute_wer=compute_wer, + keep_hypotheses=True, ) + hypotheses = self.joint.get_hypotheses() - eou_metrics_list.append(eou_metrics) - eob_metrics_list.append(eob_metrics) + if self.cfg.get('save_pred_to_file', None): + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred - return eou_metrics_list, eob_metrics_list + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) + + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + + if loss_value is not None: + tensorboard_logs['val_loss'] = loss_value + + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + tensorboard_logs['val_eou_metrics'] = eou_metrics_list + tensorboard_logs['val_eob_metrics'] = eob_metrics_list + + log_probs = self.ctc_decoder(encoder_output=encoded) + if self.compute_eval_loss: + ctc_loss = self.ctc_loss( + log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len + ) + tensorboard_logs['val_ctc_loss'] = ctc_loss + tensorboard_logs['val_rnnt_loss'] = loss_value + loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss + tensorboard_logs['val_loss'] = loss_value + + self.ctc_wer.update( + predictions=log_probs, + targets=transcript, + targets_lengths=transcript_len, + predictions_lengths=encoded_len, + ) + hypotheses_ctc = self.ctc_wer.get_hypotheses() + + if self.cfg.get('save_pred_to_file', None): + text_pred_ctc = self._get_text_from_tokens([x.y_sequence for x in hypotheses_ctc]) + tensorboard_logs['val_text_pred_ctc'] = text_pred_ctc + + eou_predictions_ctc = self._get_eou_predictions_from_hypotheses(hypotheses_ctc, batch) + eou_metrics_list_ctc, eob_metrics_list_ctc = self._calculate_eou_metrics(eou_predictions_ctc, batch) + + ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() + self.ctc_wer.reset() + + tensorboard_logs['val_wer_num_ctc'] = ctc_wer_num + tensorboard_logs['val_wer_denom_ctc'] = ctc_wer_denom + tensorboard_logs['val_wer_ctc'] = ctc_wer + tensorboard_logs['val_eou_metrics_ctc'] = eou_metrics_list_ctc + tensorboard_logs['val_eob_metrics_ctc'] = eob_metrics_list_ctc + + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + loss_value, additional_logs = self.add_interctc_losses( + loss_value, + transcript, + transcript_len, + compute_wer=True, + compute_loss=self.compute_eval_loss, + log_wer_num_denom=True, + log_prefix="val_", + ) + if self.compute_eval_loss: + # overriding total loss value. Note that the previous + # rnnt + ctc loss is available in metrics as "val_final_loss" now + tensorboard_logs['val_loss'] = loss_value + tensorboard_logs.update(additional_logs) + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + return tensorboard_logs def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." @@ -459,76 +825,17 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} - # Aggregate EOU/EOB metrics - eou_metrics = [] # type: List[EOUResult] - eob_metrics = [] # type: List[EOUResult] - for x in outputs: - eou_metrics.extend(x[f'{mode}_eou_metrics']) - eob_metrics.extend(x[f'{mode}_eob_metrics']) - - num_eou_utterances = sum([x.num_utterances for x in eou_metrics]) - eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) - eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) - - num_eob_utterances = sum([x.num_utterances for x in eob_metrics]) - eob_latency = flatten_nested_list([x.latency for x in eob_metrics]) - eob_early_cutoff = flatten_nested_list([x.early_cutoff for x in eob_metrics]) - - eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_eou_utterances - eob_avg_num_early_cutoff = len(eob_early_cutoff) / num_eob_utterances - if len(eou_latency) == 0: - eou_latency = [0.0] - if len(eou_early_cutoff) == 0: - eou_early_cutoff = [0.0] - if len(eob_latency) == 0: - eob_latency = [0.0] - if len(eob_early_cutoff) == 0: - eob_early_cutoff = [0.0] - - eou_missing = [x.missing for x in eou_metrics] - eob_missing = [x.missing for x in eob_metrics] - - eou_latency = torch.tensor(eou_latency) - eou_latency_p90 = torch.quantile(eou_latency, 0.9).item() - eou_latency_p95 = torch.quantile(eou_latency, 0.95).item() - eou_latency_p99 = torch.quantile(eou_latency, 0.99).item() - - eou_early_cutoff = torch.tensor(eou_early_cutoff) - eou_early_cutoff_p90 = torch.quantile(eou_early_cutoff, 0.9).item() - eou_early_cutoff_p95 = torch.quantile(eou_early_cutoff, 0.95).item() - eou_early_cutoff_p99 = torch.quantile(eou_early_cutoff, 0.99).item() - - eob_latency = torch.tensor(eob_latency) - eob_latency_p90 = torch.quantile(eob_latency, 0.9).item() - eob_latency_p95 = torch.quantile(eob_latency, 0.95).item() - eob_latency_p99 = torch.quantile(eob_latency, 0.99).item() - - eob_early_cutoff = torch.tensor(eob_early_cutoff) - eob_early_cutoff_p90 = torch.quantile(eob_early_cutoff, 0.9).item() - eob_early_cutoff_p95 = torch.quantile(eob_early_cutoff, 0.95).item() - eob_early_cutoff_p99 = torch.quantile(eob_early_cutoff, 0.99).item() - - tensorboard_logs[f'{mode}_eou_latency_p90'] = eou_latency_p90 - tensorboard_logs[f'{mode}_eou_latency_p95'] = eou_latency_p95 - tensorboard_logs[f'{mode}_eou_latency_p99'] = eou_latency_p99 - - tensorboard_logs[f'{mode}_eou_early_cutoff_p90'] = eou_early_cutoff_p90 - tensorboard_logs[f'{mode}_eou_early_cutoff_p95'] = eou_early_cutoff_p95 - tensorboard_logs[f'{mode}_eou_early_cutoff_p99'] = eou_early_cutoff_p99 - - tensorboard_logs[f'{mode}_eob_latency_p90'] = eob_latency_p90 - tensorboard_logs[f'{mode}_eob_latency_p95'] = eob_latency_p95 - tensorboard_logs[f'{mode}_eob_latency_p99'] = eob_latency_p99 - - tensorboard_logs[f'{mode}_eob_early_cutoff_p90'] = eob_early_cutoff_p90 - tensorboard_logs[f'{mode}_eob_early_cutoff_p95'] = eob_early_cutoff_p95 - tensorboard_logs[f'{mode}_eob_early_cutoff_p99'] = eob_early_cutoff_p99 + if self.ctc_loss_weight > 0: + ctc_wer_num = torch.stack([x['val_wer_num_ctc'] for x in outputs]).sum() + ctc_wer_denom = torch.stack([x['val_wer_denom_ctc'] for x in outputs]).sum() + tensorboard_logs['val_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - tensorboard_logs[f'{mode}_eou_early_cutoff_avg_num'] = eou_avg_num_early_cutoff - tensorboard_logs[f'{mode}_eob_early_cutoff_avg_num'] = eob_avg_num_early_cutoff + eou_metrics = self._aggregate_eou_metrics(outputs, mode) + tensorboard_logs.update(eou_metrics) - tensorboard_logs[f'{mode}_eou_missing'] = sum(eou_missing) / num_eou_utterances - tensorboard_logs[f'{mode}_eob_missing'] = sum(eob_missing) / num_eob_utterances + eou_metrics_ctc = self._aggregate_eou_metrics(outputs, mode) + for key, value in eou_metrics_ctc.items(): + tensorboard_logs[f'{key}_ctc'] = value return {**loss_log, 'log': tensorboard_logs} @@ -537,34 +844,3 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') - - @rank_zero_only - def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0): - """ - Save predictions to disk. - Args: - outputs: list of outputs - mode: mode of the model, either 'val' or 'test' - """ - - if not self.cfg.get('save_pred_to_file', None): - return - - output_file = Path(self.cfg.save_pred_to_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - output_file = output_file.with_suffix(f'.{dataloader_idx}.json') - - manifest = [] - for output in outputs: - for i in range(len(output[f'{mode}_sample_id'])): - item = { - "sample_id": output[f'{mode}_sample_id'][i], - "audio_filepath": output[f'{mode}_audio_filepath'][i], - "eou_text": output[f'{mode}_text_gt'][i], - "eou_pred_text": output[f'{mode}_text_pred'][i], - } - manifest.append(item) - write_manifest(output_file, manifest) - logging.info(f"Predictions saved to {output_file}") - return output_file diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index 785f4b62400e..5670cdfbca2a 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -1511,7 +1511,9 @@ def get_hypotheses(self): Returns the hypotheses generated during the last forward pass. """ if self.hypotheses is None: - raise ValueError("No hypotheses were generated during the last forward pass.") + raise ValueError( + "No hypotheses were generated during the last forward pass. Did you set keep_hypotheses=True in forward()?" + ) return self.hypotheses def project_encoder(self, encoder_output: torch.Tensor) -> torch.Tensor: From f83dc6fa38959b5ec7a4db2693ca6e9f7d8954af Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 15 May 2025 10:35:31 -0400 Subject: [PATCH 036/107] update cfg Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py index ac070a8cb578..5a123396278e 100644 --- a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py @@ -307,7 +307,7 @@ def init_from_pretrained_nemo(model: EncDecHybridRNNTCTCBPEModel, pretrained_mod return -@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_transducer_bpe_streaming") +@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_hybrid_transducer_ctc_bpe_streaming") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') From c4783b170a11bb3b2c2253040ba7b422973cef10 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 16 May 2025 20:11:07 -0400 Subject: [PATCH 037/107] update EOU models Signed-off-by: stevehuang52 --- .../speech_to_text_hybrid_frame_eou_train.py | 152 +++++ ...former_hybrid_asr_frame_eou_streaming.yaml | 419 +++++++++++++ ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 2 +- ...astconformer_transducer_bpe_streaming.yaml | 2 +- nemo/collections/asr/losses/ssl_losses/mlm.py | 11 +- nemo/collections/asr/models/asr_eou_models.py | 557 +++++++++++++++++- .../asr/modules/conformer_encoder.py | 17 +- .../modules/ssl_modules/multi_layer_feat.py | 19 +- 8 files changed, 1140 insertions(+), 39 deletions(-) create mode 100644 examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py create mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py new file mode 100644 index 000000000000..ea79980da691 --- /dev/null +++ b/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py @@ -0,0 +1,152 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py + +1. Add special tokens and to the tokenizer of pretrained model, by refering to the script + /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py + +2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script + /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py + +3. Run the following command to train the ASR-EOU model: +```bash +#!/bin/bash + +NEMO_PATH=/home/heh/codes/nemo-eou +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json +NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json + +PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo +TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou + +BATCH_DURATION=30 +NUM_WORKERS=0 +LIMIT_TRAIN_BATCHES=100 +VAL_CHECK_INTERVAL=100 +MAX_STEPS=1000000 + +EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug + +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming +CONFIG_NAME=fastconformer_transducer_bpe_streaming + +CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ + --config-path $CONFIG_PATH \ + --config-name $CONFIG_NAME \ + ++init_from_nemo_model=$PRETRAINED_NEMO \ + model.encoder.att_context_size="[70,1]" \ + model.tokenizer.dir=$TOKENIZER_DIR \ + model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ + model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ + model.validation_ds.manifest_filepath=$VAL_MANIFEST \ + model.train_ds.batch_duration=$BATCH_DURATION \ + model.train_ds.num_workers=$NUM_WORKERS \ + model.validation_ds.batch_duration=$BATCH_DURATION \ + model.validation_ds.num_workers=$NUM_WORKERS \ + ~model.test_ds \ + trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ + trainer.val_check_interval=$VAL_CHECK_INTERVAL \ + trainer.max_steps=$MAX_STEPS \ + exp_manager.name=$EXP_NAME +``` + +""" + + +from typing import Optional + +import lightning.pytorch as pl +import torch +from omegaconf import DictConfig, OmegaConf + +from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.models.asr_eou_models import EncDecHybridASRFrameEOUModel +from nemo.collections.asr.modules.conv_asr import ConvASRDecoder +from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint +from nemo.core.classes import typecheck +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager +from nemo.utils.trainer_utils import resolve_trainer_cfg + +typecheck.set_typecheck_enabled(False) + + +def load_from_pretrained_model(model: ASRModel, cfg: DictConfig) -> ASRModel: + args = [ + 'init_from_nemo_model', + 'init_from_pretrained_model', + 'init_from_ptl_ckpt', + ] + arg_matches = [(1 if arg in cfg and arg is not None else 0) for arg in args] + + if sum(arg_matches) == 0: + # model weights do not need to be restored + return model + + if sum(arg_matches) > 1: + raise ValueError( + f"Cannot pass more than one model initialization arguments to config!\n" + f"Found : {[args[idx] for idx, arg_present in enumerate(arg_matches) if arg_present]}" + ) + + if cfg.get('init_from_nemo_model', None) is not None: + logging.info(f"Loading pretrained model from local: {cfg.init_from_nemo_model}") + pretrained_model = ASRModel.restore_from(cfg.init_from_nemo_model, map_location='cpu') + pretrained_state_dict = pretrained_model.state_dict() + elif cfg.get('init_from_pretrained_model', None) is not None: + logging.info(f"Loading pretrained model from remote: {cfg.init_from_pretrained_model}") + pretrained_model = ASRModel.from_pretrained(cfg.init_from_pretrained_model, map_location='cpu') + pretrained_state_dict = pretrained_model.state_dict() + elif cfg.get('init_from_ptl_ckpt', None) is not None: + logging.info(f"Loading pretrained PTL checkpoint from local: {cfg.init_from_ptl_ckpt}") + pretrained_state_dict = torch.load(cfg.init_from_ptl_ckpt, map_location='cpu', weights_only=False)[ + 'state_dict' + ] + + # Load the pretrained model state dict into the current model + encoder_states = {k: v for k, v in pretrained_state_dict.items() if k.startswith("encoder.")} + model.encoder.load_state_dict(encoder_states, strict=True) + model.load_state_dict(pretrained_state_dict, strict=False) + return model + + +@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_hybrid_asr_frame_eou_streaming") +def main(cfg): + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') + + trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) + exp_manager(trainer, cfg.get("exp_manager", None)) + + asr_model = EncDecHybridASRFrameEOUModel(cfg=cfg.model, trainer=trainer) + + asr_model = load_from_pretrained_model(asr_model, cfg) + + trainer.fit(asr_model) + + if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if asr_model.prepare_test(trainer): + trainer.test(asr_model) + + +if __name__ == '__main__': + main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml new file mode 100644 index 000000000000..ceb76ae647be --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml @@ -0,0 +1,419 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +# Note: if training loss does not converge, you may increase warm-up to 20K. + +name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder + num_eou_classes: 4 + eou_class_weights: null + rnnt_loss_weight: 0.0 + ctc_loss_weight: 0.0 + eou_loss_weight: 1.0 + use_ctc_pred: false + freeze_encoder: true + freeze_ctc: true + freeze_rnnt: true + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + + random_padding: + prob: 0.9 + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds + max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + aggregator: + _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator + mode: "weighted_sum" + weights: null + layer_idx_list: ${model.layer_idx_list} + + eou_encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.encoder.d_model} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 2 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 1 # NO subsampling + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: ${model.encoder.pos_emb_max_len} + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + eou_classifier: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: ${model.eou_encoder.d_model} + num_classes: ${model.num_eou_classes} + add_blank: false + + eou_loss: + weight: ${model.eou_class_weights} + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_eou_macro_acc" + mode: "max" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index 21dce1e4f743..d76b5269429f 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -166,7 +166,7 @@ model: # An example of settings for multi-lookahead: # att_context_size: [[70,13],[70,6],[70,1],[70,0]] # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 13] # -1 means unlimited context + att_context_size: [70, 1] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited att_context_probs: null diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 976a84383da8..a5a906f6c67a 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -161,7 +161,7 @@ model: # An example of settings for multi-lookahead: # att_context_size: [[70,13],[70,6],[70,1],[70,0]] # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 13] # -1 means unlimited context + att_context_size: [70, 1] # -1 means unlimited context att_context_style: chunked_limited # regular or chunked_limited att_context_probs: null diff --git a/nemo/collections/asr/losses/ssl_losses/mlm.py b/nemo/collections/asr/losses/ssl_losses/mlm.py index 424374869c3d..4ed6f580bbb2 100644 --- a/nemo/collections/asr/losses/ssl_losses/mlm.py +++ b/nemo/collections/asr/losses/ssl_losses/mlm.py @@ -65,11 +65,14 @@ def forward( if masks is None: masks = spec_masks - # B,D,T -> B,T,D - masks = masks.transpose(1, 2) + if masks is None: + masks = torch.ones_like(decoder_outputs, dtype=torch.bool) + else: + # B,D,T -> B,T,D + masks = masks.transpose(1, 2) - masks = masks.reshape(masks.shape[0], masks.shape[1] // self.combine_time_steps, -1) - masks = masks.mean(-1) > self.mask_threshold + masks = masks.reshape(masks.shape[0], masks.shape[1] // self.combine_time_steps, -1) + masks = masks.mean(-1) > self.mask_threshold out_masked_only = decoder_outputs[masks] targets = F.pad(targets, (0, masks.shape[-1] - targets.shape[-1])) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 26420fda1dbb..fbf613beb0f3 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -17,8 +17,10 @@ from typing import Any, Dict, List, Optional, Tuple import torch +import torch.nn as nn from lightning.pytorch.utilities import rank_zero_only -from omegaconf import DictConfig, OmegaConf, open_dict +from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict +from torchmetrics import Accuracy from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( EOB_LABEL, @@ -30,6 +32,7 @@ ) from nemo.collections.asr.metrics.wer import WER from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.modules.conformer_encoder import ConformerMultiLayerFeatureExtractor from nemo.collections.asr.parts.utils.eou_utils import ( EOUResult, cal_eou_metrics_from_frame_labels, @@ -39,6 +42,8 @@ from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.data.utils import move_data_to_device +from nemo.collections.common.losses import CrossEntropyLoss +from nemo.core.classes.common import Serialization from nemo.core.classes.mixins import AccessMixin from nemo.utils import logging @@ -106,31 +111,31 @@ def _get_eou_predictions_from_hypotheses( eou_preds = [] eob_preds = [] if isinstance(hyp.alignments, tuple): - alignments = [hyp.alignments] # CTC + # CTC + probs = torch.softmax(hyp.alignments[0], dim=-1) # [time, num_classes] + tokens = hyp.alignments[1] + eou_probs = probs[:, self.eou_token].tolist() + eob_probs = probs[:, self.eob_token].tolist() + eou_preds = [int(x) == self.eou_token for x in tokens] + eob_preds = [int(x) == self.eob_token for x in tokens] else: - alignments = hyp.alignments # RNNT - for alignment in alignments: - # Process for each timestamp - if isinstance(alignment, tuple): - # CTC - probs = torch.softmax(alignment[0], dim=-1) - tokens = alignment[1] - else: - # RNNT, alignment is a list of tuples + # RNNT, each timestamp has a list of (prob, token) tuples + for alignment in hyp.alignments: + # Process for each timestamp probs = torch.softmax(torch.stack([a[0] for a in alignment], dim=0), dim=-1) # unfold RNNT preds tokens = torch.stack([a[1] for a in alignment], dim=0) # unfold RNNT preds - # Get the max prob for eou and eob - # and check if eou and eob are predicted - max_eou_prob = probs[:, self.eou_token].max().item() - max_eob_prob = probs[:, self.eob_token].max().item() - eou_pred = torch.any(tokens == self.eou_token).item() - eob_pred = torch.any(tokens == self.eob_token).item() + # Get the max prob for eou and eob + # and check if eou and eob are predicted + max_eou_prob = probs[:, self.eou_token].max().item() + max_eob_prob = probs[:, self.eob_token].max().item() + eou_pred = torch.any(tokens == self.eou_token).item() + eob_pred = torch.any(tokens == self.eob_token).item() - eou_probs.append(max_eou_prob) - eob_probs.append(max_eob_prob) - eou_preds.append(eou_pred) - eob_preds.append(eob_pred) + eou_probs.append(max_eou_prob) + eob_probs.append(max_eob_prob) + eou_preds.append(eou_pred) + eob_preds.append(eob_pred) eou_predictions.append( EOUPrediction( @@ -212,6 +217,9 @@ def _calculate_eou_metrics( return eou_metrics_list, eob_metrics_list def _aggregate_eou_metrics(self, outputs: List[dict], mode: str): + if f'{mode}_eou_metrics' not in outputs[0]: + return {} + # Aggregate EOU/EOB metrics eou_metrics = [] # type: List[EOUResult] eob_metrics = [] # type: List[EOUResult] @@ -844,3 +852,510 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') + + +class EncDecHybridASRFrameEOUModel(EncDecHybridRNNTCTCBPEModel, ASREOUModelMixin): + def __init__(self, cfg: DictConfig, trainer): + super().__init__(cfg=cfg, trainer=trainer) + self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor + self.layer_idx_list = self.cfg.get('layer_idx_list', []) + assert isinstance(self.layer_idx_list, (list, ListConfig)), "cfg.layer_idx_list must be a list" + num_encoder_layers = len(self.encoder.layers) + if -1 not in self.layer_idx_list and num_encoder_layers - 1 not in self.layer_idx_list: + self.layer_idx_list.append(num_encoder_layers - 1) + self.encoder = ConformerMultiLayerFeatureExtractor(self.encoder, self.layer_idx_list) + self.aggregator = Serialization.from_config_dict(cfg.aggregator) + self.eou_encoder = Serialization.from_config_dict(cfg.eou_encoder) + self.eou_classifier = Serialization.from_config_dict(cfg.eou_classifier) + self.num_eou_classes = cfg.num_eou_classes + self.rnnt_loss_weight = cfg.rnnt_loss_weight + self.ctc_loss_weight = cfg.ctc_loss_weight + self.eou_loss_weight = cfg.eou_loss_weight + self.use_ctc_pred = cfg.get('use_ctc_pred', False) + self.eou_loss = self._setup_eou_loss() + + if cfg.freeze_encoder: + self.encoder.freeze() + if cfg.freeze_rnnt: + self.decoder.freeze() + self.joint.freeze() + if cfg.freeze_ctc: + self.ctc_decoder.freeze() + + self.macro_accuracy = Accuracy(num_classes=self.num_eou_classes, average='macro', task="multiclass") + + def _setup_eou_loss(self): + if "eou_loss" in self.cfg: + weight = self.cfg.eou_loss.get("weight", None) + if weight in [None, "none", "None"]: + weight = [1.0] * self.num_eou_classes + elif len(weight) != self.num_eou_classes: + raise ValueError( + f"Length of weight must match the number of classes {self.num_eou_classes}, but got {weight}" + ) + logging.info(f"Using cross-entropy with weights: {weight}") + else: + weight = [1.0] * self.num_eou_classes + return CrossEntropyLoss(logits_ndim=3, weight=weight) + + def get_label_masks(self, labels: torch.Tensor, labels_len: torch.Tensor) -> torch.Tensor: + mask = torch.arange(labels.size(1))[None, :].to(labels.device) < labels_len[:, None] + return mask.to(labels.device, dtype=bool) + + def _setup_dataloader_from_config(self, config: Optional[Dict]): + cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config + dataset = LhotseSpeechToTextBpeEOUDataset( + cfg=cfg, tokenizer=self.tokenizer, return_cuts=config.get("do_transcribe", False) + ) + return get_lhotse_dataloader_from_config( + config, + # During transcription, the model is initially loaded on the CPU. + # To ensure the correct global_rank and world_size are set, + # these values must be passed from the configuration. + global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), + world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), + dataset=dataset, + tokenizer=self.tokenizer, + ) + + def forward( + self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None + ): + has_input_signal = input_signal is not None and input_signal_length is not None + has_processed_signal = processed_signal is not None and processed_signal_length is not None + if (has_input_signal ^ has_processed_signal) is False: + raise ValueError( + f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " + " with ``processed_signal`` and ``processed_signal_len`` arguments." + ) + + if not has_processed_signal: + processed_signal, processed_signal_length = self.preprocessor( + input_signal=input_signal, + length=input_signal_length, + ) + + # Spec augment is not applied during evaluation/testing + if self.spec_augmentation is not None and self.training: + processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) + + encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) + return encoded, encoded_len + + def get_eou_prediction( + self, + encoded_all: List[torch.Tensor], + encoded_len_all: List[torch.Tensor], + ctc_pred: Optional[torch.Tensor] = None, + ): + if ctc_pred is not None and self.use_ctc_pred: + encoded_all[-1] = ctc_pred + eou_encoded, eou_encoded_len = self.aggregator(encoded_all, encoded_len_all) + eou_encoded, eou_encoded_len = self.eou_encoder(eou_encoded, eou_encoded_len) + eou_pred = self.eou_classifier(eou_encoded) + return eou_pred, eou_encoded_len + + def trim_eou_preds_labels( + self, + eou_pred: torch.Tensor, + eou_pred_len: torch.Tensor, + eou_labels: torch.Tensor, + eou_labels_len: torch.Tensor, + ): + seq_len = eou_pred.size(1) + if eou_labels.size(1) > seq_len: + eou_labels = eou_labels[:, :seq_len] + eou_labels_len = eou_labels_len.clamp(max=seq_len) + elif eou_labels.size(1) < seq_len: + seq_len = eou_labels.size(1) + eou_pred = eou_pred[:, :seq_len] + eou_pred_len = eou_pred_len.clamp(max=seq_len) + + # get the min between the eou_encoded_len and eou_labels_len + eou_valid_len = torch.min(eou_pred_len, eou_labels_len) + + return eou_pred, eou_labels, eou_valid_len + + def get_eou_loss( + self, + eou_pred: torch.Tensor, + eou_pred_len: torch.Tensor, + eou_labels: torch.Tensor, + eou_labels_len: torch.Tensor, + ): + eou_pred, eou_labels, eou_valid_len = self.trim_eou_preds_labels( + eou_pred, eou_pred_len, eou_labels, eou_labels_len + ) + eou_loss = self.eou_loss( + logits=eou_pred, + labels=eou_labels, + loss_mask=self.get_label_masks(eou_labels, eou_valid_len), + ) + return eou_loss + + def training_step(self, batch: AudioToTextEOUBatch, batch_nb): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths + eou_labels = batch.eou_targets + eou_labels_len = batch.eou_target_lengths + + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + encoded = encoded_all[-1] + encoded_len = encoded_len_all[-1] + + # During training, loss must be computed, so decoder forward is necessary + decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) + + if hasattr(self, '_trainer') and self._trainer is not None: + log_every_n_steps = self._trainer.log_every_n_steps + sample_id = self._trainer.global_step + else: + log_every_n_steps = 1 + sample_id = batch_nb + + if (sample_id + 1) % log_every_n_steps == 0: + compute_wer = True + else: + compute_wer = False + + tensorboard_logs = { + 'learning_rate': self._optimizer.param_groups[0]['lr'], + 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), + } + + loss_value = None + if self.rnnt_loss_weight > 0: + # If fused Joint-Loss-WER is not used + if not self.joint.fuse_loss_wer: + # Compute full joint and loss + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + loss_value = self.loss( + log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length + ) + + # Add auxiliary losses, if registered + loss_value = self.add_auxiliary_losses(loss_value) + + if compute_wer: + self.wer.update( + predictions=encoded, + predictions_lengths=encoded_len, + targets=transcript, + targets_lengths=transcript_len, + ) + _, scores, words = self.wer.compute() + self.wer.reset() + tensorboard_logs.update({'training_batch_wer': scores.float() / words}) + + else: # If fused Joint-Loss-WER is used + # Fused joint step + loss_value, wer, _, _ = self.joint( + encoder_outputs=encoded, + decoder_outputs=decoder, + encoder_lengths=encoded_len, + transcripts=transcript, + transcript_lengths=transcript_len, + compute_wer=compute_wer, + ) + + # Add auxiliary losses, if registered + loss_value = self.add_auxiliary_losses(loss_value) + + if compute_wer: + tensorboard_logs.update({'training_batch_wer': wer}) + + if self.ctc_loss_weight > 0: + log_probs = self.ctc_decoder(encoder_output=encoded) + ctc_log_probs = log_probs + ctc_loss = self.ctc_loss( + log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len + ) + tensorboard_logs['train_rnnt_loss'] = loss_value + tensorboard_logs['train_ctc_loss'] = ctc_loss + loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss + if compute_wer: + self.ctc_wer.update( + predictions=log_probs, + targets=transcript, + targets_lengths=transcript_len, + predictions_lengths=encoded_len, + ) + ctc_wer, _, _ = self.ctc_wer.compute() + self.ctc_wer.reset() + tensorboard_logs.update({'training_batch_wer_ctc': ctc_wer}) + elif self.use_ctc_pred: + ctc_log_probs = self.ctc_decoder(encoder_output=encoded) + else: + ctc_log_probs = None + + eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all, ctc_log_probs) + eou_loss = self.get_eou_loss(eou_pred, eou_pred_len, eou_labels, eou_labels_len) + loss_value = loss_value + self.eou_loss_weight * eou_loss if loss_value is not None else eou_loss + tensorboard_logs['train_eou_loss'] = eou_loss + + # note that we want to apply interctc independent of whether main ctc + # loss is used or not (to allow rnnt + interctc training). + # assuming ``ctc_loss_weight=0.3`` and interctc is applied to a single + # layer with weight of ``0.1``, the total loss will be + # ``loss = 0.9 * (0.3 * ctc_loss + 0.7 * rnnt_loss) + 0.1 * interctc_loss`` + loss_value, additional_logs = self.add_interctc_losses( + loss_value, transcript, transcript_len, compute_wer=compute_wer + ) + tensorboard_logs.update(additional_logs) + tensorboard_logs['train_loss'] = loss_value + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + # Log items + self.log_dict(tensorboard_logs) + + # Preserve batch acoustic model T and language model U parameters if normalizing + if self._optim_normalize_joint_txu: + self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] + + return {'loss': loss_value} + + def predict_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): + signal = batch.audio_signal + signal_len = batch.audio_lengths + sample_ids = batch.sample_ids + + encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( + encoder_output=encoded_all[-1], encoded_lengths=encoded_len_all[-1], return_hypotheses=False + ) + if isinstance(sample_ids, torch.Tensor): + sample_ids = sample_ids.cpu().detach().numpy() + + eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all) + eou_predictions = [eou_pred[i][: eou_pred_len[i]] for i in range(len(eou_pred))] + return zip(sample_ids, best_hyp_text, eou_predictions) + + def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader_idx: int = 0): + signal = batch.audio_signal + signal_len = batch.audio_lengths + transcript = batch.text_tokens + transcript_len = batch.text_token_lengths + eou_labels = batch.eou_targets + eou_labels_len = batch.eou_target_lengths + + # forward() only performs encoder forward + encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) + del signal + + tensorboard_logs = {} + + if self.cfg.get('save_pred_to_file', None): + text_gt = self._get_text_from_tokens(transcript, transcript_len) + tensorboard_logs['val_sample_id'] = batch.sample_ids + tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths + tensorboard_logs['val_text_gt'] = text_gt + + loss_value = None + encoded = encoded_all[-1] + encoded_len = encoded_len_all[-1] + # If experimental fused Joint-Loss-WER is not used + if not self.joint.fuse_loss_wer: + if self.compute_eval_loss: + decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) + joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) + + loss_value = self.loss( + log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length + ) + tensorboard_logs['val_loss'] = loss_value + + self.wer.update( + predictions=encoded, + predictions_lengths=encoded_len, + targets=transcript, + targets_lengths=transcript_len, + ) + + if self.cfg.get('save_pred_to_file', None): + hypotheses = self.wer.get_hypotheses() + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred + + wer, wer_num, wer_denom = self.wer.compute() + self.wer.reset() + + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + else: + # If experimental fused Joint-Loss-WER is used + compute_wer = True + + if self.compute_eval_loss: + decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) + else: + decoded = None + target_len = transcript_len + + # Fused joint step + loss_value, wer, wer_num, wer_denom = self.joint( + encoder_outputs=encoded, + decoder_outputs=decoded, + encoder_lengths=encoded_len, + transcripts=transcript, + transcript_lengths=target_len, + compute_wer=compute_wer, + keep_hypotheses=True, + ) + if self.cfg.get('save_pred_to_file', None): + hypotheses = self.joint.get_hypotheses() + text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) + tensorboard_logs['val_text_pred'] = text_pred + + if loss_value is not None: + tensorboard_logs['val_loss'] = loss_value + + tensorboard_logs['val_wer_num'] = wer_num + tensorboard_logs['val_wer_denom'] = wer_denom + tensorboard_logs['val_wer'] = wer + + log_probs = self.ctc_decoder(encoder_output=encoded) + if self.compute_eval_loss: + ctc_loss = self.ctc_loss( + log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len + ) + tensorboard_logs['val_ctc_loss'] = ctc_loss + tensorboard_logs['val_rnnt_loss'] = loss_value + loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss + tensorboard_logs['val_loss'] = loss_value + self.ctc_wer.update( + predictions=log_probs, + targets=transcript, + targets_lengths=transcript_len, + predictions_lengths=encoded_len, + ) + + if self.cfg.get('save_pred_to_file', None): + hypotheses_ctc = self.ctc_wer.get_hypotheses() + text_pred_ctc = self._get_text_from_tokens([x.y_sequence for x in hypotheses_ctc]) + tensorboard_logs['val_text_pred_ctc'] = text_pred_ctc + + ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() + self.ctc_wer.reset() + tensorboard_logs['val_wer_num_ctc'] = ctc_wer_num + tensorboard_logs['val_wer_denom_ctc'] = ctc_wer_denom + tensorboard_logs['val_wer_ctc'] = ctc_wer + + self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) + + loss_value, additional_logs = self.add_interctc_losses( + loss_value, + transcript, + transcript_len, + compute_wer=True, + compute_loss=self.compute_eval_loss, + log_wer_num_denom=True, + log_prefix="val_", + ) + if self.compute_eval_loss: + # overriding total loss value. Note that the previous + # rnnt + ctc loss is available in metrics as "val_final_loss" now + tensorboard_logs['val_loss'] = loss_value + tensorboard_logs.update(additional_logs) + # Reset access registry + if AccessMixin.is_access_enabled(self.model_guid): + AccessMixin.reset_registry(self) + + # Calculate EOU metrics + eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all, log_probs) + + eou_loss = self.get_eou_loss(eou_pred, eou_pred_len, eou_labels, eou_labels_len) + tensorboard_logs['val_eou_loss'] = eou_loss + + eou_pred, eou_labels, eou_valid_len = self.trim_eou_preds_labels( + eou_pred, eou_pred_len, eou_labels, eou_labels_len + ) + + for i in range(eou_pred.size(0)): + self.macro_accuracy.update(preds=eou_pred[i][: eou_valid_len[i]], target=eou_labels[i][: eou_valid_len[i]]) + stats = self.macro_accuracy._final_state() + self.macro_accuracy.reset() + tensorboard_logs['val_eou_acc_stats'] = stats + + eou_predictions = self._get_eou_predictions_from_frames(eou_pred, eou_valid_len) + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + + tensorboard_logs['val_eou_metrics'] = eou_metrics_list + tensorboard_logs['val_eob_metrics'] = eob_metrics_list + + return tensorboard_logs + + def _get_eou_predictions_from_frames( + self, eou_pred: torch.Tensor, eou_pred_len: torch.Tensor + ) -> List[EOUPrediction]: + eou_predictions = [] + for i in range(eou_pred.size(0)): + eou_logits_i = eou_pred[i][: eou_pred_len[i]] # [time, num_classes] + eou_probs = eou_logits_i[:, EOU_LABEL].detach().cpu().numpy().tolist() + eob_probs = eou_logits_i[:, EOB_LABEL].detach().cpu().numpy().tolist() + eou_frame_prediction = eou_logits_i.argmax(dim=-1).cpu().numpy().tolist() + eou_preds = [int(x == EOU_LABEL) for x in eou_frame_prediction] + eob_preds = [int(x == EOB_LABEL) for x in eou_frame_prediction] + eou_predictions.append( + EOUPrediction( + eou_probs=eou_probs, + eob_probs=eob_probs, + eou_preds=eou_preds, + eob_preds=eob_preds, + ) + ) + return eou_predictions + + def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): + assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." + self._maybe_save_predictions(outputs, mode=mode, dataloader_idx=dataloader_idx) + + # Aggregate WER metrics + if self.compute_eval_loss: + loss_mean = torch.stack([x[f'{mode}_loss'] for x in outputs]).mean() + loss_log = {f'{mode}_loss': loss_mean} + else: + loss_log = {} + wer_num = torch.stack([x[f'{mode}_wer_num'] for x in outputs]).sum() + wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() + tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} + + if self.ctc_loss_weight > 0: + ctc_wer_num = torch.stack([x[f'{mode}_wer_num_ctc'] for x in outputs]).sum() + ctc_wer_denom = torch.stack([x[f'{mode}_wer_denom_ctc'] for x in outputs]).sum() + tensorboard_logs['val_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom + + eou_metrics = self._aggregate_eou_metrics(outputs, mode) + tensorboard_logs.update(eou_metrics) + + eou_metrics_ctc = self._aggregate_eou_metrics(outputs, mode) + for key, value in eou_metrics_ctc.items(): + tensorboard_logs[f'{key}_ctc'] = value + + self.macro_accuracy.tp = torch.stack([x[f'{mode}_eou_acc_stats'][0] for x in outputs]).sum(axis=0) + self.macro_accuracy.fp = torch.stack([x[f'{mode}_eou_acc_stats'][1] for x in outputs]).sum(axis=0) + self.macro_accuracy.tn = torch.stack([x[f'{mode}_eou_acc_stats'][2] for x in outputs]).sum(axis=0) + self.macro_accuracy.fn = torch.stack([x[f'{mode}_eou_acc_stats'][3] for x in outputs]).sum(axis=0) + macro_accuracy_score = self.macro_accuracy.compute() + self.macro_accuracy.reset() + tensorboard_logs[f'{mode}_eou_macro_acc'] = macro_accuracy_score + + return {**loss_log, 'log': tensorboard_logs} + + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') + + def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): + return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 11eeccd96d43..26c4f9f10ed1 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -1175,10 +1175,17 @@ def __init__( ): super().__init__() self.encoder = encoder - self.layer_idx_list = [int(l) for l in layer_idx_list] - for x in self.layer_idx_list: - if x < 0 or x >= len(encoder.layers): - raise ValueError(f"layer index {x} out of range [0, {len(encoder.layers)})") + self.num_layers = len(encoder.layers) + self.layer_idx_list = [] + if not layer_idx_list: + layer_idx_list = list(range(self.num_layers)) + for lid in layer_idx_list: + if lid < -self.num_layers or lid >= self.num_layers: + raise ValueError(f"Invalid layer index {lid} for ConformerEncoder with {self.num_layers} layers.") + if lid < 0: + lid = self.num_layers + lid + self.layer_idx_list.append(lid) + self.layer_idx_list.sort() self.enc_access_cfg = { "interctc": { "capture_layers": self.layer_idx_list, @@ -1214,6 +1221,8 @@ def forward( encoded_list = [] encoded_len_list = [] for layer_idx in self.layer_idx_list: + if layer_idx < 0: + layer_idx = self.num_layers + layer_idx try: layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"] layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"] diff --git a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py index b1ff1c1cc74b..10babc492abb 100644 --- a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py +++ b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py @@ -92,14 +92,17 @@ def __init__(self, encoder, aggregator: Optional[Callable] = None, layer_idx_lis super().__init__() self.encoder = encoder self.aggregator = aggregator - self.layer_idx_list = ( - [int(l) for l in layer_idx_list] - if layer_idx_list is not None - else [i for i in range(len(self.encoder.layers))] - ) - for x in self.layer_idx_list: - if x < 0 or x >= len(self.encoder.layers): - raise ValueError(f"layer index {x} out of range [0, {len(self.encoder.layers)})") + self.num_layers = len(encoder.layers) + self.layer_idx_list = [] + if not layer_idx_list: + layer_idx_list = list(range(self.num_layers)) + for lid in layer_idx_list: + if lid < -self.num_layers or lid >= self.num_layers: + raise ValueError(f"Invalid layer index {lid} for ConformerEncoder with {self.num_layers} layers.") + if lid < 0: + lid = self.num_layers + lid + self.layer_idx_list.append(lid) + self.layer_idx_list.sort() logging.info(f"Extracting features from layers {self.layer_idx_list}") self.access_cfg = { "interctc": { From b5fd67fddaaa55e7ba58011284c4deea36e4b0b1 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 16 May 2025 20:17:19 -0400 Subject: [PATCH 038/107] update cfg Signed-off-by: stevehuang52 --- .../asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml index ceb76ae647be..b8f2b05da30d 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml @@ -29,7 +29,7 @@ model: sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training + log_prediction: false # enables logging sample predictions in the output during training skip_nan_grad: false model_defaults: From 27a26a9f0670a2e9c4946ef3e6f875013d341df8 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 17 May 2025 15:33:16 -0400 Subject: [PATCH 039/107] update Signed-off-by: stevehuang52 --- ...former_hybrid_asr_frame_eou_streaming.yaml | 5 ++++- .../asr/data/audio_to_eou_label_lhotse.py | 10 +++++++--- nemo/collections/asr/models/asr_eou_models.py | 20 ++++++++++--------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml index b8f2b05da30d..54fc9ef6d37e 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml @@ -18,7 +18,7 @@ model: token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder num_eou_classes: 4 - eou_class_weights: null + eou_class_weights: [1,1,100,100] rnnt_loss_weight: 0.0 ctc_loss_weight: 0.0 eou_loss_weight: 1.0 @@ -56,6 +56,7 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 check_tokenizer: false + add_eou_to_text: false random_padding: prob: 0.9 @@ -101,6 +102,7 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 check_tokenizer: false + add_eou_to_text: false test_ds: manifest_filepath: null @@ -121,6 +123,7 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 check_tokenizer: false + add_eou_to_text: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py # We recommend to use vocab size of 1024 with SPE Unigram for most languages diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 2f34b1159212..493e45718269 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -146,6 +146,7 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) + self.add_eou_to_text = self.cfg.get('add_eou_to_text', True) self.padding_cfg = self.cfg.get('random_padding', None) self.augmentor = None self.len_augmentor = None @@ -333,9 +334,12 @@ def _get_text_tokens(self, cut: Cut): if not text: # skip empty utterances continue - eou_string = self.eob_string if is_backchannel[i] else self.eou_string - if self.add_sep_before_eou: - eou_string = " " + eou_string + if self.add_eou_to_text: + eou_string = self.eob_string if is_backchannel[i] else self.eou_string + if self.add_sep_before_eou: + eou_string = " " + eou_string + else: + eou_string = "" total_text += text + eou_string + " " total_text = total_text.strip() return torch.as_tensor(self.tokenizer(total_text)) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index fbf613beb0f3..d654e7033dd4 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -216,16 +216,22 @@ def _calculate_eou_metrics( return eou_metrics_list, eob_metrics_list - def _aggregate_eou_metrics(self, outputs: List[dict], mode: str): - if f'{mode}_eou_metrics' not in outputs[0]: + def _aggregate_eou_metrics(self, outputs: List[dict], mode: str, is_ctc: bool = False): + if f'{mode}_eou_metrics' not in outputs[0] and not is_ctc: + return {} + if f'{mode}_eou_metrics_ctc' not in outputs[0] and is_ctc: return {} # Aggregate EOU/EOB metrics eou_metrics = [] # type: List[EOUResult] eob_metrics = [] # type: List[EOUResult] for x in outputs: - eou_metrics.extend(x[f'{mode}_eou_metrics']) - eob_metrics.extend(x[f'{mode}_eob_metrics']) + if is_ctc: + eou_metrics.extend(x[f'{mode}_eou_metrics_ctc']) + eob_metrics.extend(x[f'{mode}_eob_metrics_ctc']) + else: + eou_metrics.extend(x[f'{mode}_eou_metrics']) + eob_metrics.extend(x[f'{mode}_eob_metrics']) num_eou_utterances = sum([x.num_utterances for x in eou_metrics]) eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) @@ -841,7 +847,7 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str eou_metrics = self._aggregate_eou_metrics(outputs, mode) tensorboard_logs.update(eou_metrics) - eou_metrics_ctc = self._aggregate_eou_metrics(outputs, mode) + eou_metrics_ctc = self._aggregate_eou_metrics(outputs, mode, is_ctc=True) for key, value in eou_metrics_ctc.items(): tensorboard_logs[f'{key}_ctc'] = value @@ -1340,10 +1346,6 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str eou_metrics = self._aggregate_eou_metrics(outputs, mode) tensorboard_logs.update(eou_metrics) - eou_metrics_ctc = self._aggregate_eou_metrics(outputs, mode) - for key, value in eou_metrics_ctc.items(): - tensorboard_logs[f'{key}_ctc'] = value - self.macro_accuracy.tp = torch.stack([x[f'{mode}_eou_acc_stats'][0] for x in outputs]).sum(axis=0) self.macro_accuracy.fp = torch.stack([x[f'{mode}_eou_acc_stats'][1] for x in outputs]).sum(axis=0) self.macro_accuracy.tn = torch.stack([x[f'{mode}_eou_acc_stats'][2] for x in outputs]).sum(axis=0) From 6f1e59b1f1e7248060956a76a5426a6c6df66f50 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 19 May 2025 14:08:31 -0400 Subject: [PATCH 040/107] refactor percentile calculation Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 60 ++++++++++--------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index d654e7033dd4..bed8089217e9 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -16,8 +16,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple +import numpy as np import torch -import torch.nn as nn from lightning.pytorch.utilities import rank_zero_only from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from torchmetrics import Accuracy @@ -216,6 +216,25 @@ def _calculate_eou_metrics( return eou_metrics_list, eob_metrics_list + def _get_percentiles(self, values: List[float], percentiles: List[float], tag: str = "") -> Dict[str, float]: + """ + Get the percentiles of a list of values. + Args: + values: list of values + percentiles: list of percentiles + Returns: + metrics: Dict of percentiles + """ + if len(values) == 0: + return [0.0] * len(percentiles) + results = np.percentile(values, percentiles).tolist() + metrics = {} + if tag: + tag += "_" + for i, p in enumerate(percentiles): + metrics[f'{tag}p{int(p)}'] = float(results[i]) + return metrics + def _aggregate_eou_metrics(self, outputs: List[dict], mode: str, is_ctc: bool = False): if f'{mode}_eou_metrics' not in outputs[0] and not is_ctc: return {} @@ -254,34 +273,21 @@ def _aggregate_eou_metrics(self, outputs: List[dict], mode: str, is_ctc: bool = eou_missing = [x.missing for x in eou_metrics] eob_missing = [x.missing for x in eob_metrics] - eou_latency = torch.tensor(eou_latency) - eou_latency_p90 = torch.quantile(eou_latency, 0.9).item() - eou_latency_p95 = torch.quantile(eou_latency, 0.95).item() - - eou_early_cutoff = torch.tensor(eou_early_cutoff) - eou_early_cutoff_p90 = torch.quantile(eou_early_cutoff, 0.9).item() - eou_early_cutoff_p95 = torch.quantile(eou_early_cutoff, 0.95).item() - - eob_latency = torch.tensor(eob_latency) - eob_latency_p90 = torch.quantile(eob_latency, 0.9).item() - eob_latency_p95 = torch.quantile(eob_latency, 0.95).item() - - eob_early_cutoff = torch.tensor(eob_early_cutoff) - eob_early_cutoff_p90 = torch.quantile(eob_early_cutoff, 0.9).item() - eob_early_cutoff_p95 = torch.quantile(eob_early_cutoff, 0.95).item() - tensorboard_logs = {} - tensorboard_logs[f'{mode}_eou_latency_p90'] = eou_latency_p90 - tensorboard_logs[f'{mode}_eou_latency_p95'] = eou_latency_p95 - - tensorboard_logs[f'{mode}_eou_early_cutoff_p90'] = eou_early_cutoff_p90 - tensorboard_logs[f'{mode}_eou_early_cutoff_p95'] = eou_early_cutoff_p95 - - tensorboard_logs[f'{mode}_eob_latency_p90'] = eob_latency_p90 - tensorboard_logs[f'{mode}_eob_latency_p95'] = eob_latency_p95 + target_percentiles = [50, 90, 95] + eou_latency_metrics = self._get_percentiles(eou_latency, target_percentiles, tag=f'{mode}_eou_latency') + eou_early_cutoff_metrics = self._get_percentiles( + eou_early_cutoff, target_percentiles, tag=f'{mode}_eou_early_cutoff' + ) + eob_latency_metrics = self._get_percentiles(eob_latency, target_percentiles, tag=f'{mode}_eob_latency') + eob_early_cutoff_metrics = self._get_percentiles( + eob_early_cutoff, target_percentiles, tag=f'{mode}_eob_early_cutoff' + ) - tensorboard_logs[f'{mode}_eob_early_cutoff_p90'] = eob_early_cutoff_p90 - tensorboard_logs[f'{mode}_eob_early_cutoff_p95'] = eob_early_cutoff_p95 + tensorboard_logs.update(eou_latency_metrics) + tensorboard_logs.update(eou_early_cutoff_metrics) + tensorboard_logs.update(eob_latency_metrics) + tensorboard_logs.update(eob_early_cutoff_metrics) tensorboard_logs[f'{mode}_eou_early_cutoff_avg_num'] = eou_avg_num_early_cutoff tensorboard_logs[f'{mode}_eob_early_cutoff_avg_num'] = eob_avg_num_early_cutoff From 5c8af18a1c99ff779f4f7d258806750bb386cc53 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 21 May 2025 16:56:55 -0400 Subject: [PATCH 041/107] update augmentation Signed-off-by: stevehuang52 --- ...astconformer_transducer_bpe_streaming.yaml | 1 - .../asr/data/audio_to_eou_label_lhotse.py | 43 ++++++++++++++++++- scripts/asr_end_of_utterance/conf/data.yaml | 6 +-- .../generate_noisy_eval_data.py | 10 +++-- tools/nemo_forced_aligner/align_eou.py | 4 +- 5 files changed, 53 insertions(+), 11 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index a5a906f6c67a..21c13f0fe645 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -165,7 +165,6 @@ model: att_context_style: chunked_limited # regular or chunked_limited att_context_probs: null - xscaling: true # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 493e45718269..2fc16d36def0 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -30,6 +30,8 @@ from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType from nemo.utils import logging +NON_SPEECH_LABEL = 0 +SPEECH_LABEL = 1 EOU_LABEL = 2 EOB_LABEL = 3 EOU_STRING = '' @@ -141,12 +143,14 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.tokenizer = TokenizerWrapper(tokenizer) self.load_audio = AudioSamples(fault_tolerant=True) self.sample_rate = self.cfg.get('sample_rate', 16000) + self.window_stride = self.cfg.get('window_stride', 0.01) self.num_sample_per_mel_frame = int( - self.cfg.get('window_stride', 0.01) * self.sample_rate + self.window_stride * self.sample_rate ) # 160 samples for every 1ms by default self.num_mel_frame_per_target_frame = int(self.cfg.get('subsampling_factor', 8)) self.add_sep_before_eou = self.cfg.get('add_sep_before_eou', False) self.add_eou_to_text = self.cfg.get('add_eou_to_text', True) + self.pad_eou_label_secs = self.cfg.get('pad_eou_label_secs', 0.0) self.padding_cfg = self.cfg.get('random_padding', None) self.augmentor = None self.len_augmentor = None @@ -257,6 +261,39 @@ def _audio_len_to_frame_len(self, num_samples: int): hidden_length = math.ceil(mel_frame_count / self.num_mel_frame_per_target_frame) return hidden_length + def _repeat_eou_labels(self, eou_targets: torch.Tensor) -> torch.Tensor: + """ + Repeat EOU labels according to self.pad_eou_label_secs + Args: + eou_targets: torch.Tensor of EOU labels, shape [T] + Returns: + eou_targets: torch.Tensor of padded EOU labels, shape [T] + """ + if not self.pad_eou_label_secs or self.pad_eou_label_secs <= 0: + return eou_targets + + eou_len = self._audio_len_to_frame_len(int(self.pad_eou_label_secs * self.sample_rate)) + + i = 0 + while i < eou_targets.size(0): + if eou_targets[i] == EOU_LABEL or eou_targets[i] == EOB_LABEL: + # repeat the label for the next eou_len samples + start = i + end = min(i + eou_len, eou_targets.size(0)) + j = start + 1 + while j < end: + if eou_targets[j] != NON_SPEECH_LABEL: + # do not overwrite the label if it's not non-speech + break + j += 1 + end = min(j, end) + # fill the non-speech label with the current EOU/EOB label + eou_targets[start:end] = eou_targets[i] + i = end + else: + i += 1 + return eou_targets + def _get_frame_labels(self, cut: Cut, num_samples: int): hidden_length = self._audio_len_to_frame_len(num_samples) if not "sou_time" in cut.custom or not "eou_time" in cut.custom: @@ -300,7 +337,7 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): sou_idx = self._audio_len_to_frame_len(int((sou_time[i] - cut.start) * self.sample_rate)) seg_len_in_secs = eou_time[i] - sou_time[i] seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.sample_rate)) - eou_targets[sou_idx : sou_idx + seg_len] = 1 + eou_targets[sou_idx : sou_idx + seg_len] = SPEECH_LABEL last_idx = min(sou_idx + seg_len - 1, hidden_length - 1) if is_backchannel[i]: eou_targets[last_idx] = EOB_LABEL # end of backchannel @@ -360,6 +397,7 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta p = np.random.rand() if self.padding_cfg is None or p > self.padding_cfg.prob: # don't apply padding + eou_targets = self._repeat_eou_labels(eou_targets) return audio, audio_len, eou_targets duration = audio_len.item() / self.cfg.sample_rate @@ -413,6 +451,7 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta post_padding_eou = torch.zeros(post_padding_eou_len, dtype=eou_targets.dtype) padded_eou_targets = torch.cat((pre_padding_eou, eou_targets, post_padding_eou), dim=0) + padded_eou_targets = self._repeat_eou_labels(padded_eou_targets) return padded_audio, padded_audio_len, padded_eou_targets def _maybe_augment_audio(self, audio: torch.Tensor, audio_len: torch.Tensor): diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index 93056488edf4..97147640d23c 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -22,8 +22,8 @@ data: random_padding: prob: 0.5 - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution for padding duration @@ -39,7 +39,7 @@ data: min_gain_dbfs: -10.0 max_gain_dbfs: 10.0 noise: - prob: 0.6 + prob: 0.99 manifest_path: ??? min_snr_db: 0 max_snr_db: 20 diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index 5a0c00476fbd..3a6612e23607 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -93,11 +93,15 @@ @hydra_runner(config_path="conf/", config_name="data") def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - # Seed everything for reproducibility - seed = cfg.data.get('seed', 42) + seed = cfg.data.get('seed', None) + if seed is None: + seed = np.random.randint(0, 2**32 - 1) + logging.info(f'No seed provided, using random seed: {seed}') logging.info(f'Setting random seed to {seed}') + with open_dict(cfg): + cfg.data.seed = seed + logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') pl.seed_everything(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 13a99e5d6d2c..6bc864ff90fa 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -147,7 +147,7 @@ class AlignmentConfig: batch_size: int = 1 use_local_attention: bool = True additional_segment_grouping_separator: Optional[str] = None - audio_filepath_parts_in_utt_id: int = 1 + audio_filepath_parts_in_utt_id: int = 4 # Buffered chunked streaming configs use_buffered_chunked_streaming: bool = False @@ -383,7 +383,7 @@ def main(cfg: AlignmentConfig): cfg.manifest_filepath = str(manifest_filepath) if origin_output_manifest_filepath is None: - manifest_stem = Path(manifest_filepath).stem.replace("-aligned", "") + manifest_stem = Path(manifest_filepath).stem cfg.output_manifest_filepath = str(Path(manifest_filepath).parent / f"{manifest_stem}-aligned.json") elif len(manifest_list) > 1 and origin_output_manifest_filepath is not None: raise ValueError( From 3b1f3546bdb1e255e91ea99bac3750a7ec7d8c9f Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 22 May 2025 10:53:52 -0400 Subject: [PATCH 042/107] update cfg Signed-off-by: stevehuang52 --- ...fastconformer_hybrid_asr_frame_eou_streaming.yaml | 12 ++++++++---- ...onformer_hybrid_transducer_ctc_bpe_streaming.yaml | 10 +++++----- .../fastconformer_transducer_bpe_streaming.yaml | 8 ++++---- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml index 54fc9ef6d37e..ae7055176f9e 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml @@ -26,6 +26,7 @@ model: freeze_encoder: true freeze_ctc: true freeze_rnnt: true + pad_eou_label_secs: 0.0 sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. @@ -57,12 +58,13 @@ model: shuffle_buffer_size: 10000 check_tokenizer: false add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} random_padding: - prob: 0.9 - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' @@ -103,6 +105,7 @@ model: shuffle_buffer_size: 10000 check_tokenizer: false add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} test_ds: manifest_filepath: null @@ -124,6 +127,7 @@ model: shuffle_buffer_size: 10000 check_tokenizer: false add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py # We recommend to use vocab size of 1024 with SPE Unigram for most languages diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index d76b5269429f..50ec93a13dc3 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -47,14 +47,14 @@ model: shuffle_buffer_size: 10000 random_padding: - prob: 0.9 - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - + augmentor: white_noise: prob: 0.5 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 21c13f0fe645..ed7bcfbc6656 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -42,10 +42,10 @@ model: shuffle_buffer_size: 10000 random_padding: - prob: 0.9 - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: 5.0 # maximum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' From dec9694ca4227119fe312da0f29519a520c8be87 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 13:33:34 -0400 Subject: [PATCH 043/107] update model and cfg Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_eou_eval.py | 23 +- .../asr_eou/speech_to_text_rnnt_eou_train.py | 4 + ...er_hybrid_asr_frame_fc_eou_streaming.yaml} | 2 +- ...r_hybrid_asr_frame_lstm_eou_streaming.yaml | 427 ++++++++++++++++++ nemo/collections/asr/models/asr_eou_models.py | 58 ++- nemo/collections/asr/modules/lstm_decoder.py | 13 +- .../legacy/checkpoint_averaging.py | 2 +- 7 files changed, 512 insertions(+), 17 deletions(-) rename examples/asr/conf/asr_eou/{fastconformer_hybrid_asr_frame_eou_streaming.yaml => fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml} (99%) create mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml diff --git a/examples/asr/asr_eou/speech_to_text_eou_eval.py b/examples/asr/asr_eou/speech_to_text_eou_eval.py index 122f1739d76e..8deac45932b6 100644 --- a/examples/asr/asr_eou/speech_to_text_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_eou_eval.py @@ -15,16 +15,21 @@ import lightning.pytorch as pl import torch -from omegaconf import OmegaConf, open_dict + +torch.set_float32_matmul_precision("highest") +from omegaconf import DictConfig, OmegaConf, open_dict from nemo.collections.asr.models import ASRModel +from nemo.core.classes import typecheck from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager from nemo.utils.trainer_utils import resolve_trainer_cfg +typecheck.set_typecheck_enabled(False) + -def load_model(cfg, trainer): +def load_model(cfg: DictConfig, trainer: pl.Trainer) -> ASRModel: if "init_from_nemo_model" in cfg: logging.info(f"Loading model from local file: {cfg.init_from_nemo_model}") model = ASRModel.restore_from(cfg.init_from_nemo_model, trainer=trainer) @@ -35,7 +40,7 @@ def load_model(cfg, trainer): raise ValueError( "Please provide either 'init_from_nemo_model' or 'init_from_pretrained_model' in the config file." ) - if "init_from_ptl_ckpt" in cfg: + if cfg.get("init_from_ptl_ckpt", None): logging.info(f"Loading weights from checkpoint: {cfg.init_from_ptl_ckpt}") state_dict = torch.load(cfg.init_from_ptl_ckpt, map_location='cpu', weights_only=False)['state_dict'] model.load_state_dict(state_dict, strict=True) @@ -50,12 +55,18 @@ def main(cfg): exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = load_model(cfg, trainer) + asr_model = asr_model.eval() # Set the model to evaluation mode + if hasattr(asr_model, 'wer'): + asr_model.wer.log_prediction = False - if "save_pred_to_file" in cfg: - with open_dict(asr_model.cfg): + with open_dict(asr_model.cfg): + if "save_pred_to_file" in cfg: asr_model.cfg.save_pred_to_file = cfg.save_pred_to_file - + if "calclate_eou_metrics" in cfg: + asr_model.cfg.calclate_eou_metrics = cfg.calclate_eou_metrics if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + with open_dict(cfg.model.test_ds): + cfg.model.test_ds.pad_eou_label_secs = asr_model.cfg.get('pad_eou_label_secs', 0.0) asr_model.setup_test_data(test_data_config=cfg.model.test_ds) trainer.test(asr_model) else: diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index e173b44141bc..038f91e87bad 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -235,6 +235,10 @@ def main(cfg): if init_from_model: init_from_pretrained_nemo(asr_model, init_from_model, cfg) + if cfg.model.get("freeze_encoder", False): + logging.info("Freezing encoder weights.") + asr_model.encoder.freeze() + trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml similarity index 99% rename from examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml rename to examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml index ae7055176f9e..79977b0f1b1b 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml @@ -338,7 +338,7 @@ model: stochastic_depth_mode: linear # linear or uniform stochastic_depth_start_layer: 1 - eou_classifier: + eou_decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder feat_in: ${model.eou_encoder.d_model} num_classes: ${model.num_eou_classes} diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml new file mode 100644 index 000000000000..8653c608d883 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -0,0 +1,427 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +# Note: if training loss does not converge, you may increase warm-up to 20K. + +name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder + num_eou_classes: 4 + eou_class_weights: [1,1,100,100] + rnnt_loss_weight: 0.0 + ctc_loss_weight: 0.0 + eou_loss_weight: 1.0 + use_ctc_pred: false + freeze_encoder: true + freeze_ctc: true + freeze_rnnt: true + pad_eou_label_secs: 0.0 + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: false # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + random_padding: + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + aggregator: + _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator + mode: "weighted_sum" + weights: null + layer_idx_list: ${model.layer_idx_list} + + eou_encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.encoder.d_model} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 2 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 1 # NO subsampling + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: ${model.encoder.pos_emb_max_len} + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + eou_decoder: + _target_: nemo.collections.asr.modules.LSTMDecoder + feat_in: ${model.eou_encoder.d_model} + num_classes: ${model.num_eou_classes} + lstm_hidden_size: 128 + add_blank: false + + eou_loss: + weight: ${model.eou_class_weights} + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing + d_model: ${model.encoder.d_model} + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_eou_macro_acc" + mode: "max" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index bed8089217e9..572533443fa0 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -33,6 +33,7 @@ from nemo.collections.asr.metrics.wer import WER from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel from nemo.collections.asr.modules.conformer_encoder import ConformerMultiLayerFeatureExtractor +from nemo.collections.asr.parts.mixins import TranscribeConfig from nemo.collections.asr.parts.utils.eou_utils import ( EOUResult, cal_eou_metrics_from_frame_labels, @@ -375,6 +376,11 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): tokenizer=self.tokenizer, ) + def _transcribe_forward(self, batch: AudioToTextEOUBatch, trcfg: TranscribeConfig): + encoded, encoded_len = self.forward(input_signal=batch.audio_signal, input_signal_length=batch.audio_lengths) + output = dict(encoded=encoded, encoded_len=encoded_len) + return output + def training_step(self, batch: AudioToTextEOUBatch, batch_nb): # Reset access registry if AccessMixin.is_access_enabled(self.model_guid): @@ -567,9 +573,12 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) tensorboard_logs['val_text_pred'] = text_pred - eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) - - eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + if self.cfg.get('calculate_eou_metrics', True): + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + else: + eou_metrics_list = [] + eob_metrics_list = [] if loss_value is not None: tensorboard_logs['val_loss'] = loss_value @@ -598,11 +607,41 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} - eou_metrics = self._aggregate_eou_metrics(outputs, mode=mode) + if self.cfg.get('calculate_eou_metrics', True): + eou_metrics = self._aggregate_eou_metrics(outputs, mode=mode) tensorboard_logs.update(eou_metrics) return {**loss_log, 'log': tensorboard_logs} + # def test_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): + # # logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) + # # test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} + + # signal = batch.audio_signal + # signal_len = batch.audio_lengths + # transcript = batch.text_tokens + # transcript_len = batch.text_token_lengths + + # # forward() only performs encoder forward + # encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) + # del signal + + # tensorboard_logs = {} + # hypotheses = self.decoding.rnnt_decoder_predictions_tensor( + # encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=True + # ) + # eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) + # eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + # tensorboard_logs['test_eou_metrics'] = eou_metrics_list + # tensorboard_logs['test_eob_metrics'] = eob_metrics_list + + # test_logs = tensorboard_logs + # if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + # self.test_step_outputs[dataloader_idx].append(test_logs) + # else: + # self.test_step_outputs.append(test_logs) + # return test_logs + def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') @@ -878,7 +917,7 @@ def __init__(self, cfg: DictConfig, trainer): self.encoder = ConformerMultiLayerFeatureExtractor(self.encoder, self.layer_idx_list) self.aggregator = Serialization.from_config_dict(cfg.aggregator) self.eou_encoder = Serialization.from_config_dict(cfg.eou_encoder) - self.eou_classifier = Serialization.from_config_dict(cfg.eou_classifier) + self.eou_decoder = Serialization.from_config_dict(cfg.eou_decoder) self.num_eou_classes = cfg.num_eou_classes self.rnnt_loss_weight = cfg.rnnt_loss_weight self.ctc_loss_weight = cfg.ctc_loss_weight @@ -964,7 +1003,7 @@ def get_eou_prediction( encoded_all[-1] = ctc_pred eou_encoded, eou_encoded_len = self.aggregator(encoded_all, encoded_len_all) eou_encoded, eou_encoded_len = self.eou_encoder(eou_encoded, eou_encoded_len) - eou_pred = self.eou_classifier(eou_encoded) + eou_pred = self.eou_decoder(eou_encoded) return eou_pred, eou_encoded_len def trim_eou_preds_labels( @@ -1298,8 +1337,8 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader for i in range(eou_pred.size(0)): self.macro_accuracy.update(preds=eou_pred[i][: eou_valid_len[i]], target=eou_labels[i][: eou_valid_len[i]]) stats = self.macro_accuracy._final_state() - self.macro_accuracy.reset() tensorboard_logs['val_eou_acc_stats'] = stats + self.macro_accuracy.reset() eou_predictions = self._get_eou_predictions_from_frames(eou_pred, eou_valid_len) eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) @@ -1340,6 +1379,10 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str loss_log = {f'{mode}_loss': loss_mean} else: loss_log = {} + + eou_loss_mean = torch.stack([x[f'{mode}_eou_loss'] for x in outputs]).mean() + loss_log[f'{mode}_eou_loss'] = eou_loss_mean + wer_num = torch.stack([x[f'{mode}_wer_num'] for x in outputs]).sum() wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} @@ -1352,6 +1395,7 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str eou_metrics = self._aggregate_eou_metrics(outputs, mode) tensorboard_logs.update(eou_metrics) + self.macro_accuracy.reset() self.macro_accuracy.tp = torch.stack([x[f'{mode}_eou_acc_stats'][0] for x in outputs]).sum(axis=0) self.macro_accuracy.fp = torch.stack([x[f'{mode}_eou_acc_stats'][1] for x in outputs]).sum(axis=0) self.macro_accuracy.tn = torch.stack([x[f'{mode}_eou_acc_stats'][2] for x in outputs]).sum(axis=0) diff --git a/nemo/collections/asr/modules/lstm_decoder.py b/nemo/collections/asr/modules/lstm_decoder.py index 9bb60e2fabca..8c41c5657f52 100644 --- a/nemo/collections/asr/modules/lstm_decoder.py +++ b/nemo/collections/asr/modules/lstm_decoder.py @@ -45,7 +45,16 @@ def input_types(self): def output_types(self): return OrderedDict({"logprobs": NeuralType(('B', 'T', 'D'), LogprobsType())}) - def __init__(self, feat_in, num_classes, lstm_hidden_size, vocabulary=None, bidirectional=False, num_layers=1): + def __init__( + self, + feat_in, + num_classes, + lstm_hidden_size, + vocabulary=None, + bidirectional=False, + num_layers=1, + add_blank=True, + ): super().__init__() if vocabulary is not None: @@ -57,7 +66,7 @@ def __init__(self, feat_in, num_classes, lstm_hidden_size, vocabulary=None, bidi self.__vocabulary = vocabulary self._feat_in = feat_in # Add 1 for blank char - self._num_classes = num_classes + 1 + self._num_classes = num_classes + 1 if add_blank else num_classes self.lstm_layer = nn.LSTM( input_size=feat_in, diff --git a/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py b/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py index 846777fe70b5..863fc820acb9 100755 --- a/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py +++ b/scripts/checkpoint_averaging/legacy/checkpoint_averaging.py @@ -127,7 +127,7 @@ def main(): logging.info(f"Averaging {n} checkpoints ...") for ix, path in enumerate(tqdm(checkpoint_paths, total=n, desc='Averaging checkpoints')): - checkpoint = torch.load(path, map_location=device) + checkpoint = torch.load(path, map_location=device, weights_only=False) if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] From 33ca51c411f4564e351367796d52600cacbdb061 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 15:00:04 -0400 Subject: [PATCH 044/107] update frame eou Signed-off-by: stevehuang52 --- ...r_hybrid_asr_frame_lstm_eou_streaming.yaml | 63 ++----------------- nemo/collections/asr/models/asr_eou_models.py | 5 +- 2 files changed, 7 insertions(+), 61 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml index 8653c608d883..1f8eb895b39d 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -280,69 +280,14 @@ model: weights: null layer_idx_list: ${model.layer_idx_list} - eou_encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.encoder.d_model} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 2 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 1 # NO subsampling - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: ${model.encoder.pos_emb_max_len} - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 + eou_encoder: null eou_decoder: _target_: nemo.collections.asr.modules.LSTMDecoder - feat_in: ${model.eou_encoder.d_model} + feat_in: ${model.encoder.d_model} num_classes: ${model.num_eou_classes} - lstm_hidden_size: 128 + lstm_hidden_size: 256 + num_layers: 4 add_blank: false eou_loss: diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 572533443fa0..c07d1f1eaed9 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -916,7 +916,7 @@ def __init__(self, cfg: DictConfig, trainer): self.layer_idx_list.append(num_encoder_layers - 1) self.encoder = ConformerMultiLayerFeatureExtractor(self.encoder, self.layer_idx_list) self.aggregator = Serialization.from_config_dict(cfg.aggregator) - self.eou_encoder = Serialization.from_config_dict(cfg.eou_encoder) + self.eou_encoder = Serialization.from_config_dict(cfg.eou_encoder) if cfg.eou_encoder is not None else None self.eou_decoder = Serialization.from_config_dict(cfg.eou_decoder) self.num_eou_classes = cfg.num_eou_classes self.rnnt_loss_weight = cfg.rnnt_loss_weight @@ -1002,7 +1002,8 @@ def get_eou_prediction( if ctc_pred is not None and self.use_ctc_pred: encoded_all[-1] = ctc_pred eou_encoded, eou_encoded_len = self.aggregator(encoded_all, encoded_len_all) - eou_encoded, eou_encoded_len = self.eou_encoder(eou_encoded, eou_encoded_len) + if self.eou_encoder is not None: + eou_encoded, eou_encoded_len = self.eou_encoder(eou_encoded, eou_encoded_len) eou_pred = self.eou_decoder(eou_encoded) return eou_pred, eou_encoded_len From d726602de1d2712f1afc60251b002e240f034bfd Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 15:02:53 -0400 Subject: [PATCH 045/107] update cfg Signed-off-by: stevehuang52 --- .../fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml index 1f8eb895b39d..9050643cf151 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -313,15 +313,14 @@ model: optim: name: adamw - lr: 5.0 + lr: 0.0005 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: - name: NoamAnnealing - d_model: ${model.encoder.d_model} + name: CosineAnnealing # scheduler config override warmup_steps: 10000 warmup_ratio: null From 5c194a4691bcedac05fba10e2bf5ffce43467d8b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 16:18:22 -0400 Subject: [PATCH 046/107] add adapter to eou Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_eou_eval.py | 2 +- .../asr_eou/speech_to_text_rnnt_eou_train.py | 86 +++- ...rmer_transducer_bpe_streaming_adapter.yaml | 367 ++++++++++++++++++ 3 files changed, 452 insertions(+), 3 deletions(-) create mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml diff --git a/examples/asr/asr_eou/speech_to_text_eou_eval.py b/examples/asr/asr_eou/speech_to_text_eou_eval.py index 8deac45932b6..b0879b227d46 100644 --- a/examples/asr/asr_eou/speech_to_text_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_eou_eval.py @@ -67,7 +67,7 @@ def main(cfg): if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: with open_dict(cfg.model.test_ds): cfg.model.test_ds.pad_eou_label_secs = asr_model.cfg.get('pad_eou_label_secs', 0.0) - asr_model.setup_test_data(test_data_config=cfg.model.test_ds) + asr_model.setup_multiple_test_data(test_data_config=cfg.model.test_ds) trainer.test(asr_model) else: raise ValueError( diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 038f91e87bad..dd6766379600 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -71,21 +71,94 @@ """ - +from dataclasses import is_dataclass from typing import Optional import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf +from omegaconf import DictConfig, OmegaConf, open_dict from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel from nemo.collections.asr.models.asr_eou_models import EncDecRNNTBPEEOUModel from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint +from nemo.core import adapter_mixins from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager from nemo.utils.trainer_utils import resolve_trainer_cfg +def add_global_adapter_cfg(model, global_adapter_cfg): + # Convert to DictConfig from dict or Dataclass + if is_dataclass(global_adapter_cfg): + global_adapter_cfg = OmegaConf.structured(global_adapter_cfg) + + if not isinstance(global_adapter_cfg, DictConfig): + global_adapter_cfg = DictConfig(global_adapter_cfg) + + # Update the model.cfg with information about the new adapter global cfg + with open_dict(global_adapter_cfg), open_dict(model.cfg): + if 'adapters' not in model.cfg: + model.cfg.adapters = OmegaConf.create({}) + + # Add the global config for adapters to the model's internal config + model.cfg.adapters[model.adapter_global_cfg_key] = global_adapter_cfg + + # Update all adapter modules (that already exist) with this global adapter config + model.update_adapter_cfg(model.cfg.adapters) + + +def update_model_config_to_support_adapter(model_cfg): + with open_dict(model_cfg): + # Update encoder adapter compatible config + adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_) + if adapter_metadata is not None: + model_cfg.encoder._target_ = adapter_metadata.adapter_class_path + + +def setup_adapters(cfg: DictConfig, model: ASRModel): + # Setup adapters + with open_dict(cfg.model.adapter): + # Extract the name of the adapter (must be give for training) + adapter_name = cfg.model.adapter.pop("adapter_name") + adapter_type = cfg.model.adapter.pop("adapter_type") + adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) + adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) + + # Resolve the config of the specified `adapter_type` + if adapter_type not in cfg.model.adapter.keys(): + raise ValueError( + f"Adapter type ({adapter_type}) config could not be found. Adapter setup config - \n" + f"{OmegaConf.to_yaml(cfg.model.adapter)}" + ) + + adapter_type_cfg = cfg.model.adapter[adapter_type] + print(f"Found `{adapter_type}` config :\n" f"{OmegaConf.to_yaml(adapter_type_cfg)}") + + # Augment adapter name with module name, if not provided by user + if adapter_module_name is not None and ':' not in adapter_name: + adapter_name = f'{adapter_module_name}:{adapter_name}' + + # Extract the global adapter config, if provided + adapter_global_cfg = cfg.model.adapter.pop(model.adapter_global_cfg_key, None) + if adapter_global_cfg is not None: + add_global_adapter_cfg(model, adapter_global_cfg) + + model.add_adapter(adapter_name, cfg=adapter_type_cfg) + assert model.is_adapter_available() + + # Disable all other adapters, enable just the current adapter. + model.set_enabled_adapters(enabled=False) # disable all adapters prior to training + model.set_enabled_adapters(adapter_name, enabled=True) # enable just one adapter by name + + # First, Freeze all the weights of the model (not just encoder, everything) + model.freeze() + # Activate dropout() and other modules that depend on train mode. + model = model.train() + # Then, Unfreeze just the adapter weights that were enabled above (no part of encoder/decoder/joint/etc) + model.unfreeze_enabled_adapters() + return model + + def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None: raise NotImplementedError( @@ -229,6 +302,9 @@ def main(cfg): trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) exp_manager(trainer, cfg.get("exp_manager", None)) + if cfg.model.get("adapter", None) is not None: + update_model_config_to_support_adapter(cfg.model) + asr_model = EncDecRNNTBPEEOUModel(cfg=cfg.model, trainer=trainer) init_from_model = get_pretrained_model_name(cfg) @@ -239,6 +315,12 @@ def main(cfg): logging.info("Freezing encoder weights.") asr_model.encoder.freeze() + if cfg.model.get("adapter", None) is not None: + asr_model = setup_adapters(cfg, asr_model) + + import pdb + + pdb.set_trace() trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml new file mode 100644 index 000000000000..f8d0b787a8f1 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -0,0 +1,367 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming-EOU" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + adapter: + ### Config of the adapter training/eval script ### + adapter_name: "eou-adapter" # Name of the adapter, used by the script + adapter_type: "linear" # Type of the adapter. Corresponds to the subconfigs below. + adapter_module_name: null # Name of the adapter module. Combine multiple modules with '+' between module names. + adapter_state_dict_name: "adapters.pt" # If the individual adapters must be saved, a file name can be provided here. null disables this. + + ### Adapter Configs ### + # Linear / Houlsby Adapter (https://arxiv.org/abs/1902.00751) + linear: + # Config of the adapter module itself + _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter + in_features: ${model.encoder.d_model} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. + dim: 32 # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count. + activation: swish + norm_position: 'pre' # Can be `pre` or `post` + dropout: 0.0 # float, dropout for the adapter + + # Adapter strategy config + adapter_strategy: + _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy + stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. + l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. + + # Tiny-Attention Adapter (https://arxiv.org/abs/2211.01979) + # NOTE: Only supported for Attention based encoders. Make sure to pass `adapter_module_name` as "encoder" + tiny_attn: + # Config of the adapter module itself + # Defaults to Relative Positional Encoding MHA + # _target_ can instead be .MultiHeadAttentionAdapter if Conformer was originally using Absolute Positional Encoding. + _target_: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter + n_feat: ${model.encoder.d_model} # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter. + n_head: 1 # Number of heads for attention. + proj_dim: -1 # Can be `null` - to avoid projection, > 0 for explicit dim, or -1 to default to `n_head` + dropout_rate: 0.0 # float, dropout for the adapter + + # Adapter strategy config + adapter_strategy: + _target_: nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MHAResidualAddAdapterStrategy + stochastic_depth: 0.0 # float, setting to > 0 will enable stochastic depth for each adapter block. + l2_lambda: 0.0 # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output. + + # Optional global config available to all adapters at a global level. + # A global config is shared across every layer of the adapters, defining global properties rather + # than properties local to the adapter (as defined above). + # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*, + # and further global operations that can decide dynamically how to support the requested adapter. + global_cfg: + check_encoder_adapter: True # ASR adapter key, determines whether to check if encoder adapter modules is supported + check_decoder_adapter: True # ASR adapter key, determines whether to check if decoder adapter modules is supported + check_joint_adapter: True # ASR adapter key, determines whether to check if joint adapter modules is supported + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.5 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + drop_last: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + drop_last: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 # 1e-4 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing # NoamAnnealing CosineAnnealing + # scheduler config override + d_model: ${model.encoder.d_model} + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null From a1a5cbdb8e17efbb15d17c2b67708f929d268cf6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 21:11:41 -0400 Subject: [PATCH 047/107] remove pdb Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index dd6766379600..9cbf451ba68c 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -318,9 +318,6 @@ def main(cfg): if cfg.model.get("adapter", None) is not None: asr_model = setup_adapters(cfg, asr_model) - import pdb - - pdb.set_trace() trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: From 3114b909ea8c47868cecc8011a1b4ff0c9ad78e0 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 27 May 2025 22:29:25 -0400 Subject: [PATCH 048/107] update cfg Signed-off-by: stevehuang52 --- .../asr_eou/fastconformer_transducer_bpe_streaming.yaml | 3 --- nemo/collections/asr/models/asr_eou_models.py | 8 ++++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index ed7bcfbc6656..5ef9a1b6557b 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -32,7 +32,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -78,7 +77,6 @@ model: shuffle: false num_workers: 8 pin_memory: true - drop_last: true quadratic_duration: 30 num_buckets: 30 num_cuts_for_bins_estimate: 10000 @@ -95,7 +93,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index c07d1f1eaed9..d4de081ef557 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -534,8 +534,12 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) tensorboard_logs['val_text_pred'] = text_pred - eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) - eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + if self.cfg.get('calculate_eou_metrics', True): + eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) + eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) + else: + eou_metrics_list = [] + eob_metrics_list = [] wer, wer_num, wer_denom = self.wer.compute() self.wer.reset() From c2706862bd177e8da2a42bb797de77433dfdd47a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 28 May 2025 09:33:26 -0400 Subject: [PATCH 049/107] update cfg Signed-off-by: stevehuang52 --- .../fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml | 3 --- ...fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml | 3 --- ...fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml | 3 --- .../fastconformer_transducer_bpe_streaming_adapter.yaml | 3 --- scripts/asr_end_of_utterance/conf/data.yaml | 8 ++++---- 5 files changed, 4 insertions(+), 16 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml index 79977b0f1b1b..c31c30a1b195 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml @@ -48,7 +48,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -95,7 +94,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -117,7 +115,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml index 9050643cf151..3578fc683057 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -48,7 +48,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -95,7 +94,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -117,7 +115,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index 50ec93a13dc3..8f61b4295aa9 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -37,7 +37,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -81,7 +80,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -100,7 +98,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index f8d0b787a8f1..72253e835bec 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -84,7 +84,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true - drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 @@ -130,7 +129,6 @@ model: shuffle: false num_workers: 8 pin_memory: true - drop_last: true quadratic_duration: 30 num_buckets: 30 num_cuts_for_bins_estimate: 10000 @@ -147,7 +145,6 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: false - drop_last: false num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index 97147640d23c..9a9c522fad11 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -21,7 +21,7 @@ data: shuffle_buffer_size: 10000 random_padding: - prob: 0.5 + prob: 1.0 min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds @@ -31,9 +31,9 @@ data: augmentor: white_noise: - prob: 0.5 - min_level: -90 - max_level: -46 + prob: 1.0 + min_level: -80 + max_level: -40 gain: prob: 0.5 min_gain_dbfs: -10.0 From 7f8f760a75695dfbbebc0140bffa805bbe4e9248 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 28 May 2025 09:38:42 -0400 Subject: [PATCH 050/107] update cfg Signed-off-by: stevehuang52 --- .../fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml | 2 +- .../fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml | 2 +- .../fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml | 2 +- .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 2 +- .../asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml index c31c30a1b195..d84ca5a75b27 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml @@ -70,7 +70,7 @@ model: augmentor: white_noise: - prob: 0.5 + prob: 0.9 min_level: -90 max_level: -46 gain: diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml index 3578fc683057..143d5f758e35 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -70,7 +70,7 @@ model: augmentor: white_noise: - prob: 0.5 + prob: 0.9 min_level: -90 max_level: -46 gain: diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index 8f61b4295aa9..d1e0caec2efa 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -56,7 +56,7 @@ model: augmentor: white_noise: - prob: 0.5 + prob: 0.9 min_level: -90 max_level: -46 gain: diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 5ef9a1b6557b..fefd84646b06 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -51,7 +51,7 @@ model: augmentor: white_noise: - prob: 0.5 + prob: 0.9 min_level: -90 max_level: -46 gain: diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index 72253e835bec..ca1a2c82637a 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -103,7 +103,7 @@ model: augmentor: white_noise: - prob: 0.5 + prob: 0.9 min_level: -90 max_level: -46 gain: From b6d49957a0428f2b890b9fd3f86875af73c4b664 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 30 May 2025 10:41:50 -0400 Subject: [PATCH 051/107] update Signed-off-by: stevehuang52 --- .../fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml | 1 + .../fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml | 1 + .../fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml | 1 + .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 1 + .../fastconformer_transducer_bpe_streaming_adapter.yaml | 1 + nemo/collections/asr/parts/utils/eou_utils.py | 5 ++++- 6 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml index d84ca5a75b27..3d304df59ec7 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml @@ -48,6 +48,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml index 143d5f758e35..b0f2037e91dd 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml @@ -48,6 +48,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml index d1e0caec2efa..4e9efa03be47 100644 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml @@ -37,6 +37,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index fefd84646b06..3244dbdaf69a 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -32,6 +32,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index ca1a2c82637a..d5bc48bea0cd 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -84,6 +84,7 @@ model: batch_duration: null # you may disable batch_duration by setting it to `null` batch_size: 16 shuffle: true + drop_last: true num_workers: 8 pin_memory: true quadratic_duration: 30 diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index 8b41ca6c0b56..b5392b7cb873 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -77,7 +77,10 @@ def evaluate_eou( num_predictions = len(prediction) missing = 0 - predicted_eou = [p for p in prediction if p["eou_prob"] > threshold] + if threshold is not None and threshold > 0: + predicted_eou = [p for p in prediction if p["eou_prob"] > threshold] + else: + predicted_eou = [p for p in prediction if p["eou_pred"]] if do_sorting: predicted_eou = sorted(predicted_eou, key=lambda x: x["start_time"]) From 39d5e257d452cf733ec036c6e0145ed5eca7b4e6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 30 May 2025 15:33:44 -0400 Subject: [PATCH 052/107] add cfg Signed-off-by: stevehuang52 --- ...ybrid_asr_frame_fc_lstm_eou_streaming.yaml | 425 ++++++++++++++++++ 1 file changed, 425 insertions(+) create mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml new file mode 100644 index 000000000000..7f000742b2a8 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml @@ -0,0 +1,425 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. +# The model would have two decoders: RNNT (Transducer) and CTC + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +# Note: if training loss does not converge, you may increase warm-up to 20K. + +name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder + num_eou_classes: 4 + eou_class_weights: [1,1,100,100] + rnnt_loss_weight: 0.0 + ctc_loss_weight: 0.0 + eou_loss_weight: 1.0 + use_ctc_pred: false + freeze_encoder: true + freeze_ctc: true + freeze_rnnt: true + pad_eou_label_secs: 0.0 + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: false # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + random_padding: + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.9 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + check_tokenizer: false + add_eou_to_text: false + pad_eou_label_secs: ${model.pad_eou_label_secs} + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder + aux_ctc: + ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss + use_cer: false + ctc_reduction: 'mean_batch' + decoder: + _target_: nemo.collections.asr.modules.ConvASRDecoder + feat_in: null + num_classes: -1 + vocabulary: [] + decoding: + strategy: "greedy" + + aggregator: + _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator + mode: "weighted_sum" + weights: null + layer_idx_list: ${model.layer_idx_list} + + eou_encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.encoder.d_model} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 2 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 1 # NO subsampling + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: ${model.encoder.pos_emb_max_len} + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + eou_decoder: + _target_: nemo.collections.asr.modules.LSTMDecoder + feat_in: ${model.encoder.d_model} + num_classes: ${model.num_eou_classes} + lstm_hidden_size: 256 + num_layers: 4 + add_blank: false + + eou_loss: + weight: ${model.eou_class_weights} + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 0.0005 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: CosineAnnealing + # scheduler config override + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_eou_macro_acc" + mode: "max" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null From c799139e0eb09d0541a29ac70f2a5caae14ada3e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 30 May 2025 17:22:27 -0400 Subject: [PATCH 053/107] fix eou metric Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/utils/eou_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index b5392b7cb873..896294c6ab34 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -77,9 +77,11 @@ def evaluate_eou( num_predictions = len(prediction) missing = 0 + predicted_eou = prediction if threshold is not None and threshold > 0: predicted_eou = [p for p in prediction if p["eou_prob"] > threshold] - else: + elif all([hasattr(p, "eou_pred") for p in prediction]): + # If eou_pred is available, use it predicted_eou = [p for p in prediction if p["eou_pred"]] if do_sorting: From 923950b11a50ffe7a38a177a8a27ca3e59f42580 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 31 May 2025 22:05:13 -0400 Subject: [PATCH 054/107] update adapter Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py | 8 ++++++-- .../fastconformer_transducer_bpe_streaming_adapter.yaml | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 9cbf451ba68c..85958a4c821d 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -150,8 +150,12 @@ def setup_adapters(cfg: DictConfig, model: ASRModel): model.set_enabled_adapters(enabled=False) # disable all adapters prior to training model.set_enabled_adapters(adapter_name, enabled=True) # enable just one adapter by name - # First, Freeze all the weights of the model (not just encoder, everything) - model.freeze() + model.freeze() # freeze whole model by default + if not cfg.model.get("freeze_decoder", True): + model.decoder.unfreeze() + if hasattr(model, 'joint') and not cfg.model.get(f"freeze_joint", True): + model.joint.unfreeze() + # Activate dropout() and other modules that depend on train mode. model = model.train() # Then, Unfreeze just the adapter weights that were enabled above (no part of encoder/decoder/joint/etc) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index d5bc48bea0cd..03ffea91d009 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -74,6 +74,9 @@ model: check_decoder_adapter: True # ASR adapter key, determines whether to check if decoder adapter modules is supported check_joint_adapter: True # ASR adapter key, determines whether to check if joint adapter modules is supported + freeze_decoder: ${model.adapter.global_cfg.check_decoder_adapter} + freeze_joint: ${model.adapter.global_cfg.check_joint_adapter} + train_ds: manifest_filepath: ??? tarred_audio_filepaths: null From 604b86d090820780ca2aef6f26f4ccf24d8aecba Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 5 Jun 2025 10:00:07 -0400 Subject: [PATCH 055/107] add scripts Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/utils/eou_utils.py | 49 ++- .../asr_end_of_utterance/add_eob_labels.py | 211 ++++++++++++ .../asr_end_of_utterance/clean_manifest.py | 303 ++++++++++++++++++ .../eval_eou_with_niva.py | 152 +++++++++ scripts/asr_end_of_utterance/evaluate_eou.py | 5 + 5 files changed, 719 insertions(+), 1 deletion(-) create mode 100644 scripts/asr_end_of_utterance/add_eob_labels.py create mode 100644 scripts/asr_end_of_utterance/clean_manifest.py create mode 100644 scripts/asr_end_of_utterance/eval_eou_with_niva.py diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index 896294c6ab34..3f48bb2b5822 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -13,7 +13,7 @@ # limitations under the License. from dataclasses import dataclass -from typing import List +from typing import Dict, List import numpy as np @@ -189,3 +189,50 @@ def cal_eou_metrics_from_frame_labels( prediction=pred_seg_lst, reference=ref_seg_lst, threshold=threshold, collar=collar, do_sorting=False ) return eou_metrics + + +def get_percentiles(values: List[float], percentiles: List[float], tag: str = "") -> Dict[str, float]: + """ + Get the percentiles of a list of values. + Args: + values: list of values + percentiles: list of percentiles + Returns: + metrics: Dict of percentiles + """ + if len(values) == 0: + return [0.0] * len(percentiles) + results = np.percentile(values, percentiles).tolist() + metrics = {} + if tag: + tag += "_" + for i, p in enumerate(percentiles): + metrics[f'{tag}p{int(p)}'] = float(results[i]) + return metrics + + +def aggregate_eou_metrics(eou_metrics: List[EOUResult], target_percentiles: List = [50, 90, 95]) -> Dict[str, float]: + # Aggregate EOU metrics + num_eou_utterances = sum([x.num_utterances for x in eou_metrics]) + eou_latency = flatten_nested_list([x.latency for x in eou_metrics]) + eou_early_cutoff = flatten_nested_list([x.early_cutoff for x in eou_metrics]) + + eou_avg_num_early_cutoff = len(eou_early_cutoff) / num_eou_utterances if num_eou_utterances > 0 else 0.0 + if len(eou_latency) == 0: + eou_latency = [0.0] + if len(eou_early_cutoff) == 0: + eou_early_cutoff = [0.0] + + eou_missing = [x.missing for x in eou_metrics] + + metrics = {} + eou_latency_metrics = get_percentiles(eou_latency, target_percentiles, tag='latency') + eou_early_cutoff_metrics = get_percentiles(eou_early_cutoff, target_percentiles, tag='early_cutoff') + + metrics.update(eou_latency_metrics) + metrics.update(eou_early_cutoff_metrics) + + metrics['early_cutoff_rate'] = eou_avg_num_early_cutoff + metrics['miss_rate'] = sum(eou_missing) / num_eou_utterances if num_eou_utterances > 0 else 0.0 + + return metrics diff --git a/scripts/asr_end_of_utterance/add_eob_labels.py b/scripts/asr_end_of_utterance/add_eob_labels.py new file mode 100644 index 000000000000..ba17b7aa98f2 --- /dev/null +++ b/scripts/asr_end_of_utterance/add_eob_labels.py @@ -0,0 +1,211 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +```bash +python add_eob_labels.py /path/to/manifest/dir +``` +where output will be saved in the same directory with `-eob` suffix added to the filename. +""" + +import argparse +import json +from pathlib import Path +from string import punctuation + +from tqdm import tqdm + +parser = argparse.ArgumentParser(description="Add `is_backchannel` labels to manifest files.") +parser.add_argument( + "input_manifest", + type=str, + help="Path to the input manifest file to be cleaned.", +) +parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="Path to the output manifest file after cleaning.", +) +parser.add_argument( + "-p", + "--pattern", + type=str, + default="*.json", + help="Pattern to match files in the input directory.", +) + + +def read_manifest(manifest_path): + manifest = [] + with open(manifest_path, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line: + manifest.append(json.loads(line)) + return manifest + + +def write_manifest(manifest_path, manifest): + with open(manifest_path, 'w') as f: + for item in manifest: + f.write(json.dumps(item) + '\n') + + +def clean_text(text): + text = text.translate(str.maketrans('', '', punctuation)).lower().strip() + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + return " ".join(text.split()).strip() + + +backchannel_phrases = [ + 'absolutely', + 'ah', + 'all right', + 'alright', + 'but yeah', + 'definitely', + 'exactly', + 'go ahead', + 'good', + 'great', + 'great thanks', + 'ha ha', + 'hi', + 'i know', + 'i know right', + 'i see', + 'indeed', + 'interesting', + 'mhmm', + 'mhmm mhmm', + 'mhmm right', + 'mhmm yeah', + 'mhmm yes', + 'nice', + 'of course', + 'oh', + 'oh dear', + 'oh man', + 'oh okay', + 'oh wow', + 'oh yes', + 'ok', + 'ok thanks', + 'okay', + 'okay okay', + 'okay thanks', + 'perfect', + 'really', + 'right', + 'right exactly', + 'right right', + 'right yeah', + 'so yeah', + 'sounds good', + 'sure', + 'thank you', + 'thanks', + "that's awesome", + 'thats right', + 'thats true', + 'true', + 'uh-huh', + 'uh-huh yeah', + 'uhhuh', + 'um-humm', + 'well', + 'what', + 'wow', + 'yeah', + 'yeah i know', + 'yeah i see', + 'yeah mhmm', + 'yeah okay', + 'yeah right', + 'yeah uh-huh', + 'yeah yeah', + 'yep', + 'yes', + 'yes please', + 'yes yes', + 'you know', + "you're right", +] + +backchannel_phrases_nopc = [clean_text(phrase) for phrase in backchannel_phrases] + + +def check_if_backchannel(text): + """ + Check if the text is a backchannel phrase. + """ + # Remove punctuation and convert to lowercase + text = clean_text(text) + # Check if the text is in the list of backchannel phrases + return text in backchannel_phrases_nopc + + +def add_eob_labels(manifest_path): + num_eob = 0 + manifest = read_manifest(manifest_path) + for i, item in enumerate(manifest): + text = item['text'] + # Check if the text is a backchannel phrase + is_backchannel = check_if_backchannel(text) + # Add the EOB label to the text + if is_backchannel: + item['is_backchannel'] = True + num_eob += 1 + else: + item['is_backchannel'] = False + manifest[i] = item + return manifest, num_eob + + +def main(): + args = parser.parse_args() + input_manifest = Path(args.input_manifest) + + if input_manifest.is_dir(): + manifest_list = list(input_manifest.glob(args.pattern)) + if not manifest_list: + raise ValueError(f"No files found in {input_manifest} matching pattern `{args.pattern}`") + else: + manifest_list = [input_manifest] + + if args.output is None: + output_dir = input_manifest if input_manifest.is_dir() else input_manifest.parent + else: + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + total_num_eob = 0 + print(f"Processing {len(manifest_list)} manifest files...") + for manifest_path in tqdm(manifest_list, total=len(manifest_list)): + output_file = output_dir / f"{manifest_path.stem}-eob.json" + new_manifest, num_eob = add_eob_labels(manifest_path) + total_num_eob += num_eob + write_manifest(output_file, new_manifest) + print(f"Processed {manifest_path} and saved to {output_file}. Number of EOB labels added: {num_eob}") + + print(f"Total number of EOB labels added: {total_num_eob}") + + +if __name__ == "__main__": + main() diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py new file mode 100644 index 000000000000..b6b10e7f2ecf --- /dev/null +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -0,0 +1,303 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +```bash +python clean_manifest.py \ + /path/to/manifest/dir \ + -o /path/to/output/dir +``` + +""" + + +import argparse +import re +from pathlib import Path +from string import punctuation + +from num2words import num2words +from whisper_normalizer.english import EnglishTextNormalizer + +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest + +punctuations = punctuation.replace("'", "") + +text_normalizer = EnglishTextNormalizer() + +parser = argparse.ArgumentParser(description="Clean manifest file by text normalization") +parser.add_argument( + "input_manifest", + type=str, + help="Path to the input manifest file to be cleaned.", +) +parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="Path to the output manifest file after cleaning.", +) +parser.add_argument( + "-lower", + "--lowercase", + type=bool, + default=False, + help="Whether to convert the text to lowercase.", +) +parser.add_argument( + "-drop", + "--remove_punc", + type=bool, + default=False, + help="Whether to remove punctuation from the text.", +) +parser.add_argument( + "--normalize", + type=bool, + default=False, + help="Whether to normalize the text using Whisper text normalizer.", +) +parser.add_argument( + "-n2w", + "--replace_numbers", + type=bool, + default=True, + help="Whether to replace numbers with words.", +) +parser.add_argument( + "-p", + "--pattern", + type=str, + default="**/*.json", + help="Pattern to match files in the input directory.", +) + + +def convert_to_spoken(text: str) -> str: + # Mapping of metric units to spoken forms + unit_map = { + "kg": "kilograms", + "g": "grams", + "mg": "milligrams", + "l": "liters", + "ml": "milliliters", + "cm": "centimeters", + "mm": "millimeters", + "m": "meters", + "km": "kilometers", + "°c": "degrees celsius", + "°f": "degrees fahrenheit", + "oz": "ounces", + "lb": "pounds", + "lbs": "pounds", + } + + # Replace metric units like "12kg" or "5 ml" + def replace_metric(match): + number = match.group(1) + unit = match.group(2).lower() + spoken_unit = unit_map.get(unit, unit) + return f"{number} {spoken_unit}" + + # Replace time like "5am" or "6PM" + def replace_ampm(match): + hour = match.group(1) + meridiem = match.group(2).lower() + return f"{hour} {'a m' if meridiem == 'am' else 'p m'}" + + # Replace time like "1:30pm" + def replace_colon_time(match): + hour = match.group(1) + minute = match.group(2) + meridiem = match.group(3).lower() + return f"{hour} {minute} {'a m' if meridiem == 'am' else 'p m'}" + + # Convert feet and inches like 5'11" to "5 feet 11 inches" + def replace_feet_inches(match): + feet = match.group(1) + inches = match.group(2) + return f"{feet} feet {inches} inches" + + # Convert just feet (e.g., 6') to "6 feet" + def replace_feet_only(match): + feet = match.group(1) + return f"{feet} feet" + + # Convert just inches (e.g., 10") to "10 inches" + def replace_inches_only(match): + inches = match.group(1) + return f"{inches} inches" + + # Apply replacements + # First: time with colon (e.g., 1:30pm) + text = re.sub(r'\b(\d{1,2}):(\d{2})(am|pm)\b', replace_colon_time, text, flags=re.IGNORECASE) + + # Then: basic am/pm (e.g., 5am) + text = re.sub(r'\b(\d{1,2})(am|pm)\b', replace_ampm, text, flags=re.IGNORECASE) + + # Then: replace 1st, 2nd, 3rd, etc + text = text.replace("1st", "first") + text = text.replace("2nd", "second") + text = text.replace("3rd", "third") + text = text.replace("@", " at ") + + # Finally: metric units + text = re.sub( + r'\b(\d+(?:\.\d+)?)\s?(kg|g|mg|l|ml|cm|mm|m|km|°c|°f|oz|lbs?|LB|LBS?)\b', + replace_metric, + text, + flags=re.IGNORECASE, + ) + text = re.sub(r'\b(\d+)\'(\d+)"', replace_feet_inches, text) # e.g., 5'11" + text = re.sub(r'\b(\d+)\'', replace_feet_only, text) # e.g., 6' + text = re.sub(r'(\d+)"', replace_inches_only, text) # e.g., 10" + + return text + + +def replace_numbers_with_words(text): + def convert_number(match): + num_str = match.group() + original = num_str + + # Remove dollar sign + is_dollar = False + if num_str.startswith('$'): + is_dollar = True + num_str = num_str[1:] + + # Remove commas + num_str = num_str.replace(',', '') + + try: + if '.' in num_str: + # Convert decimal number + integer_part, decimal_part = num_str.split('.') + words = num2words(int(integer_part)) + ' point ' + ' '.join(num2words(int(d)) for d in decimal_part) + else: + words = num2words(int(num_str)) + if is_dollar: + words += ' dollars' + return words + " " + except: + return original # Return original if conversion fails + + # Pattern matches: $3,000 or 3,000.45 or 1234 + pattern = re.compile(r'\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\$?\d+(?:\.\d+)?') + result = pattern.sub(convert_number, text) + + result = " ".join(result.split()) # Remove extra spaces + return result + + +def drop_punctuations(text): + """ + Clean the text by removing invalid characters and converting to lowercase. + + :param text: Input text. + :return: Cleaned text. + """ + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + text = ' '.join(text.split()) # Remove extra spaces + return text.strip() + + +def clean_label(_str: str) -> str: + """ + Remove unauthorized characters in a string, lower it and remove unneeded spaces + """ + # replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] + replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] + replace_with_apos = [char for char in '‘’ʻ‘’‘'] + _str = _str.strip() + for i in replace_with_blank: + _str = _str.replace(i, "") + for i in replace_with_apos: + _str = _str.replace(i, "'") + + text = _str + text = text.replace("\u2103", "celsius") + text = text.replace("\u2109", "fahrenheit") + text = text.replace("\u00b0", "degrees") + text = text.replace("\u2019", "'") + text = text.replace("\\", ".") + text = text.replace("\n", " ") + text = text.replace("\r", " ") + text = text.replace("\t", " ") + + ret = " ".join(_str.split()) + return ret + + +def main(args): + + manifest_files = Path(args.input_manifest) + if manifest_files.is_dir(): + manifest_files = list(manifest_files.glob(args.pattern)) + elif manifest_files.is_file(): + manifest_files = [manifest_files] + else: + raise ValueError(f"Invalid input manifest path: {args.input_manifest}") + + for manifest_file in manifest_files: + print(f"Processing manifest file: {manifest_file}") + postfix = "-cleaned" + postfix += "_norm" if args.normalize else "" + postfix += "_n2w" if args.replace_numbers else "" + if args.lowercase and args.remove_punc: + postfix += "_noPC" + else: + postfix += "_lc" if args.lowercase else "" + postfix += "_np" if args.remove_punc else "" + + output_manifest = manifest_file.with_name(f"{manifest_file.stem}{postfix}{manifest_file.suffix}") + + if args.output: + if args.output.endswith(".json"): + if len(manifest_files) > 1: + raise ValueError("Output path must be a directory when processing multiple manifest files.") + output_manifest = Path(args.output) + else: + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + output_manifest = output_dir / output_manifest.name + + manifest = read_manifest(str(manifest_file)) + + for i, item in enumerate(manifest): + text = str(item["text"]) + manifest[i]["origin_text"] = text + if args.normalize: + text = text_normalizer(text) + if args.replace_numbers: + text = convert_to_spoken(text) + text = replace_numbers_with_words(text) + if args.lowercase: + text = text.lower() + if args.remove_punc: + text = text.translate(str.maketrans("", "", punctuations)) + text = drop_punctuations(text) + manifest[i]["text"] = clean_label(text) + + write_manifest(str(output_manifest), manifest) + print(f"Cleaned manifest saved to {output_manifest}") + + +if __name__ == "__main__": + args = parser.parse_args() + main(args) diff --git a/scripts/asr_end_of_utterance/eval_eou_with_niva.py b/scripts/asr_end_of_utterance/eval_eou_with_niva.py new file mode 100644 index 000000000000..1410dbdf27b5 --- /dev/null +++ b/scripts/asr_end_of_utterance/eval_eou_with_niva.py @@ -0,0 +1,152 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example usage: + +The NIVA_PRED_ROOT and REFERENCE_ROOT directories should have the following structure: + +: +->dataset1/ + eou/ + ctm/ +->dataset2/ + eou/ + ctm/ + +: +->dataset1/ +->dataset2/ + + +```bash +python eval_eou_with_niva.py \ + --prediction $NIVA_PRED_ROOT \ + --reference $REFERENCE_ROOT \ + --multiple +``` + +""" + + +import argparse +import json +from pathlib import Path +from typing import List + +from nemo.collections.asr.parts.utils.eou_utils import EOUResult, aggregate_eou_metrics, evaluate_eou + +parser = argparse.ArgumentParser(description="Evaluate end of utterance predictions against reference labels.") +parser.add_argument( + "-p", + "--prediction", + type=str, + required=True, + help="Path to the directory containing the predictions.", +) +parser.add_argument( + "-r", + "--reference", + type=str, + required=True, + help="Path to the directory containing the groundtruth.", +) +parser.add_argument( + "--eob", + action="store_true", + help="Whether to evaluate end of backchannel predictions.", +) +parser.add_argument( + "--multiple", + action="store_true", + help="Whether to evaluate multiple datasets.", +) + + +def load_segLST(directory: str, use_eob: bool = False) -> dict: + json_files = list(Path(directory).glob("*.json")) + segLST = {} + for json_file in json_files: + key = json_file.stem + with open(json_file, 'r') as f: + data = json.load(f) + is_backchannel = data[0].get("is_backchannel", False) if data else False + if not isinstance(is_backchannel, list): + is_backchannel = [is_backchannel] + if any(is_backchannel) and not use_eob: + continue + segLST[key] = data + return segLST + + +def evaluate_eou_predictions(prediction_dir: str, reference_dir: str, use_eob: bool = False) -> List[EOUResult]: + prediction_dir = Path(prediction_dir) / "eou" + prediction_segLST = load_segLST(prediction_dir, use_eob) + reference_segLST = load_segLST(reference_dir, use_eob) + + eou_metrics = [] + for key, reference in reference_segLST.items(): + if key not in prediction_segLST: + raise ValueError(f"Key {key} in reference not found in predictions.") + prediction = prediction_segLST[key] + eou_result = evaluate_eou( + prediction=prediction, reference=reference, threshold=None, collar=0.0, do_sorting=True + ) + eou_metrics.append(eou_result) + + results = aggregate_eou_metrics(eou_metrics) + + # add prefix to the keys of the results + prefix = Path(reference_dir).stem + prefix += "_eob" if use_eob else "_eou" + results = {f"{prefix}_{k}": v for k, v in results.items()} + + return results + + +if __name__ == "__main__": + args = parser.parse_args() + + prediction_dir = Path(args.prediction) + reference_dir = Path(args.reference) + + if not prediction_dir.is_dir(): + raise ValueError(f"Prediction directory {prediction_dir} does not exist or is not a directory.") + if not reference_dir.is_dir(): + raise ValueError(f"Reference directory {reference_dir} does not exist or is not a directory.") + + if args.multiple: + # get all subdirectories in the prediction and reference directories + prediction_dirs = sorted([x for x in prediction_dir.glob("*/") if x.is_dir()]) + reference_dirs = sorted([x for x in reference_dir.glob("*/") if x.is_dir()]) + if len(prediction_dirs) != len(reference_dirs): + raise ValueError( + f"Number of prediction directories {len(prediction_dirs)} must match number of reference directories {len(reference_dirs)}." + ) + else: + prediction_dirs = [prediction_dir] + reference_dirs = [reference_dir] + + for ref_dir, pred_dir in zip(reference_dirs, prediction_dirs): + if args.multiple and ref_dir.stem != pred_dir.stem: + raise ValueError( + f"Reference directory {ref_dir} and prediction directory {pred_dir} must have the same name." + ) + results = evaluate_eou_predictions(prediction_dir=str(pred_dir), reference_dir=str(ref_dir), use_eob=args.eob) + # Print the results + print("==========================================") + print(f"Evaluation Results for: {pred_dir} against {ref_dir}") + for key, value in results.items(): + print(f"{key}: {value:.4f}") + print("==========================================") diff --git a/scripts/asr_end_of_utterance/evaluate_eou.py b/scripts/asr_end_of_utterance/evaluate_eou.py index 85e8c4b79fd4..5f842d7a5f4d 100644 --- a/scripts/asr_end_of_utterance/evaluate_eou.py +++ b/scripts/asr_end_of_utterance/evaluate_eou.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +This script is deprecated !!!! +""" + + import argparse import json from typing import List From 5533de7e8ba02f164acd122cc15c3d873da0890e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 5 Jun 2025 11:40:15 -0400 Subject: [PATCH 056/107] update docstring Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_eou_eval.py | 40 +++++++++++++++++++ .../speech_to_text_hybrid_eou_train.py | 2 +- .../speech_to_text_hybrid_frame_eou_train.py | 9 +---- nemo/collections/asr/models/asr_eou_models.py | 1 - 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_eou_eval.py b/examples/asr/asr_eou/speech_to_text_eou_eval.py index b0879b227d46..9b0a580b343c 100644 --- a/examples/asr/asr_eou/speech_to_text_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_eou_eval.py @@ -12,6 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Example usage: + +```bash +NEMO_PATH=/home/heh/codes/nemo-eou +export PYTHONPATH=$NEMO_PATH:$PYTHONPATH + +TEST_MANIFEST="[/path/to/your/test_manifest.json,/path/to/your/test_manifest2.json,...]" +TEST_NAME="[test_name1,test_name2,...]" +TEST_BATCH=32 +NUM_WORKERS=8 + +PRETRAINED_NEMO=/path/to/EOU/model.nemo +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_eou_eval.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/asr_eou +CONFIG_NAME=fastconformer_transducer_bpe_streaming + +export CUDA_VISIBLE_DEVICES=0 && \ +python $SCRIPT \ + --config-path $CONFIG_PATH \ + --config-name $CONFIG_NAME \ + ++init_from_nemo_model=$PRETRAINED_NEMO \ + ~model.train_ds \ + ~model.validation_ds \ + ++model.test_ds.defer_setup=true \ + ++model.test_ds.sample_rate=16000 \ + ++model.test_ds.manifest_filepath=$TEST_MANIFEST \ + ++model.test_ds.name=$TEST_NAME \ + ++model.test_ds.batch_size=$TEST_BATCH \ + ++model.test_ds.num_workers=$NUM_WORKERS \ + ++model.test_ds.drop_last=false \ + ++model.test_ds.force_finite=true \ + ++model.test_ds.shuffle=false \ + ++model.test_ds.pin_memory=true \ + exp_manager.name=$EXP_NAME-eval \ + exp_manager.create_wandb_logger=false \ +``` + +""" + import lightning.pytorch as pl import torch diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py index 5a123396278e..8fdc780b0342 100644 --- a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py @@ -45,7 +45,7 @@ EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming CONFIG_NAME=fastconformer_transducer_bpe_streaming diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py index ea79980da691..1a7058ef4791 100644 --- a/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py @@ -45,7 +45,7 @@ EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming CONFIG_NAME=fastconformer_transducer_bpe_streaming @@ -71,17 +71,12 @@ """ - -from typing import Optional - import lightning.pytorch as pl import torch from omegaconf import DictConfig, OmegaConf -from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel +from nemo.collections.asr.models import ASRModel from nemo.collections.asr.models.asr_eou_models import EncDecHybridASRFrameEOUModel -from nemo.collections.asr.modules.conv_asr import ConvASRDecoder -from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint from nemo.core.classes import typecheck from nemo.core.config import hydra_runner from nemo.utils import logging diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index d4de081ef557..579dcf73195c 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -1331,7 +1331,6 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader # Calculate EOU metrics eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all, log_probs) - eou_loss = self.get_eou_loss(eou_pred, eou_pred_len, eou_labels, eou_labels_len) tensorboard_logs['val_eou_loss'] = eou_loss From 68aa1ca836b116776eff60a6620ca15ad3be591a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 5 Jun 2025 19:57:04 -0400 Subject: [PATCH 057/107] update Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 2 +- .../asr_end_of_utterance/clean_manifest.py | 85 ++++++++++++++++++- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 579dcf73195c..ef58f46b670b 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -39,7 +39,7 @@ cal_eou_metrics_from_frame_labels, flatten_nested_list, ) -from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.collections.asr.parts.utils.manifest_utils import write_manifest from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.data.utils import move_data_to_device diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index b6b10e7f2ecf..4ac8c3b40f37 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -38,7 +38,7 @@ text_normalizer = EnglishTextNormalizer() -parser = argparse.ArgumentParser(description="Clean manifest file by text normalization") +parser = argparse.ArgumentParser(description="Clean manifest file by droping PnC") parser.add_argument( "input_manifest", type=str, @@ -85,6 +85,18 @@ default="**/*.json", help="Pattern to match files in the input directory.", ) +parser.add_argument( + "-t", + "--text_field", + type=str, + default="text", + help="Field in the manifest to clean. Default is 'text'.", +) +parser.add_argument( + "--auto_pc", + action="store_true", + help="If set, will add auto capitalization and punctuation at the end of the text.", +) def convert_to_spoken(text: str) -> str: @@ -244,8 +256,70 @@ def clean_label(_str: str) -> str: return ret -def main(args): +def ends_with_punctuation(s: str) -> bool: + # Strip trailing whitespace + s = s.rstrip() + + # consider this set to be punctuation that's acceptable to end a sentence with + puncturation_chars = [",", ".", ":", ";", "?", "!", "-", "—", "–", "…"] + + # If string is empty after stripping, return False + if not s: + return False + + # Get the last character + last_char = s[-1] + + # Return True if the last character is punctuation, otherwise False + return last_char in puncturation_chars + + +def add_period_if_needed(text: str) -> str: + """ + Add a period at the end of the text if it does not already end with one. + """ + if not ends_with_punctuation(text): + text += "." + return text.strip() + + +def capitalize_self_i(text): + # Replace standalone lowercase "i" with "I" + # Handles "i", "i.", "i?", "i'll", "i'm", etc. + return re.sub(r'\b(i)(?=[\s.,!?;:\'\"-]|$)', r'I', text) + + +def add_space_after_punctuation(text): + # Add a space after punctuation if it's not already followed by one or by the end of the string + return re.sub(r'([,\.?;:])(?=\S)', r'\1 ', text) + + +def add_auto_capitalization(text): + if text.lower() != text: + # If the text is not all lowercase, we assume it has some capitalization + return text + # Remove space before punctuation (.,!?;:) + text = re.sub(r'\s+([.,!?;:])', r'\1', text) + + # Capitalize the first letter of each sentence + def capitalize_sentences(match): + return match.group(1) + match.group(2).upper() + + # Ensure first character is capitalized + text = text.strip() + if text: + text = text[0].upper() + text[1:] + + text = capitalize_self_i(text) + text = add_space_after_punctuation(text) + # Capitalize after sentence-ending punctuation followed by space(s) + text = re.sub(r'([.!?]\s+)([a-z])', capitalize_sentences, text) + return text + + +def main(args): + text_field = args.text_field manifest_files = Path(args.input_manifest) if manifest_files.is_dir(): manifest_files = list(manifest_files.glob(args.pattern)) @@ -264,6 +338,7 @@ def main(args): else: postfix += "_lc" if args.lowercase else "" postfix += "_np" if args.remove_punc else "" + postfix += "_aPC" if args.auto_pc else "" output_manifest = manifest_file.with_name(f"{manifest_file.stem}{postfix}{manifest_file.suffix}") @@ -280,8 +355,8 @@ def main(args): manifest = read_manifest(str(manifest_file)) for i, item in enumerate(manifest): - text = str(item["text"]) - manifest[i]["origin_text"] = text + text = str(item[text_field]) + manifest[i]["original_text"] = text if args.normalize: text = text_normalizer(text) if args.replace_numbers: @@ -292,6 +367,8 @@ def main(args): if args.remove_punc: text = text.translate(str.maketrans("", "", punctuations)) text = drop_punctuations(text) + if args.auto_pc: + text = add_auto_capitalization(text) manifest[i]["text"] = clean_label(text) write_manifest(str(output_manifest), manifest) From dd99cf45e86c32f282b2b2bfb29e23dd10ca960a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 6 Jun 2025 20:45:06 -0400 Subject: [PATCH 058/107] update Signed-off-by: stevehuang52 --- ...ormer_transducer_bpe_streaming_augval.yaml | 320 ++++++++++++++++++ .../asr/data/audio_to_eou_label_lhotse.py | 3 + 2 files changed, 323 insertions(+) create mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml new file mode 100644 index 000000000000..d539672a0a53 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml @@ -0,0 +1,320 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming-EOU" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.9 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 1.0 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 1.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 80 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 # 1e-4 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing # NoamAnnealing CosineAnnealing + # scheduler config override + d_model: ${model.encoder.d_model} + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 2fc16d36def0..1c221d7bc785 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -419,6 +419,9 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta elif self.padding_cfg.pad_distribution == 'normal': total_padding_duration = np.random.normal(self.padding_cfg.normal_mean, self.padding_cfg.normal_std) total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) + elif self.padding_cfg.pad_distribution == 'constant': + total_padding_duration = 2 * self.padding_cfg.min_pad_duration + min_padding_duration = 0 else: raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") From e40459a77aeec1b7228c48658124775210fbbdb1 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 6 Jun 2025 20:51:50 -0400 Subject: [PATCH 059/107] update generate eval data Signed-off-by: stevehuang52 --- .../generate_noisy_eval_data.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index 3a6612e23607..c231b708abc1 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -162,6 +162,13 @@ def process_manifest(data_cfg, output_dir): output_audio_dir = output_dir flatten_audio_path = False + if data_cfg.random_padding.pad_distribution == "constant": + is_constant_padding = True + pad_dur = data_cfg.random_padding.min_pad_duration + else: + is_constant_padding = False + pad_dur = None + # Load the dataset tokenizer = parsers.make_parser(labels) # dummy tokenizer dataset = LhotseSpeechToTextBpeEOUDataset(cfg=data_cfg, tokenizer=tokenizer, return_cuts=True) @@ -207,6 +214,15 @@ def process_manifest(data_cfg, output_dir): manifest_item["offset"] = 0 manifest_item["duration"] = audio.shape[0] / dataset.sample_rate + if is_constant_padding: + # Adjust the sou_time and eou_time for constant padding, if they exist + if 'sou_time' in manifest_item and 'eou_time' in manifest_item: + if not isinstance(manifest_item['sou_time'], list): + manifest_item['sou_time'] = manifest_item['sou_time'] + pad_dur + manifest_item['eou_time'] = manifest_item['eou_time'] + pad_dur + else: + manifest_item['sou_time'] = [x + pad_dur for x in manifest_item['sou_time']] + manifest_item['eou_time'] = [x + pad_dur for x in manifest_item['eou_time']] manifest.append(manifest_item) # Write the output manifest From 97c17f12b8abb5d56e90f61f6aac246131312583 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sun, 8 Jun 2025 19:56:02 -0400 Subject: [PATCH 060/107] update eou val Signed-off-by: stevehuang52 --- ...ormer_transducer_bpe_streaming_augval.yaml | 12 +++++++ .../asr/data/audio_to_eou_label_lhotse.py | 11 +++++-- scripts/asr_end_of_utterance/conf/data.yaml | 14 ++++---- .../generate_noisy_eval_data.py | 32 ++++++++++--------- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml index d539672a0a53..17dc9387c5a4 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml @@ -90,6 +90,8 @@ model: max_pad_duration: 1.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' + pre_pad_duration: 0.2 + post_pad_duration: 3.0 test_ds: manifest_filepath: null @@ -109,6 +111,16 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + random_padding: + prob: 1.0 + min_pad_duration: 0.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' + pre_pad_duration: 0.2 + post_pad_duration: 3.0 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py # We recommend to use vocab size of 1024 with SPE Unigram for most languages tokenizer: diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 1c221d7bc785..8d16657072d5 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -414,18 +414,23 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta else: min_padding_duration = 2 * self.padding_cfg.min_pad_duration + pre_padding_duration = None + post_padding_duration = None + if self.padding_cfg.pad_distribution == 'uniform': total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) elif self.padding_cfg.pad_distribution == 'normal': total_padding_duration = np.random.normal(self.padding_cfg.normal_mean, self.padding_cfg.normal_std) total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) elif self.padding_cfg.pad_distribution == 'constant': - total_padding_duration = 2 * self.padding_cfg.min_pad_duration - min_padding_duration = 0 + pass else: raise ValueError(f"Unknown padding distribution: {self.padding_cfg.pad_distribution}") - if min_padding_duration == 0: + if self.padding_cfg.pad_distribution == 'constant': + pre_padding_duration = self.padding_cfg.pre_pad_duration + post_padding_duration = self.padding_cfg.post_pad_duration + elif min_padding_duration == 0: pre_padding_duration = total_padding_duration / 2 post_padding_duration = total_padding_duration / 2 else: diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index 9a9c522fad11..f7b48251b176 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -21,13 +21,13 @@ data: shuffle_buffer_size: 10000 random_padding: - prob: 1.0 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 30.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution for padding duration - normal_std: 2.0 # standard deviation of normal distribution for padding duration + prob: 1.0 + min_pad_duration: 0.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' + pre_pad_duration: 0.2 + post_pad_duration: 3.0 augmentor: white_noise: diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index c231b708abc1..85b760ab04b7 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -48,7 +48,7 @@ import torch import yaml from lhotse.cut import MixedCut -from omegaconf import OmegaConf, open_dict +from omegaconf import ListConfig, OmegaConf, open_dict from tqdm import tqdm from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset @@ -132,14 +132,17 @@ def main(cfg): yaml.dump(config, f) logging.info(f'Config dumped to {output_dir / "config.yaml"}') - input_manifest_file = Path(cfg.data.manifest_filepath) - if input_manifest_file.is_dir(): - pattern = cfg.data.get('pattern', '*.json') - manifest_list = list(input_manifest_file.glob(pattern)) - if not manifest_list: - raise ValueError(f"No files found in {input_manifest_file} matching pattern `{pattern}`") + if isinstance(cfg.data.manifest_filepath, (list, ListConfig)): + manifest_list = [Path(x) for x in cfg.data.manifest_filepath] else: - manifest_list = [Path(x) for x in str(input_manifest_file).split(",")] + input_manifest_file = Path(cfg.data.manifest_filepath) + if input_manifest_file.is_dir(): + pattern = cfg.data.get('pattern', '*.json') + manifest_list = list(input_manifest_file.glob(pattern)) + if not manifest_list: + raise ValueError(f"No files found in {input_manifest_file} matching pattern `{pattern}`") + else: + manifest_list = [Path(x) for x in str(input_manifest_file).split(",")] logging.info(f'Found {len(manifest_list)} manifest files to process...') @@ -164,10 +167,10 @@ def process_manifest(data_cfg, output_dir): if data_cfg.random_padding.pad_distribution == "constant": is_constant_padding = True - pad_dur = data_cfg.random_padding.min_pad_duration + pre_pad_dur = data_cfg.random_padding.pre_pad_duration else: is_constant_padding = False - pad_dur = None + pre_pad_dur = None # Load the dataset tokenizer = parsers.make_parser(labels) # dummy tokenizer @@ -198,7 +201,6 @@ def process_manifest(data_cfg, output_dir): if k == "dataloading_info": continue manifest_item[k] = v - audio = audio_batch[j][: audio_len_batch[j]] audio_file = cut.recording.sources[0].source @@ -218,11 +220,11 @@ def process_manifest(data_cfg, output_dir): # Adjust the sou_time and eou_time for constant padding, if they exist if 'sou_time' in manifest_item and 'eou_time' in manifest_item: if not isinstance(manifest_item['sou_time'], list): - manifest_item['sou_time'] = manifest_item['sou_time'] + pad_dur - manifest_item['eou_time'] = manifest_item['eou_time'] + pad_dur + manifest_item['sou_time'] = manifest_item['sou_time'] + pre_pad_dur + manifest_item['eou_time'] = manifest_item['eou_time'] + pre_pad_dur else: - manifest_item['sou_time'] = [x + pad_dur for x in manifest_item['sou_time']] - manifest_item['eou_time'] = [x + pad_dur for x in manifest_item['eou_time']] + manifest_item['sou_time'] = [x + pre_pad_dur for x in manifest_item['sou_time']] + manifest_item['eou_time'] = [x + pre_pad_dur for x in manifest_item['eou_time']] manifest.append(manifest_item) # Write the output manifest From 152f1b5b6eb892254b47ef76c2b1bb2919ef66b5 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 27 Jun 2025 09:44:34 -0400 Subject: [PATCH 061/107] update Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 2 +- .../asr_end_of_utterance/clean_manifest.py | 20 ++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 8d16657072d5..fbe519ff0b0a 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -317,7 +317,7 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): assert len(sou_time) == len( eou_time - ), f"Number of SOU time and EOU time do not match: SOU ({len(sou_time)}) vs EOU ({len(eou_time)})" + ), f"Number of SOU time and EOU time do not match: SOU ({sou_time}) vs EOU ({eou_time})" if cut.has_custom("is_backchannel"): is_backchannel = cut.custom["is_backchannel"] diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index 4ac8c3b40f37..b922d5a3213c 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -23,9 +23,9 @@ """ - import argparse import re +import unicodedata from pathlib import Path from string import punctuation @@ -318,6 +318,23 @@ def capitalize_sentences(match): return text +def unicode_to_ascii(text: str) -> str: + """ + Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) + into their closest ASCII equivalents. + """ + # Normalize the string to NFKD to separate base characters from diacritics + normalized = unicodedata.normalize('NFKD', text) + + # Encode to ASCII bytes, ignoring characters that can't be converted + ascii_bytes = normalized.encode('ascii', 'ignore') + + # Decode back to string + ascii_text = ascii_bytes.decode('ascii') + + return ascii_text + + def main(args): text_field = args.text_field manifest_files = Path(args.input_manifest) @@ -357,6 +374,7 @@ def main(args): for i, item in enumerate(manifest): text = str(item[text_field]) manifest[i]["original_text"] = text + text = unicode_to_ascii(text) if args.normalize: text = text_normalizer(text) if args.replace_numbers: From 6a599343d0d93af5b1d2564dce06272afaea5fca Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 27 Jun 2025 10:26:33 -0400 Subject: [PATCH 062/107] add drop_pnc=true as default for dataloading Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index fbe519ff0b0a..2838820aeff3 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -13,6 +13,7 @@ # limitations under the License. import math +import unicodedata from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -54,6 +55,39 @@ class AudioToTextEOUBatch: eou_target_lengths: torch.Tensor | None = None +def unicode_to_ascii(text: str) -> str: + """ + Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) + into their closest ASCII equivalents. + """ + # Normalize the string to NFKD to separate base characters from diacritics + normalized = unicodedata.normalize('NFKD', text) + + # Encode to ASCII bytes, ignoring characters that can't be converted + ascii_bytes = normalized.encode('ascii', 'ignore') + + # Decode back to string + ascii_text = ascii_bytes.decode('ascii') + + return ascii_text + + +def drop_pnc(text: str) -> str: + """ + Clean the text by removing invalid characters and converting to lowercase. + + :param text: Input text. + :return: Cleaned text. + """ + valid_chars = "abcdefghijklmnopqrstuvwxyz'" + text = text.lower() + text = unicode_to_ascii(text) + text = text.replace(":", " ") + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + text = ' '.join(text.split()).strip() + return text + + class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, @@ -136,7 +170,7 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.return_cuts = return_cuts self.eou_string = self.cfg.get('eou_string', EOU_STRING) self.eob_string = self.cfg.get('eob_string', EOB_STRING) - + self.drop_pnc = self.cfg.get('drop_pnc', True) if cfg.get('check_tokenizer', True): self._check_special_tokens(tokenizer) @@ -371,6 +405,8 @@ def _get_text_tokens(self, cut: Cut): if not text: # skip empty utterances continue + if self.drop_pnc: + text = drop_pnc(text) if self.add_eou_to_text: eou_string = self.eob_string if is_backchannel[i] else self.eou_string if self.add_sep_before_eou: From 580156d6d2454afa4fb252e30806c0770fcd293b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 27 Jun 2025 16:34:01 -0400 Subject: [PATCH 063/107] update Signed-off-by: stevehuang52 --- .../asr_end_of_utterance/clean_manifest.py | 241 ++++++++++++++++-- tools/nemo_forced_aligner/align_eou.py | 23 +- 2 files changed, 244 insertions(+), 20 deletions(-) diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index b922d5a3213c..d6919f8b1d7d 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -24,11 +24,13 @@ """ import argparse +import datetime import re import unicodedata from pathlib import Path from string import punctuation +import dateutil.parser as date_parser from num2words import num2words from whisper_normalizer.english import EnglishTextNormalizer @@ -97,9 +99,173 @@ action="store_true", help="If set, will add auto capitalization and punctuation at the end of the text.", ) +parser.add_argument( + "--format", + default="asr", + choices=["asr", "conv"], + help="Format of the manifest. Default is 'asr'.", +) +parser.add_argument( + "--keep_name", + action="store_true", + help="If set, will keep the original name of the manifest file.", +) + +# Spoken representations + +MONTHS = [ + "", + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", +] + +ORDINALS = { + 1: "first", + 2: "second", + 3: "third", + 4: "fourth", + 5: "fifth", + 6: "sixth", + 7: "seventh", + 8: "eighth", + 9: "ninth", + 10: "tenth", + 11: "eleventh", + 12: "twelfth", + 13: "thirteenth", + 14: "fourteenth", + 15: "fifteenth", + 16: "sixteenth", + 17: "seventeenth", + 18: "eighteenth", + 19: "nineteenth", + 20: "twentieth", + 21: "twenty first", + 22: "twenty second", + 23: "twenty third", + 24: "twenty fourth", + 25: "twenty fifth", + 26: "twenty sixth", + 27: "twenty seventh", + 28: "twenty eighth", + 29: "twenty ninth", + 30: "thirtieth", + 31: "thirty first", +} + + +def speak_year(year: int) -> str: + if 2000 <= year <= 2099: + return f"twenty {speak_number(year % 100)}" + elif 1900 <= year <= 1999: + return f"nineteen {speak_number(year % 100)}" + else: + return str(year) + + +def speak_number(n: int) -> str: + num_words = { + 0: "zero", + 1: "one", + 2: "two", + 3: "three", + 4: "four", + 5: "five", + 6: "six", + 7: "seven", + 8: "eight", + 9: "nine", + 10: "ten", + 11: "eleven", + 12: "twelve", + 13: "thirteen", + 14: "fourteen", + 15: "fifteen", + 16: "sixteen", + 17: "seventeen", + 18: "eighteen", + 19: "nineteen", + 20: "twenty", + 30: "thirty", + 40: "forty", + 50: "fifty", + 60: "sixty", + 70: "seventy", + 80: "eighty", + 90: "ninety", + } + if n <= 20: + return num_words[n] + elif n < 100: + tens, ones = divmod(n, 10) + return f"{num_words[tens * 10]} {num_words[ones]}" if ones else num_words[tens * 10] + else: + return str(n) + + +def parse_with_auto_dayfirst(date_str: str): + try: + # Try both ways + parsed_us = date_parser.parse(date_str, dayfirst=False) + parsed_eu = date_parser.parse(date_str, dayfirst=True) + + # If one of the parses clearly makes more sense, return it + if parsed_us.month > 12: + return parsed_eu + if parsed_eu.month > 12: + return parsed_us + + # If day is greater than 12, it's probably day-first + if parsed_us.day > 12 and parsed_eu.day <= 12: + return parsed_eu + elif parsed_eu.day > 12 and parsed_us.day <= 12: + return parsed_us + + # Default fallback (assumes US style) + return parsed_us + except Exception: + return None + + +def date_to_spoken_string(date_str: str) -> str: + parsed = parse_with_auto_dayfirst(date_str) + if not parsed: + return None + + month = MONTHS[parsed.month] + day = ORDINALS[parsed.day] + spoken = f"{month} {day} {speak_year(parsed.year)}" + + return spoken + + +def replace_dates_in_text(text: str) -> str: + # Regex pattern to match common date formats like: + # 5/22, 05/22/2025, 22/05/2025, 2025-05-22 + date_pattern = r'\b(?:\d{1,4}[-/])?\d{1,2}[-/]\d{1,4}\b' + + def replace_match(match): + date_str = match.group(0) + spoken = date_to_spoken_string(date_str) + return spoken if spoken else date_str + + return re.sub(date_pattern, replace_match, text) def convert_to_spoken(text: str) -> str: + + text = replace_dates_in_text(text) # Convert dates to spoken form + # Mapping of metric units to spoken forms unit_map = { "kg": "kilograms", @@ -235,7 +401,7 @@ def clean_label(_str: str) -> str: """ # replace_with_space = [char for char in '/?*\",.:=?_{|}~¨«·»¡¿„…‧‹›≪≫!:;ː→'] replace_with_blank = [char for char in '`¨´‘’“”`ʻ‘’“"‘”'] - replace_with_apos = [char for char in '‘’ʻ‘’‘'] + replace_with_apos = [char for char in '‘’ʻ‘’‘'] + ["\u2019"] _str = _str.strip() for i in replace_with_blank: _str = _str.replace(i, "") @@ -335,6 +501,51 @@ def unicode_to_ascii(text: str) -> str: return ascii_text +def clean_text(text: str, args) -> str: + """ + Clean the text based on the provided arguments. + """ + text = unicode_to_ascii(text) + if args.normalize: + text = text_normalizer(text) + if args.replace_numbers: + text = convert_to_spoken(text) + text = replace_numbers_with_words(text) + if args.lowercase: + text = text.lower() + if args.remove_punc: + text = text.translate(str.maketrans("", "", punctuations)) + text = drop_punctuations(text) + if args.auto_pc: + text = add_auto_capitalization(text) + return clean_label(text) + + +def clean_asr_manifest(manifest, text_field, args): + for i, item in enumerate(manifest): + text = str(item[text_field]) + manifest[i]["origin_text"] = text + manifest[i]["text"] = clean_text(text, args) + return manifest + + +def clean_conv_manifest(manifest, text_field, args): + new_manifest = [] + for i, item in enumerate(manifest): + conversations = [] + for turn in item["conversations"]: + conversations.append( + { + "role": turn["role"], + "value": clean_text(turn["value"], args), + "type": turn.get("type", "text"), + } + ) + item["conversations"] = conversations + new_manifest.append(item) + return manifest + + def main(args): text_field = args.text_field manifest_files = Path(args.input_manifest) @@ -367,27 +578,19 @@ def main(args): else: output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) - output_manifest = output_dir / output_manifest.name + if args.keep_name: + output_manifest = output_dir / manifest_file.name + else: + output_manifest = output_dir / output_manifest.name manifest = read_manifest(str(manifest_file)) - for i, item in enumerate(manifest): - text = str(item[text_field]) - manifest[i]["original_text"] = text - text = unicode_to_ascii(text) - if args.normalize: - text = text_normalizer(text) - if args.replace_numbers: - text = convert_to_spoken(text) - text = replace_numbers_with_words(text) - if args.lowercase: - text = text.lower() - if args.remove_punc: - text = text.translate(str.maketrans("", "", punctuations)) - text = drop_punctuations(text) - if args.auto_pc: - text = add_auto_capitalization(text) - manifest[i]["text"] = clean_label(text) + if args.format == "asr": + manifest = clean_asr_manifest(manifest, text_field, args) + elif args.format == "conv": + manifest = clean_conv_manifest(manifest, text_field, args) + else: + raise ValueError(f"Unsupported manifest format: {args.format}") write_manifest(str(output_manifest), manifest) print(f"Cleaned manifest saved to {output_manifest}") diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 6bc864ff90fa..aef7dbbb5cf2 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -17,6 +17,7 @@ import math import os import shutil +import unicodedata import uuid from dataclasses import dataclass, field, is_dataclass from pathlib import Path @@ -174,6 +175,23 @@ class AlignmentConfig: gpu_idx: int = 0 # current GPU index +def unicode_to_ascii(text: str) -> str: + """ + Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) + into their closest ASCII equivalents. + """ + # Normalize the string to NFKD to separate base characters from diacritics + normalized = unicodedata.normalize('NFKD', text) + + # Encode to ASCII bytes, ignoring characters that can't be converted + ascii_bytes = normalized.encode('ascii', 'ignore') + + # Decode back to string + ascii_text = ascii_bytes.decode('ascii') + + return ascii_text + + def drop_pnc(text): """ Clean the text by removing invalid characters and converting to lowercase. @@ -183,7 +201,10 @@ def drop_pnc(text): """ valid_chars = "abcdefghijklmnopqrstuvwxyz'" text = text.lower() - return ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + text = unicode_to_ascii(text) + text = text.replace(":", " ") + text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + return " ".join(text.split()).strip() def clean_text(manifest: List[dict]): From 2d1dce5e4d85b756bea7a1b8a7c547eb9fda8d30 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 27 Jun 2025 22:00:15 -0400 Subject: [PATCH 064/107] update cfg Signed-off-by: stevehuang52 --- .../asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 3244dbdaf69a..5a6b518bd3ff 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -302,6 +302,7 @@ exp_manager: monitor: "val_wer" mode: "min" save_top_k: 5 + filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. resume_if_exists: false From 3d9ae6634eebfbbba0bda9b910d4c3c1a298fb07 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 28 Jun 2025 11:11:15 -0400 Subject: [PATCH 065/107] update Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index ef58f46b670b..e3b948c68a69 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -599,6 +599,13 @@ def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." + + if not outputs: + logging.warning( + f"No outputs received for {mode} dataloader {dataloader_idx}. Skipping epoch end processing." + ) + return {} + self._maybe_save_predictions(outputs, mode=mode, dataloader_idx=dataloader_idx) # Aggregate WER metrics From ace403b1ded35647efc7c9eba9a40031d051cf33 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 30 Jun 2025 09:35:21 -0400 Subject: [PATCH 066/107] update Signed-off-by: stevehuang52 --- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 2 +- scripts/asr_end_of_utterance/clean_manifest.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 2838820aeff3..fdb4d1228a9a 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -170,7 +170,7 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.return_cuts = return_cuts self.eou_string = self.cfg.get('eou_string', EOU_STRING) self.eob_string = self.cfg.get('eob_string', EOB_STRING) - self.drop_pnc = self.cfg.get('drop_pnc', True) + self.drop_pnc = self.cfg.get('drop_pnc', False) if cfg.get('check_tokenizer', True): self._check_special_tokens(tokenizer) diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index d6919f8b1d7d..e263ca108464 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -357,6 +357,9 @@ def convert_number(match): if num_str.startswith('$'): is_dollar = True num_str = num_str[1:] + elif num_str.endswith('$'): + is_dollar = True + num_str = num_str[:-1] # Remove commas num_str = num_str.replace(',', '') @@ -377,7 +380,7 @@ def convert_number(match): # Pattern matches: $3,000 or 3,000.45 or 1234 pattern = re.compile(r'\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\$?\d+(?:\.\d+)?') result = pattern.sub(convert_number, text) - + result = result.replace("$", " dollars ") # Handle dollar sign separately result = " ".join(result.split()) # Remove extra spaces return result From 98749b9bebc10700c3e731daaa5c6c7814af6431 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 8 Jul 2025 14:07:38 -0400 Subject: [PATCH 067/107] update Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 4 ++-- .../tokenizers/add_special_tokens_to_sentencepiece.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 5a6b518bd3ff..d6fe9215f708 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -115,7 +115,7 @@ model: window_size: 0.025 window_stride: 0.01 window: "hann" - features: 80 + features: 128 n_fft: 512 frame_splicing: 1 dither: 0.00001 @@ -302,7 +302,7 @@ exp_manager: monitor: "val_wer" mode: "min" save_top_k: 5 - filename: '${name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' + filename: '${exp_manager.name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. resume_if_exists: false diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py index f2c95c20bd4a..1f60eaeea51b 100644 --- a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py +++ b/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py @@ -54,7 +54,7 @@ "--input_file", type=str, required=True, - help="Path to sentencepiece model file", + help="Path to nemo model file, or sentencepiece model file", ) parser.add_argument( "-o", From 9fb4395c15f808595dd817954a9fe17064aa2c7b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 9 Jul 2025 10:11:47 -0400 Subject: [PATCH 068/107] fix miss rate Signed-off-by: stevehuang52 --- nemo/collections/asr/parts/utils/eou_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index 3f48bb2b5822..4afb8d292a2d 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -76,7 +76,7 @@ def evaluate_eou( num_utterances = len(reference) num_predictions = len(prediction) missing = 0 - + earlycut_ids = set() predicted_eou = prediction if threshold is not None and threshold > 0: predicted_eou = [p for p in prediction if p["eou_prob"] > threshold] @@ -112,17 +112,24 @@ def evaluate_eou( # Correctly predicted EOU true_positives += 1 latency.append(p_end - r_end) + if r_idx in earlycut_ids: + # If this reference was already missed due to early cutoff, we do not count it again + earlycut_ids.remove(r_idx) r_idx += 1 elif r_start <= p_end < r_end - collar: # Early cutoff # current predicted EOU is within the current reference utterance false_positives += 1 early_cutoff.append(r_end - p_end) + earlycut_ids.add(r_idx) elif r_end + collar < p_end: # Late EOU # Current predicted EOU is after the current reference ends false_negatives += 1 latency.append(p_end - r_end) + if r_idx in earlycut_ids: + # If this reference was already missed due to early cutoff, we do not count it again + earlycut_ids.remove(r_idx) r_idx += 1 else: # p_end <= r_start @@ -134,6 +141,7 @@ def evaluate_eou( false_negatives += len(reference) - r_idx missing += len(reference) - r_idx + missing -= len(earlycut_ids) # Remove the references that were missed due to early cutoff return EOUResult( latency=latency, early_cutoff=early_cutoff, From 53e8417aa388f364386880fcd890eb1b077a7fee Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 21 Jul 2025 21:13:48 -0400 Subject: [PATCH 069/107] update Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 11 ++++++++++- nemo/collections/asr/parts/utils/eou_utils.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index e3b948c68a69..1392badf5925 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -317,7 +317,10 @@ def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", datalo output_file = Path(self.cfg.save_pred_to_file) output_file.parent.mkdir(parents=True, exist_ok=True) - output_file = output_file.with_suffix(f'.{dataloader_idx}.json') + if self._validation_names: + output_file = output_file.with_name(f"{self._validation_names[dataloader_idx]}_{output_file.name}") + else: + output_file = output_file.with_suffix(f'.{dataloader_idx}.json') manifest = [] for output in outputs: @@ -327,9 +330,15 @@ def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", datalo "audio_filepath": output[f'{mode}_audio_filepath'][i], "eou_text": output[f'{mode}_text_gt'][i], "eou_pred_text": output[f'{mode}_text_pred'][i], + "is_backchannel": bool(str(output[f'{mode}_text_gt'][i]).endswith(EOB_STRING)), } if f"{mode}_text_pred_ctc" in output: item["eou_pred_text_ctc"] = output[f"{mode}_text_pred_ctc"][i] + + eou_metrics = {f"eou_{k}": v for k, v in output[f"{mode}_eou_metrics"][i].to_dict().items()} + eob_metrics = {f"eob_{k}": v for k, v in output[f"{mode}_eob_metrics"][i].to_dict().items()} + item.update(eou_metrics) + item.update(eob_metrics) manifest.append(item) write_manifest(output_file, manifest) logging.info(f"Predictions saved to {output_file}") diff --git a/nemo/collections/asr/parts/utils/eou_utils.py b/nemo/collections/asr/parts/utils/eou_utils.py index 4afb8d292a2d..6099e2a6fcfe 100644 --- a/nemo/collections/asr/parts/utils/eou_utils.py +++ b/nemo/collections/asr/parts/utils/eou_utils.py @@ -29,6 +29,23 @@ class EOUResult: num_predictions: int missing: int + def to_dict(self) -> Dict[str, float]: + """ + Convert the EOUResult dataclass to a dictionary. + Returns: + Dict: A dictionary representation of the EOUResult. + """ + return { + 'latency': self.latency, + 'early_cutoff': self.early_cutoff, + 'true_positives': self.true_positives, + 'false_negatives': self.false_negatives, + 'false_positives': self.false_positives, + 'num_utterances': self.num_utterances, + 'num_predictions': self.num_predictions, + 'missing': self.missing, + } + def flatten_nested_list(nested_list: List[List[float]]) -> List[float]: """ From 0faa56bf6b5ab2b84cdb95f304e087877830d1d4 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 29 Jul 2025 16:59:09 -0400 Subject: [PATCH 070/107] add ignore_eob_label Signed-off-by: stevehuang52 --- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index fdb4d1228a9a..455535c36a28 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -186,6 +186,7 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.add_eou_to_text = self.cfg.get('add_eou_to_text', True) self.pad_eou_label_secs = self.cfg.get('pad_eou_label_secs', 0.0) self.padding_cfg = self.cfg.get('random_padding', None) + self.ignore_eob_label = self.cfg.get('ignore_eob_label', False) self.augmentor = None self.len_augmentor = None if self.cfg.get('augmentor', None) is not None: @@ -338,7 +339,7 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): return torch.zeros(hidden_length).long() eou_targets = torch.ones(hidden_length).long() # speech label eou_targets[-1] = EOU_LABEL # by default it's end of utterance - if cut.has_custom("is_backchannel") and cut.custom["is_backchannel"]: + if cut.has_custom("is_backchannel") and cut.custom["is_backchannel"] and not self.ignore_eob_label: eou_targets[-1] = EOB_LABEL # end of backchannel return eou_targets @@ -373,7 +374,7 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): seg_len = self._audio_len_to_frame_len(int(seg_len_in_secs * self.sample_rate)) eou_targets[sou_idx : sou_idx + seg_len] = SPEECH_LABEL last_idx = min(sou_idx + seg_len - 1, hidden_length - 1) - if is_backchannel[i]: + if is_backchannel[i] and not self.ignore_eob_label: eou_targets[last_idx] = EOB_LABEL # end of backchannel else: eou_targets[last_idx] = EOU_LABEL # end of utterance @@ -408,7 +409,7 @@ def _get_text_tokens(self, cut: Cut): if self.drop_pnc: text = drop_pnc(text) if self.add_eou_to_text: - eou_string = self.eob_string if is_backchannel[i] else self.eou_string + eou_string = self.eob_string if is_backchannel[i] and not self.ignore_eob_label else self.eou_string if self.add_sep_before_eou: eou_string = " " + eou_string else: From 59d986e6771b63b9a083795652d36c6a7087d274 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 31 Jul 2025 10:32:21 -0400 Subject: [PATCH 071/107] fix and update Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 2 +- scripts/asr_end_of_utterance/clean_manifest.py | 4 +++- scripts/asr_end_of_utterance/conf/data.yaml | 8 ++++---- .../asr_end_of_utterance/generate_noisy_eval_data.py | 10 ++++++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 1392badf5925..e762d30ebea2 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -317,7 +317,7 @@ def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", datalo output_file = Path(self.cfg.save_pred_to_file) output_file.parent.mkdir(parents=True, exist_ok=True) - if self._validation_names: + if getattr(self, '_validation_names', None): output_file = output_file.with_name(f"{self._validation_names[dataloader_idx]}_{output_file.name}") else: output_file = output_file.with_suffix(f'.{dataloader_idx}.json') diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index e263ca108464..be34064df8f9 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -40,7 +40,7 @@ text_normalizer = EnglishTextNormalizer() -parser = argparse.ArgumentParser(description="Clean manifest file by droping PnC") +parser = argparse.ArgumentParser(description="Clean manifest file") parser.add_argument( "input_manifest", type=str, @@ -517,6 +517,8 @@ def clean_text(text: str, args) -> str: if args.lowercase: text = text.lower() if args.remove_punc: + text = text.replace("-", " ") + text = text.replace("_", " ") text = text.translate(str.maketrans("", "", punctuations)) text = drop_punctuations(text) if args.auto_pc: diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_end_of_utterance/conf/data.yaml index f7b48251b176..c27bf129fa1e 100644 --- a/scripts/asr_end_of_utterance/conf/data.yaml +++ b/scripts/asr_end_of_utterance/conf/data.yaml @@ -31,15 +31,15 @@ data: augmentor: white_noise: - prob: 1.0 - min_level: -80 + prob: 0.0 + min_level: -90 max_level: -40 gain: - prob: 0.5 + prob: 0.0 min_gain_dbfs: -10.0 max_gain_dbfs: 10.0 noise: - prob: 0.99 + prob: 1.0 manifest_path: ??? min_snr_db: 0 max_snr_db: 20 diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py index 85b760ab04b7..eda952c75db8 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_end_of_utterance/generate_noisy_eval_data.py @@ -42,6 +42,7 @@ from pathlib import Path from shutil import rmtree +import librosa import lightning.pytorch as pl import numpy as np import soundfile as sf @@ -165,7 +166,7 @@ def process_manifest(data_cfg, output_dir): output_audio_dir = output_dir flatten_audio_path = False - if data_cfg.random_padding.pad_distribution == "constant": + if "random_padding" in data_cfg and data_cfg.random_padding.pad_distribution == "constant": is_constant_padding = True pre_pad_dur = data_cfg.random_padding.pre_pad_duration else: @@ -217,7 +218,7 @@ def process_manifest(data_cfg, output_dir): manifest_item["duration"] = audio.shape[0] / dataset.sample_rate if is_constant_padding: - # Adjust the sou_time and eou_time for constant padding, if they exist + # Adjust the sou_time and eou_time for constant padding if 'sou_time' in manifest_item and 'eou_time' in manifest_item: if not isinstance(manifest_item['sou_time'], list): manifest_item['sou_time'] = manifest_item['sou_time'] + pre_pad_dur @@ -225,6 +226,11 @@ def process_manifest(data_cfg, output_dir): else: manifest_item['sou_time'] = [x + pre_pad_dur for x in manifest_item['sou_time']] manifest_item['eou_time'] = [x + pre_pad_dur for x in manifest_item['eou_time']] + else: + # add sou_time and eou_time to the manifest item + manifest_item['sou_time'] = pre_pad_dur + manifest_item['eou_time'] = pre_pad_dur + librosa.get_duration(filename=audio_file) + manifest.append(manifest_item) # Write the output manifest From bf45b3529d34dc73a27eeb9679ccbc312082bb51 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 1 Aug 2025 20:13:40 -0400 Subject: [PATCH 072/107] improve lhotse augmentation Signed-off-by: stevehuang52 --- ...conformer_transducer_bpe_streaming_v2.yaml | 304 ++++++++++++++++++ .../asr/data/audio_to_eou_label_lhotse.py | 157 ++++++++- .../common/data/lhotse/dataloader.py | 8 + 3 files changed, 467 insertions(+), 2 deletions(-) create mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml new file mode 100644 index 000000000000..d6e1f52a0efe --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -0,0 +1,304 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming-EOU" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + skip_augment: true + lhotse_eou: true + random_padding: + prob: 0.99 + min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds + max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + noise_path: ??? + noise_mix_prob: 0.7 + noise_snr: [0, 20.0] + + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 128 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 17 + d_model: 512 + use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: true # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 # 1e-4 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing # NoamAnnealing CosineAnnealing + # scheduler config override + d_model: ${model.encoder.d_model} + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + filename: '${exp_manager.name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 455535c36a28..9784cfe027ad 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -55,6 +55,19 @@ class AudioToTextEOUBatch: eou_target_lengths: torch.Tensor | None = None +@dataclass +class RandomPaddingConfig: + prob: float = 0.9 # probability of applying padding + min_pad_duration: float = 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: float = 2.0 # maximum duration of pre/post padding in seconds + max_total_duration: float = 30.0 # maximum total duration of the padded audio in seconds + pad_distribution: str = 'uniform' # distribution of padding duration, 'uniform' or 'normal' or 'constant' + normal_mean: float = 0.5 # mean of normal distribution for padding duration + normal_std: float = 2.0 # standard deviation of normal distribution for padding duration + pre_pad_duration: float = 0.2 # amount of left-padding when pad_distribution='constant' + post_pad_duration: float = 3.0 # amount of right-padding when pad_distribution='constant' + + def unicode_to_ascii(text: str) -> str: """ Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) @@ -83,6 +96,7 @@ def drop_pnc(text: str) -> str: text = text.lower() text = unicode_to_ascii(text) text = text.replace(":", " ") + text = text.replace("-", " ") text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) text = ' '.join(text.split()).strip() return text @@ -112,9 +126,11 @@ class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds max_pad_duration: 2.0 # maximum duration of pre/post padding in seconds max_total_duration: 30.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' or 'constant' normal_mean: 0.5 # mean of normal distribution for padding duration normal_std: 2.0 # standard deviation of normal distribution for padding duration + pre_pad_duration: 0.2 # amount of left-padding when pad_distribution='constant' + post_pad_duration: 3.0 # amount of right-padding when pad_distribution='constant' ``` Returns: @@ -186,9 +202,13 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.add_eou_to_text = self.cfg.get('add_eou_to_text', True) self.pad_eou_label_secs = self.cfg.get('pad_eou_label_secs', 0.0) self.padding_cfg = self.cfg.get('random_padding', None) + if self.padding_cfg is not None: + self.padding_cfg = OmegaConf.to_container(self.padding_cfg, resolve=True) + self.padding_cfg = RandomPaddingConfig(**self.padding_cfg) self.ignore_eob_label = self.cfg.get('ignore_eob_label', False) self.augmentor = None self.len_augmentor = None + self.skip_augment = self.cfg.get("skip_augment", False) if self.cfg.get('augmentor', None) is not None: augmentor = {} len_augmentor = {} @@ -224,7 +244,47 @@ def _check_special_tokens(self, tokenizer: TokenizerSpec): "Please refer to scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py for details." ) - def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: + def _simple_getitem(self, cuts: CutSet) -> AudioToTextEOUBatch: + audio, audio_lens, cuts = self.load_audio(cuts) + if self.return_cuts: + return audio, audio_lens, cuts + + eou_targets = [] + text_tokens = [] + sample_ids = [] + audio_filepaths = [] + for i in range(len(cuts)): + c = cuts[i] + if isinstance(c, MixedCut): + c = c.first_non_padding_cut + + sample_ids.append(c.id) + audio_filepaths.append(c.recording.sources[0].source) + # Get EOU labels and text tokens + eou_targets_i = self._get_frame_labels(c, audio_lens[i]) + text_tokens_i = self._get_text_tokens(c) + eou_targets.append(eou_targets_i) + text_tokens.append(text_tokens_i) + + eou_target_lens = torch.tensor([t.size(0) for t in eou_targets], dtype=torch.long) + eou_targets = collate_vectors(eou_targets, padding_value=0) + text_token_lens = torch.tensor([t.size(0) for t in text_tokens], dtype=torch.long) + text_tokens = collate_vectors(text_tokens, padding_value=0) + return AudioToTextEOUBatch( + sample_ids=sample_ids, + audio_filepaths=audio_filepaths, + audio_signal=audio, + audio_lengths=audio_lens, + text_tokens=text_tokens, + text_token_lengths=text_token_lens, + eou_targets=eou_targets, + eou_target_lengths=eou_target_lens, + ) + + def __getitem__(self, cuts: CutSet) -> AudioToTextEOUBatch: + if self.skip_augment: + return self._simple_getitem(cuts) + audio, audio_lens, cuts = self.load_audio(cuts) audio_signals = [] audio_lengths = [] @@ -232,6 +292,7 @@ def __getitem__(self, cuts: CutSet) -> Tuple[torch.Tensor, ...]: text_tokens = [] sample_ids = [] audio_filepaths = [] + for i in range(len(cuts)): c = cuts[i] if isinstance(c, MixedCut): @@ -552,3 +613,95 @@ def _maybe_augment_length(self, audio: torch.Tensor, audio_len: torch.Tensor): audio_len = audio.size(0) return audio, audio_len + + +def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): + """ + perform random padding to data + """ + padding_cfg = config.get('random_padding', None) + if padding_cfg is not None: + padding_cfg = OmegaConf.to_container(padding_cfg, resolve=True) + padding_cfg = RandomPaddingConfig(**padding_cfg) + p = np.random.rand() + if not padding_cfg or p > padding_cfg.prob: + # do nothing + return cut + + duration = cut.duration + # if already longer than the maximum duration, return the original audio + if duration >= padding_cfg.max_total_duration: + return cut + + if isinstance(cut, MixedCut): + cut = cut.first_non_padding_cut + sou_time = cut.custom.get("sou_time", None) + if sou_time is None: + sou_time = [float(cut.start)] + elif not isinstance(sou_time, list): + sou_time = [sou_time] + + eou_time = cut.custom.get("eou_time", None) + if eou_time is None: + eou_time = [float(cut.start) + duration] + elif not isinstance(eou_time, list): + eou_time = [eou_time] + + cut.custom["origin_sou_time"] = sou_time + cut.custom["origin_eou_time"] = eou_time + + max_padding_duration = max(0, padding_cfg.max_total_duration - duration) + if max_padding_duration <= 2 * padding_cfg.min_pad_duration: + min_padding_duration = 0 + else: + min_padding_duration = 2 * padding_cfg.min_pad_duration + + pre_padding_duration = None + post_padding_duration = None + + if padding_cfg.pad_distribution == 'uniform': + total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) + elif padding_cfg.pad_distribution == 'normal': + total_padding_duration = np.random.normal(padding_cfg.normal_mean, padding_cfg.normal_std) + total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) + elif padding_cfg.pad_distribution == 'constant': + pass + else: + raise ValueError( + f"Unknown padding distribution: {padding_cfg.pad_distribution}, choices in ['uniform', 'normal', 'constant]" + ) + + if padding_cfg.pad_distribution == 'constant': + pre_padding_duration = padding_cfg.pre_pad_duration + post_padding_duration = padding_cfg.post_pad_duration + elif min_padding_duration == 0: + pre_padding_duration = total_padding_duration / 2 + post_padding_duration = total_padding_duration / 2 + else: + pre_padding_duration = np.random.uniform(min_padding_duration, total_padding_duration - min_padding_duration) + post_padding_duration = total_padding_duration - pre_padding_duration + + if padding_cfg.max_pad_duration is not None: + pre_padding_duration = min(pre_padding_duration, padding_cfg.max_pad_duration) + post_padding_duration = min(post_padding_duration, padding_cfg.max_pad_duration) + + sou_time = [t + pre_padding_duration for t in sou_time] + eou_time = [t + pre_padding_duration for t in sou_time] + + cut_left_padded = cut.pad(duration=pre_padding_duration + duration, direction="left", preserve_id=True) + cut_both_padded = cut_left_padded.pad( + duration=cut_left_padded.duration + post_padding_duration, direction="right", preserve_id=True + ) + + cut_both_padded.first_non_padding_cut.custom["sou_time"] = sou_time + cut_both_padded.first_non_padding_cut.custom["eou_time"] = eou_time + + return cut_both_padded + + +class LhotseASREOURandomPadding: + def __init__(self, cfg: DictConfig) -> None: + self.cfg = cfg + + def __call__(self, cuts: CutSet) -> CutSet: + return CutSet.from_cuts(lhotse_asr_eou_cut_random_pad_transform(config=self.cfg, cut=cut) for cut in cuts) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index a9d9cd7aa81d..9d38ba41abbd 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -38,6 +38,7 @@ from lhotse.utils import fastcopy, fix_random_seed from omegaconf import DictConfig, OmegaConf +from nemo.collections.asr.data.audio_to_eou_label_lhotse import lhotse_asr_eou_cut_random_pad_transform from nemo.collections.common.data.lhotse.cutset import ( IncompleteConfigError, guess_parse_cutset, @@ -203,6 +204,9 @@ class LhotseDataLoadingConfig: force_map_dataset: bool = False force_iterable_dataset: bool = False + # 6. EOU related options. + random_padding: Any | None = None + def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool: """Determine whether to use iterable dataset for a given configuration.""" @@ -497,6 +501,7 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) + noise = noise.resample(config.sample_rate) cuts = cuts.mix( cuts=noise, snr=tuple(config.noise_snr), @@ -544,6 +549,9 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.filter(TokenPerSecondFilter(config.min_tps, config.max_tps)) cuts = cuts.filter(TokenPerTokenFilter(config.min_tpt, config.max_tpt)) + if config.get("random_padding", None) is not None: + cuts = cuts.map(partial(lhotse_asr_eou_cut_random_pad_transform, config)) + # Select the strategy customizing Lhotse sampler behaviour. # Provides support for dynamic batch sizes, multimodal dataloading, 2D bucketing, etc. bucket_duration_bins = determine_bucket_duration_bins(config) From 6b5a9b366a1691976fa3efc490eb79f94cc89398 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 1 Aug 2025 20:14:16 -0400 Subject: [PATCH 073/107] update cfg Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index d6e1f52a0efe..4aadd8cfd757 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -41,7 +41,6 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 skip_augment: true - lhotse_eou: true random_padding: prob: 0.99 min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds From af9756e2e744fa38c713c3ad0a713f189356f529 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 1 Aug 2025 20:18:15 -0400 Subject: [PATCH 074/107] update cfg Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index 4aadd8cfd757..e7d0c419be38 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -44,7 +44,7 @@ model: random_padding: prob: 0.99 min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds + max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' From d1c9b8d34b25589f251d795ee9a8039bcfb735c6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sun, 3 Aug 2025 09:40:27 -0400 Subject: [PATCH 075/107] update Signed-off-by: stevehuang52 --- nemo/collections/common/data/lhotse/dataloader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 9d38ba41abbd..f915f7353122 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -498,6 +498,9 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=None) # 2. Optional augmentations. + if config.get("random_padding", None) is not None: + cuts = cuts.map(partial(lhotse_asr_eou_cut_random_pad_transform, config)) + # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) @@ -549,9 +552,6 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.filter(TokenPerSecondFilter(config.min_tps, config.max_tps)) cuts = cuts.filter(TokenPerTokenFilter(config.min_tpt, config.max_tpt)) - if config.get("random_padding", None) is not None: - cuts = cuts.map(partial(lhotse_asr_eou_cut_random_pad_transform, config)) - # Select the strategy customizing Lhotse sampler behaviour. # Provides support for dynamic batch sizes, multimodal dataloading, 2D bucketing, etc. bucket_duration_bins = determine_bucket_duration_bins(config) From 69d79c207ca5cd40ccbcf2d22975f43022366314 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sun, 3 Aug 2025 09:44:32 -0400 Subject: [PATCH 076/107] update Signed-off-by: stevehuang52 --- nemo/collections/common/data/lhotse/dataloader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index f915f7353122..0007abf8aa0d 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -498,8 +498,13 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=None) # 2. Optional augmentations. + import time + + t0 = time.time() if config.get("random_padding", None) is not None: cuts = cuts.map(partial(lhotse_asr_eou_cut_random_pad_transform, config)) + t1 = time.time() + logging.info(f"Random padding time: {t1 - t0} seconds") # 2.a. Noise mixing. if config.noise_path is not None: From f68e8edc2614ea79759397793151baa88f9a653a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sun, 3 Aug 2025 22:26:34 -0400 Subject: [PATCH 077/107] add debug info Signed-off-by: stevehuang52 --- ...conformer_transducer_bpe_streaming_v2.yaml | 4 +-- .../asr/data/audio_to_eou_label_lhotse.py | 28 ++++++++++++++----- .../common/data/lhotse/dataloader.py | 14 ++++++++-- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index e7d0c419be38..7b21b67c2538 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -42,7 +42,7 @@ model: shuffle_buffer_size: 10000 skip_augment: true random_padding: - prob: 0.99 + prob: 1.0 min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds @@ -51,7 +51,7 @@ model: normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' noise_path: ??? - noise_mix_prob: 0.7 + noise_mix_prob: 1.0 noise_snr: [0, 20.0] diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 9784cfe027ad..58b38bfc941b 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -22,6 +22,7 @@ from lhotse.cut import Cut, CutSet, MixedCut from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors +from lhotse.lazy import Dillable, LazyIteratorChain from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations @@ -255,6 +256,7 @@ def _simple_getitem(self, cuts: CutSet) -> AudioToTextEOUBatch: audio_filepaths = [] for i in range(len(cuts)): c = cuts[i] + if isinstance(c, MixedCut): c = c.first_non_padding_cut @@ -619,10 +621,8 @@ def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): """ perform random padding to data """ - padding_cfg = config.get('random_padding', None) - if padding_cfg is not None: - padding_cfg = OmegaConf.to_container(padding_cfg, resolve=True) - padding_cfg = RandomPaddingConfig(**padding_cfg) + padding_cfg = OmegaConf.to_container(config, resolve=True) + padding_cfg = RandomPaddingConfig(**padding_cfg) p = np.random.rand() if not padding_cfg or p > padding_cfg.prob: # do nothing @@ -699,9 +699,23 @@ def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): return cut_both_padded -class LhotseASREOURandomPadding: - def __init__(self, cfg: DictConfig) -> None: +class LazyLhotseEOURandomPadding(Dillable): + def __init__(self, cuts: CutSet, cfg: DictConfig) -> None: + self.source = cuts self.cfg = cfg + def __iter__(self): + for cut in self.source: + yield lhotse_asr_eou_cut_random_pad_transform(config=self.cfg, cut=cut) + + def __len__(self): + return len(self.source) + + def __add__(self, other) -> "LazyIteratorChain": + return LazyIteratorChain(self, other) + + +class LhotseEOURandomPadding(RandomPaddingConfig): def __call__(self, cuts: CutSet) -> CutSet: - return CutSet.from_cuts(lhotse_asr_eou_cut_random_pad_transform(config=self.cfg, cut=cut) for cut in cuts) + config = OmegaConf.create(self.__dict__) + return CutSet(LazyLhotseEOURandomPadding(cuts, config)) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 0007abf8aa0d..327b88c4e440 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -38,7 +38,10 @@ from lhotse.utils import fastcopy, fix_random_seed from omegaconf import DictConfig, OmegaConf -from nemo.collections.asr.data.audio_to_eou_label_lhotse import lhotse_asr_eou_cut_random_pad_transform +from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( + LhotseEOURandomPadding, + lhotse_asr_eou_cut_random_pad_transform, +) from nemo.collections.common.data.lhotse.cutset import ( IncompleteConfigError, guess_parse_cutset, @@ -502,10 +505,17 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No t0 = time.time() if config.get("random_padding", None) is not None: - cuts = cuts.map(partial(lhotse_asr_eou_cut_random_pad_transform, config)) + # random_padding_augmentation = LhotseEOURandomPadding(**config.random_padding) + # cuts = random_padding_augmentation(cuts) + cuts = cuts.map( + partial(lhotse_asr_eou_cut_random_pad_transform, config.random_padding), + ) t1 = time.time() logging.info(f"Random padding time: {t1 - t0} seconds") + # cuts = cuts.pad(duration=0.5, direction="left", preserve_id=True) + # cuts = cuts.pad(duration=1.5, direction="right", preserve_id=True) + # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) From 01a6f7dbc4dc52cc81ee76906695aff4fe90d21b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 5 Aug 2025 15:51:13 -0400 Subject: [PATCH 078/107] improve data augmentation Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 13 +++++++++++-- nemo/collections/common/data/lhotse/dataloader.py | 13 ++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 58b38bfc941b..e920fdf92640 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -44,6 +44,16 @@ EOU_PROHIBITED_AUGMENTATIONS = ['random_segment'] +def first_supervised_cut(maybe_mixed_cut): + if isinstance(maybe_mixed_cut, MixedCut): + return [ + t.cut + for t in maybe_mixed_cut.tracks + if len(t.cut.supervisions) > 0 and not t.cut.custom.get("is_mixed_noise") + ][0] + return maybe_mixed_cut + + @dataclass class AudioToTextEOUBatch: sample_ids: List | None = None @@ -256,9 +266,8 @@ def _simple_getitem(self, cuts: CutSet) -> AudioToTextEOUBatch: audio_filepaths = [] for i in range(len(cuts)): c = cuts[i] - if isinstance(c, MixedCut): - c = c.first_non_padding_cut + c = first_supervised_cut(c) sample_ids.append(c.id) audio_filepaths.append(c.recording.sources[0].source) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 327b88c4e440..7e2f57a1f8c2 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -501,25 +501,24 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=None) # 2. Optional augmentations. - import time - t0 = time.time() if config.get("random_padding", None) is not None: # random_padding_augmentation = LhotseEOURandomPadding(**config.random_padding) # cuts = random_padding_augmentation(cuts) cuts = cuts.map( partial(lhotse_asr_eou_cut_random_pad_transform, config.random_padding), ) - t1 = time.time() - logging.info(f"Random padding time: {t1 - t0} seconds") - - # cuts = cuts.pad(duration=0.5, direction="left", preserve_id=True) - # cuts = cuts.pad(duration=1.5, direction="right", preserve_id=True) # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) noise = noise.resample(config.sample_rate) + + def mark_as_mixed_in_noise(cut): + cut.is_mixed_noise = True + return cut + + noise = noise.map(mark_as_mixed_in_noise) cuts = cuts.mix( cuts=noise, snr=tuple(config.noise_snr), From 096b855a0aa18fb84614e279d83c851f52dace09 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 6 Aug 2025 13:54:15 -0400 Subject: [PATCH 079/107] update utils Signed-off-by: stevehuang52 --- .../asr_end_of_utterance/clean_manifest.py | 4 +-- .../convert_to_tarred_audio_dataset.py | 29 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index be34064df8f9..3b2bbd5e82cd 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -529,8 +529,8 @@ def clean_text(text: str, args) -> str: def clean_asr_manifest(manifest, text_field, args): for i, item in enumerate(manifest): text = str(item[text_field]) - manifest[i]["origin_text"] = text - manifest[i]["text"] = clean_text(text, args) + manifest[i][f"origin_{text_field}"] = text + manifest[i][text_field] = clean_text(text, args) return manifest diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py index c3b5cef57cbc..20ef6f7bdc93 100644 --- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py +++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py @@ -614,15 +614,42 @@ def create_concatenated_dataset( metadata_yaml = OmegaConf.structured(metadata) OmegaConf.save(metadata_yaml, new_metadata_path, resolve=True) - def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig): + def _read_manifest(self, manifest_path: str | list, config: ASRTarredDatasetConfig): """Read and filters data from the manifest""" + entries = [] + total_duration = 0.0 + filtered_entries = [] + filtered_duration = 0.0 + + if isinstance(manifest_path, str): + manifest_paths = manifest_path.split(",") + else: + manifest_paths = manifest_path + + print(f"Found {len(manifest_paths)} manifest files to be processed") + for manifest_file in manifest_paths: + entries_i, total_dur_i, filtered_ent_i, filtered_dur_i = self._read_single_manifest( + str(manifest_file), config + ) + entries.extend(entries_i) + total_duration += total_dur_i + filtered_entries.extend(filtered_ent_i) + filtered_duration += filtered_dur_i + + return entries, total_duration, filtered_entries, filtered_duration + + def _read_single_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig): # Read the existing manifest entries = [] total_duration = 0.0 filtered_entries = [] filtered_duration = 0.0 + print(f"Reading manifest: {manifest_path}") with open(manifest_path, 'r', encoding='utf-8') as m: for line in m: + line = line.strip() + if not line: + continue entry = json.loads(line) audio_key = "audio_filepath" if "audio_filepath" in entry else "audio_file" if audio_key not in entry: From a96eede287509cba455d01284572d67412bf878b Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 7 Aug 2025 10:31:44 -0400 Subject: [PATCH 080/107] update Signed-off-by: stevehuang52 --- examples/asr/transcribe_speech_distributed.py | 268 ++++++++++++++++++ .../asr_end_of_utterance/clean_manifest.py | 20 ++ 2 files changed, 288 insertions(+) create mode 100644 examples/asr/transcribe_speech_distributed.py diff --git a/examples/asr/transcribe_speech_distributed.py b/examples/asr/transcribe_speech_distributed.py new file mode 100644 index 000000000000..5d4a65ff85b4 --- /dev/null +++ b/examples/asr/transcribe_speech_distributed.py @@ -0,0 +1,268 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from copy import deepcopy +from dataclasses import dataclass, field +from math import ceil +from pathlib import Path +from typing import List + +from omegaconf import ListConfig +from tqdm import tqdm +from transcribe_speech import TranscriptionConfig as SingleTranscribeConfig +from transcribe_speech import main as single_transcribe_main + +from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig +from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest +from nemo.core.config import hydra_runner +from nemo.utils import logging + +""" +Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data. + +# Arguments + model_path: path to .nemo ASR checkpoint + pretrained_name: name of pretrained ASR model (from NGC registry) + audio_dir: path to directory with audio files + dataset_manifest: path to dataset JSON manifest file (in NeMo formats + compute_langs: Bool to request language ID information (if the model supports it) + timestamps: Bool to request greedy time stamp information (if the model supports it) by default None + + (Optionally: You can limit the type of timestamp computations using below overrides) + ctc_decoding.ctc_timestamp_type="all" # (default all, can be [all, char, word, segment]) + rnnt_decoding.rnnt_timestamp_type="all" # (default all, can be [all, char, word, segment]) + + output_filename: Output filename where the transcriptions will be written + batch_size: batch size during inference + presort_manifest: sorts the provided manifest by audio length for faster inference (default: True) + + cuda: Optional int to enable or disable execution of model on certain CUDA device. + allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available + amp: Bool to decide if Automatic Mixed Precision should be used during inference + audio_type: Str filetype of the audio. Supported = wav, flac, mp3 + + overwrite_transcripts: Bool which when set allows repeated transcriptions to overwrite previous results. + + ctc_decoding: Decoding sub-config for CTC. Refer to documentation for specific values. + rnnt_decoding: Decoding sub-config for RNNT. Refer to documentation for specific values. + + calculate_wer: Bool to decide whether to calculate wer/cer at end of this script + clean_groundtruth_text: Bool to clean groundtruth text + langid: Str used for convert_num_to_words during groundtruth cleaning + use_cer: Bool to use Character Error Rate (CER) or Word Error Rate (WER) + + calculate_rtfx: Bool to calculate the RTFx throughput to transcribe the input dataset. + +# Usage +ASR model can be specified by either "model_path" or "pretrained_name". +Data for transcription can be defined with either "audio_dir" or "dataset_manifest". +append_pred - optional. Allows you to add more than one prediction to an existing .json +pred_name_postfix - optional. The name you want to be written for the current model +Results are returned in a JSON manifest file. + +python transcribe_speech.py \ + model_path=null \ + pretrained_name=null \ + audio_dir="" \ + dataset_manifest="" \ + output_filename="" \ + clean_groundtruth_text=True \ + langid='en' \ + batch_size=32 \ + timestamps=False \ + compute_langs=False \ + cuda=0 \ + amp=True \ + append_pred=False \ + pred_name_postfix="" +""" + + +@dataclass +class ModelChangeConfig: + """ + Sub-config for changes specific to the Conformer Encoder + """ + + conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig) + + +@dataclass +class TranscriptionConfig(SingleTranscribeConfig): + """ + Transcription Configuration for audio to text transcription. + """ + + # General configs + pattern: str = "*.json" + output_dir: str = "transcribe_output/" + + # Distributed config + num_nodes: int = 1 + node_idx: int = 0 + num_gpus_per_node: int = 1 + gpu_idx: int = 0 + + # handle long manifest + split_size: int = -1 # -1 means no split + + +def get_unfinished_manifest(manifest_list: List[Path], output_dir: Path): + unfinished = [] + for manifest_file in manifest_list: + output_manifest_file = output_dir / manifest_file.name + if not output_manifest_file.exists(): + unfinished.append(manifest_file) + return sorted(unfinished) + + +def get_manifest_for_current_rank( + manifest_list: List[Path], gpu_id: int = 0, num_gpu: int = 1, node_idx: int = 0, num_node: int = 1 +): + node_manifest_list = [] + assert num_node > 0, f"num_node ({num_node}) must be greater than 0" + assert num_gpu > 0, f"num_gpu ({num_gpu}) must be greater than 0" + assert 0 <= gpu_id < num_gpu, f"gpu_id ({gpu_id}) must be in range [0, {num_gpu})" + assert 0 <= node_idx < num_node, f"node_idx ({node_idx}) must be in range [0, {num_node})" + for i, manifest_file in enumerate(manifest_list): + if (i + node_idx) % num_node == 0: + node_manifest_list.append(manifest_file) + + gpu_manifest_list = [] + for i, manifest_file in enumerate(node_manifest_list): + if (i + gpu_id) % num_gpu == 0: + gpu_manifest_list.append(manifest_file) + return gpu_manifest_list + + +def maybe_split_manifest(manifest_list: List[Path], cfg: TranscriptionConfig) -> List[Path]: + if cfg.split_size is None or cfg.split_size <= 0: + return manifest_list + + all_sharded_manifest_files = [] + sharded_manifest_dir = Path(cfg.output_dir) / "sharded_manifest_todo" + sharded_manifest_dir.mkdir(parents=True, exist_ok=True) + + sharded_manifest_done_dir = Path(cfg.output_dir) / "sharded_manifest_done" + sharded_manifest_done_dir.mkdir(parents=True, exist_ok=True) + cfg.output_dir = sharded_manifest_done_dir + + logging.info(f"Splitting {len(manifest_list)} manifest files by every {cfg.split_size} samples.") + for manifest_file in tqdm(manifest_list, total=len(manifest_list), desc="Splitting manifest files"): + manifest = read_manifest(manifest_file) + + num_chunks = ceil(len(manifest) / cfg.split_size) + for i in range(num_chunks): + chunk_manifest = manifest[i * cfg.split_size : (i + 1) * cfg.split_size] + sharded_manifest_file = sharded_manifest_dir / f"{manifest_file.stem}--tpart_{i}.json" + write_manifest(sharded_manifest_file, chunk_manifest) + all_sharded_manifest_files.append(sharded_manifest_file) + + return all_sharded_manifest_files + + +def maybe_merge_manifest(cfg: TranscriptionConfig): + if cfg.split_size is None or cfg.split_size <= 0: + return + + # only merge manifest on the first GPU of the first node + if not cfg.gpu_idx == 0 and cfg.node_idx == 0: + return + + sharded_manifest_dir = Path(cfg.output_dir) + sharded_manifests = list(sharded_manifest_dir.glob("*--tpart_*.json")) + if not sharded_manifests: + logging.info(f"No sharded manifest files found in {sharded_manifest_dir}") + return + + logging.info(f"Merging {len(sharded_manifests)} sharded manifest files.") + manifest_dict = defaultdict(list) + for sharded_manifest in sharded_manifests: + data_name = sharded_manifest.stem.split("--tpart_")[0] + manifest_dict[data_name].append(sharded_manifest) + + output_dir = Path(cfg.output_dir).parent + for data_name, sharded_manifest_list in tqdm( + manifest_dict.items(), total=len(manifest_dict), desc="Merging manifest files" + ): + merged_manifest = [] + for sharded_manifest in sharded_manifest_list: + manifest = read_manifest(sharded_manifest) + merged_manifest.extend(manifest) + output_manifest = output_dir / f"{data_name}.json" + write_manifest(output_manifest, merged_manifest) + logging.info(f"Merged manifest files saved to {output_dir}") + + +@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig) +def run_distributed_transcribe(cfg: TranscriptionConfig): + + logging.info(f"Running distributed transcription with config: {cfg}") + + if isinstance(cfg.dataset_manifest, str) and "," in cfg.dataset_manifest: + manifest_list = cfg.dataset_manifest.split(",") + elif isinstance(cfg.dataset_manifest, (ListConfig, list)): + manifest_list = cfg.dataset_manifest + else: + input_manifest = Path(cfg.dataset_manifest) + if input_manifest.is_dir(): + manifest_list = list(input_manifest.glob(cfg.pattern)) + elif input_manifest.is_file(): + manifest_list = [input_manifest] + else: + raise ValueError(f"Invalid manifest file or directory: {input_manifest}") + + if not manifest_list: + raise ValueError(f"No manifest files found matching pattern: {cfg.pattern} in {input_manifest}") + + manifest_list = maybe_split_manifest(manifest_list, cfg) + + logging.info(f"Found {len(manifest_list)} manifest files.") + + output_dir = Path(cfg.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + unfinished_manifest = get_unfinished_manifest(manifest_list, output_dir=output_dir) + if not unfinished_manifest: + maybe_merge_manifest(cfg) + logging.info("All manifest files have been processed. Exiting.") + return + logging.info(f"Found {len(unfinished_manifest)} unfinished manifest files.") + + manifest_list = get_manifest_for_current_rank( + unfinished_manifest, + gpu_id=cfg.gpu_idx, + num_gpu=cfg.num_gpus_per_node, + node_idx=cfg.node_idx, + num_node=cfg.num_nodes, + ) + if not manifest_list: + logging.info(f"No manifest files found for GPU {cfg.gpu_idx} on node {cfg.node_idx}. Exiting.") + return + + logging.info(f"Processing {len(manifest_list)} manifest files with GPU {cfg.gpu_idx} on node {cfg.node_idx}.") + + cfg.cuda = cfg.gpu_idx + for manifest_file in tqdm(manifest_list): + logging.info(f"Processing {manifest_file}...") + output_filename = output_dir / Path(manifest_file).name + curr_cfg = deepcopy(cfg) + curr_cfg.dataset_manifest = str(manifest_file) + curr_cfg.output_filename = str(output_filename) + + single_transcribe_main(curr_cfg) + + +if __name__ == '__main__': + run_distributed_transcribe() # noqa pylint: disable=no-value-for-parameter diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index 3b2bbd5e82cd..d3cf6372305b 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -381,6 +381,26 @@ def convert_number(match): pattern = re.compile(r'\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\$?\d+(?:\.\d+)?') result = pattern.sub(convert_number, text) result = result.replace("$", " dollars ") # Handle dollar sign separately + + def merge_th(text: str) -> str: + # merge th with the preceding digit + candidates = ["four th ", "five th ", "six th ", "seven th ", "eight th ", "nine th "] + for key in candidates: + if key in text: + if "five" in key: + target = "fifth " + else: + target = f"{key.split(' ')[0]}th " + text = text.replace(key, target) + elif text.endswith(key.strip()): + if "five" in key: + target = "fifth" + else: + target = f"{key.split(' ')[0]}th" + text = text.replace(key.strip(), target) + return text + + result = merge_th(result) result = " ".join(result.split()) # Remove extra spaces return result From 632f515960a9572c9f9d7e5ece086bc7693539cd Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 7 Aug 2025 12:02:50 -0400 Subject: [PATCH 081/107] update Signed-off-by: stevehuang52 --- examples/asr/transcribe_speech_distributed.py | 3 ++- nemo/collections/asr/data/audio_to_text_dataset.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/asr/transcribe_speech_distributed.py b/examples/asr/transcribe_speech_distributed.py index 5d4a65ff85b4..3a56d621c3f8 100644 --- a/examples/asr/transcribe_speech_distributed.py +++ b/examples/asr/transcribe_speech_distributed.py @@ -114,6 +114,7 @@ class TranscriptionConfig(SingleTranscribeConfig): node_idx: int = 0 num_gpus_per_node: int = 1 gpu_idx: int = 0 + bind_gpu_to_cuda: bool = False # handle long manifest split_size: int = -1 # -1 means no split @@ -253,7 +254,7 @@ def run_distributed_transcribe(cfg: TranscriptionConfig): logging.info(f"Processing {len(manifest_list)} manifest files with GPU {cfg.gpu_idx} on node {cfg.node_idx}.") - cfg.cuda = cfg.gpu_idx + cfg.cuda = cfg.gpu_idx if cfg.bind_gpu_to_cuda else None for manifest_file in tqdm(manifest_list): logging.info(f"Processing {manifest_file}...") output_filename = output_dir / Path(manifest_file).name diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index ad83d4609126..dd53a14711ee 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -31,6 +31,7 @@ get_hf_audio_to_text_char_dataset, ) from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor, process_augmentations +from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset from nemo.collections.common.tokenizers import TokenizerSpec from nemo.utils import logging @@ -865,6 +866,8 @@ def write_on_batch_end( for sample_id, transcribed_text in prediction: item = {} + if isinstance(transcribed_text, Hypothesis): + transcribed_text = transcribed_text.text if isinstance(sample_id, lhotse.cut.Cut): sample = sample_id if isinstance(sample, lhotse.cut.MixedCut): From c706f75d27b70b2fa80343c049c57998d0802efa Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 17:59:08 -0400 Subject: [PATCH 082/107] update dataloader Signed-off-by: stevehuang52 --- examples/asr/transcribe_speech_distributed.py | 9 ++++++++- nemo/collections/common/data/lhotse/dataloader.py | 10 ++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/asr/transcribe_speech_distributed.py b/examples/asr/transcribe_speech_distributed.py index 3a56d621c3f8..471aca4a79d9 100644 --- a/examples/asr/transcribe_speech_distributed.py +++ b/examples/asr/transcribe_speech_distributed.py @@ -229,7 +229,7 @@ def run_distributed_transcribe(cfg: TranscriptionConfig): raise ValueError(f"No manifest files found matching pattern: {cfg.pattern} in {input_manifest}") manifest_list = maybe_split_manifest(manifest_list, cfg) - + original_manifest_list = list(manifest_list) logging.info(f"Found {len(manifest_list)} manifest files.") output_dir = Path(cfg.output_dir) @@ -264,6 +264,13 @@ def run_distributed_transcribe(cfg: TranscriptionConfig): single_transcribe_main(curr_cfg) + # check if all manifest files have been processed + unfinished_manifest = get_unfinished_manifest(original_manifest_list, output_dir=output_dir) + if not unfinished_manifest: + maybe_merge_manifest(cfg) + logging.info("All manifest files have been processed. Exiting.") + return + if __name__ == '__main__': run_distributed_transcribe() # noqa pylint: disable=no-value-for-parameter diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 7e2f57a1f8c2..b576779f3caa 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -38,10 +38,6 @@ from lhotse.utils import fastcopy, fix_random_seed from omegaconf import DictConfig, OmegaConf -from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( - LhotseEOURandomPadding, - lhotse_asr_eou_cut_random_pad_transform, -) from nemo.collections.common.data.lhotse.cutset import ( IncompleteConfigError, guess_parse_cutset, @@ -503,6 +499,12 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No # 2. Optional augmentations. if config.get("random_padding", None) is not None: + # put this here to avoid circular import + from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( + LhotseEOURandomPadding, + lhotse_asr_eou_cut_random_pad_transform, + ) + # random_padding_augmentation = LhotseEOURandomPadding(**config.random_padding) # cuts = random_padding_augmentation(cuts) cuts = cuts.map( From 3bdf00c03df177b16d207bc9b2e7ab1099f4eada Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 18:16:49 -0400 Subject: [PATCH 083/107] update oomptimizer Signed-off-by: stevehuang52 --- scripts/speech_recognition/oomptimizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/speech_recognition/oomptimizer.py b/scripts/speech_recognition/oomptimizer.py index d46179742ff8..f5c4d4bbf0c5 100755 --- a/scripts/speech_recognition/oomptimizer.py +++ b/scripts/speech_recognition/oomptimizer.py @@ -17,7 +17,6 @@ import math import sys from numbers import Number -from typing import Iterable, Literal import click import lightning.pytorch as pl @@ -375,7 +374,10 @@ def oomptimizer( config_path is None and module_name is None ), "--pretrained-name cannot be used together with --module-name/--config-path" click.echo(f"Intializing ASR model from pretrained checkpoint {pretrained_name}.") - model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device) + if pretrained_name.endswith('.nemo'): + model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device) + else: + model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device) else: assert config_path is not None, "--module-name requires --config-path to be specified as well." assert module_name is not None, "--config-path requires --module-name to be specified as well." From da0ab3ed3c90751b7f347949152004eeda07c77e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 18:20:19 -0400 Subject: [PATCH 084/107] update oomptimizer Signed-off-by: stevehuang52 --- scripts/speech_recognition/oomptimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/speech_recognition/oomptimizer.py b/scripts/speech_recognition/oomptimizer.py index f5c4d4bbf0c5..a6598a56759b 100755 --- a/scripts/speech_recognition/oomptimizer.py +++ b/scripts/speech_recognition/oomptimizer.py @@ -375,7 +375,7 @@ def oomptimizer( ), "--pretrained-name cannot be used together with --module-name/--config-path" click.echo(f"Intializing ASR model from pretrained checkpoint {pretrained_name}.") if pretrained_name.endswith('.nemo'): - model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device) + model = ASRModel.restore_from(pretrained_name, trainer=trainer).to(device) else: model = ASRModel.from_pretrained(pretrained_name, trainer=trainer).to(device) else: From 97417801deead7e2c10094962c5a5cf967426176 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 18:28:07 -0400 Subject: [PATCH 085/107] update eou model Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index e762d30ebea2..ef7e991c42d0 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -46,6 +46,7 @@ from nemo.collections.common.losses import CrossEntropyLoss from nemo.core.classes.common import Serialization from nemo.core.classes.mixins import AccessMixin +from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType from nemo.utils import logging __all__ = ['EncDecRNNTBPEEOUModel', 'EncDecHybridRNNTCTCBPEEOUModel'] @@ -60,6 +61,25 @@ class EOUPrediction: class ASREOUModelMixin: + + @property + def oomptimizer_schema(self) -> dict: + """ + Return a typing schema for optimal batch size calibration for various + sequence lengths using OOMptimizer. + """ + return { + "cls": AudioToTextEOUBatch, + "inputs": [ + {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input", "name": "audio_signal"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "audio_lengths"}, + {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "text_tokens"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "text_token_lengths"}, + {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "eou_targets"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "eou_target_lengths"}, + ], + } + def _patch_decoding_cfg(self, cfg: DictConfig): """ Patch the decoding config as needed for EOU computation From f246fd26cdcbf9e5e2efe213fb00489440021434 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 19:25:32 -0400 Subject: [PATCH 086/107] update eou model Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index ef7e991c42d0..ed438713008b 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -62,24 +62,6 @@ class EOUPrediction: class ASREOUModelMixin: - @property - def oomptimizer_schema(self) -> dict: - """ - Return a typing schema for optimal batch size calibration for various - sequence lengths using OOMptimizer. - """ - return { - "cls": AudioToTextEOUBatch, - "inputs": [ - {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input", "name": "audio_signal"}, - {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "audio_lengths"}, - {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "text_tokens"}, - {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "text_token_lengths"}, - {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "eou_targets"}, - {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "eou_target_lengths"}, - ], - } - def _patch_decoding_cfg(self, cfg: DictConfig): """ Patch the decoding config as needed for EOU computation @@ -688,6 +670,24 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') + @property + def oomptimizer_schema(self) -> dict: + """ + Return a typing schema for optimal batch size calibration for various + sequence lengths using OOMptimizer. + """ + return { + "cls": AudioToTextEOUBatch, + "inputs": [ + {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input", "name": "audio_signal"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "audio_lengths"}, + {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "text_tokens"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "text_token_lengths"}, + {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "eou_targets"}, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "eou_target_lengths"}, + ], + } + class EncDecHybridRNNTCTCBPEEOUModel(EncDecHybridRNNTCTCBPEModel, ASREOUModelMixin): def __init__(self, cfg: DictConfig, trainer): From 9cf662e507d2c970da21a1de7d2b375380f726d3 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Fri, 8 Aug 2025 19:47:15 -0400 Subject: [PATCH 087/107] update eou model Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index ed438713008b..81b3d8f559fc 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -681,10 +681,20 @@ def oomptimizer_schema(self) -> dict: "inputs": [ {"type": NeuralType(("B", "T"), AudioSignal()), "seq_length": "input", "name": "audio_signal"}, {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "audio_lengths"}, - {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "text_tokens"}, - {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "text_token_lengths"}, - {"type": NeuralType(("B", "T"), LabelsType()), "seq_length": "input", "name": "eou_targets"}, - {"type": NeuralType(("B",), LengthsType()), "seq_length": "input", "name": "eou_target_lengths"}, + { + "type": NeuralType(("B", "T"), LabelsType()), + "seq_length": "output", + "name": "text_tokens", + "vocab_size": self.tokenizer.vocab_size, + }, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "output", "name": "text_token_lengths"}, + { + "type": NeuralType(("B", "T"), LabelsType()), + "seq_length": "output", + "name": "eou_targets", + "vocab_size": 4, + }, + {"type": NeuralType(("B",), LengthsType()), "seq_length": "output", "name": "eou_target_lengths"}, ], } From 109aeb26395e2d8e35d61a65335a06350c0f1eb9 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 9 Aug 2025 09:39:46 -0400 Subject: [PATCH 088/107] update augmentation Signed-off-by: stevehuang52 --- ...conformer_transducer_bpe_streaming_v2.yaml | 3 ++- .../asr/data/audio_to_eou_label_lhotse.py | 25 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index 7b21b67c2538..09018346017d 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -49,7 +49,8 @@ model: pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - + min_post_pad_duration: 3.0 + noise_path: ??? noise_mix_prob: 1.0 noise_snr: [0, 20.0] diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index e920fdf92640..29c3b1542199 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -72,6 +72,8 @@ class RandomPaddingConfig: min_pad_duration: float = 0.5 # minimum duration of pre/post padding in seconds max_pad_duration: float = 2.0 # maximum duration of pre/post padding in seconds max_total_duration: float = 30.0 # maximum total duration of the padded audio in seconds + min_pre_pad_duration: float = 0.0 # minimum duration of pre-padding in seconds + min_post_pad_duration: float = 0.0 # minimum duration of post-padding in seconds pad_distribution: str = 'uniform' # distribution of padding duration, 'uniform' or 'normal' or 'constant' normal_mean: float = 0.5 # mean of normal distribution for padding duration normal_std: float = 2.0 # standard deviation of normal distribution for padding duration @@ -521,7 +523,11 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta if max_padding_duration <= 2 * self.padding_cfg.min_pad_duration: min_padding_duration = 0 else: - min_padding_duration = 2 * self.padding_cfg.min_pad_duration + min_padding_duration = max( + 2 * self.padding_cfg.min_pad_duration, + self.padding_cfg.min_pre_pad_duration + self.padding_cfg.min_post_pad_duration, + ) + min_padding_duration = min(min_padding_duration, max_padding_duration) pre_padding_duration = None post_padding_duration = None @@ -543,10 +549,10 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta pre_padding_duration = total_padding_duration / 2 post_padding_duration = total_padding_duration / 2 else: - pre_padding_duration = np.random.uniform( - min_padding_duration, total_padding_duration - min_padding_duration + post_padding_duration = np.random.uniform( + self.padding_cfg.min_post_pad_duration, total_padding_duration - self.padding_cfg.min_pre_pad_duration ) - post_padding_duration = total_padding_duration - pre_padding_duration + pre_padding_duration = total_padding_duration - post_padding_duration if self.padding_cfg.max_pad_duration is not None: pre_padding_duration = min(pre_padding_duration, self.padding_cfg.max_pad_duration) @@ -663,7 +669,10 @@ def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): if max_padding_duration <= 2 * padding_cfg.min_pad_duration: min_padding_duration = 0 else: - min_padding_duration = 2 * padding_cfg.min_pad_duration + min_padding_duration = max( + 2 * padding_cfg.min_pad_duration, padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration + ) + min_padding_duration = min(min_padding_duration, max_padding_duration) pre_padding_duration = None post_padding_duration = None @@ -687,8 +696,10 @@ def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): pre_padding_duration = total_padding_duration / 2 post_padding_duration = total_padding_duration / 2 else: - pre_padding_duration = np.random.uniform(min_padding_duration, total_padding_duration - min_padding_duration) - post_padding_duration = total_padding_duration - pre_padding_duration + post_padding_duration = np.random.uniform( + padding_cfg.min_post_pad_duration, total_padding_duration - padding_cfg.min_pre_pad_duration + ) + pre_padding_duration = total_padding_duration - post_padding_duration if padding_cfg.max_pad_duration is not None: pre_padding_duration = min(pre_padding_duration, padding_cfg.max_pad_duration) From ddc4b554b678b243c712cb1b01db4d837b1abeef Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 9 Aug 2025 09:45:19 -0400 Subject: [PATCH 089/107] update aug Signed-off-by: stevehuang52 --- ...conformer_transducer_bpe_streaming_v2.yaml | 4 +-- .../asr/data/audio_to_eou_label_lhotse.py | 26 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index 09018346017d..19c4ceb50636 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -43,8 +43,8 @@ model: skip_augment: true random_padding: prob: 1.0 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 29c3b1542199..19a13fee3c42 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -70,7 +70,7 @@ class AudioToTextEOUBatch: class RandomPaddingConfig: prob: float = 0.9 # probability of applying padding min_pad_duration: float = 0.5 # minimum duration of pre/post padding in seconds - max_pad_duration: float = 2.0 # maximum duration of pre/post padding in seconds + max_pad_duration: float = 5.0 # maximum duration of pre/post padding in seconds max_total_duration: float = 30.0 # maximum total duration of the padded audio in seconds min_pre_pad_duration: float = 0.0 # minimum duration of pre-padding in seconds min_post_pad_duration: float = 0.0 # minimum duration of post-padding in seconds @@ -519,15 +519,18 @@ def _random_pad_audio(self, audio: torch.Tensor, audio_len: torch.Tensor, eou_ta # apply padding audio = audio[:audio_len] + self.padding_cfg.min_pre_pad_duration = max( + self.padding_cfg.min_pre_pad_duration, self.padding_cfg.min_pad_duration + ) + self.padding_cfg.min_post_pad_duration = max( + self.padding_cfg.min_post_pad_duration, self.padding_cfg.min_pad_duration + ) + max_padding_duration = max(0, self.padding_cfg.max_total_duration - duration) - if max_padding_duration <= 2 * self.padding_cfg.min_pad_duration: + if max_padding_duration <= self.padding_cfg.min_pre_pad_duration + self.padding_cfg.min_post_pad_duration: min_padding_duration = 0 else: - min_padding_duration = max( - 2 * self.padding_cfg.min_pad_duration, - self.padding_cfg.min_pre_pad_duration + self.padding_cfg.min_post_pad_duration, - ) - min_padding_duration = min(min_padding_duration, max_padding_duration) + min_padding_duration = self.padding_cfg.min_pre_pad_duration + self.padding_cfg.min_post_pad_duration pre_padding_duration = None post_padding_duration = None @@ -666,13 +669,12 @@ def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): cut.custom["origin_eou_time"] = eou_time max_padding_duration = max(0, padding_cfg.max_total_duration - duration) - if max_padding_duration <= 2 * padding_cfg.min_pad_duration: + padding_cfg.min_pre_pad_duration = max(padding_cfg.min_pre_pad_duration, padding_cfg.min_pad_duration) + padding_cfg.min_post_pad_duration = max(padding_cfg.min_post_pad_duration, padding_cfg.min_pad_duration) + if max_padding_duration <= padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration: min_padding_duration = 0 else: - min_padding_duration = max( - 2 * padding_cfg.min_pad_duration, padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration - ) - min_padding_duration = min(min_padding_duration, max_padding_duration) + min_padding_duration = padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration pre_padding_duration = None post_padding_duration = None From 9762900afc2449d221ccaec775cd56a7c91d7cc9 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sat, 9 Aug 2025 19:11:43 -0400 Subject: [PATCH 090/107] update augment Signed-off-by: stevehuang52 --- .../asr_eou/fastconformer_transducer_bpe_streaming.yaml | 8 +++++--- .../fastconformer_transducer_bpe_streaming_v2.yaml | 4 ++-- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index d6fe9215f708..10310bfa3102 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -43,13 +43,15 @@ model: random_padding: prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds + min_post_pad_duration: 3.0 + min_pre_pad_duration: 0.0 + max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - + + augmentor: white_noise: prob: 0.9 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index 19c4ceb50636..c22507eeb3bb 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -43,13 +43,13 @@ model: skip_augment: true random_padding: prob: 1.0 - min_pad_duration: 0.5 # minimum duration of pre/post padding in seconds + min_post_pad_duration: 3.0 + min_pre_pad_duration: 0.0 max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds max_total_duration: 40.0 # maximum total duration of the padded audio in seconds pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - min_post_pad_duration: 3.0 noise_path: ??? noise_mix_prob: 1.0 diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 19a13fee3c42..e65180c8a37e 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -69,11 +69,11 @@ class AudioToTextEOUBatch: @dataclass class RandomPaddingConfig: prob: float = 0.9 # probability of applying padding - min_pad_duration: float = 0.5 # minimum duration of pre/post padding in seconds + min_pad_duration: float = 0.0 # minimum duration of pre/post padding in seconds max_pad_duration: float = 5.0 # maximum duration of pre/post padding in seconds - max_total_duration: float = 30.0 # maximum total duration of the padded audio in seconds + max_total_duration: float = 40.0 # maximum total duration of the padded audio in seconds min_pre_pad_duration: float = 0.0 # minimum duration of pre-padding in seconds - min_post_pad_duration: float = 0.0 # minimum duration of post-padding in seconds + min_post_pad_duration: float = 2.0 # minimum duration of post-padding in seconds pad_distribution: str = 'uniform' # distribution of padding duration, 'uniform' or 'normal' or 'constant' normal_mean: float = 0.5 # mean of normal distribution for padding duration normal_std: float = 2.0 # standard deviation of normal distribution for padding duration From 33d1e9e1cb2bdfe42d09ac881de93db82c09e02a Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Sun, 10 Aug 2025 20:12:48 -0400 Subject: [PATCH 091/107] update Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml | 1 - .../asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml | 3 ++- nemo/collections/asr/data/audio_to_eou_label_lhotse.py | 3 ++- nemo/collections/common/data/lhotse/dataloader.py | 4 +++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 10310bfa3102..51e71a09c7af 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -51,7 +51,6 @@ model: normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - augmentor: white_noise: prob: 0.9 diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml index c22507eeb3bb..ed817bc08b5f 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml @@ -41,8 +41,9 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 skip_augment: true + use_dataloader_augment: true random_padding: - prob: 1.0 + prob: 0.99 min_post_pad_duration: 3.0 min_pre_pad_duration: 0.0 max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index e65180c8a37e..fa9b6f42b831 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -110,7 +110,7 @@ def drop_pnc(text: str) -> str: text = unicode_to_ascii(text) text = text.replace(":", " ") text = text.replace("-", " ") - text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) + text = ''.join([c for c in text if c in valid_chars or c.isspace()]) text = ' '.join(text.split()).strip() return text @@ -222,6 +222,7 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.augmentor = None self.len_augmentor = None self.skip_augment = self.cfg.get("skip_augment", False) + logging.info(f"EOU dataset with skip augmentations = {self.skip_augment}") if self.cfg.get('augmentor', None) is not None: augmentor = {} len_augmentor = {} diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index b576779f3caa..424a40cdeacf 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -205,6 +205,7 @@ class LhotseDataLoadingConfig: # 6. EOU related options. random_padding: Any | None = None + use_dataloader_augment: bool = False def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool: @@ -498,8 +499,9 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No # 2. Optional augmentations. - if config.get("random_padding", None) is not None: + if config.get("random_padding", None) is not None and config.get("use_dataloader_augment", False): # put this here to avoid circular import + logging.info("Using dataloader augmentations for EOU random padding.") from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( LhotseEOURandomPadding, lhotse_asr_eou_cut_random_pad_transform, From 218b88afda76251f27574432b6044989b405e2bc Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Mon, 11 Aug 2025 09:48:46 -0400 Subject: [PATCH 092/107] update drop pnc func Signed-off-by: stevehuang52 --- .../asr/data/audio_to_eou_label_lhotse.py | 1 + .../asr_end_of_utterance/clean_manifest.py | 29 ++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index fa9b6f42b831..6e672d82ce3f 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -110,6 +110,7 @@ def drop_pnc(text: str) -> str: text = unicode_to_ascii(text) text = text.replace(":", " ") text = text.replace("-", " ") + text = text.replace("_", " ") text = ''.join([c for c in text if c in valid_chars or c.isspace()]) text = ' '.join(text.split()).strip() return text diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_end_of_utterance/clean_manifest.py index d3cf6372305b..99a90d397714 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_end_of_utterance/clean_manifest.py @@ -405,7 +405,24 @@ def merge_th(text: str) -> str: return result -def drop_punctuations(text): +def unicode_to_ascii(text: str) -> str: + """ + Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) + into their closest ASCII equivalents. + """ + # Normalize the string to NFKD to separate base characters from diacritics + normalized = unicodedata.normalize('NFKD', text) + + # Encode to ASCII bytes, ignoring characters that can't be converted + ascii_bytes = normalized.encode('ascii', 'ignore') + + # Decode back to string + ascii_text = ascii_bytes.decode('ascii') + + return ascii_text + + +def drop_punctuations(text: str) -> str: """ Clean the text by removing invalid characters and converting to lowercase. @@ -413,9 +430,13 @@ def drop_punctuations(text): :return: Cleaned text. """ valid_chars = "abcdefghijklmnopqrstuvwxyz'" - text = ''.join([c for c in text if c in valid_chars or c.isspace() or c == "'"]) - text = ' '.join(text.split()) # Remove extra spaces - return text.strip() + text = text.lower() + text = unicode_to_ascii(text) + text = text.replace(":", " ") + text = text.replace("-", " ") + text = ''.join([c for c in text if c in valid_chars or c.isspace()]) + text = ' '.join(text.split()).strip() + return text def clean_label(_str: str) -> str: From 922dfddca49fb231744dc3e4907f4e9685529fba Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 19 Aug 2025 20:59:46 -0400 Subject: [PATCH 093/107] update eou finetune Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_rnnt_eou_train.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 85958a4c821d..a9797281a974 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -188,10 +188,18 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat if pretrained_model_path.endswith('.nemo'): pretrained_model = ASRModel.restore_from(restore_path=pretrained_model_path) # type: EncDecRNNTBPEModel else: - try: - pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel - except Exception as e: - raise ValueError(f"Could not load pretrained model from {pretrained_model_path}.") from e + pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel + + try: + model.load_state_dict(pretrained_model.state_dict(), strict=True) + logging.info( + f"Pretrained model from {pretrained_model_path} has exactly the same model structure, skip further loading." + ) + return + except Exception as e: + logging.warning( + f"Pretrained model {pretrained_model_path} has different model structure, try loading weights separately and add EOU/EOB classes." + ) if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): raise ValueError( From ca4055a4c72608ab27af39c5f8c13ef890d31f75 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 3 Sep 2025 14:08:08 -0400 Subject: [PATCH 094/107] update transcribe Signed-off-by: stevehuang52 --- examples/asr/transcribe_speech.py | 2 +- examples/asr/transcribe_speech_distributed.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py index 8e9ae6befeeb..d46ab7714c99 100644 --- a/examples/asr/transcribe_speech.py +++ b/examples/asr/transcribe_speech.py @@ -305,7 +305,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis if cfg.decoder_type and cfg.decoder_type != 'rnnt': raise ValueError('RNNT model only support rnnt decoding!') - if cfg.decoder_type and hasattr(asr_model.encoder, 'set_default_att_context_size'): + if cfg.att_context_size and hasattr(asr_model.encoder, 'set_default_att_context_size'): asr_model.encoder.set_default_att_context_size(cfg.att_context_size) # Setup decoding strategy diff --git a/examples/asr/transcribe_speech_distributed.py b/examples/asr/transcribe_speech_distributed.py index 471aca4a79d9..90f73dfa1661 100644 --- a/examples/asr/transcribe_speech_distributed.py +++ b/examples/asr/transcribe_speech_distributed.py @@ -72,7 +72,7 @@ pred_name_postfix - optional. The name you want to be written for the current model Results are returned in a JSON manifest file. -python transcribe_speech.py \ +CUDA_VISIBLE_DEVICES=1 python transcribe_speech_distributed.py \ model_path=null \ pretrained_name=null \ audio_dir="" \ @@ -86,7 +86,12 @@ cuda=0 \ amp=True \ append_pred=False \ - pred_name_postfix="" + pred_name_postfix="" \ + split_size=10000 \ + num_nodes=1 \ + node_idx=0 \ + num_gpus_per_node=1 \ + gpu_idx=0 """ From 6c3aff2a74f2b4e6fda9654d7d374e13bae5e08d Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 11 Sep 2025 11:07:24 -0400 Subject: [PATCH 095/107] update cfg Signed-off-by: stevehuang52 --- ...astconformer_transducer_bpe_streaming.yaml | 4 +- ...conformer_transducer_bpe_streaming_xl.yaml | 315 ++++++++++++++++++ 2 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 51e71a09c7af..78e939cbdc5b 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -135,7 +135,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + use_bias: false # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding @@ -164,7 +164,7 @@ model: att_context_style: chunked_limited # regular or chunked_limited att_context_probs: null - xscaling: true # scales up the input embeddings by sqrt(d_model) + xscaling: false # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 # Convolution module's params diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml new file mode 100644 index 000000000000..abc845802856 --- /dev/null +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml @@ -0,0 +1,315 @@ +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. + +# You may find more detail: +# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer +# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer +# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml + +name: "FastConformer-Transducer-BPE-Streaming-EOU" + +model: + token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] + token_init_weight_value: null # only applicable when token_init_method='constant' + token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' + + sample_rate: 16000 + compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. + log_prediction: true # enables logging sample predictions in the output during training + skip_nan_grad: false + + model_defaults: + enc_hidden: ${model.encoder.d_model} + pred_hidden: 640 + joint_hidden: 640 + + train_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: true + drop_last: true + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + random_padding: + prob: 0.99 + min_post_pad_duration: 3.0 + min_pre_pad_duration: 0.0 + max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds + max_total_duration: 40.0 # maximum total duration of the padded audio in seconds + pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' + normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' + normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' + + augmentor: + white_noise: + prob: 0.9 + min_level: -90 + max_level: -46 + gain: + prob: 0.5 + min_gain_dbfs: -10.0 + max_gain_dbfs: 10.0 + noise: + prob: 0.6 + manifest_path: ??? + min_snr_db: 0 + max_snr_db: 20 + max_gain_db: 300.0 + + validation_ds: + manifest_filepath: ??? + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + test_ds: + manifest_filepath: null + tarred_audio_filepaths: null + sample_rate: ${model.sample_rate} + max_duration: 30 # you may need to update it for your dataset + min_duration: 0.1 + defer_setup: true + batch_duration: null # you may disable batch_duration by setting it to `null` + batch_size: 16 + shuffle: false + num_workers: 8 + pin_memory: true + quadratic_duration: 30 + num_buckets: 30 + num_cuts_for_bins_estimate: 10000 + bucket_buffer_size: 10000 + shuffle_buffer_size: 10000 + + # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py + # We recommend to use vocab size of 1024 with SPE Unigram for most languages + tokenizer: + dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) + type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) + + preprocessor: + _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor + sample_rate: ${model.sample_rate} + normalize: "NA" # No normalization for mel-spectogram makes streaming easier + window_size: 0.025 + window_stride: 0.01 + window: "hann" + features: 128 + n_fft: 512 + frame_splicing: 1 + dither: 0.00001 + pad_to: 0 + + spec_augment: + _target_: nemo.collections.asr.modules.SpectrogramAugmentation + freq_masks: 2 # set to zero to disable it + time_masks: 10 # set to zero to disable it + freq_width: 27 + time_width: 0.05 + + encoder: + _target_: nemo.collections.asr.modules.ConformerEncoder + feat_in: ${model.preprocessor.features} + feat_out: -1 # you may set it if you need different output size other than the default d_model + n_layers: 24 + d_model: 1024 + use_bias: false # whether to apply bias in the feedforward, MHA and convolution modules + + # Sub-sampling parameters + subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding + subsampling_factor: 8 # must be power of 2 for striding and vggnet + subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model + causal_downsampling: true + + # Feed forward module's params + ff_expansion_factor: 4 + + # Multi-headed Attention Module's params + self_attention_model: rel_pos # rel_pos or abs_pos + n_heads: 8 # may need to be lower for smaller d_models + + # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention + # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large + # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one + # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s + + # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. + # The first item in the list would be the default during test/validation/inference. + # An example of settings for multi-lookahead: + # att_context_size: [[70,13],[70,6],[70,1],[70,0]] + # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] + att_context_size: [70, 1] # -1 means unlimited context + att_context_style: chunked_limited # regular or chunked_limited + att_context_probs: null + + xscaling: false # scales up the input embeddings by sqrt(d_model) + pos_emb_max_len: 5000 + + # Convolution module's params + conv_kernel_size: 9 + conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) + + # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size + # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] + # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly + conv_context_size: causal + + ### regularization + dropout: 0.1 # The dropout used in most of the Conformer Modules + dropout_pre_encoder: 0.1 # The dropout used before the encoder + dropout_emb: 0.0 # The dropout used for embeddings + dropout_att: 0.1 # The dropout for multi-headed attention modules + + # set to non-zero to enable stochastic depth + stochastic_depth_drop_prob: 0.0 + stochastic_depth_mode: linear # linear or uniform + stochastic_depth_start_layer: 1 + + decoder: + _target_: nemo.collections.asr.modules.RNNTDecoder + normalization_mode: null # Currently only null is supported for export. + random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf + blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. + + prednet: + pred_hidden: ${model.model_defaults.pred_hidden} + pred_rnn_layers: 1 + t_max: null + dropout: 0.2 + + joint: + _target_: nemo.collections.asr.modules.RNNTJoint + log_softmax: null # 'null' would set it automatically according to CPU/GPU device + preserve_memory: false # dramatically slows down training, but might preserve some memory + + # Fuses the computation of prediction net + joint net + loss + WER calculation + # to be run on sub-batches of size `fused_batch_size`. + # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. + # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. + # Using small values here will preserve a lot of memory during training, but will make training slower as well. + # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. + # However, to preserve memory, this ratio can be 1:8 or even 1:16. + # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. + fuse_loss_wer: true + fused_batch_size: 4 + + jointnet: + joint_hidden: ${model.model_defaults.joint_hidden} + activation: "relu" + dropout: 0.2 + + decoding: + strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. + + # greedy strategy config + greedy: + max_symbols: 10 + + # beam strategy config + beam: + beam_size: 2 + return_best_hypothesis: False + score_norm: true + tsd_max_sym_exp: 50 # for Time Synchronous Decoding + alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding + + # config for InterCTC loss: https://arxiv.org/abs/2102.03216 + # specify loss weights and which layers to use for InterCTC + # e.g., to reproduce the paper results, set loss_weights: [0.3] + # and apply_at_layers: [8] (assuming 18 layers). Note that final + # layer loss coefficient is automatically adjusted (to 0.7 in above example) + interctc: + loss_weights: [] + apply_at_layers: [] + + loss: + loss_name: "default" + warprnnt_numba_kwargs: + # FastEmit regularization: https://arxiv.org/abs/2010.11148 + # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming + # You may set it to lower values like 1e-3 for models with larger right context + fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. + + optim: + name: adamw + lr: 5.0 # 1e-4 + # optimizer arguments + betas: [0.9, 0.98] + weight_decay: 1e-3 + + # scheduler setup + sched: + name: NoamAnnealing # NoamAnnealing CosineAnnealing + # scheduler config override + d_model: ${model.encoder.d_model} + warmup_steps: 10000 + warmup_ratio: null + min_lr: 1e-6 + +trainer: + devices: -1 # number of GPUs, -1 would use all available GPUs + num_nodes: 1 + max_epochs: -1 + max_steps: 100000 # computed at runtime if not set + val_check_interval: 1000 # an int for number of iterations + limit_train_batches: ${trainer.val_check_interval} + accelerator: auto + strategy: + _target_: lightning.pytorch.strategies.DDPStrategy + gradient_as_bucket_view: true + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + precision: 32 # 16, 32, or bf16 + log_every_n_steps: 10 # Interval of logging. + enable_progress_bar: True + num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it + check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs + sync_batchnorm: true + enable_checkpointing: False # Provided by exp_manager + logger: false # Provided by exp_manager + benchmark: false # needs to be false for models with variable-length speech input as it slows down training + use_distributed_sampler: false + +exp_manager: + exp_dir: null + name: ${name} + create_tensorboard_logger: true + create_checkpoint_callback: true + checkpoint_callback_params: + # in case of multiple validation sets, first one is used + monitor: "val_wer" + mode: "min" + save_top_k: 5 + filename: '${exp_manager.name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' + always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + resume_if_exists: false + resume_ignore_no_checkpoint: false + + create_wandb_logger: false + wandb_logger_kwargs: + name: null + project: null From f0018e97b5efa86bd11cd78eb444993da65467ef Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Thu, 11 Sep 2025 14:46:49 -0400 Subject: [PATCH 096/107] fix cfg Signed-off-by: stevehuang52 --- .../conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml index abc845802856..17d3af8d4a8f 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml @@ -195,7 +195,7 @@ model: prednet: pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 + pred_rnn_layers: 2 t_max: null dropout: 0.2 From b9ab27701c15d26c27226d5c3d72a86c7705fe8e Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 16 Sep 2025 11:31:23 -0400 Subject: [PATCH 097/107] clean up for PR Signed-off-by: stevehuang52 --- .../speech_to_text_hybrid_eou_train.py | 331 ----------- .../speech_to_text_hybrid_frame_eou_train.py | 147 ----- .../asr_eou/speech_to_text_rnnt_eou_train.py | 37 +- ...mer_hybrid_asr_frame_fc_eou_streaming.yaml | 424 -------------- ...ybrid_asr_frame_fc_lstm_eou_streaming.yaml | 425 -------------- ...r_hybrid_asr_frame_lstm_eou_streaming.yaml | 369 ------------ ...r_hybrid_transducer_ctc_bpe_streaming.yaml | 331 ----------- ...astconformer_transducer_bpe_streaming.yaml | 9 +- ...rmer_transducer_bpe_streaming_adapter.yaml | 13 +- ...ormer_transducer_bpe_streaming_augval.yaml | 332 ----------- ...conformer_transducer_bpe_streaming_v2.yaml | 305 ---------- ...conformer_transducer_bpe_streaming_xl.yaml | 315 ---------- .../asr/data/audio_to_eou_label_lhotse.py | 179 +----- nemo/collections/asr/models/asr_eou_models.py | 547 +----------------- .../common/data/lhotse/dataloader.py | 18 - 15 files changed, 63 insertions(+), 3719 deletions(-) delete mode 100644 examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py delete mode 100644 examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py delete mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml delete mode 100644 examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py deleted file mode 100644 index 8fdc780b0342..000000000000 --- a/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py +++ /dev/null @@ -1,331 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Example usage: - -0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py - -1. Add special tokens and to the tokenizer of pretrained model, by refering to the script - /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py - -2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script - /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py - -3. Run the following command to train the ASR-EOU model: -```bash -#!/bin/bash - -NEMO_PATH=/home/heh/codes/nemo-eou -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH - -TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json - -PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo -TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou - -BATCH_DURATION=30 -NUM_WORKERS=0 -LIMIT_TRAIN_BATCHES=100 -VAL_CHECK_INTERVAL=100 -MAX_STEPS=1000000 - -EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug - -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_hybrid_eou_train.py -CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming -CONFIG_NAME=fastconformer_transducer_bpe_streaming - -CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ - --config-path $CONFIG_PATH \ - --config-name $CONFIG_NAME \ - ++init_from_nemo_model=$PRETRAINED_NEMO \ - model.encoder.att_context_size="[70,1]" \ - model.tokenizer.dir=$TOKENIZER_DIR \ - model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ - model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ - model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.train_ds.batch_duration=$BATCH_DURATION \ - model.train_ds.num_workers=$NUM_WORKERS \ - model.validation_ds.batch_duration=$BATCH_DURATION \ - model.validation_ds.num_workers=$NUM_WORKERS \ - ~model.test_ds \ - trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ - trainer.val_check_interval=$VAL_CHECK_INTERVAL \ - trainer.max_steps=$MAX_STEPS \ - exp_manager.name=$EXP_NAME -``` - -""" - - -from typing import Optional - -import lightning.pytorch as pl -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel -from nemo.collections.asr.models.asr_eou_models import EncDecHybridRNNTCTCBPEEOUModel, EncDecRNNTBPEEOUModel -from nemo.collections.asr.modules.conv_asr import ConvASRDecoder -from nemo.collections.asr.modules.rnnt import RNNTDecoder, RNNTJoint -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager -from nemo.utils.trainer_utils import resolve_trainer_cfg - - -def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: - if hasattr(cfg, 'init_from_ptl_ckpt') and cfg.init_from_ptl_ckpt is not None: - raise NotImplementedError( - "Currently for simplicity of single script for all model types, we only support `init_from_nemo_model` and `init_from_pretrained_model`" - ) - nemo_model_path = cfg.get('init_from_nemo_model', None) - pretrained_name = cfg.get('init_from_pretrained_model', None) - if nemo_model_path is not None and pretrained_name is not None: - raise ValueError("Only pass `init_from_nemo_model` or `init_from_pretrained_model` but not both") - elif nemo_model_path is None and pretrained_name is None: - return None - - if nemo_model_path: - return nemo_model_path - if pretrained_name: - return pretrained_name - - -def init_from_pretrained_nemo(model: EncDecHybridRNNTCTCBPEModel, pretrained_model_path: str, cfg: DictConfig): - """ - load the pretrained model from a .nemo file, taking into account the joint network - """ - if pretrained_model_path.endswith('.nemo'): - pretrained_model = ASRModel.restore_from(restore_path=pretrained_model_path) # type: EncDecRNNTBPEModel - else: - try: - pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel - except Exception as e: - raise ValueError(f"Could not load pretrained model from {pretrained_model_path}.") from e - - if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): - raise ValueError( - f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." - ) - - # Load encoder state dict into the model - model.encoder.load_state_dict(pretrained_model.encoder.state_dict(), strict=True) - logging.info(f"Encoder weights loaded from {pretrained_model_path}.") - - # Load decoder state dict into the model - decoder = model.decoder # type: RNNTDecoder - pretrained_decoder = pretrained_model.decoder # type: RNNTDecoder - if not isinstance(decoder, RNNTDecoder) or not isinstance(pretrained_decoder, RNNTDecoder): - raise ValueError( - f"Decoder {decoder.__class__} is not RNNTDecoder or pretrained decoder {pretrained_decoder.__class__} is not RNNTDecoder." - ) - - decoder.prediction["dec_rnn"].load_state_dict(pretrained_decoder.prediction["dec_rnn"].state_dict(), strict=True) - - decoder_embed_states = decoder.prediction["embed"].state_dict()['weight'] # shape: [num_classes+2, hid_dim] - pretrained_decoder_embed_states = pretrained_decoder.prediction["embed"].state_dict()[ - 'weight' - ] # shape: [num_classes, hid_dim] - if decoder_embed_states.shape[0] != pretrained_decoder_embed_states.shape[0] + 2: - raise ValueError( - f"Size mismatched between pretrained ({pretrained_decoder_embed_states.shape[0]}+2) and current model ({decoder_embed_states.shape[0]}), skip loading decoder embedding." - ) - - decoder_embed_states[:-3, :] = pretrained_decoder_embed_states[:-1, :] # everything except EOU, EOB and blank - decoder_embed_states[-1, :] = pretrained_decoder_embed_states[-1, :] # blank class - decoder.prediction["embed"].load_state_dict({"weight": decoder_embed_states}, strict=True) - logging.info(f"Decoder weights loaded from {pretrained_model_path}.") - - # Load joint network weights if new model's joint network has two more classes than the pretrained model - joint_network = model.joint # type: RNNTJoint - pretrained_joint_network = pretrained_model.joint # type: RNNTJoint - assert isinstance(joint_network, RNNTJoint), f"Joint network {joint_network.__class__} is not RNNTJoint." - assert isinstance( - pretrained_joint_network, RNNTJoint - ), f"Pretrained joint network {pretrained_joint_network.__class__} is not RNNTJoint." - joint_network.pred.load_state_dict(pretrained_joint_network.pred.state_dict(), strict=True) - joint_network.enc.load_state_dict(pretrained_joint_network.enc.state_dict(), strict=True) - - if joint_network.num_classes_with_blank != pretrained_joint_network.num_classes_with_blank + 2: - raise ValueError( - f"Size mismatched between pretrained ({pretrained_joint_network.num_classes_with_blank}+2) and current model ({joint_network.num_classes_with_blank}), skip loading joint network." - ) - - # Load the joint network weights - pretrained_joint_state = pretrained_joint_network.joint_net.state_dict() - joint_state = joint_network.joint_net.state_dict() - pretrained_joint_clf_weight = pretrained_joint_state['2.weight'] # shape: [num_classes, hid_dim] - pretrained_joint_clf_bias = pretrained_joint_state['2.bias'] if '2.bias' in pretrained_joint_state else None - - token_init_method = cfg.model.get('token_init_method', 'constant') - # Copy the weights and biases from the pretrained model to the new model - # shape: [num_classes+2, hid_dim] - joint_state['2.weight'][:-3, :] = pretrained_joint_clf_weight[:-1, :] # everything except EOU, EOB and blank - joint_state['2.weight'][-1, :] = pretrained_joint_clf_weight[-1, :] # blank class - - value = None - if token_init_method == 'min': - # set the EOU and EOB class to the minimum value of the pretrained model - value = pretrained_joint_clf_weight.min(dim=0)[0] - elif token_init_method == 'max': - # set the EOU and EOB class to the maximum value of the pretrained model - value = pretrained_joint_clf_weight.max(dim=0)[0] - elif token_init_method == 'mean': - # set the EOU and EOB class to the mean value of the pretrained model - value = pretrained_joint_clf_weight.mean(dim=0) - elif token_init_method == 'constant': - value = cfg.model.get('token_init_weight_value', None) - elif token_init_method: - raise ValueError(f"Unknown token_init_method: {token_init_method}.") - - if value is not None: - joint_state['2.weight'][-2, :] = value # EOB class - joint_state['2.weight'][-3, :] = value # EOU class - - if pretrained_joint_clf_bias is not None and '2.bias' in joint_state: - joint_state['2.bias'][:-3] = pretrained_joint_clf_bias[:-1] # everything except EOU, EOB and blank - joint_state['2.bias'][-1] = pretrained_joint_clf_bias[-1] # blank class - value = None - if token_init_method == 'constant': - value = cfg.model.get('token_init_bias_value', None) - elif token_init_method == 'min': - # set the EOU and EOB class to the minimum value of the pretrained model - value = pretrained_joint_clf_bias.min() - elif token_init_method == 'max': - # set the EOU and EOB class to the maximum value of the pretrained model - value = pretrained_joint_clf_bias.max() - elif token_init_method == 'mean': - # set the EOU and EOB class to the mean value of the pretrained model - value = pretrained_joint_clf_bias.mean() - elif token_init_method: - raise ValueError(f"Unknown token_init_method: {token_init_method}.") - - if value is not None: - joint_state['2.bias'][-2] = value # EOB class - joint_state['2.bias'][-3] = value # EOU class - - # Load the joint network weights - joint_network.joint_net.load_state_dict(joint_state, strict=True) - logging.info(f"Joint network weights loaded from {pretrained_model_path}.") - - # Load the CTC decoder weights if the model is EncDecHybridRNNTCTCBPEEOUModel - if not hasattr(model, 'ctc_decoder') or not isinstance(model, EncDecHybridRNNTCTCBPEEOUModel): - return - if not hasattr(pretrained_model, 'ctc_decoder') or not isinstance(pretrained_model, EncDecHybridRNNTCTCBPEModel): - raise ValueError( - f"CTC decoder {model.ctc_decoder.__class__} is not EncDecHybridRNNTCTCBPEEOUModel or pretrained CTC decoder {pretrained_model.ctc_decoder.__class__} is not EncDecHybridRNNTCTCBPEModel." - ) - - ctc_decoder = model.ctc_decoder # type: ConvASRDecoder - pretrained_ctc_decoder = pretrained_model.ctc_decoder # type: ConvASRDecoder - assert isinstance(ctc_decoder, ConvASRDecoder), f"CTC decoder {ctc_decoder.__class__} is not ConvASRDecoder." - assert isinstance( - pretrained_ctc_decoder, ConvASRDecoder - ), f"Pretrained CTC decoder {pretrained_ctc_decoder.__class__} is not ConvASRDecoder." - - ctc_decoder_state = ctc_decoder.state_dict() - pretrained_ctc_decoder_state = pretrained_ctc_decoder.state_dict() - - if ctc_decoder._num_classes == pretrained_ctc_decoder._num_classes: - logging.info("CTC decoder weights loaded from pretrained model with same shape.") - ctc_decoder.load_state_dict(pretrained_ctc_decoder_state, strict=True) - return - elif ctc_decoder._num_classes != pretrained_ctc_decoder._num_classes + 2: - raise ValueError( - f"Size mismatched between pretrained ({pretrained_ctc_decoder._num_classes}+2) and current model ({ctc_decoder._num_classes}), skip loading CTC decoder." - ) - - pretrained_weight = pretrained_ctc_decoder_state['decoder_layers.0.weight'] # shape: [num_classes, hid_dim, 1] - pretrained_bias = ( - pretrained_ctc_decoder_state['decoder_layers.0.bias'] - if 'decoder_layers.0.bias' in pretrained_ctc_decoder_state - else None - ) # shape: [num_classes] - - # Copy the weights and biases from the pretrained model to the new model - ctc_decoder_state['decoder_layers.0.weight'][:-3, :, :] = pretrained_weight[ - :-1, :, : - ] # everything except EOU, EOB and blank - ctc_decoder_state['decoder_layers.0.weight'][-1, :, :] = pretrained_weight[-1, :, :] # blank class - value = None - if token_init_method == 'min': - # set the EOU and EOB class to the minimum value of the pretrained model - value = pretrained_weight.min(dim=0)[0] - elif token_init_method == 'max': - # set the EOU and EOB class to the maximum value of the pretrained model - value = pretrained_weight.max(dim=0)[0] - elif token_init_method == 'mean': - # set the EOU and EOB class to the mean value of the pretrained model - value = pretrained_weight.mean(dim=0) - elif token_init_method == 'constant': - value = cfg.model.get('token_init_weight_value', None) - elif token_init_method: - raise ValueError(f"Unknown token_init_method: {token_init_method}.") - - if value is not None: - ctc_decoder_state['decoder_layers.0.weight'][-2, :] = value # EOB class - ctc_decoder_state['decoder_layers.0.weight'][-3, :] = value # EOU class - - if pretrained_bias is not None and 'decoder_layers.0.bias' in ctc_decoder_state: - ctc_decoder_state['decoder_layers.0.bias'][:-3] = pretrained_bias[:-1] # everything except EOU, EOB and blank - ctc_decoder_state['decoder_layers.0.bias'][-1] = pretrained_bias[-1] # blank class - value = None - if token_init_method == 'constant': - value = cfg.model.get('token_init_bias_value', None) - elif token_init_method == 'min': - # set the EOU and EOB class to the minimum value of the pretrained model - value = pretrained_bias.min() - elif token_init_method == 'max': - # set the EOU and EOB class to the maximum value of the pretrained model - value = pretrained_bias.max() - elif token_init_method == 'mean': - # set the EOU and EOB class to the mean value of the pretrained model - value = pretrained_bias.mean() - elif token_init_method: - raise ValueError(f"Unknown token_init_method: {token_init_method}.") - if value is not None: - ctc_decoder_state['decoder_layers.0.bias'][-2] = value - ctc_decoder_state['decoder_layers.0.bias'][-3] = value - - # Load the CTC decoder weights - model.ctc_decoder.load_state_dict(ctc_decoder_state, strict=True) - logging.info(f"CTC decoder weights loaded from {pretrained_model_path}.") - return - - -@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_hybrid_transducer_ctc_bpe_streaming") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) - exp_manager(trainer, cfg.get("exp_manager", None)) - - asr_model = EncDecHybridRNNTCTCBPEEOUModel(cfg=cfg.model, trainer=trainer) - - init_from_model = get_pretrained_model_name(cfg) - if init_from_model: - init_from_pretrained_nemo(asr_model, init_from_model, cfg) - - trainer.fit(asr_model) - - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if asr_model.prepare_test(trainer): - trainer.test(asr_model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py b/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py deleted file mode 100644 index 1a7058ef4791..000000000000 --- a/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Example usage: - -0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py - -1. Add special tokens and to the tokenizer of pretrained model, by refering to the script - /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py - -2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script - /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py - -3. Run the following command to train the ASR-EOU model: -```bash -#!/bin/bash - -NEMO_PATH=/home/heh/codes/nemo-eou -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH - -TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json - -PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo -TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou - -BATCH_DURATION=30 -NUM_WORKERS=0 -LIMIT_TRAIN_BATCHES=100 -VAL_CHECK_INTERVAL=100 -MAX_STEPS=1000000 - -EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug - -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_hybrid_frame_eou_train.py -CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming -CONFIG_NAME=fastconformer_transducer_bpe_streaming - -CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ - --config-path $CONFIG_PATH \ - --config-name $CONFIG_NAME \ - ++init_from_nemo_model=$PRETRAINED_NEMO \ - model.encoder.att_context_size="[70,1]" \ - model.tokenizer.dir=$TOKENIZER_DIR \ - model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ - model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ - model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.train_ds.batch_duration=$BATCH_DURATION \ - model.train_ds.num_workers=$NUM_WORKERS \ - model.validation_ds.batch_duration=$BATCH_DURATION \ - model.validation_ds.num_workers=$NUM_WORKERS \ - ~model.test_ds \ - trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ - trainer.val_check_interval=$VAL_CHECK_INTERVAL \ - trainer.max_steps=$MAX_STEPS \ - exp_manager.name=$EXP_NAME -``` - -""" - -import lightning.pytorch as pl -import torch -from omegaconf import DictConfig, OmegaConf - -from nemo.collections.asr.models import ASRModel -from nemo.collections.asr.models.asr_eou_models import EncDecHybridASRFrameEOUModel -from nemo.core.classes import typecheck -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import exp_manager -from nemo.utils.trainer_utils import resolve_trainer_cfg - -typecheck.set_typecheck_enabled(False) - - -def load_from_pretrained_model(model: ASRModel, cfg: DictConfig) -> ASRModel: - args = [ - 'init_from_nemo_model', - 'init_from_pretrained_model', - 'init_from_ptl_ckpt', - ] - arg_matches = [(1 if arg in cfg and arg is not None else 0) for arg in args] - - if sum(arg_matches) == 0: - # model weights do not need to be restored - return model - - if sum(arg_matches) > 1: - raise ValueError( - f"Cannot pass more than one model initialization arguments to config!\n" - f"Found : {[args[idx] for idx, arg_present in enumerate(arg_matches) if arg_present]}" - ) - - if cfg.get('init_from_nemo_model', None) is not None: - logging.info(f"Loading pretrained model from local: {cfg.init_from_nemo_model}") - pretrained_model = ASRModel.restore_from(cfg.init_from_nemo_model, map_location='cpu') - pretrained_state_dict = pretrained_model.state_dict() - elif cfg.get('init_from_pretrained_model', None) is not None: - logging.info(f"Loading pretrained model from remote: {cfg.init_from_pretrained_model}") - pretrained_model = ASRModel.from_pretrained(cfg.init_from_pretrained_model, map_location='cpu') - pretrained_state_dict = pretrained_model.state_dict() - elif cfg.get('init_from_ptl_ckpt', None) is not None: - logging.info(f"Loading pretrained PTL checkpoint from local: {cfg.init_from_ptl_ckpt}") - pretrained_state_dict = torch.load(cfg.init_from_ptl_ckpt, map_location='cpu', weights_only=False)[ - 'state_dict' - ] - - # Load the pretrained model state dict into the current model - encoder_states = {k: v for k, v in pretrained_state_dict.items() if k.startswith("encoder.")} - model.encoder.load_state_dict(encoder_states, strict=True) - model.load_state_dict(pretrained_state_dict, strict=False) - return model - - -@hydra_runner(config_path="../conf/asr_eou", config_name="fastconformer_hybrid_asr_frame_eou_streaming") -def main(cfg): - logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') - - trainer = pl.Trainer(**resolve_trainer_cfg(cfg.trainer)) - exp_manager(trainer, cfg.get("exp_manager", None)) - - asr_model = EncDecHybridASRFrameEOUModel(cfg=cfg.model, trainer=trainer) - - asr_model = load_from_pretrained_model(asr_model, cfg) - - trainer.fit(asr_model) - - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: - if asr_model.prepare_test(trainer): - trainer.test(asr_model) - - -if __name__ == '__main__': - main() # noqa pylint: disable=no-value-for-parameter diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index a9797281a974..2350412ffb90 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -27,26 +27,22 @@ ```bash #!/bin/bash -NEMO_PATH=/home/heh/codes/nemo-eou -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH +TRAIN_MANIFEST=/path/to/train_manifest.json +VAL_MANIFEST=/path/to/val_manifest.json +NOISE_MANIFEST=/path/to/noise_manifest.json -TRAIN_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -VAL_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/turnGPT_TTS_data/daily_dialogue_test_tts.json -NOISE_MANIFEST=/home/heh/codes/nemo-eou/nemo_experiments/noise_manifest.json +PRETRAINED_NEMO=/path/to/pretrained_model.nemo +TOKENIZER_DIR=/path/to/tokenizer_dir -PRETRAINED_NEMO=/media/data3/pretrained_models/nemo_asr/stt_en_fastconformer_hybrid_large_streaming_80ms_rnnt.nemo -TOKENIZER_DIR=/media/data3/pretrained_models/nemo_asr/tokenizers/stt_en_fastconformer_hybrid_large_streaming_80ms_eou - -BATCH_DURATION=30 -NUM_WORKERS=0 -LIMIT_TRAIN_BATCHES=100 -VAL_CHECK_INTERVAL=100 +BATCH_SIZE=16 +NUM_WORKERS=8 +LIMIT_TRAIN_BATCHES=1000 +VAL_CHECK_INTERVAL=1000 MAX_STEPS=1000000 -EXP_NAME=fastconformer_transducer_bpe_streaming_eou_debug - -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou.py -CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/fastconformer/cache_aware_streaming +EXP_NAME=fastconformer_transducer_bpe_streaming_eou +SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/asr_eou CONFIG_NAME=fastconformer_transducer_bpe_streaming CUDA_VISIBLE_DEVICES=0 python $SCRIPT \ @@ -58,9 +54,9 @@ model.train_ds.manifest_filepath=$TRAIN_MANIFEST \ model.train_ds.augmentor.noise.manifest_path=$NOISE_MANIFEST \ model.validation_ds.manifest_filepath=$VAL_MANIFEST \ - model.train_ds.batch_duration=$BATCH_DURATION \ + model.train_ds.batch_size=$BATCH_SIZE \ model.train_ds.num_workers=$NUM_WORKERS \ - model.validation_ds.batch_duration=$BATCH_DURATION \ + model.validation_ds.batch_size=$BATCH_SIZE \ model.validation_ds.num_workers=$NUM_WORKERS \ ~model.test_ds \ trainer.limit_train_batches=$LIMIT_TRAIN_BATCHES \ @@ -122,7 +118,6 @@ def setup_adapters(cfg: DictConfig, model: ASRModel): adapter_name = cfg.model.adapter.pop("adapter_name") adapter_type = cfg.model.adapter.pop("adapter_type") adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) - adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) # Resolve the config of the specified `adapter_type` if adapter_type not in cfg.model.adapter.keys(): @@ -202,7 +197,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat ) if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): - raise ValueError( + raise TypeError( f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." ) @@ -214,7 +209,7 @@ def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_pat decoder = model.decoder # type: RNNTDecoder pretrained_decoder = pretrained_model.decoder # type: RNNTDecoder if not isinstance(decoder, RNNTDecoder) or not isinstance(pretrained_decoder, RNNTDecoder): - raise ValueError( + raise TypeError( f"Decoder {decoder.__class__} is not RNNTDecoder or pretrained decoder {pretrained_decoder.__class__} is not RNNTDecoder." ) diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml deleted file mode 100644 index 3d304df59ec7..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_eou_streaming.yaml +++ /dev/null @@ -1,424 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. -# The model would have two decoders: RNNT (Transducer) and CTC - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -# Note: if training loss does not converge, you may increase warm-up to 20K. - -name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder - num_eou_classes: 4 - eou_class_weights: [1,1,100,100] - rnnt_loss_weight: 0.0 - ctc_loss_weight: 0.0 - eou_loss_weight: 1.0 - use_ctc_pred: false - freeze_encoder: true - freeze_ctc: true - freeze_rnnt: true - pad_eou_label_secs: 0.0 - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: false # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - random_padding: - prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder - aux_ctc: - ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss - use_cer: false - ctc_reduction: 'mean_batch' - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null - num_classes: -1 - vocabulary: [] - decoding: - strategy: "greedy" - - aggregator: - _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator - mode: "weighted_sum" - weights: null - layer_idx_list: ${model.layer_idx_list} - - eou_encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.encoder.d_model} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 2 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 1 # NO subsampling - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: ${model.encoder.pos_emb_max_len} - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - eou_decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: ${model.eou_encoder.d_model} - num_classes: ${model.num_eou_classes} - add_blank: false - - eou_loss: - weight: ${model.eou_class_weights} - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 5.0 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing - d_model: ${model.encoder.d_model} - # scheduler config override - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_eou_macro_acc" - mode: "max" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml deleted file mode 100644 index 7f000742b2a8..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_fc_lstm_eou_streaming.yaml +++ /dev/null @@ -1,425 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. -# The model would have two decoders: RNNT (Transducer) and CTC - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -# Note: if training loss does not converge, you may increase warm-up to 20K. - -name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder - num_eou_classes: 4 - eou_class_weights: [1,1,100,100] - rnnt_loss_weight: 0.0 - ctc_loss_weight: 0.0 - eou_loss_weight: 1.0 - use_ctc_pred: false - freeze_encoder: true - freeze_ctc: true - freeze_rnnt: true - pad_eou_label_secs: 0.0 - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: false # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - random_padding: - prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder - aux_ctc: - ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss - use_cer: false - ctc_reduction: 'mean_batch' - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null - num_classes: -1 - vocabulary: [] - decoding: - strategy: "greedy" - - aggregator: - _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator - mode: "weighted_sum" - weights: null - layer_idx_list: ${model.layer_idx_list} - - eou_encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.encoder.d_model} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 2 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 1 # NO subsampling - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: ${model.encoder.att_context_size} # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: ${model.encoder.pos_emb_max_len} - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - eou_decoder: - _target_: nemo.collections.asr.modules.LSTMDecoder - feat_in: ${model.encoder.d_model} - num_classes: ${model.num_eou_classes} - lstm_hidden_size: 256 - num_layers: 4 - add_blank: false - - eou_loss: - weight: ${model.eou_class_weights} - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 0.0005 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: CosineAnnealing - # scheduler config override - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_eou_macro_acc" - mode: "max" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml deleted file mode 100644 index b0f2037e91dd..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_asr_frame_lstm_eou_streaming.yaml +++ /dev/null @@ -1,369 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. -# The model would have two decoders: RNNT (Transducer) and CTC - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -# Note: if training loss does not converge, you may increase warm-up to 20K. - -name: "FastConformer-Hybrid-ASR-Frame-EOU-Streaming" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - layer_idx_list: [0, -1] # extract features from the first and last layers of ASR encoder - num_eou_classes: 4 - eou_class_weights: [1,1,100,100] - rnnt_loss_weight: 0.0 - ctc_loss_weight: 0.0 - eou_loss_weight: 1.0 - use_ctc_pred: false - freeze_encoder: true - freeze_ctc: true - freeze_rnnt: true - pad_eou_label_secs: 0.0 - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: false # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - random_padding: - prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - check_tokenizer: false - add_eou_to_text: false - pad_eou_label_secs: ${model.pad_eou_label_secs} - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder - aux_ctc: - ctc_loss_weight: ${model.ctc_loss_weight} # the weight used to combine the CTC loss with the RNNT loss - use_cer: false - ctc_reduction: 'mean_batch' - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null - num_classes: -1 - vocabulary: [] - decoding: - strategy: "greedy" - - aggregator: - _target_: nemo.collections.asr.modules.ssl_modules.multi_layer_feat.Aggregator - mode: "weighted_sum" - weights: null - layer_idx_list: ${model.layer_idx_list} - - eou_encoder: null - - eou_decoder: - _target_: nemo.collections.asr.modules.LSTMDecoder - feat_in: ${model.encoder.d_model} - num_classes: ${model.num_eou_classes} - lstm_hidden_size: 256 - num_layers: 4 - add_blank: false - - eou_loss: - weight: ${model.eou_class_weights} - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 0.0005 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: CosineAnnealing - # scheduler config override - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_eou_macro_acc" - mode: "max" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml deleted file mode 100644 index 4e9efa03be47..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml +++ /dev/null @@ -1,331 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Hybrid-Transducer-CTC ASR model, large size (~115M) with sub-word encoding. -# The model would have two decoders: RNNT (Transducer) and CTC - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Hybrid ASR: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#hybrid-transducer-ctc -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -# Note: if training loss does not converge, you may increase warm-up to 20K. - -name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming-EOU" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - random_padding: - prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # The section which would contain the decoder and decoding configs of the auxiliary CTC decoder - aux_ctc: - ctc_loss_weight: 0.3 # the weight used to combine the CTC loss with the RNNT loss - use_cer: false - ctc_reduction: 'mean_batch' - decoder: - _target_: nemo.collections.asr.modules.ConvASRDecoder - feat_in: null - num_classes: -1 - vocabulary: [] - decoding: - strategy: "greedy" - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 5.0 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing - d_model: ${model.encoder.d_model} - # scheduler config override - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 78e939cbdc5b..3a6be9f336b3 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -41,6 +41,7 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU random_padding: prob: 0.99 min_post_pad_duration: 3.0 @@ -57,11 +58,11 @@ model: min_level: -90 max_level: -46 gain: - prob: 0.5 + prob: 0.2 min_gain_dbfs: -10.0 max_gain_dbfs: 10.0 noise: - prob: 0.6 + prob: 0.9 manifest_path: ??? min_snr_db: 0 max_snr_db: 20 @@ -84,6 +85,7 @@ model: num_cuts_for_bins_estimate: 10000 bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU test_ds: manifest_filepath: null @@ -102,6 +104,7 @@ model: num_cuts_for_bins_estimate: 10000 bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py # We recommend to use vocab size of 1024 with SPE Unigram for most languages @@ -250,7 +253,7 @@ model: # FastEmit regularization: https://arxiv.org/abs/2010.11148 # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + fastemit_lambda: 3e-2 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index 03ffea91d009..ef249bb401da 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -96,6 +96,7 @@ model: bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU random_padding: prob: 0.99 min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds @@ -111,11 +112,11 @@ model: min_level: -90 max_level: -46 gain: - prob: 0.5 + prob: 0.2 min_gain_dbfs: -10.0 max_gain_dbfs: 10.0 noise: - prob: 0.6 + prob: 0.9 manifest_path: ??? min_snr_db: 0 max_snr_db: 20 @@ -138,6 +139,7 @@ model: num_cuts_for_bins_estimate: 10000 bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU test_ds: manifest_filepath: null @@ -156,6 +158,7 @@ model: num_cuts_for_bins_estimate: 10000 bucket_buffer_size: 10000 shuffle_buffer_size: 10000 + ignore_eob_label: true # ignore backchannel and treat them the same as EOU # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py # We recommend to use vocab size of 1024 with SPE Unigram for most languages @@ -189,7 +192,7 @@ model: feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 17 d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules + use_bias: false # whether to apply bias in the feedforward, MHA and convolution modules # Sub-sampling parameters subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding @@ -218,7 +221,7 @@ model: att_context_style: chunked_limited # regular or chunked_limited att_context_probs: null - xscaling: true # scales up the input embeddings by sqrt(d_model) + xscaling: false # scales up the input embeddings by sqrt(d_model) pos_emb_max_len: 5000 # Convolution module's params @@ -304,7 +307,7 @@ model: # FastEmit regularization: https://arxiv.org/abs/2010.11148 # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. + fastemit_lambda: 3e-2 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml deleted file mode 100644 index 17dc9387c5a4..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_augval.yaml +++ /dev/null @@ -1,332 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -name: "FastConformer-Transducer-BPE-Streaming-EOU" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - random_padding: - prob: 0.99 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 10.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - random_padding: - prob: 1.0 - min_pad_duration: 1.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 1.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' - pre_pad_duration: 0.2 - post_pad_duration: 3.0 - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - random_padding: - prob: 1.0 - min_pad_duration: 0.0 # minimum duration of pre/post padding in seconds - max_pad_duration: 3.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'constant' # distribution of padding duration, 'uniform' or 'normal' or 'constant' - pre_pad_duration: 0.2 - post_pad_duration: 3.0 - - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 80 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 5.0 # 1e-4 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing # NoamAnnealing CosineAnnealing - # scheduler config override - d_model: ${model.encoder.d_model} - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml deleted file mode 100644 index ed817bc08b5f..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_v2.yaml +++ /dev/null @@ -1,305 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -name: "FastConformer-Transducer-BPE-Streaming-EOU" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - skip_augment: true - use_dataloader_augment: true - random_padding: - prob: 0.99 - min_post_pad_duration: 3.0 - min_pre_pad_duration: 0.0 - max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - noise_path: ??? - noise_mix_prob: 1.0 - noise_snr: [0, 20.0] - - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 128 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 17 - d_model: 512 - use_bias: True # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: true # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 1 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 5.0 # 1e-4 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing # NoamAnnealing CosineAnnealing - # scheduler config override - d_model: ${model.encoder.d_model} - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - filename: '${exp_manager.name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml deleted file mode 100644 index 17d3af8d4a8f..000000000000 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_xl.yaml +++ /dev/null @@ -1,315 +0,0 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. - -# You may find more detail: -# FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer -# Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer -# FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml - -name: "FastConformer-Transducer-BPE-Streaming-EOU" - -model: - token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] - token_init_weight_value: null # only applicable when token_init_method='constant' - token_init_bias_value: -1000.0 # only applicable when token_init_method='constant' - - sample_rate: 16000 - compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. - log_prediction: true # enables logging sample predictions in the output during training - skip_nan_grad: false - - model_defaults: - enc_hidden: ${model.encoder.d_model} - pred_hidden: 640 - joint_hidden: 640 - - train_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: true - drop_last: true - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - random_padding: - prob: 0.99 - min_post_pad_duration: 3.0 - min_pre_pad_duration: 0.0 - max_pad_duration: 6.0 # maximum duration of pre/post padding in seconds - max_total_duration: 40.0 # maximum total duration of the padded audio in seconds - pad_distribution: 'uniform' # distribution of padding duration, 'uniform' or 'normal' - normal_mean: 0.5 # mean of normal distribution used when pad_distribution='normal' - normal_std: 2.0 # standard deviation of normal distribution used when pad_distribution='normal' - - augmentor: - white_noise: - prob: 0.9 - min_level: -90 - max_level: -46 - gain: - prob: 0.5 - min_gain_dbfs: -10.0 - max_gain_dbfs: 10.0 - noise: - prob: 0.6 - manifest_path: ??? - min_snr_db: 0 - max_snr_db: 20 - max_gain_db: 300.0 - - validation_ds: - manifest_filepath: ??? - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - test_ds: - manifest_filepath: null - tarred_audio_filepaths: null - sample_rate: ${model.sample_rate} - max_duration: 30 # you may need to update it for your dataset - min_duration: 0.1 - defer_setup: true - batch_duration: null # you may disable batch_duration by setting it to `null` - batch_size: 16 - shuffle: false - num_workers: 8 - pin_memory: true - quadratic_duration: 30 - num_buckets: 30 - num_cuts_for_bins_estimate: 10000 - bucket_buffer_size: 10000 - shuffle_buffer_size: 10000 - - # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py - # We recommend to use vocab size of 1024 with SPE Unigram for most languages - tokenizer: - dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) - type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) - - preprocessor: - _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor - sample_rate: ${model.sample_rate} - normalize: "NA" # No normalization for mel-spectogram makes streaming easier - window_size: 0.025 - window_stride: 0.01 - window: "hann" - features: 128 - n_fft: 512 - frame_splicing: 1 - dither: 0.00001 - pad_to: 0 - - spec_augment: - _target_: nemo.collections.asr.modules.SpectrogramAugmentation - freq_masks: 2 # set to zero to disable it - time_masks: 10 # set to zero to disable it - freq_width: 27 - time_width: 0.05 - - encoder: - _target_: nemo.collections.asr.modules.ConformerEncoder - feat_in: ${model.preprocessor.features} - feat_out: -1 # you may set it if you need different output size other than the default d_model - n_layers: 24 - d_model: 1024 - use_bias: false # whether to apply bias in the feedforward, MHA and convolution modules - - # Sub-sampling parameters - subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding - subsampling_factor: 8 # must be power of 2 for striding and vggnet - subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model - causal_downsampling: true - - # Feed forward module's params - ff_expansion_factor: 4 - - # Multi-headed Attention Module's params - self_attention_model: rel_pos # rel_pos or abs_pos - n_heads: 8 # may need to be lower for smaller d_models - - # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention - # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large - # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one - # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s - - # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs. - # The first item in the list would be the default during test/validation/inference. - # An example of settings for multi-lookahead: - # att_context_size: [[70,13],[70,6],[70,1],[70,0]] - # att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25] - att_context_size: [70, 1] # -1 means unlimited context - att_context_style: chunked_limited # regular or chunked_limited - att_context_probs: null - - xscaling: false # scales up the input embeddings by sqrt(d_model) - pos_emb_max_len: 5000 - - # Convolution module's params - conv_kernel_size: 9 - conv_norm_type: 'layer_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) - - # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size - # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] - # Recommend to use causal convolutions as it would increase the effective right context and therefore the look-ahead significantly - conv_context_size: causal - - ### regularization - dropout: 0.1 # The dropout used in most of the Conformer Modules - dropout_pre_encoder: 0.1 # The dropout used before the encoder - dropout_emb: 0.0 # The dropout used for embeddings - dropout_att: 0.1 # The dropout for multi-headed attention modules - - # set to non-zero to enable stochastic depth - stochastic_depth_drop_prob: 0.0 - stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 1 - - decoder: - _target_: nemo.collections.asr.modules.RNNTDecoder - normalization_mode: null # Currently only null is supported for export. - random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf - blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. - - prednet: - pred_hidden: ${model.model_defaults.pred_hidden} - pred_rnn_layers: 2 - t_max: null - dropout: 0.2 - - joint: - _target_: nemo.collections.asr.modules.RNNTJoint - log_softmax: null # 'null' would set it automatically according to CPU/GPU device - preserve_memory: false # dramatically slows down training, but might preserve some memory - - # Fuses the computation of prediction net + joint net + loss + WER calculation - # to be run on sub-batches of size `fused_batch_size`. - # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. - # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. - # Using small values here will preserve a lot of memory during training, but will make training slower as well. - # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. - # However, to preserve memory, this ratio can be 1:8 or even 1:16. - # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. - fuse_loss_wer: true - fused_batch_size: 4 - - jointnet: - joint_hidden: ${model.model_defaults.joint_hidden} - activation: "relu" - dropout: 0.2 - - decoding: - strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. - - # greedy strategy config - greedy: - max_symbols: 10 - - # beam strategy config - beam: - beam_size: 2 - return_best_hypothesis: False - score_norm: true - tsd_max_sym_exp: 50 # for Time Synchronous Decoding - alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding - - # config for InterCTC loss: https://arxiv.org/abs/2102.03216 - # specify loss weights and which layers to use for InterCTC - # e.g., to reproduce the paper results, set loss_weights: [0.3] - # and apply_at_layers: [8] (assuming 18 layers). Note that final - # layer loss coefficient is automatically adjusted (to 0.7 in above example) - interctc: - loss_weights: [] - apply_at_layers: [] - - loss: - loss_name: "default" - warprnnt_numba_kwargs: - # FastEmit regularization: https://arxiv.org/abs/2010.11148 - # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming - # You may set it to lower values like 1e-3 for models with larger right context - fastemit_lambda: 5e-3 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. - clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. - - optim: - name: adamw - lr: 5.0 # 1e-4 - # optimizer arguments - betas: [0.9, 0.98] - weight_decay: 1e-3 - - # scheduler setup - sched: - name: NoamAnnealing # NoamAnnealing CosineAnnealing - # scheduler config override - d_model: ${model.encoder.d_model} - warmup_steps: 10000 - warmup_ratio: null - min_lr: 1e-6 - -trainer: - devices: -1 # number of GPUs, -1 would use all available GPUs - num_nodes: 1 - max_epochs: -1 - max_steps: 100000 # computed at runtime if not set - val_check_interval: 1000 # an int for number of iterations - limit_train_batches: ${trainer.val_check_interval} - accelerator: auto - strategy: - _target_: lightning.pytorch.strategies.DDPStrategy - gradient_as_bucket_view: true - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - precision: 32 # 16, 32, or bf16 - log_every_n_steps: 10 # Interval of logging. - enable_progress_bar: True - num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it - check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs - sync_batchnorm: true - enable_checkpointing: False # Provided by exp_manager - logger: false # Provided by exp_manager - benchmark: false # needs to be false for models with variable-length speech input as it slows down training - use_distributed_sampler: false - -exp_manager: - exp_dir: null - name: ${name} - create_tensorboard_logger: true - create_checkpoint_callback: true - checkpoint_callback_params: - # in case of multiple validation sets, first one is used - monitor: "val_wer" - mode: "min" - save_top_k: 5 - filename: '${exp_manager.name}--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}' - always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - resume_if_exists: false - resume_ignore_no_checkpoint: false - - create_wandb_logger: false - wandb_logger_kwargs: - name: null - project: null diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 6e672d82ce3f..89e2cebf7f18 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -13,16 +13,14 @@ # limitations under the License. import math -import unicodedata from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import numpy as np import torch.utils.data from lhotse.cut import Cut, CutSet, MixedCut from lhotse.dataset import AudioSamples from lhotse.dataset.collation import collate_vectors -from lhotse.lazy import Dillable, LazyIteratorChain from omegaconf import DictConfig, OmegaConf from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations @@ -45,6 +43,13 @@ def first_supervised_cut(maybe_mixed_cut): + """ + Get the first supervised cut from a mixed cut, skip the noise cut in case the noise cut has supervision. + Args: + maybe_mixed_cut: Cut or MixedCut + Returns: + Cut: The first supervised cut from the mixed cut + """ if isinstance(maybe_mixed_cut, MixedCut): return [ t.cut @@ -56,6 +61,10 @@ def first_supervised_cut(maybe_mixed_cut): @dataclass class AudioToTextEOUBatch: + """ + Data class for ASR-EOU batch. + """ + sample_ids: List | None = None audio_filepaths: List | None = None audio_signal: torch.Tensor | None = None @@ -81,41 +90,6 @@ class RandomPaddingConfig: post_pad_duration: float = 3.0 # amount of right-padding when pad_distribution='constant' -def unicode_to_ascii(text: str) -> str: - """ - Converts text with accented or special Latin characters (e.g., ó, ñ, ū, ō) - into their closest ASCII equivalents. - """ - # Normalize the string to NFKD to separate base characters from diacritics - normalized = unicodedata.normalize('NFKD', text) - - # Encode to ASCII bytes, ignoring characters that can't be converted - ascii_bytes = normalized.encode('ascii', 'ignore') - - # Decode back to string - ascii_text = ascii_bytes.decode('ascii') - - return ascii_text - - -def drop_pnc(text: str) -> str: - """ - Clean the text by removing invalid characters and converting to lowercase. - - :param text: Input text. - :return: Cleaned text. - """ - valid_chars = "abcdefghijklmnopqrstuvwxyz'" - text = text.lower() - text = unicode_to_ascii(text) - text = text.replace(":", " ") - text = text.replace("-", " ") - text = text.replace("_", " ") - text = ''.join([c for c in text if c in valid_chars or c.isspace()]) - text = ' '.join(text.split()).strip() - return text - - class LhotseSpeechToTextBpeEOUDataset(torch.utils.data.Dataset): """ This dataset processes the audio data and the corresponding text data to generate the ASR labels, @@ -200,7 +174,6 @@ def __init__(self, cfg: DictConfig, tokenizer: TokenizerSpec, return_cuts: bool self.return_cuts = return_cuts self.eou_string = self.cfg.get('eou_string', EOU_STRING) self.eob_string = self.cfg.get('eob_string', EOB_STRING) - self.drop_pnc = self.cfg.get('drop_pnc', False) if cfg.get('check_tokenizer', True): self._check_special_tokens(tokenizer) @@ -260,6 +233,9 @@ def _check_special_tokens(self, tokenizer: TokenizerSpec): ) def _simple_getitem(self, cuts: CutSet) -> AudioToTextEOUBatch: + """ + Simple getitem function when skipping all augmentations. + """ audio, audio_lens, cuts = self.load_audio(cuts) if self.return_cuts: return audio, audio_lens, cuts @@ -406,6 +382,14 @@ def _repeat_eou_labels(self, eou_targets: torch.Tensor) -> torch.Tensor: return eou_targets def _get_frame_labels(self, cut: Cut, num_samples: int): + """ + Get the frame-level EOU labels for a single audio segment. + Args: + cut: Cut object + num_samples: int, the number of samples in the audio segment + Returns: + eou_targets: torch.Tensor of EOU labels, shape [T] + """ hidden_length = self._audio_len_to_frame_len(num_samples) if not "sou_time" in cut.custom or not "eou_time" in cut.custom: # assume only single speech segment @@ -458,6 +442,13 @@ def _get_frame_labels(self, cut: Cut, num_samples: int): return eou_targets def _get_text_tokens(self, cut: Cut): + """ + Add EOU labels to the text and get the text tokens for a single audio segment. + Args: + cut: Cut object + Returns: + text_tokens: torch.Tensor of text tokens, shape [T] + """ if not cut.has_custom("sou_time") or not cut.has_custom("eou_time") or not cut.has_custom("utterances"): # assume only single speech segment utterances = [cut.supervisions[0].text] @@ -482,8 +473,6 @@ def _get_text_tokens(self, cut: Cut): if not text: # skip empty utterances continue - if self.drop_pnc: - text = drop_pnc(text) if self.add_eou_to_text: eou_string = self.eob_string if is_backchannel[i] and not self.ignore_eob_label else self.eou_string if self.add_sep_before_eou: @@ -635,111 +624,3 @@ def _maybe_augment_length(self, audio: torch.Tensor, audio_len: torch.Tensor): audio_len = audio.size(0) return audio, audio_len - - -def lhotse_asr_eou_cut_random_pad_transform(config: DictConfig, cut: Cut): - """ - perform random padding to data - """ - padding_cfg = OmegaConf.to_container(config, resolve=True) - padding_cfg = RandomPaddingConfig(**padding_cfg) - p = np.random.rand() - if not padding_cfg or p > padding_cfg.prob: - # do nothing - return cut - - duration = cut.duration - # if already longer than the maximum duration, return the original audio - if duration >= padding_cfg.max_total_duration: - return cut - - if isinstance(cut, MixedCut): - cut = cut.first_non_padding_cut - sou_time = cut.custom.get("sou_time", None) - if sou_time is None: - sou_time = [float(cut.start)] - elif not isinstance(sou_time, list): - sou_time = [sou_time] - - eou_time = cut.custom.get("eou_time", None) - if eou_time is None: - eou_time = [float(cut.start) + duration] - elif not isinstance(eou_time, list): - eou_time = [eou_time] - - cut.custom["origin_sou_time"] = sou_time - cut.custom["origin_eou_time"] = eou_time - - max_padding_duration = max(0, padding_cfg.max_total_duration - duration) - padding_cfg.min_pre_pad_duration = max(padding_cfg.min_pre_pad_duration, padding_cfg.min_pad_duration) - padding_cfg.min_post_pad_duration = max(padding_cfg.min_post_pad_duration, padding_cfg.min_pad_duration) - if max_padding_duration <= padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration: - min_padding_duration = 0 - else: - min_padding_duration = padding_cfg.min_pre_pad_duration + padding_cfg.min_post_pad_duration - - pre_padding_duration = None - post_padding_duration = None - - if padding_cfg.pad_distribution == 'uniform': - total_padding_duration = np.random.uniform(min_padding_duration, max_padding_duration) - elif padding_cfg.pad_distribution == 'normal': - total_padding_duration = np.random.normal(padding_cfg.normal_mean, padding_cfg.normal_std) - total_padding_duration = max(min_padding_duration, min(max_padding_duration, total_padding_duration)) - elif padding_cfg.pad_distribution == 'constant': - pass - else: - raise ValueError( - f"Unknown padding distribution: {padding_cfg.pad_distribution}, choices in ['uniform', 'normal', 'constant]" - ) - - if padding_cfg.pad_distribution == 'constant': - pre_padding_duration = padding_cfg.pre_pad_duration - post_padding_duration = padding_cfg.post_pad_duration - elif min_padding_duration == 0: - pre_padding_duration = total_padding_duration / 2 - post_padding_duration = total_padding_duration / 2 - else: - post_padding_duration = np.random.uniform( - padding_cfg.min_post_pad_duration, total_padding_duration - padding_cfg.min_pre_pad_duration - ) - pre_padding_duration = total_padding_duration - post_padding_duration - - if padding_cfg.max_pad_duration is not None: - pre_padding_duration = min(pre_padding_duration, padding_cfg.max_pad_duration) - post_padding_duration = min(post_padding_duration, padding_cfg.max_pad_duration) - - sou_time = [t + pre_padding_duration for t in sou_time] - eou_time = [t + pre_padding_duration for t in sou_time] - - cut_left_padded = cut.pad(duration=pre_padding_duration + duration, direction="left", preserve_id=True) - cut_both_padded = cut_left_padded.pad( - duration=cut_left_padded.duration + post_padding_duration, direction="right", preserve_id=True - ) - - cut_both_padded.first_non_padding_cut.custom["sou_time"] = sou_time - cut_both_padded.first_non_padding_cut.custom["eou_time"] = eou_time - - return cut_both_padded - - -class LazyLhotseEOURandomPadding(Dillable): - def __init__(self, cuts: CutSet, cfg: DictConfig) -> None: - self.source = cuts - self.cfg = cfg - - def __iter__(self): - for cut in self.source: - yield lhotse_asr_eou_cut_random_pad_transform(config=self.cfg, cut=cut) - - def __len__(self): - return len(self.source) - - def __add__(self, other) -> "LazyIteratorChain": - return LazyIteratorChain(self, other) - - -class LhotseEOURandomPadding(RandomPaddingConfig): - def __call__(self, cuts: CutSet) -> CutSet: - config = OmegaConf.create(self.__dict__) - return CutSet(LazyLhotseEOURandomPadding(cuts, config)) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 81b3d8f559fc..e4d29e868f9a 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,8 +19,7 @@ import numpy as np import torch from lightning.pytorch.utilities import rank_zero_only -from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict -from torchmetrics import Accuracy +from omegaconf import DictConfig, OmegaConf, open_dict from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( EOB_LABEL, @@ -32,7 +31,6 @@ ) from nemo.collections.asr.metrics.wer import WER from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel, EncDecRNNTBPEModel -from nemo.collections.asr.modules.conformer_encoder import ConformerMultiLayerFeatureExtractor from nemo.collections.asr.parts.mixins import TranscribeConfig from nemo.collections.asr.parts.utils.eou_utils import ( EOUResult, @@ -43,10 +41,8 @@ from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.data.utils import move_data_to_device -from nemo.collections.common.losses import CrossEntropyLoss -from nemo.core.classes.common import Serialization from nemo.core.classes.mixins import AccessMixin -from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType +from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, NeuralType from nemo.utils import logging __all__ = ['EncDecRNNTBPEEOUModel', 'EncDecHybridRNNTCTCBPEEOUModel'] @@ -635,35 +631,6 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str return {**loss_log, 'log': tensorboard_logs} - # def test_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): - # # logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx) - # # test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()} - - # signal = batch.audio_signal - # signal_len = batch.audio_lengths - # transcript = batch.text_tokens - # transcript_len = batch.text_token_lengths - - # # forward() only performs encoder forward - # encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len) - # del signal - - # tensorboard_logs = {} - # hypotheses = self.decoding.rnnt_decoder_predictions_tensor( - # encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=True - # ) - # eou_predictions = self._get_eou_predictions_from_hypotheses(hypotheses, batch) - # eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) - # tensorboard_logs['test_eou_metrics'] = eou_metrics_list - # tensorboard_logs['test_eob_metrics'] = eob_metrics_list - - # test_logs = tensorboard_logs - # if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: - # self.test_step_outputs[dataloader_idx].append(test_logs) - # else: - # self.test_step_outputs.append(test_logs) - # return test_logs - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') @@ -953,511 +920,3 @@ def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') - - -class EncDecHybridASRFrameEOUModel(EncDecHybridRNNTCTCBPEModel, ASREOUModelMixin): - def __init__(self, cfg: DictConfig, trainer): - super().__init__(cfg=cfg, trainer=trainer) - self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor - self.layer_idx_list = self.cfg.get('layer_idx_list', []) - assert isinstance(self.layer_idx_list, (list, ListConfig)), "cfg.layer_idx_list must be a list" - num_encoder_layers = len(self.encoder.layers) - if -1 not in self.layer_idx_list and num_encoder_layers - 1 not in self.layer_idx_list: - self.layer_idx_list.append(num_encoder_layers - 1) - self.encoder = ConformerMultiLayerFeatureExtractor(self.encoder, self.layer_idx_list) - self.aggregator = Serialization.from_config_dict(cfg.aggregator) - self.eou_encoder = Serialization.from_config_dict(cfg.eou_encoder) if cfg.eou_encoder is not None else None - self.eou_decoder = Serialization.from_config_dict(cfg.eou_decoder) - self.num_eou_classes = cfg.num_eou_classes - self.rnnt_loss_weight = cfg.rnnt_loss_weight - self.ctc_loss_weight = cfg.ctc_loss_weight - self.eou_loss_weight = cfg.eou_loss_weight - self.use_ctc_pred = cfg.get('use_ctc_pred', False) - self.eou_loss = self._setup_eou_loss() - - if cfg.freeze_encoder: - self.encoder.freeze() - if cfg.freeze_rnnt: - self.decoder.freeze() - self.joint.freeze() - if cfg.freeze_ctc: - self.ctc_decoder.freeze() - - self.macro_accuracy = Accuracy(num_classes=self.num_eou_classes, average='macro', task="multiclass") - - def _setup_eou_loss(self): - if "eou_loss" in self.cfg: - weight = self.cfg.eou_loss.get("weight", None) - if weight in [None, "none", "None"]: - weight = [1.0] * self.num_eou_classes - elif len(weight) != self.num_eou_classes: - raise ValueError( - f"Length of weight must match the number of classes {self.num_eou_classes}, but got {weight}" - ) - logging.info(f"Using cross-entropy with weights: {weight}") - else: - weight = [1.0] * self.num_eou_classes - return CrossEntropyLoss(logits_ndim=3, weight=weight) - - def get_label_masks(self, labels: torch.Tensor, labels_len: torch.Tensor) -> torch.Tensor: - mask = torch.arange(labels.size(1))[None, :].to(labels.device) < labels_len[:, None] - return mask.to(labels.device, dtype=bool) - - def _setup_dataloader_from_config(self, config: Optional[Dict]): - cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config - dataset = LhotseSpeechToTextBpeEOUDataset( - cfg=cfg, tokenizer=self.tokenizer, return_cuts=config.get("do_transcribe", False) - ) - return get_lhotse_dataloader_from_config( - config, - # During transcription, the model is initially loaded on the CPU. - # To ensure the correct global_rank and world_size are set, - # these values must be passed from the configuration. - global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), - world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=dataset, - tokenizer=self.tokenizer, - ) - - def forward( - self, input_signal=None, input_signal_length=None, processed_signal=None, processed_signal_length=None - ): - has_input_signal = input_signal is not None and input_signal_length is not None - has_processed_signal = processed_signal is not None and processed_signal_length is not None - if (has_input_signal ^ has_processed_signal) is False: - raise ValueError( - f"{self} Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive " - " with ``processed_signal`` and ``processed_signal_len`` arguments." - ) - - if not has_processed_signal: - processed_signal, processed_signal_length = self.preprocessor( - input_signal=input_signal, - length=input_signal_length, - ) - - # Spec augment is not applied during evaluation/testing - if self.spec_augmentation is not None and self.training: - processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) - - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - return encoded, encoded_len - - def get_eou_prediction( - self, - encoded_all: List[torch.Tensor], - encoded_len_all: List[torch.Tensor], - ctc_pred: Optional[torch.Tensor] = None, - ): - if ctc_pred is not None and self.use_ctc_pred: - encoded_all[-1] = ctc_pred - eou_encoded, eou_encoded_len = self.aggregator(encoded_all, encoded_len_all) - if self.eou_encoder is not None: - eou_encoded, eou_encoded_len = self.eou_encoder(eou_encoded, eou_encoded_len) - eou_pred = self.eou_decoder(eou_encoded) - return eou_pred, eou_encoded_len - - def trim_eou_preds_labels( - self, - eou_pred: torch.Tensor, - eou_pred_len: torch.Tensor, - eou_labels: torch.Tensor, - eou_labels_len: torch.Tensor, - ): - seq_len = eou_pred.size(1) - if eou_labels.size(1) > seq_len: - eou_labels = eou_labels[:, :seq_len] - eou_labels_len = eou_labels_len.clamp(max=seq_len) - elif eou_labels.size(1) < seq_len: - seq_len = eou_labels.size(1) - eou_pred = eou_pred[:, :seq_len] - eou_pred_len = eou_pred_len.clamp(max=seq_len) - - # get the min between the eou_encoded_len and eou_labels_len - eou_valid_len = torch.min(eou_pred_len, eou_labels_len) - - return eou_pred, eou_labels, eou_valid_len - - def get_eou_loss( - self, - eou_pred: torch.Tensor, - eou_pred_len: torch.Tensor, - eou_labels: torch.Tensor, - eou_labels_len: torch.Tensor, - ): - eou_pred, eou_labels, eou_valid_len = self.trim_eou_preds_labels( - eou_pred, eou_pred_len, eou_labels, eou_labels_len - ) - eou_loss = self.eou_loss( - logits=eou_pred, - labels=eou_labels, - loss_mask=self.get_label_masks(eou_labels, eou_valid_len), - ) - return eou_loss - - def training_step(self, batch: AudioToTextEOUBatch, batch_nb): - signal = batch.audio_signal - signal_len = batch.audio_lengths - transcript = batch.text_tokens - transcript_len = batch.text_token_lengths - eou_labels = batch.eou_targets - eou_labels_len = batch.eou_target_lengths - - # Reset access registry - if AccessMixin.is_access_enabled(self.model_guid): - AccessMixin.reset_registry(self) - - encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - encoded = encoded_all[-1] - encoded_len = encoded_len_all[-1] - - # During training, loss must be computed, so decoder forward is necessary - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - - if hasattr(self, '_trainer') and self._trainer is not None: - log_every_n_steps = self._trainer.log_every_n_steps - sample_id = self._trainer.global_step - else: - log_every_n_steps = 1 - sample_id = batch_nb - - if (sample_id + 1) % log_every_n_steps == 0: - compute_wer = True - else: - compute_wer = False - - tensorboard_logs = { - 'learning_rate': self._optimizer.param_groups[0]['lr'], - 'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32), - } - - loss_value = None - if self.rnnt_loss_weight > 0: - # If fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - # Compute full joint and loss - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - if compute_wer: - self.wer.update( - predictions=encoded, - predictions_lengths=encoded_len, - targets=transcript, - targets_lengths=transcript_len, - ) - _, scores, words = self.wer.compute() - self.wer.reset() - tensorboard_logs.update({'training_batch_wer': scores.float() / words}) - - else: # If fused Joint-Loss-WER is used - # Fused joint step - loss_value, wer, _, _ = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoder, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=transcript_len, - compute_wer=compute_wer, - ) - - # Add auxiliary losses, if registered - loss_value = self.add_auxiliary_losses(loss_value) - - if compute_wer: - tensorboard_logs.update({'training_batch_wer': wer}) - - if self.ctc_loss_weight > 0: - log_probs = self.ctc_decoder(encoder_output=encoded) - ctc_log_probs = log_probs - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - tensorboard_logs['train_rnnt_loss'] = loss_value - tensorboard_logs['train_ctc_loss'] = ctc_loss - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - if compute_wer: - self.ctc_wer.update( - predictions=log_probs, - targets=transcript, - targets_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - ctc_wer, _, _ = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs.update({'training_batch_wer_ctc': ctc_wer}) - elif self.use_ctc_pred: - ctc_log_probs = self.ctc_decoder(encoder_output=encoded) - else: - ctc_log_probs = None - - eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all, ctc_log_probs) - eou_loss = self.get_eou_loss(eou_pred, eou_pred_len, eou_labels, eou_labels_len) - loss_value = loss_value + self.eou_loss_weight * eou_loss if loss_value is not None else eou_loss - tensorboard_logs['train_eou_loss'] = eou_loss - - # note that we want to apply interctc independent of whether main ctc - # loss is used or not (to allow rnnt + interctc training). - # assuming ``ctc_loss_weight=0.3`` and interctc is applied to a single - # layer with weight of ``0.1``, the total loss will be - # ``loss = 0.9 * (0.3 * ctc_loss + 0.7 * rnnt_loss) + 0.1 * interctc_loss`` - loss_value, additional_logs = self.add_interctc_losses( - loss_value, transcript, transcript_len, compute_wer=compute_wer - ) - tensorboard_logs.update(additional_logs) - tensorboard_logs['train_loss'] = loss_value - # Reset access registry - if AccessMixin.is_access_enabled(self.model_guid): - AccessMixin.reset_registry(self) - - # Log items - self.log_dict(tensorboard_logs) - - # Preserve batch acoustic model T and language model U parameters if normalizing - if self._optim_normalize_joint_txu: - self._optim_normalize_txu = [encoded_len.max(), transcript_len.max()] - - return {'loss': loss_value} - - def predict_step(self, batch: AudioToTextEOUBatch, batch_idx, dataloader_idx=0): - signal = batch.audio_signal - signal_len = batch.audio_lengths - sample_ids = batch.sample_ids - - encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - best_hyp_text = self.decoding.rnnt_decoder_predictions_tensor( - encoder_output=encoded_all[-1], encoded_lengths=encoded_len_all[-1], return_hypotheses=False - ) - if isinstance(sample_ids, torch.Tensor): - sample_ids = sample_ids.cpu().detach().numpy() - - eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all) - eou_predictions = [eou_pred[i][: eou_pred_len[i]] for i in range(len(eou_pred))] - return zip(sample_ids, best_hyp_text, eou_predictions) - - def validation_pass(self, batch: AudioToTextEOUBatch, batch_idx: int, dataloader_idx: int = 0): - signal = batch.audio_signal - signal_len = batch.audio_lengths - transcript = batch.text_tokens - transcript_len = batch.text_token_lengths - eou_labels = batch.eou_targets - eou_labels_len = batch.eou_target_lengths - - # forward() only performs encoder forward - encoded_all, encoded_len_all = self.forward(input_signal=signal, input_signal_length=signal_len) - del signal - - tensorboard_logs = {} - - if self.cfg.get('save_pred_to_file', None): - text_gt = self._get_text_from_tokens(transcript, transcript_len) - tensorboard_logs['val_sample_id'] = batch.sample_ids - tensorboard_logs['val_audio_filepath'] = batch.audio_filepaths - tensorboard_logs['val_text_gt'] = text_gt - - loss_value = None - encoded = encoded_all[-1] - encoded_len = encoded_len_all[-1] - # If experimental fused Joint-Loss-WER is not used - if not self.joint.fuse_loss_wer: - if self.compute_eval_loss: - decoder, target_length, states = self.decoder(targets=transcript, target_length=transcript_len) - joint = self.joint(encoder_outputs=encoded, decoder_outputs=decoder) - - loss_value = self.loss( - log_probs=joint, targets=transcript, input_lengths=encoded_len, target_lengths=target_length - ) - tensorboard_logs['val_loss'] = loss_value - - self.wer.update( - predictions=encoded, - predictions_lengths=encoded_len, - targets=transcript, - targets_lengths=transcript_len, - ) - - if self.cfg.get('save_pred_to_file', None): - hypotheses = self.wer.get_hypotheses() - text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) - tensorboard_logs['val_text_pred'] = text_pred - - wer, wer_num, wer_denom = self.wer.compute() - self.wer.reset() - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - else: - # If experimental fused Joint-Loss-WER is used - compute_wer = True - - if self.compute_eval_loss: - decoded, target_len, states = self.decoder(targets=transcript, target_length=transcript_len) - else: - decoded = None - target_len = transcript_len - - # Fused joint step - loss_value, wer, wer_num, wer_denom = self.joint( - encoder_outputs=encoded, - decoder_outputs=decoded, - encoder_lengths=encoded_len, - transcripts=transcript, - transcript_lengths=target_len, - compute_wer=compute_wer, - keep_hypotheses=True, - ) - if self.cfg.get('save_pred_to_file', None): - hypotheses = self.joint.get_hypotheses() - text_pred = self._get_text_from_tokens([x.y_sequence for x in hypotheses]) - tensorboard_logs['val_text_pred'] = text_pred - - if loss_value is not None: - tensorboard_logs['val_loss'] = loss_value - - tensorboard_logs['val_wer_num'] = wer_num - tensorboard_logs['val_wer_denom'] = wer_denom - tensorboard_logs['val_wer'] = wer - - log_probs = self.ctc_decoder(encoder_output=encoded) - if self.compute_eval_loss: - ctc_loss = self.ctc_loss( - log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len - ) - tensorboard_logs['val_ctc_loss'] = ctc_loss - tensorboard_logs['val_rnnt_loss'] = loss_value - loss_value = (1 - self.ctc_loss_weight) * loss_value + self.ctc_loss_weight * ctc_loss - tensorboard_logs['val_loss'] = loss_value - self.ctc_wer.update( - predictions=log_probs, - targets=transcript, - targets_lengths=transcript_len, - predictions_lengths=encoded_len, - ) - - if self.cfg.get('save_pred_to_file', None): - hypotheses_ctc = self.ctc_wer.get_hypotheses() - text_pred_ctc = self._get_text_from_tokens([x.y_sequence for x in hypotheses_ctc]) - tensorboard_logs['val_text_pred_ctc'] = text_pred_ctc - - ctc_wer, ctc_wer_num, ctc_wer_denom = self.ctc_wer.compute() - self.ctc_wer.reset() - tensorboard_logs['val_wer_num_ctc'] = ctc_wer_num - tensorboard_logs['val_wer_denom_ctc'] = ctc_wer_denom - tensorboard_logs['val_wer_ctc'] = ctc_wer - - self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32)) - - loss_value, additional_logs = self.add_interctc_losses( - loss_value, - transcript, - transcript_len, - compute_wer=True, - compute_loss=self.compute_eval_loss, - log_wer_num_denom=True, - log_prefix="val_", - ) - if self.compute_eval_loss: - # overriding total loss value. Note that the previous - # rnnt + ctc loss is available in metrics as "val_final_loss" now - tensorboard_logs['val_loss'] = loss_value - tensorboard_logs.update(additional_logs) - # Reset access registry - if AccessMixin.is_access_enabled(self.model_guid): - AccessMixin.reset_registry(self) - - # Calculate EOU metrics - eou_pred, eou_pred_len = self.get_eou_prediction(encoded_all, encoded_len_all, log_probs) - eou_loss = self.get_eou_loss(eou_pred, eou_pred_len, eou_labels, eou_labels_len) - tensorboard_logs['val_eou_loss'] = eou_loss - - eou_pred, eou_labels, eou_valid_len = self.trim_eou_preds_labels( - eou_pred, eou_pred_len, eou_labels, eou_labels_len - ) - - for i in range(eou_pred.size(0)): - self.macro_accuracy.update(preds=eou_pred[i][: eou_valid_len[i]], target=eou_labels[i][: eou_valid_len[i]]) - stats = self.macro_accuracy._final_state() - tensorboard_logs['val_eou_acc_stats'] = stats - self.macro_accuracy.reset() - - eou_predictions = self._get_eou_predictions_from_frames(eou_pred, eou_valid_len) - eou_metrics_list, eob_metrics_list = self._calculate_eou_metrics(eou_predictions, batch) - - tensorboard_logs['val_eou_metrics'] = eou_metrics_list - tensorboard_logs['val_eob_metrics'] = eob_metrics_list - - return tensorboard_logs - - def _get_eou_predictions_from_frames( - self, eou_pred: torch.Tensor, eou_pred_len: torch.Tensor - ) -> List[EOUPrediction]: - eou_predictions = [] - for i in range(eou_pred.size(0)): - eou_logits_i = eou_pred[i][: eou_pred_len[i]] # [time, num_classes] - eou_probs = eou_logits_i[:, EOU_LABEL].detach().cpu().numpy().tolist() - eob_probs = eou_logits_i[:, EOB_LABEL].detach().cpu().numpy().tolist() - eou_frame_prediction = eou_logits_i.argmax(dim=-1).cpu().numpy().tolist() - eou_preds = [int(x == EOU_LABEL) for x in eou_frame_prediction] - eob_preds = [int(x == EOB_LABEL) for x in eou_frame_prediction] - eou_predictions.append( - EOUPrediction( - eou_probs=eou_probs, - eob_probs=eob_probs, - eou_preds=eou_preds, - eob_preds=eob_preds, - ) - ) - return eou_predictions - - def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str = "val"): - assert mode in ['val', 'test'], f"Invalid mode: {mode}. Must be 'val' or 'test'." - self._maybe_save_predictions(outputs, mode=mode, dataloader_idx=dataloader_idx) - - # Aggregate WER metrics - if self.compute_eval_loss: - loss_mean = torch.stack([x[f'{mode}_loss'] for x in outputs]).mean() - loss_log = {f'{mode}_loss': loss_mean} - else: - loss_log = {} - - eou_loss_mean = torch.stack([x[f'{mode}_eou_loss'] for x in outputs]).mean() - loss_log[f'{mode}_eou_loss'] = eou_loss_mean - - wer_num = torch.stack([x[f'{mode}_wer_num'] for x in outputs]).sum() - wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() - tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} - - if self.ctc_loss_weight > 0: - ctc_wer_num = torch.stack([x[f'{mode}_wer_num_ctc'] for x in outputs]).sum() - ctc_wer_denom = torch.stack([x[f'{mode}_wer_denom_ctc'] for x in outputs]).sum() - tensorboard_logs['val_wer_ctc'] = ctc_wer_num.float() / ctc_wer_denom - - eou_metrics = self._aggregate_eou_metrics(outputs, mode) - tensorboard_logs.update(eou_metrics) - - self.macro_accuracy.reset() - self.macro_accuracy.tp = torch.stack([x[f'{mode}_eou_acc_stats'][0] for x in outputs]).sum(axis=0) - self.macro_accuracy.fp = torch.stack([x[f'{mode}_eou_acc_stats'][1] for x in outputs]).sum(axis=0) - self.macro_accuracy.tn = torch.stack([x[f'{mode}_eou_acc_stats'][2] for x in outputs]).sum(axis=0) - self.macro_accuracy.fn = torch.stack([x[f'{mode}_eou_acc_stats'][3] for x in outputs]).sum(axis=0) - macro_accuracy_score = self.macro_accuracy.compute() - self.macro_accuracy.reset() - tensorboard_logs[f'{mode}_eou_macro_acc'] = macro_accuracy_score - - return {**loss_log, 'log': tensorboard_logs} - - def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='val') - - def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0): - return self.multi_inference_epoch_end(outputs, dataloader_idx, mode='test') diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 7745fe380f87..bc8624e687bc 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -204,10 +204,6 @@ class LhotseDataLoadingConfig: force_map_dataset: bool = False force_iterable_dataset: bool = False - # 6. EOU related options. - random_padding: Any | None = None - use_dataloader_augment: bool = False - def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool: """Determine whether to use iterable dataset for a given configuration.""" @@ -500,20 +496,6 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No # 2. Optional augmentations. - if config.get("random_padding", None) is not None and config.get("use_dataloader_augment", False): - # put this here to avoid circular import - logging.info("Using dataloader augmentations for EOU random padding.") - from nemo.collections.asr.data.audio_to_eou_label_lhotse import ( - LhotseEOURandomPadding, - lhotse_asr_eou_cut_random_pad_transform, - ) - - # random_padding_augmentation = LhotseEOURandomPadding(**config.random_padding) - # cuts = random_padding_augmentation(cuts) - cuts = cuts.map( - partial(lhotse_asr_eou_cut_random_pad_transform, config.random_padding), - ) - # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) From 5ef8ceb96c6ef4205c1fac9b3324f692fdda74d8 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 16 Sep 2025 12:15:49 -0400 Subject: [PATCH 098/107] clean up Signed-off-by: stevehuang52 --- .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 14 +- .../collections/asr/models/rnnt_bpe_models.py | 14 +- .../asr/modules/conformer_encoder.py | 91 ------------ ...l_eou_with_niva.py => eval_eou_metrics.py} | 0 scripts/asr_end_of_utterance/evaluate_eou.py | 140 ------------------ 5 files changed, 8 insertions(+), 251 deletions(-) rename scripts/asr_end_of_utterance/{eval_eou_with_niva.py => eval_eou_metrics.py} (100%) delete mode 100644 scripts/asr_end_of_utterance/evaluate_eou.py diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 1d3dbc979598..e47945addaf0 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -21,7 +21,6 @@ from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset @@ -139,15 +138,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): - - if config.get("use_eou", False): - cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config - dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer) - else: - dataset = LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ) + dataset = LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ) return get_lhotse_dataloader_from_config( config, diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 0faed26c24f6..31c8e7ce7c90 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -21,7 +21,6 @@ from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from nemo.collections.asr.data import audio_to_text_dataset -from nemo.collections.asr.data.audio_to_eou_label_lhotse import LhotseSpeechToTextBpeEOUDataset from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset @@ -508,15 +507,10 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig, verbose: bool = Tru def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): - - if config.get("use_eou", False): - cfg = OmegaConf.create(config) if not isinstance(config, DictConfig) else config - dataset = LhotseSpeechToTextBpeEOUDataset(cfg=cfg, tokenizer=self.tokenizer) - else: - dataset = LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ) + dataset = LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ) return get_lhotse_dataloader_from_config( config, diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index fdc4bfdd6b4b..0c6be0719e7a 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -1265,97 +1265,6 @@ def get_accepted_adapter_types( return types -class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable, AccessMixin): - """ - A wrapper module that extracts features from multiple layers of a ConformerEncoder, - by reusing existing mechanisim for interctc loss. - To use it, set `layer_idx_list` to specify the indices of layers to extract from. - Also, you can specify an `aggretator` module to aggregate the features from different layers, - default not aggregating. - """ - - def __init__( - self, - encoder: ConformerEncoder, - layer_idx_list: List[int], - aggregator: NeuralModule = None, - detach: bool = False, - convert_to_cpu: bool = False, - ): - super().__init__() - self.encoder = encoder - self.num_layers = len(encoder.layers) - self.layer_idx_list = [] - if not layer_idx_list: - layer_idx_list = list(range(self.num_layers)) - for lid in layer_idx_list: - if lid < -self.num_layers or lid >= self.num_layers: - raise ValueError(f"Invalid layer index {lid} for ConformerEncoder with {self.num_layers} layers.") - if lid < 0: - lid = self.num_layers + lid - self.layer_idx_list.append(lid) - self.layer_idx_list.sort() - self.enc_access_cfg = { - "interctc": { - "capture_layers": self.layer_idx_list, - }, - "detach": detach, - "convert_to_cpu": convert_to_cpu, - } - self.aggregator = aggregator - - def forward( - self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ) -> Tuple[torch.Tensor, torch.Tensor]: - # pylint: disable=missing-function-docstring - old_access_flag = self.is_access_enabled(guid=getattr(self, "model_guid", None)) - self.update_access_cfg(self.enc_access_cfg, guid=getattr(self, "model_guid", None)) - self.set_access_enabled(access_enabled=True, guid=getattr(self, "model_guid", None)) - - _ = self.encoder( - audio_signal=audio_signal, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - # Chunk of code adapted from ConformerEncoder.forward_internal() - total_registry = {} - for module_registry in self.get_module_registry(self.encoder).values(): - for key in module_registry: - if key.startswith("interctc/") and key in total_registry: - raise RuntimeError(f"layer {key} has been logged multiple times!") - total_registry.update(module_registry) - - encoded_list = [] - encoded_len_list = [] - for layer_idx in self.layer_idx_list: - if layer_idx < 0: - layer_idx = self.num_layers + layer_idx - try: - layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"] - layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"] - except KeyError: - raise RuntimeError( - f"Intermediate layer {layer_idx} was not captured! " - "Check the layer index and the number of ConformerEncoder layers." - ) - if len(layer_outputs) > 1 or len(layer_lengths) > 1: - raise RuntimeError("Make sure encoder.forward is called exactly one time") - encoded_list.append(layer_outputs[0]) # [B, D, T] - encoded_len_list.append(layer_lengths[0]) # [B] - - self.encoder.reset_registry() - self.set_access_enabled(access_enabled=old_access_flag, guid=getattr(self, "model_guid", None)) - # End of the adapted chunk - - if self.aggregator is not None: - return self.aggregator(encoded_list, encoded_len_list) # Tensor[B,D*L,T], Tensor[B] - else: - return encoded_list, encoded_len_list # List[Tensor[B,D,T]], List[Tensor[B]] - - # Register any additional information if adapter_mixins.get_registered_adapter(ConformerEncoder) is None: adapter_mixins.register_adapter(base_class=ConformerEncoder, adapter_class=ConformerEncoderAdapter) diff --git a/scripts/asr_end_of_utterance/eval_eou_with_niva.py b/scripts/asr_end_of_utterance/eval_eou_metrics.py similarity index 100% rename from scripts/asr_end_of_utterance/eval_eou_with_niva.py rename to scripts/asr_end_of_utterance/eval_eou_metrics.py diff --git a/scripts/asr_end_of_utterance/evaluate_eou.py b/scripts/asr_end_of_utterance/evaluate_eou.py deleted file mode 100644 index 5f842d7a5f4d..000000000000 --- a/scripts/asr_end_of_utterance/evaluate_eou.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script is deprecated !!!! -""" - - -import argparse -import json -from typing import List - -import numpy as np - -from nemo.collections.asr.parts.utils.eou_utils import evaluate_eou - -parser = argparse.ArgumentParser(description="Evaluate end of utterance predictions against reference labels.") -parser.add_argument( - "-p", - "--predictions", - type=str, - required=True, - help="Path to the JSON file containing the predictions.", -) -parser.add_argument( - "-r", - "--references", - type=str, - required=True, - help="Path to the JSON file containing the reference labels.", -) -parser.add_argument( - "-t", - "--threshold", - type=float, - default=0.5, - help="Threshold for considering a prediction as EOU.", -) -parser.add_argument( - "--drop_prefix", - default="", - type=str, - help="Prefix to drop from the audio_filepath in the JSON file.", -) -parser.add_argument( - "-c", - "--collar", - type=float, - default=0.1, - help="Collar time in seconds for matching predictions to references.", -) -parser.add_argument( - "-o", - "--output_dir", - type=str, - default="eou_results/", - help="Output directory to save the evaluation results.", -) - - -def load_json(file_path: str, drop_prefix: str = "") -> List[dict]: - """Load a JSON file, then clean the audio_filepath.""" - with open(file_path, "r") as f: - data = json.load(f) - - cleaned_data = [] - for item in data: - audio_filepath = item["audio_filepath"] - if drop_prefix and audio_filepath.startswith(drop_prefix): - audio_filepath = audio_filepath[len(drop_prefix) :] - elif audio_filepath.startswith("./"): - audio_filepath = audio_filepath[2:] - item["audio_filepath"] = audio_filepath - - cleaned_data.append(item) - return cleaned_data - - -def main(): - args = parser.parse_args() - - predictions = load_json(args.predictions, args.drop_prefix) - references = load_json(args.references, args.drop_prefix) - results = evaluate_eou( - prediction=predictions, - reference=references, - threshold=args.threshold, - collar=args.collar, - ) - - f1_score = ( - (2 * results.true_positives / (2 * results.true_positives + results.false_negatives + results.false_positives)) - if (results.true_positives + results.false_negatives + results.false_positives) > 0 - else 0 - ) - - avg_cutoffs = len(results.early_cutoff) / len(results.num_utterances) if len(results.num_utterances) > 0 else 0 - - p80_cutoff = np.percentile(results.early_cutoff, 80) if len(results.early_cutoff) > 0 else 0 - p90_cutoff = np.percentile(results.early_cutoff, 90) if len(results.early_cutoff) > 0 else 0 - p95_cutoff = np.percentile(results.early_cutoff, 95) if len(results.early_cutoff) > 0 else 0 - p99_cutoff = np.percentile(results.early_cutoff, 99) if len(results.early_cutoff) > 0 else 0 - - p80_latency = np.percentile(results.latency, 80) if len(results.latency) > 0 else 0 - p90_latency = np.percentile(results.latency, 90) if len(results.latency) > 0 else 0 - p95_latency = np.percentile(results.latency, 95) if len(results.latency) > 0 else 0 - p99_latency = np.percentile(results.latency, 99) if len(results.latency) > 0 else 0 - - # Print the results - print("======================================") - print("Evaluation Results:") - print(f"Number of utterances: {results.num_utterances}") - print(f"Number of predictions: {results.num_predictions}") - print(f"F1 Score: {f1_score:.4f}") - print("======================================") - print(f"Early cutoff rate: {avg_cutoffs:.4f}") - print(f"Early cutoff P80: {p80_cutoff:.4f} seconds") - print(f"Early cutoff P90: {p90_cutoff:.4f} seconds") - print(f"Early cutoff P95: {p95_cutoff:.4f} seconds") - print(f"Early cutoff P99: {p99_cutoff:.4f} seconds") - print("======================================") - print(f"P80 Latency: {p80_latency:.4f} seconds") - print(f"P90 Latency: {p90_latency:.4f} seconds") - print(f"P95 Latency: {p95_latency:.4f} seconds") - print(f"P99 Latency: {p99_latency:.4f} seconds") - - -if __name__ == "__main__": - main() From 551ac6817e208514391e7d5e8908f98daec1ae88 Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:17:48 -0400 Subject: [PATCH 099/107] Potential fix for code scanning alert no. 16191: Explicit returns mixed with implicit (fall through) returns Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 2350412ffb90..0a379ea14e08 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -174,7 +174,7 @@ def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: return nemo_model_path if pretrained_name: return pretrained_name - + return None def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str, cfg: DictConfig): """ From e504d6cb6d36a766dcb4a0f9565263595320807d Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:18:09 -0400 Subject: [PATCH 100/107] Potential fix for code scanning alert no. 16190: Explicit returns mixed with implicit (fall through) returns Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- nemo/collections/asr/models/asr_eou_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index e4d29e868f9a..f6d2941df267 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -301,16 +301,18 @@ def _aggregate_eou_metrics(self, outputs: List[dict], mode: str, is_ctc: bool = return tensorboard_logs @rank_zero_only - def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0): + def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0) -> Optional[Path]: """ Save predictions to disk. Args: outputs: list of outputs mode: mode of the model, either 'val' or 'test' + Returns: + Path object if predictions are saved, None otherwise. """ if not self.cfg.get('save_pred_to_file', None): - return + return None output_file = Path(self.cfg.save_pred_to_file) output_file.parent.mkdir(parents=True, exist_ok=True) From f11ef31e0c905df21bbcc87e946a14accd071055 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 16 Sep 2025 16:19:04 +0000 Subject: [PATCH 101/107] Apply isort and black reformatting Signed-off-by: stevehuang52 --- examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py | 1 + nemo/collections/asr/models/asr_eou_models.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index 0a379ea14e08..bf31ceba9bc0 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -176,6 +176,7 @@ def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: return pretrained_name return None + def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str, cfg: DictConfig): """ load the pretrained model from a .nemo file, taking into account the joint network diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index f6d2941df267..1238cb81cf9f 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -301,7 +301,9 @@ def _aggregate_eou_metrics(self, outputs: List[dict], mode: str, is_ctc: bool = return tensorboard_logs @rank_zero_only - def _maybe_save_predictions(self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0) -> Optional[Path]: + def _maybe_save_predictions( + self, outputs: List[Dict], mode: str = "val", dataloader_idx: int = 0 + ) -> Optional[Path]: """ Save predictions to disk. Args: From 84e8baad9ebaaf01db061c19baff0dd27b65d1fd Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:20:05 -0400 Subject: [PATCH 102/107] Potential fix for code scanning alert no. 16185: File is not always closed Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --- tools/nemo_forced_aligner/align_eou.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index aef7dbbb5cf2..953b7cbe4c6d 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -524,12 +524,13 @@ def process_single_manifest(cfg: AlignmentConfig, model, buffered_chunk_params, continue # get sou/eou time - lines = [line.split() for line in open(item['segments_level_ctm_filepath'])] - start_time = min([float(line[2]) for line in lines]) - end_time = max([float(line[2]) + float(line[3]) for line in lines]) - input_manifest_lines[i]['sou_time'] = start_time - input_manifest_lines[i]['eou_time'] = end_time - output_manifest_lines.append(input_manifest_lines[i]) + with open(item['segments_level_ctm_filepath']) as f: + lines = [line.split() for line in f] + start_time = min([float(line[2]) for line in lines]) + end_time = max([float(line[2]) + float(line[3]) for line in lines]) + input_manifest_lines[i]['sou_time'] = start_time + input_manifest_lines[i]['eou_time'] = end_time + output_manifest_lines.append(input_manifest_lines[i]) with open(cfg.output_manifest_filepath, 'w') as f: for item in output_manifest_lines: From 389060669d585f8ed99657b5b45e124d6c214524 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Tue, 16 Sep 2025 12:39:39 -0400 Subject: [PATCH 103/107] clean up Signed-off-by: stevehuang52 --- nemo/collections/asr/models/asr_eou_models.py | 1 + tools/nemo_forced_aligner/align_eou.py | 25 ++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 1238cb81cf9f..08eda768173d 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -629,6 +629,7 @@ def multi_inference_epoch_end(self, outputs, dataloader_idx: int = 0, mode: str wer_denom = torch.stack([x[f'{mode}_wer_denom'] for x in outputs]).sum() tensorboard_logs = {**loss_log, f'{mode}_wer': wer_num.float() / wer_denom} + eou_metrics = {} if self.cfg.get('calculate_eou_metrics', True): eou_metrics = self._aggregate_eou_metrics(outputs, mode=mode) tensorboard_logs.update(eou_metrics) diff --git a/tools/nemo_forced_aligner/align_eou.py b/tools/nemo_forced_aligner/align_eou.py index 953b7cbe4c6d..54e7bce8ff6f 100644 --- a/tools/nemo_forced_aligner/align_eou.py +++ b/tools/nemo_forced_aligner/align_eou.py @@ -27,9 +27,7 @@ import torch from omegaconf import OmegaConf from utils.data_prep import ( - add_t_start_end_to_utt_obj, get_batch_starts_ends, - get_batch_variables, get_manifest_lines_batch, is_entry_in_all_lines, is_entry_in_any_lines, @@ -37,7 +35,6 @@ from utils.make_ass_files import make_ass_files from utils.make_ctm_files import make_ctm_files from utils.make_output_manifest import write_manifest_out_line -from utils.viterbi_decoding import viterbi_decoding from nemo.collections.asr.models.ctc_models import EncDecCTCModel from nemo.collections.asr.models.hybrid_rnnt_ctc_models import EncDecHybridRNNTCTCModel @@ -46,6 +43,21 @@ from nemo.core.config import hydra_runner from nemo.utils import logging +try: + from nemo.collections.asr.parts.utils.aligner_utils import ( + add_t_start_end_to_utt_obj, + get_batch_variables, + viterbi_decoding, + ) +except ImportError: + raise ImportError( + "Missing required dependency for NFA. " + "Install NeMo with NFA utilities support:\n" + " pip install 'nemo_toolkit[all]>=2.5.0'\n" + "Or install the latest development version:\n" + " pip install git+https://github.com/NVIDIA/NeMo.git" + ) + """ Align the utterances in manifest_filepath. Results are saved in ctm files in output_dir as well as json manifest in output_manifest_filepath. @@ -508,7 +520,12 @@ def process_single_manifest(cfg: AlignmentConfig, model, buffered_chunk_params, f_manifest_out.close() # adding eou processing here - input_manifest_lines = [json.loads(line) for line in open(cfg.manifest_filepath)] + input_manifest_lines = [] + with open(cfg.manifest_filepath, 'r') as f: + for line in f.readlines(): + if line.strip(): + input_manifest_lines.append(json.loads(line)) + output_manifest_lines = [] with open(tgt_manifest_filepath, 'r') as f: for i, line in enumerate(f.readlines()): From 26270ceef736641f3585e144ca0a0a12865a0df8 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 8 Oct 2025 15:35:47 -0400 Subject: [PATCH 104/107] clean up Signed-off-by: stevehuang52 --- .../asr/asr_eou/speech_to_text_eou_eval.py | 12 +-- .../asr_eou/speech_to_text_rnnt_eou_train.py | 39 +++++--- ...astconformer_transducer_bpe_streaming.yaml | 2 +- ...rmer_transducer_bpe_streaming_adapter.yaml | 4 +- ...transcribe_speech_manifest_distributed.py} | 50 +++++++--- .../asr/data/audio_to_eou_label_lhotse.py | 2 +- nemo/collections/asr/models/asr_eou_models.py | 22 +++++ .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 10 +- .../collections/asr/models/rnnt_bpe_models.py | 10 +- nemo/collections/asr/modules/__init__.py | 7 +- .../asr/modules/conformer_encoder.py | 91 +++++++++++++++++- .../asr/modules/ssl_modules/__init__.py | 5 +- .../modules/ssl_modules/multi_layer_feat.py | 92 ++----------------- .../asr/parts/submodules/rnnt_decoding.py | 14 +-- .../common/data/lhotse/dataloader.py | 7 +- .../common/data/lhotse/nemo_adapters.py | 1 - .../add_eob_labels.py | 0 .../clean_manifest.py | 5 +- .../conf/data.yaml | 0 .../eval_eou_metrics.py | 61 ++++++++---- .../generate_noisy_eval_data.py | 34 +------ .../add_special_tokens_to_sentencepiece.py | 10 +- .../tokenizers/sentencepiece_model_pb2.py | 0 .../convert_to_tarred_audio_dataset.py | 4 +- 24 files changed, 267 insertions(+), 215 deletions(-) rename examples/asr/{transcribe_speech_distributed.py => transcribe_speech_manifest_distributed.py} (86%) rename scripts/{asr_end_of_utterance => asr_eou}/add_eob_labels.py (100%) rename scripts/{asr_end_of_utterance => asr_eou}/clean_manifest.py (99%) rename scripts/{asr_end_of_utterance => asr_eou}/conf/data.yaml (100%) rename scripts/{asr_end_of_utterance => asr_eou}/eval_eou_metrics.py (70%) rename scripts/{asr_end_of_utterance => asr_eou}/generate_noisy_eval_data.py (95%) rename scripts/{asr_end_of_utterance => asr_eou}/tokenizers/add_special_tokens_to_sentencepiece.py (96%) rename scripts/{asr_end_of_utterance => asr_eou}/tokenizers/sentencepiece_model_pb2.py (100%) diff --git a/examples/asr/asr_eou/speech_to_text_eou_eval.py b/examples/asr/asr_eou/speech_to_text_eou_eval.py index 9b0a580b343c..17d00a385be4 100644 --- a/examples/asr/asr_eou/speech_to_text_eou_eval.py +++ b/examples/asr/asr_eou/speech_to_text_eou_eval.py @@ -16,22 +16,15 @@ Example usage: ```bash -NEMO_PATH=/home/heh/codes/nemo-eou -export PYTHONPATH=$NEMO_PATH:$PYTHONPATH - TEST_MANIFEST="[/path/to/your/test_manifest.json,/path/to/your/test_manifest2.json,...]" TEST_NAME="[test_name1,test_name2,...]" TEST_BATCH=32 NUM_WORKERS=8 PRETRAINED_NEMO=/path/to/EOU/model.nemo -SCRIPT=${NEMO_PATH}/examples/asr/asr_eou/speech_to_text_eou_eval.py -CONFIG_PATH=${NEMO_PATH}/examples/asr/conf/asr_eou CONFIG_NAME=fastconformer_transducer_bpe_streaming -export CUDA_VISIBLE_DEVICES=0 && \ -python $SCRIPT \ - --config-path $CONFIG_PATH \ +python speech_to_text_eou_eval.py \ --config-name $CONFIG_NAME \ ++init_from_nemo_model=$PRETRAINED_NEMO \ ~model.train_ds \ @@ -46,8 +39,7 @@ ++model.test_ds.force_finite=true \ ++model.test_ds.shuffle=false \ ++model.test_ds.pin_memory=true \ - exp_manager.name=$EXP_NAME-eval \ - exp_manager.create_wandb_logger=false \ + exp_manager.create_wandb_logger=false ``` """ diff --git a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py index bf31ceba9bc0..81249f703412 100644 --- a/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py +++ b/examples/asr/asr_eou/speech_to_text_rnnt_eou_train.py @@ -15,13 +15,22 @@ """ Example usage: -0. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py - -1. Add special tokens and to the tokenizer of pretrained model, by refering to the script - /scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py - -2. If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script - /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py +1. Prepare dataset based on /nemo/collections/asr/data/audio_to_eou_label_lhotse.py + Specifically, each sample in the jsonl manifest should have the following fields: + { + "audio_filepath": "/path/to/audio.wav", + "text": "The text of the audio." + "offset": 0.0, # offset of the audio, in seconds + "duration": 3.0, # duration of the audio, in seconds + "sou_time": 0.2, # start of utterance time, in seconds + "eou_time": 1.5, # end of utterance time, in seconds + } + +2. If using a normal ASR model as initialization: + - Add special tokens and to the tokenizer of pretrained model, by refering to the script + /scripts/asr_eou/tokenizers/add_special_tokens_to_sentencepiece.py + - If pretrained model is HybridRNNTCTCBPEModel, convert it to RNNT using the script + /examples/asr/asr_hybrid_transducer_ctc/helpers/convert_nemo_asr_hybrid_to_ctc.py 3. Run the following command to train the ASR-EOU model: ```bash @@ -179,29 +188,31 @@ def get_pretrained_model_name(cfg: DictConfig) -> Optional[str]: def init_from_pretrained_nemo(model: EncDecRNNTBPEEOUModel, pretrained_model_path: str, cfg: DictConfig): """ - load the pretrained model from a .nemo file, taking into account the joint network + Load the pretrained model from a .nemo file or remote checkpoint. If the pretrained model has exactly + the same vocabulary size as the current model, the whole model will be loaded directly. Otherwise, + the encoder and decoder weights will be loaded separately and the EOU/EOB classes will be handled separately. """ if pretrained_model_path.endswith('.nemo'): pretrained_model = ASRModel.restore_from(restore_path=pretrained_model_path) # type: EncDecRNNTBPEModel else: pretrained_model = ASRModel.from_pretrained(pretrained_model_path) # type: EncDecRNNTBPEModel + if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): + raise TypeError( + f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." + ) + try: model.load_state_dict(pretrained_model.state_dict(), strict=True) logging.info( f"Pretrained model from {pretrained_model_path} has exactly the same model structure, skip further loading." ) return - except Exception as e: + except Exception: logging.warning( f"Pretrained model {pretrained_model_path} has different model structure, try loading weights separately and add EOU/EOB classes." ) - if not isinstance(pretrained_model, (EncDecRNNTBPEModel, EncDecHybridRNNTCTCBPEModel)): - raise TypeError( - f"Pretrained model {pretrained_model.__class__} is not EncDecRNNTBPEModel or EncDecHybridRNNTCTCBPEModel." - ) - # Load encoder state dict into the model model.encoder.load_state_dict(pretrained_model.encoder.state_dict(), strict=True) logging.info(f"Encoder weights loaded from {pretrained_model_path}.") diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml index 3a6be9f336b3..6ec564245b8c 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming.yaml @@ -1,4 +1,4 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR+EOU model, large size (~115M) with sub-word encoding. # You may find more detail: # FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer diff --git a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml index ef249bb401da..7cc70cf00378 100644 --- a/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml +++ b/examples/asr/conf/asr_eou/fastconformer_transducer_bpe_streaming_adapter.yaml @@ -1,11 +1,11 @@ -# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR model, large size (~115M) with sub-word encoding. +# It contains the default values for training a cache-aware streaming FastConformer-Transducer ASR+EOU model, large size (~115M) with sub-word encoding. # You may find more detail: # FastConformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#fast-conformer # Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml -name: "FastConformer-Transducer-BPE-Streaming-EOU" +name: "FastConformer-Transducer-BPE-Streaming-EOU-adapter" model: token_init_method: "constant" # choices=['min', 'max', 'mean', 'constant'] diff --git a/examples/asr/transcribe_speech_distributed.py b/examples/asr/transcribe_speech_manifest_distributed.py similarity index 86% rename from examples/asr/transcribe_speech_distributed.py rename to examples/asr/transcribe_speech_manifest_distributed.py index 90f73dfa1661..9ecde83b4a8e 100644 --- a/examples/asr/transcribe_speech_distributed.py +++ b/examples/asr/transcribe_speech_manifest_distributed.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,13 +30,19 @@ from nemo.utils import logging """ -Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data. +Transcribe audio manifests on distributed GPUs. Useful for transcription of moderate amounts of audio data. +This script also supports splitting the manifest into chunks and merging the results back together. +This script is a modified version of `transcribe_speech_distributed.py` that only takes manifest files as input. +It is useful for transcribing a large amount of audio data that does not fit into a single job. # Arguments model_path: path to .nemo ASR checkpoint pretrained_name: name of pretrained ASR model (from NGC registry) - audio_dir: path to directory with audio files - dataset_manifest: path to dataset JSON manifest file (in NeMo formats + dataset_manifest: path to dataset JSON manifest file (in NeMo formats), can be a comma-separated list of manifest files + or a directory containing manifest files + pattern: pattern to glob the manifest files if `dataset_manifest` is a directory + output_dir: directory to write the transcriptions + compute_langs: Bool to request language ID information (if the model supports it) timestamps: Bool to request greedy time stamp information (if the model supports it) by default None @@ -67,23 +73,21 @@ # Usage ASR model can be specified by either "model_path" or "pretrained_name". -Data for transcription can be defined with either "audio_dir" or "dataset_manifest". append_pred - optional. Allows you to add more than one prediction to an existing .json pred_name_postfix - optional. The name you want to be written for the current model Results are returned in a JSON manifest file. +```bash CUDA_VISIBLE_DEVICES=1 python transcribe_speech_distributed.py \ - model_path=null \ - pretrained_name=null \ - audio_dir="" \ + model_path= \ dataset_manifest="" \ + output_dir="" \ output_filename="" \ clean_groundtruth_text=True \ langid='en' \ batch_size=32 \ timestamps=False \ compute_langs=False \ - cuda=0 \ amp=True \ append_pred=False \ pred_name_postfix="" \ @@ -92,6 +96,16 @@ node_idx=0 \ num_gpus_per_node=1 \ gpu_idx=0 +``` + +If you use Slurm, you can use this params to configure the script: +```bash + gpu_idx=\$SLURM_LOCALID \ + num_gpus_per_node=\$SLURM_GPUS_ON_NODE \ + num_nodes=\$SLURM_JOB_NUM_NODES \ + node_idx=\$SLURM_NODEID +``` + """ @@ -115,14 +129,16 @@ class TranscriptionConfig(SingleTranscribeConfig): output_dir: str = "transcribe_output/" # Distributed config - num_nodes: int = 1 - node_idx: int = 0 - num_gpus_per_node: int = 1 - gpu_idx: int = 0 - bind_gpu_to_cuda: bool = False + num_nodes: int = 1 # total number of nodes + node_idx: int = 0 # index of the current node + num_gpus_per_node: int = 1 # number of GPUs per node + gpu_idx: int = 0 # index of the current GPU + bind_gpu_to_cuda: bool = ( + False # If False, the script will just do .cuda() on the model, otherwise it will do .to(f"cuda:{gpu_idx}") + ) # handle long manifest - split_size: int = -1 # -1 means no split + split_size: int = -1 # -1 means no split, otherwise split the manifest into chunks of this size def get_unfinished_manifest(manifest_list: List[Path], output_dir: Path): @@ -217,6 +233,10 @@ def run_distributed_transcribe(cfg: TranscriptionConfig): logging.info(f"Running distributed transcription with config: {cfg}") + if cfg.dataset_manifest is None: + raise ValueError("`dataset_manifest` is required") + + # load the manifest if isinstance(cfg.dataset_manifest, str) and "," in cfg.dataset_manifest: manifest_list = cfg.dataset_manifest.split(",") elif isinstance(cfg.dataset_manifest, (ListConfig, list)): diff --git a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py index 89e2cebf7f18..2283325d9cd4 100644 --- a/nemo/collections/asr/data/audio_to_eou_label_lhotse.py +++ b/nemo/collections/asr/data/audio_to_eou_label_lhotse.py @@ -54,7 +54,7 @@ def first_supervised_cut(maybe_mixed_cut): return [ t.cut for t in maybe_mixed_cut.tracks - if len(t.cut.supervisions) > 0 and not t.cut.custom.get("is_mixed_noise") + if len(t.cut.supervisions) > 0 and not t.cut.custom.get("is_mixed_noise", False) ][0] return maybe_mixed_cut diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 08eda768173d..97b6d69af2a1 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -57,6 +57,25 @@ class EOUPrediction: class ASREOUModelMixin: + def __init__(self): + if not hasattr(self, 'tokenizer'): + self.tokenizer = None + if not hasattr(self, 'eou_token'): + self.eou_token = None + if not hasattr(self, 'eob_token'): + self.eob_token = None + if not hasattr(self, 'frame_len_in_secs'): + self.frame_len_in_secs = None + + def setup_eou_mixin(self, eou_token: int, eob_token: int, frame_len_in_secs: float, tokenizer): + if getattr(self, 'eou_token', None) is None: + self.eou_token = eou_token + if getattr(self, 'eob_token', None) is None: + self.eob_token = eob_token + if getattr(self, 'frame_len_in_secs', None) is None: + self.frame_len_in_secs = frame_len_in_secs + if getattr(self, 'tokenizer', None) is None: + self.tokenizer = tokenizer def _patch_decoding_cfg(self, cfg: DictConfig): """ @@ -357,6 +376,8 @@ def __init__(self, cfg: DictConfig, trainer): self.eob_token = self.tokenizer.token_to_id(EOB_STRING) self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor + self.setup_eou_mixin(self.eou_token, self.eob_token, self.frame_len_in_secs, self.tokenizer) + self.wer = WER( decoding=self.decoding, batch_dim_index=0, @@ -684,6 +705,7 @@ def __init__(self, cfg: DictConfig, trainer): self.eou_token = self.tokenizer.token_to_id(EOU_STRING) self.eob_token = self.tokenizer.token_to_id(EOB_STRING) self.frame_len_in_secs = self.cfg.preprocessor.window_stride * self.cfg.encoder.subsampling_factor + self.setup_eou_mixin(self.eou_token, self.eob_token, self.frame_len_in_secs, self.tokenizer) self.wer = WER( decoding=self.decoding, diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index e47945addaf0..cd04a5ad2462 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -138,11 +138,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): - dataset = LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ) - return get_lhotse_dataloader_from_config( config, # During transcription, the model is initially loaded on the CPU. @@ -150,7 +145,10 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # these values must be passed from the configuration. global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=dataset, + dataset=LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 31c8e7ce7c90..cd8667f2f0fe 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -507,11 +507,6 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig, verbose: bool = Tru def _setup_dataloader_from_config(self, config: Optional[Dict]): if config.get("use_lhotse"): - dataset = LhotseSpeechToTextBpeDataset( - tokenizer=self.tokenizer, - return_cuts=config.get("do_transcribe", False), - ) - return get_lhotse_dataloader_from_config( config, # During transcription, the model is initially loaded on the CPU. @@ -519,7 +514,10 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # these values must be passed from the configuration. global_rank=self.global_rank if not config.get("do_transcribe", False) else config.get("global_rank"), world_size=self.world_size if not config.get("do_transcribe", False) else config.get("world_size"), - dataset=dataset, + dataset=LhotseSpeechToTextBpeDataset( + tokenizer=self.tokenizer, + return_cuts=config.get("do_transcribe", False), + ), tokenizer=self.tokenizer, ) diff --git a/nemo/collections/asr/modules/__init__.py b/nemo/collections/asr/modules/__init__.py index 14abdd0d2776..097d3e29d8de 100644 --- a/nemo/collections/asr/modules/__init__.py +++ b/nemo/collections/asr/modules/__init__.py @@ -20,7 +20,11 @@ SpectrogramAugmentation, ) from nemo.collections.asr.modules.beam_search_decoder import BeamSearchDecoderWithLM -from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder, ConformerEncoderAdapter +from nemo.collections.asr.modules.conformer_encoder import ( + ConformerEncoder, + ConformerEncoderAdapter, + ConformerMultiLayerFeatureExtractor, +) from nemo.collections.asr.modules.conv_asr import ( ConvASRDecoder, ConvASRDecoderClassification, @@ -45,7 +49,6 @@ ) from nemo.collections.asr.modules.squeezeformer_encoder import SqueezeformerEncoder, SqueezeformerEncoderAdapter from nemo.collections.asr.modules.ssl_modules import ( - ConformerMultiLayerFeatureExtractor, ConformerMultiLayerFeaturePreprocessor, ConvFeatureMaksingWrapper, MultiSoftmaxDecoder, diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index 0c6be0719e7a..fd99da58df52 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -16,7 +16,7 @@ import random from collections import OrderedDict from dataclasses import dataclass -from typing import List, Optional, Set, Tuple +from typing import Callable, List, Optional, Set, Tuple import torch import torch.distributed @@ -56,7 +56,7 @@ ) from nemo.utils import logging -__all__ = ['ConformerEncoder'] +__all__ = ['ConformerEncoder', 'ConformerMultiLayerFeatureExtractor'] class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): @@ -1289,3 +1289,90 @@ class ConformerChangeConfig: # corresponding to left and right context, or -1 for full context. # If None is provided, the attention context size isn't changed. att_context_size: Optional[List[int]] = None + + +class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable, AccessMixin): + def __init__( + self, + encoder: ConformerEncoder, + aggregator: Optional[Callable] = None, + layer_idx_list: Optional[List[int]] = None, + ): + """ + This class is used to extract features from different layers of the ConformerEncoder. + Args: + encoder: ConformerEncoder instance. + aggregator: Aggregator instance. If None, the features are returned as a list. + layer_idx_list: List of layer indices to extract features from. If None, all layers are extracted. + """ + super().__init__() + self.encoder = encoder + self.aggregator = aggregator + self.num_layers = len(encoder.layers) + self.layer_idx_list = [] + if not layer_idx_list: + layer_idx_list = list(range(self.num_layers)) + for lid in layer_idx_list: + if lid < -self.num_layers or lid >= self.num_layers: + raise ValueError(f"Invalid layer index {lid} for ConformerEncoder with {self.num_layers} layers.") + if lid < 0: + lid = self.num_layers + lid + self.layer_idx_list.append(lid) + self.layer_idx_list.sort() + logging.info(f"Extracting features from layers: {self.layer_idx_list}") + self.access_cfg = { + "interctc": { + "capture_layers": self.layer_idx_list, + }, + "detach": False, + "convert_to_cpu": False, + } + self._is_access_enabled = False + + def forward( + self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + same interface as ConformerEncoder.forward() + Returns: + tuple of aggregated features of shape [B, D, T] and lengths of shape [B] + """ + self.encoder.update_access_cfg(self.access_cfg, guid=getattr(self, "model_guid", None)) + self.encoder.set_access_enabled(access_enabled=True, guid=getattr(self, "model_guid", None)) + + _ = self.encoder( + audio_signal=audio_signal, + length=length, + cache_last_channel=cache_last_channel, + cache_last_time=cache_last_time, + cache_last_channel_len=cache_last_channel_len, + ) + + total_registry = {} + for module_registry in self.encoder.get_module_registry(self.encoder).values(): + for key in module_registry: + if key.startswith("interctc/") and key in total_registry: + raise RuntimeError(f"layer {key} has been logged multiple times!") + total_registry.update(module_registry) + + encoded_list = [] + encoded_len_list = [] + for layer_idx in self.layer_idx_list: + try: + layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"] + layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"] + except KeyError: + raise RuntimeError( + f"Intermediate layer {layer_idx} was not captured! Check the layer index and the number of " + "ConformerEncoder layers." + ) + if len(layer_outputs) > 1 or len(layer_lengths) > 1: + raise RuntimeError("Make sure encoder.forward is called exactly one time") + encoded_list.append(layer_outputs[0]) # [B, D, T] + encoded_len_list.append(layer_lengths[0]) # [B] + + self.encoder.reset_registry() + if self.aggregator is None: + return encoded_list, encoded_len_list + return self.aggregator(encoded_list, encoded_len_list) diff --git a/nemo/collections/asr/modules/ssl_modules/__init__.py b/nemo/collections/asr/modules/ssl_modules/__init__.py index dcfefd54fa73..629c68653772 100644 --- a/nemo/collections/asr/modules/ssl_modules/__init__.py +++ b/nemo/collections/asr/modules/ssl_modules/__init__.py @@ -17,9 +17,6 @@ SpeakerNoiseAugmentation, ) from nemo.collections.asr.modules.ssl_modules.masking import ConvFeatureMaksingWrapper, RandomBlockMasking -from nemo.collections.asr.modules.ssl_modules.multi_layer_feat import ( - ConformerMultiLayerFeatureExtractor, - ConformerMultiLayerFeaturePreprocessor, -) +from nemo.collections.asr.modules.ssl_modules.multi_layer_feat import ConformerMultiLayerFeaturePreprocessor from nemo.collections.asr.modules.ssl_modules.multi_softmax_decoder import MultiSoftmaxDecoder from nemo.collections.asr.modules.ssl_modules.quantizers import RandomProjectionVectorQuantizer diff --git a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py index b9af8a7105b1..40f16c5a0be1 100644 --- a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py +++ b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py @@ -18,7 +18,11 @@ import torch.distributed import torch.nn as nn -from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor, ConformerEncoder +from nemo.collections.asr.modules import ( + AudioToMelSpectrogramPreprocessor, + ConformerEncoder, + ConformerMultiLayerFeatureExtractor, +) from nemo.core.classes import Exportable, NeuralModule from nemo.core.classes.mixins import AccessMixin from nemo.utils import logging @@ -81,88 +85,12 @@ def forward( raise ValueError(f"Unknown mode {self.mode}") -class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable): - def __init__(self, encoder, aggregator: Optional[Callable] = None, layer_idx_list: Optional[List[int]] = None): - """ - Args: - encoder: ConformerEncoder instance. - aggregator: Aggregator instance. - layer_idx_list: List of layer indices to extract features from. - """ - super().__init__() - self.encoder = encoder - self.aggregator = aggregator - self.num_layers = len(encoder.layers) - self.layer_idx_list = [] - if not layer_idx_list: - layer_idx_list = list(range(self.num_layers)) - for lid in layer_idx_list: - if lid < -self.num_layers or lid >= self.num_layers: - raise ValueError(f"Invalid layer index {lid} for ConformerEncoder with {self.num_layers} layers.") - if lid < 0: - lid = self.num_layers + lid - self.layer_idx_list.append(lid) - self.layer_idx_list.sort() - logging.info(f"Extracting features from layers {self.layer_idx_list}") - self.access_cfg = { - "interctc": { - "capture_layers": self.layer_idx_list, - }, - "detach": False, - "convert_to_cpu": False, - } - self._is_access_enabled = False - - def forward( - self, audio_signal, length, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None - ) -> Tuple[torch.Tensor, torch.Tensor]: - """ - Args: - same interface as ConformerEncoder.forward() - Returns: - tuple of aggregated features of shape [B, D, T] and lengths of shape [B] - """ - self.encoder.update_access_cfg(self.access_cfg, guid=getattr(self, "model_guid", None)) - self.encoder.set_access_enabled(access_enabled=True, guid=getattr(self, "model_guid", None)) - - _ = self.encoder( - audio_signal=audio_signal, - length=length, - cache_last_channel=cache_last_channel, - cache_last_time=cache_last_time, - cache_last_channel_len=cache_last_channel_len, - ) - - total_registry = {} - for module_registry in self.encoder.get_module_registry(self.encoder).values(): - for key in module_registry: - if key.startswith("interctc/") and key in total_registry: - raise RuntimeError(f"layer {key} has been logged multiple times!") - total_registry.update(module_registry) - - encoded_list = [] - encoded_len_list = [] - for layer_idx in self.layer_idx_list: - try: - layer_outputs = total_registry[f"interctc/layer_output_{layer_idx}"] - layer_lengths = total_registry[f"interctc/layer_length_{layer_idx}"] - except KeyError: - raise RuntimeError( - f"Intermediate layer {layer_idx} was not captured! Check the layer index and the number of " - "ConformerEncoder layers." - ) - if len(layer_outputs) > 1 or len(layer_lengths) > 1: - raise RuntimeError("Make sure encoder.forward is called exactly one time") - encoded_list.append(layer_outputs[0]) # [B, D, T] - encoded_len_list.append(layer_lengths[0]) # [B] - - self.encoder.reset_registry() - if self.aggregator is None: - return encoded_list, encoded_len_list - return self.aggregator(encoded_list, encoded_len_list) - - class ConformerMultiLayerFeaturePreprocessor(NeuralModule, Exportable, AccessMixin): + """ + This class is used to replace the AudioToMelSpectrogramPreprocessor such that + the input to the actual model encoder is the multi-layer features from a pre-trained ConformerEncoder. + """ + def __init__( self, aggregator: nn.Module, diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index 0b279dfa8648..50f0f2eb7f62 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -1038,15 +1038,15 @@ def compute_rnnt_timestamps(self, hypothesis: Hypothesis, timestamp_type: str = # Assert number of offsets and hypothesis tokens are 1:1 match. num_flattened_tokens = 0 for t in range(len(char_offsets)): - # Subtract one here for the extra RNNT BLANK token emitted to designate "End of timestep" - num_flattened_tokens += len(char_offsets[t]['char']) - 1 - if char_offsets[t]['char'][-1] != self.blank_id: - num_flattened_tokens += 1 # Add one back if it reaches max steps without blank token + # Count all tokens except for RNNT BLANK token emitted to designate "End of timestep" + num_flattened_tokens += len([c for c in char_offsets[t]['char'] if c != self.blank_id]) if num_flattened_tokens != len(hypothesis.text): - raise RuntimeError( - f"Number of tokens in hypothesis ({len(hypothesis.text)}) does not match the number of offsets " - f"({num_flattened_tokens}). Please check the hypothesis and offsets manually." + raise ValueError( + f"`char_offsets`: {char_offsets} and `processed_tokens`: {hypothesis.text}" + " have to be of the same length, but are: " + f"`len(offsets)`: {num_flattened_tokens} and `len(processed_tokens)`:" + f" {len(hypothesis.text)}" ) encoded_char_offsets = copy.deepcopy(char_offsets) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index bc8624e687bc..4db49ee9cd39 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -495,7 +495,6 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=None) # 2. Optional augmentations. - # 2.a. Noise mixing. if config.noise_path is not None: noise = guess_parse_cutset(config.noise_path) @@ -505,6 +504,12 @@ def mark_as_mixed_in_noise(cut): cut.is_mixed_noise = True return cut + # In current lhotse implementation, if padding is applied before noise augmentation, + # and your noise manifest has dummy text field like `"text": "-"`, + # the call to MixCut.first_non_padding_cut will return the noise cut + # instead of the speech cut. We mark the noise cut with ``is_mixed_noise`` flag + # to avoid this issue, and the speech cut can be obtained by: + # `cut =[t.cut for t in cut.tracks if len(t.cut.supervisions) > 0 and not t.cut.custom.get("is_mixed_noise", False)][0]` noise = noise.map(mark_as_mixed_in_noise) cuts = cuts.mix( cuts=noise, diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index e18d52765597..eeca509874f3 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -119,7 +119,6 @@ def __iter__(self) -> Generator[Cut, None, None]: cut = self._create_cut( audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None) ) - assert isinstance(cut, MonoCut) # Note that start=0 and not start=offset because supervision's start if relative to the # start of the cut; and cut.start is already set to offset cut.supervisions.append( diff --git a/scripts/asr_end_of_utterance/add_eob_labels.py b/scripts/asr_eou/add_eob_labels.py similarity index 100% rename from scripts/asr_end_of_utterance/add_eob_labels.py rename to scripts/asr_eou/add_eob_labels.py diff --git a/scripts/asr_end_of_utterance/clean_manifest.py b/scripts/asr_eou/clean_manifest.py similarity index 99% rename from scripts/asr_end_of_utterance/clean_manifest.py rename to scripts/asr_eou/clean_manifest.py index 99a90d397714..54766c4f321e 100644 --- a/scripts/asr_end_of_utterance/clean_manifest.py +++ b/scripts/asr_eou/clean_manifest.py @@ -24,7 +24,6 @@ """ import argparse -import datetime import re import unicodedata from pathlib import Path @@ -374,7 +373,7 @@ def convert_number(match): if is_dollar: words += ' dollars' return words + " " - except: + except Exception: return original # Return original if conversion fails # Pattern matches: $3,000 or 3,000.45 or 1234 @@ -462,7 +461,7 @@ def clean_label(_str: str) -> str: text = text.replace("\r", " ") text = text.replace("\t", " ") - ret = " ".join(_str.split()) + ret = " ".join(text.split()) return ret diff --git a/scripts/asr_end_of_utterance/conf/data.yaml b/scripts/asr_eou/conf/data.yaml similarity index 100% rename from scripts/asr_end_of_utterance/conf/data.yaml rename to scripts/asr_eou/conf/data.yaml diff --git a/scripts/asr_end_of_utterance/eval_eou_metrics.py b/scripts/asr_eou/eval_eou_metrics.py similarity index 70% rename from scripts/asr_end_of_utterance/eval_eou_metrics.py rename to scripts/asr_eou/eval_eou_metrics.py index 1410dbdf27b5..9c96afff98b7 100644 --- a/scripts/asr_end_of_utterance/eval_eou_metrics.py +++ b/scripts/asr_eou/eval_eou_metrics.py @@ -15,28 +15,45 @@ """ Example usage: -The NIVA_PRED_ROOT and REFERENCE_ROOT directories should have the following structure: +The PREDICTION_ROOT and REFERENCE_ROOT directories should have the following structure: -: +: ->dataset1/ eou/ - ctm/ + -> sample1.json + -> sample2.json ->dataset2/ eou/ - ctm/ + -> sample1.json + -> sample2.json : ->dataset1/ + -> sample1.json + -> sample2.json ->dataset2/ - - + -> sample1.json + -> sample2.json + + +each sample.json should contain a list of dictionaries with the following fields: +{ + "session_id": str, + "start_time": float, # start time in seconds + "end_time": float, # end time in seconds + "words": str, # transcription of the utterance + "audio_filepath": str, # only in prediction + "eou_prob": float, # only in prediction, probability of EOU in range [0.1] + "eou_pred": bool, # only in prediction + "full_text": str, # only in prediction, which is the full transcription up to the end_time +} + ```bash python eval_eou_with_niva.py \ - --prediction $NIVA_PRED_ROOT \ + --prediction $PREDICTION_ROOT \ --reference $REFERENCE_ROOT \ --multiple ``` - """ @@ -67,6 +84,11 @@ action="store_true", help="Whether to evaluate end of backchannel predictions.", ) +parser.add_argument( + "--ignore_eob", + action="store_true", + help="Whether to ignore end of backchannel predictions.", +) parser.add_argument( "--multiple", action="store_true", @@ -74,26 +96,27 @@ ) -def load_segLST(directory: str, use_eob: bool = False) -> dict: +def load_segLST(directory: str, use_eob: bool = False, ignore_eob: bool = False) -> dict: json_files = list(Path(directory).glob("*.json")) segLST = {} for json_file in json_files: key = json_file.stem with open(json_file, 'r') as f: data = json.load(f) - is_backchannel = data[0].get("is_backchannel", False) if data else False - if not isinstance(is_backchannel, list): - is_backchannel = [is_backchannel] - if any(is_backchannel) and not use_eob: - continue + assert isinstance(data, list), f"Data in {json_file} is not a list." + if not ignore_eob: + # get the data with the correct eob label + data = [x for x in data if (x.get("is_backchannel", False) == use_eob)] segLST[key] = data return segLST -def evaluate_eou_predictions(prediction_dir: str, reference_dir: str, use_eob: bool = False) -> List[EOUResult]: +def evaluate_eou_predictions( + prediction_dir: str, reference_dir: str, use_eob: bool = False, ignore_eob: bool = False +) -> List[EOUResult]: prediction_dir = Path(prediction_dir) / "eou" - prediction_segLST = load_segLST(prediction_dir, use_eob) - reference_segLST = load_segLST(reference_dir, use_eob) + prediction_segLST = load_segLST(prediction_dir, use_eob, ignore_eob) + reference_segLST = load_segLST(reference_dir, use_eob, ignore_eob) eou_metrics = [] for key, reference in reference_segLST.items(): @@ -143,7 +166,9 @@ def evaluate_eou_predictions(prediction_dir: str, reference_dir: str, use_eob: b raise ValueError( f"Reference directory {ref_dir} and prediction directory {pred_dir} must have the same name." ) - results = evaluate_eou_predictions(prediction_dir=str(pred_dir), reference_dir=str(ref_dir), use_eob=args.eob) + results = evaluate_eou_predictions( + prediction_dir=str(pred_dir), reference_dir=str(ref_dir), use_eob=args.eob, ignore_eob=args.ignore_eob + ) # Print the results print("==========================================") print(f"Evaluation Results for: {pred_dir} against {ref_dir}") diff --git a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py b/scripts/asr_eou/generate_noisy_eval_data.py similarity index 95% rename from scripts/asr_end_of_utterance/generate_noisy_eval_data.py rename to scripts/asr_eou/generate_noisy_eval_data.py index eda952c75db8..daf3f807d4aa 100644 --- a/scripts/asr_end_of_utterance/generate_noisy_eval_data.py +++ b/scripts/asr_eou/generate_noisy_eval_data.py @@ -59,38 +59,6 @@ from nemo.core.config import hydra_runner from nemo.utils import logging -# Dummy labels for the dummy tokenizer -labels = [ - " ", - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "'", -] - @hydra_runner(config_path="conf/", config_name="data") def main(cfg): @@ -174,7 +142,7 @@ def process_manifest(data_cfg, output_dir): pre_pad_dur = None # Load the dataset - tokenizer = parsers.make_parser(labels) # dummy tokenizer + tokenizer = parsers.make_parser() # dummy tokenizer dataset = LhotseSpeechToTextBpeEOUDataset(cfg=data_cfg, tokenizer=tokenizer, return_cuts=True) dataloader = get_lhotse_dataloader_from_config( diff --git a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py b/scripts/asr_eou/tokenizers/add_special_tokens_to_sentencepiece.py similarity index 96% rename from scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py rename to scripts/asr_eou/tokenizers/add_special_tokens_to_sentencepiece.py index 1f60eaeea51b..cc4f86d0a71b 100644 --- a/scripts/asr_end_of_utterance/tokenizers/add_special_tokens_to_sentencepiece.py +++ b/scripts/asr_eou/tokenizers/add_special_tokens_to_sentencepiece.py @@ -107,7 +107,8 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined, extract_only= token_type = 4 model = spt.ModelProto() - model.ParseFromString(open(input_file, 'rb').read()) + with open(input_file, 'rb') as f: + model.ParseFromString(f.read()) if not extract_only: for token in tokens: @@ -126,7 +127,7 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined, extract_only= id = sp.piece_to_id(token) logging.info(f"Created token '{token}' at ID {id}") logging.info(f"New tokenizer vocab size: {sp.get_piece_size()}") - except: + except Exception: logging.error( "Could not appropriately configure new tokenizer. Verify if the special tokens already exist." ) @@ -155,9 +156,8 @@ def edit_spt_model(input_file, output_dir, tokens, is_userdefined, extract_only= # skip special tokens continue token = piece[1:] if piece.startswith("▁") else f"##{piece}" - if len(token) == 0: - tokens = piece[0] - f.write(f"{token}\n") # Format follows the original vocab format + if len(token) > 0: + f.write(f"{token}\n") # Format follows the original vocab format logging.info(f"Created new tokenizer vocab at: {vocab_txt_file}") diff --git a/scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py b/scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py similarity index 100% rename from scripts/asr_end_of_utterance/tokenizers/sentencepiece_model_pb2.py rename to scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py diff --git a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py index eaa7f1f37bba..50c0de65985b 100644 --- a/scripts/speech_recognition/convert_to_tarred_audio_dataset.py +++ b/scripts/speech_recognition/convert_to_tarred_audio_dataset.py @@ -85,7 +85,7 @@ from dataclasses import dataclass, field from datetime import datetime from io import BytesIO -from typing import Any, List, Optional +from typing import Any, List, Optional, Union import numpy as np import soundfile @@ -563,7 +563,7 @@ def create_concatenated_dataset( metadata_yaml = OmegaConf.structured(metadata) OmegaConf.save(metadata_yaml, new_metadata_path, resolve=True) - def _read_manifest(self, manifest_path: str | list, config: ASRTarredDatasetConfig): + def _read_manifest(self, manifest_path: Union[str, List[str]], config: ASRTarredDatasetConfig): """Read and filters data from the manifest""" entries = [] total_duration = 0.0 From e8844701d160ae5fd4dbc7b24ff6234de7294fe6 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 8 Oct 2025 15:47:14 -0400 Subject: [PATCH 105/107] fix pylint&flake8 Signed-off-by: stevehuang52 --- nemo/collections/asr/data/audio_to_text_dataset.py | 3 --- nemo/collections/asr/models/asr_eou_models.py | 2 +- .../asr_eou/tokenizers/sentencepiece_model_pb2.py | 14 ++++++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py index 968f78ddd08c..afb7a86a5f0e 100644 --- a/nemo/collections/asr/data/audio_to_text_dataset.py +++ b/nemo/collections/asr/data/audio_to_text_dataset.py @@ -32,7 +32,6 @@ get_hf_audio_to_text_char_dataset, ) from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor, process_augmentations -from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis from nemo.collections.common.data.dataset import CodeSwitchedDataset, ConcatDataset from nemo.collections.common.tokenizers import TokenizerSpec from nemo.utils import logging @@ -867,8 +866,6 @@ def write_on_batch_end( for sample_id, hypotheses in prediction: item = {} - if isinstance(transcribed_text, Hypothesis): - transcribed_text = transcribed_text.text if isinstance(sample_id, lhotse.cut.Cut): sample = sample_id if isinstance(sample, lhotse.cut.MixedCut): diff --git a/nemo/collections/asr/models/asr_eou_models.py b/nemo/collections/asr/models/asr_eou_models.py index 97b6d69af2a1..2a42bbb85410 100644 --- a/nemo/collections/asr/models/asr_eou_models.py +++ b/nemo/collections/asr/models/asr_eou_models.py @@ -184,7 +184,7 @@ def _pad_to_same_length(self, eou_labels: List[float], eou_preds: List[float]) - def _calculate_eou_metrics( self, eou_predictions: List[EOUPrediction], batch: AudioToTextEOUBatch - ) -> Tuple[List, List]: + ) -> Tuple[List[EOUResult], List[EOUResult]]: """ Calculate EOU metrics. Args: diff --git a/scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py b/scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py index 0ea467f28d15..cb97411349aa 100644 --- a/scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py +++ b/scripts/asr_eou/tokenizers/sentencepiece_model_pb2.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generated by the protocol buffer compiler. DO NOT EDIT! # source: sentencepiece_model.proto From 38f5b93f83a5bc02dde0bba259cc4f4d5980a201 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 8 Oct 2025 15:56:28 -0400 Subject: [PATCH 106/107] fix pylint Signed-off-by: stevehuang52 --- nemo/collections/asr/modules/__init__.py | 38 +++++++++++++++++++ .../asr/modules/ssl_modules/__init__.py | 10 +++++ .../modules/ssl_modules/multi_layer_feat.py | 3 +- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/modules/__init__.py b/nemo/collections/asr/modules/__init__.py index 097d3e29d8de..310b89bf9f8c 100644 --- a/nemo/collections/asr/modules/__init__.py +++ b/nemo/collections/asr/modules/__init__.py @@ -55,3 +55,41 @@ RandomBlockMasking, RandomProjectionVectorQuantizer, ) + + +__all__ = [ + 'AudioToMelSpectrogramPreprocessor', + 'AudioToMFCCPreprocessor', + 'CropOrPadSpectrogramAugmentation', + 'MaskedPatchAugmentation', + 'SpectrogramAugmentation', + 'BeamSearchDecoderWithLM', + 'ConformerEncoder', + 'ConformerEncoderAdapter', + 'ConformerMultiLayerFeatureExtractor', + 'ConvASRDecoder', + 'ConvASRDecoderClassification', + 'ConvASRDecoderReconstruction', + 'ConvASREncoder', + 'ConvASREncoderAdapter', + 'ECAPAEncoder', + 'ParallelConvASREncoder', + 'SpeakerDecoder', + 'ViterbiDecoderWithGraph', + 'HATJoint', + 'LSTMDecoder', + 'MSDD_module', + 'RNNEncoder', + 'RNNTDecoder', + 'RNNTDecoderJointSSL', + 'RNNTJoint', + 'SampledRNNTJoint', + 'StatelessTransducerDecoder', + 'SqueezeformerEncoder', + 'SqueezeformerEncoderAdapter', + 'ConformerMultiLayerFeaturePreprocessor', + 'ConvFeatureMaksingWrapper', + 'MultiSoftmaxDecoder', + 'RandomBlockMasking', + 'RandomProjectionVectorQuantizer', +] diff --git a/nemo/collections/asr/modules/ssl_modules/__init__.py b/nemo/collections/asr/modules/ssl_modules/__init__.py index 629c68653772..f33127bd7d85 100644 --- a/nemo/collections/asr/modules/ssl_modules/__init__.py +++ b/nemo/collections/asr/modules/ssl_modules/__init__.py @@ -20,3 +20,13 @@ from nemo.collections.asr.modules.ssl_modules.multi_layer_feat import ConformerMultiLayerFeaturePreprocessor from nemo.collections.asr.modules.ssl_modules.multi_softmax_decoder import MultiSoftmaxDecoder from nemo.collections.asr.modules.ssl_modules.quantizers import RandomProjectionVectorQuantizer + +__all__ = [ + 'MultiSpeakerNoiseAugmentation', + 'SpeakerNoiseAugmentation', + 'ConvFeatureMaksingWrapper', + 'RandomBlockMasking', + 'ConformerMultiLayerFeaturePreprocessor', + 'MultiSoftmaxDecoder', + 'RandomProjectionVectorQuantizer', +] diff --git a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py index 40f16c5a0be1..73ca41438437 100644 --- a/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py +++ b/nemo/collections/asr/modules/ssl_modules/multi_layer_feat.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable, List, Optional, Tuple +from typing import List, Optional, Tuple import torch import torch.distributed @@ -25,7 +25,6 @@ ) from nemo.core.classes import Exportable, NeuralModule from nemo.core.classes.mixins import AccessMixin -from nemo.utils import logging class Aggregator(nn.Module): From ab77c22d624ad6e6963106a5a7640daba8815e09 Mon Sep 17 00:00:00 2001 From: stevehuang52 Date: Wed, 8 Oct 2025 16:03:12 -0400 Subject: [PATCH 107/107] refactor Signed-off-by: stevehuang52 --- .../asr/modules/conformer_encoder.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index fd99da58df52..36371e6cf6a7 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -1265,32 +1265,6 @@ def get_accepted_adapter_types( return types -# Register any additional information -if adapter_mixins.get_registered_adapter(ConformerEncoder) is None: - adapter_mixins.register_adapter(base_class=ConformerEncoder, adapter_class=ConformerEncoderAdapter) - - -@dataclass -class ConformerChangeConfig: - """ - Change self_attention_model for Conformer. - - Options: - 'rel_pos': relative positional embedding and Transformer-XL - 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using - overlapping chunks. Attention context is determined by att_context_size parameter. - 'abs_pos': absolute positional embedding and Transformer - """ - - # If None is provided, self_attention_model is not changed. - self_attention_model: Optional[str] = None - - # Change the attention context size by providing 2 integers, - # corresponding to left and right context, or -1 for full context. - # If None is provided, the attention context size isn't changed. - att_context_size: Optional[List[int]] = None - - class ConformerMultiLayerFeatureExtractor(NeuralModule, Exportable, AccessMixin): def __init__( self, @@ -1376,3 +1350,29 @@ def forward( if self.aggregator is None: return encoded_list, encoded_len_list return self.aggregator(encoded_list, encoded_len_list) + + +# Register any additional information +if adapter_mixins.get_registered_adapter(ConformerEncoder) is None: + adapter_mixins.register_adapter(base_class=ConformerEncoder, adapter_class=ConformerEncoderAdapter) + + +@dataclass +class ConformerChangeConfig: + """ + Change self_attention_model for Conformer. + + Options: + 'rel_pos': relative positional embedding and Transformer-XL + 'rel_pos_local_attn': relative positional embedding and Transformer-XL with local attention using + overlapping chunks. Attention context is determined by att_context_size parameter. + 'abs_pos': absolute positional embedding and Transformer + """ + + # If None is provided, self_attention_model is not changed. + self_attention_model: Optional[str] = None + + # Change the attention context size by providing 2 integers, + # corresponding to left and right context, or -1 for full context. + # If None is provided, the attention context size isn't changed. + att_context_size: Optional[List[int]] = None