From 4d287d53311d4c5598d68b7249c79f14666dc061 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 05:49:07 -0700
Subject: [PATCH 01/90] yodas2 config is added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          | 385 ++++++++++++++++++
 1 file changed, 385 insertions(+)
 create mode 100644 dataset_configs/multilingual/granary/yodas2.yaml

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
new file mode 100644
index 00000000..f81aab68
--- /dev/null
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -0,0 +1,385 @@
+documentation: |
+  YODAS2 
+  ############
+  Documentation is in progress.
+
+params:
+  source_lang: en
+  source_lang_full: English
+  min_audio_lid_probability:  0.7
+  min_audio_duration: 0.1
+  max_audio_duration: 40.0 #Add specific processor!
+  translation:
+    target_lang: it
+    target_lang_full: Italian
+    max_len_diff_ratio: 4
+    min_hist_token_ratio: 0.8
+    min_text_lid_probability: 0.3
+    min_qe_score: 0.75
+  save_disk_space: True
+
+processors_to_run: "9"
+workspace_dir: ???
+install_requirements: True
+
+processors:
+  - _target_: sdp.processors.datasets.yodas2.ListYodas2Data
+    output_manifest_file: ${workspace_dir}/manifest_00.json
+    use_metadata: True
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_01.json
+    new_field:  src_lang
+    expression: entry.lang_subset[:2]
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_02.json
+    input_value_key: src_lang
+    target_value: ${params.source_lang}
+  
+  - _target_: sdp.processors.datasets.yodas2.DownloadYodas2Data
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+  
+  - _target_: sdp.processors.ExtractTar
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_04.json
+    field_to_tar_filepath: 'local_audio'
+    extraction_dir: ${workspace_dir}/${params.source_lang}
+    remove_source_tar: ${params.save_disk_space}
+    filepath_prefix_field: 'lang_subset'
+    output_filepath_field: 'extracted_audios'
+    get_extracted_filepaths: True
+  
+  - _target_: sdp.processors.datasets.yodas2.CreateInitialManifest
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_05.json
+    field_with_list: 'extracted_audios'
+    output_field: 'source_audio_filepath'
+    fields_to_save: 
+      - lang_subset
+      - shard_id
+      - src_lang
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_06.json
+    new_field: 'audio_filepath'
+    expression: "'${workspace_dir}/${params.source_lang}/converted/' + entry.lang_subset + '/' + entry.shard_id + '/' + entry.yodas_id"
+  
+  - _target_: sdp.processors.FfmpegConvert
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_07.json
+    input_file_key: 'source_audio_filepath'
+    output_file_key: 'audio_filepath'
+    id_key: 'audio_filepath'
+    converted_audio_dir: '/'
+    target_samplerate: 16000
+    target_nchannels: 1
+  
+  - _target_: sdp.processors.RemoveFiles
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_08.json
+    filepath_field: 'source_audio_filepath' 
+    should_run: ${params.save_disk_space}
+
+  # Lang ID
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_09.json
+    model_size_or_path: 'base'
+    num_devices: -1
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_09
+    language_detection_only: True
+    inference:
+        language_detection_segments: 7
+        chunk_length: 30
+    save_timestamps_separately: False
+    skip_corrupted_audios: True
+
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_10.json
+    new_field: 'lid_verified'
+    expression: (entry.language == "${params.source_lang}") & (entry.language_probability >= ${params.min_audio_lid_probability})
+    filter: True
+
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_11.json
+    fields_to_drop:
+      - language
+      - language_probability
+      - lid_verified
+  
+  # Inference on long audio
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_12.json
+    model_size_or_path: 'base'
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_12
+    inference:
+        batch_size: 16
+        language: ${params.source_lang}
+    save_timestamps_separately: False
+    skip_corrupted: True
+  
+  - _target_: sdp.processors.ListToEntries
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
+    field_with_list: 'segments'
+    
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
+    new_field: 'duration'
+    expression: entry.end - entry.start
+  
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
+    rename_fields: 
+      start: offset 
+      id: segment_id 
+  
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_16.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - src_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+  
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_17.json
+    model_size_or_path: 'base'
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_17
+    inference:
+        batch_size: 16
+        language: ${params.source_lang}
+    save_timestamps_separately: False
+    skip_corrupted: True
+    slice_by_offset: True
+  
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_18.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - src_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+      - pred_text
+  
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_19.json
+    rename_fields:
+      pred_text: text
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_20.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.WhisperHallucinationFeatures
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_21.json
+    text_field: text
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_22.json
+    new_field:  is_hallucinated
+    expression: (not entry.hall_repeated_ngrams) & (not entry.hall_long_word) & (not entry.hall_frequent_single_word)
+    filter: True
+    
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_23.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - src_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+      - text
+  
+  - _target_: sdp.processors.vLLMInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_24.json
+    generation_field: src_text
+    prompt_file: ./dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/${params.source_lang}.yaml
+    model:
+      model: "Qwen/Qwen2.5-7B-Instruct-1M"
+      tensor_parallel_size: 2
+      max_model_len: 2048
+      enable_chunked_prefill: True
+      max_num_batched_tokens: 1024
+      enforce_eager: True
+      dtype: float16
+      gpu_memory_utilization: 0.95 
+      max_num_seqs: 16
+    inference:
+      temperature: 0.7
+      top_p: 0.8
+      repetition_penalty: 1.05
+      max_tokens: 2048
+    apply_chat_template:
+      tokenize: False
+      add_generation_prompt: True
+  
+  - _target_: sdp.processors.QwenGenerationFiltering
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_25.json
+    text_field: text
+    generation_field: src_text
+  
+  - _target_: sdp.processors.SubRegex
+    text_key: src_text
+    regex_params_yaml: ./dataset_configs/multilingual/yodas2/partials/subregex_params.yaml
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_26.json
+  
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_27.json
+    fields_to_drop:
+      - text
+  
+  # AST
+  - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_28.json
+    fields:
+      tgt_lang: ${params.translation.target_lang}
+
+  - _target_: sdp.processors.vLLMInference
+    generation_field: tgt_text
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_29.json
+    prompt:
+      system: ""
+      user: |
+        Translate the following ${params.source_lang_full} source text to ${params.translation.target_lang_full}:
+        ${params.source_lang_full}: {src_text}
+        ${params.translation.target_lang_full}: 
+    model:
+      model: "utter-project/EuroLLM-9B-Instruct"
+      dtype: float16
+    inference:
+      best_of: 1
+      temperature: 0.0
+      top_p: 1.0
+      max_tokens: 1280
+    apply_chat_template:
+      max_length: 512
+      tokenize: False
+      add_generation_prompt: True
+  
+  ## num_words and len_diff_ratio filtering
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_30.json
+    text_key: src_text
+    num_words_key: num_words_src
+  
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_31.json
+    text_key: tgt_text
+    num_words_key: num_words_tgt
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_32.json
+    new_field: num_words_filter
+    expression: (entry.num_words_src > 1) & (entry.num_words_tgt > 1)
+    filter: True
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_33.json
+    new_field: len_diff_ratio
+    expression: max(entry.num_words_src / entry.num_words_tgt, entry.num_words_tgt / entry.num_words_src)
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_34.json
+    input_value_key: len_diff_ratio
+    operator: lt
+    target_value: ${params.translation.max_len_diff_ratio}
+  
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_35.json
+    fields_to_drop:
+      - num_words_src
+      - num_words_tgt
+      - num_words_filter
+      - len_diff_ratio
+  
+  ## filtering based on character histograms
+  - _target_: sdp.processors.CharacterHistograms
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_36.json
+    text_field: src_text
+    lang: ${params.source_lang}
+    output_score_field: src_hist_token_ratio
+    cache_dir: /data3/sdp_test/cache/histograms
+  
+  - _target_: sdp.processors.CharacterHistograms
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_37.json
+    text_field: tgt_text
+    lang: ${params.translation.target_lang}
+    output_score_field: tgt_hist_token_ratio
+    cache_dir: /data3/sdp_test/cache/histograms
+
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_38.json
+    new_field: len_diff_ratio_filter
+    expression: (entry.src_hist_token_ratio > ${params.translation.min_hist_token_ratio}) & (entry.tgt_hist_token_ratio > ${params.translation.min_hist_token_ratio})
+    filter: True
+  
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_39.json
+    fields_to_drop:
+      - src_hist_token_ratio
+      - tgt_hist_token_ratio
+      - len_diff_ratio_filter
+
+  ## filtering based on Fasttext LID
+  - _target_: sdp.processors.FastTextClassifier
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_40.json
+    text_field: src_text
+    output_field: src_lid
+    model_name_or_path: lid.176.bin
+    cache_dir: /data3/sdp_test/cache
+  
+  - _target_: sdp.processors.FastTextClassifier
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_41.json
+    text_field: tgt_text
+    output_field: tgt_lid
+    model_name_or_path: lid.176.bin
+    cache_dir: /data3/sdp_test/cache
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_42.json
+    new_field: lid_filter
+    expression: (entry.src_lid == '${params.source_lang}') & (entry.src_lid_prob > ${params.translation.min_text_lid_probability}) & (entry.tgt_lid == '${params.translation.target_lang}') & (entry.tgt_lid_prob > ${params.translation.min_text_lid_probability})
+    filter: True
+  
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_43.json
+    fields_to_drop:
+      - src_lid
+      - src_lid_prob
+      - tgt_lid
+      - tgt_lid_prob
+      - lid_filter
+
+  ## filtering based on Cometoid QE
+  - _target_: sdp.processors.CometoidWMTQualityEstimation
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_44.json
+    source_text_field: src_text
+    target_text_field: tgt_text
+    model_name_or_path: cometoid-wmt23
+    device_type: gpu
+    num_devices: 4
+    chunksize: 10
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_45.json
+    input_value_key: cometoid_score
+    operator: gt
+    target_value: ${params.translation.min_qe_score}
+
+  - _target_: sdp.processors.DropSpecificFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_46.json
+    fields_to_drop:
+      - cometoid_score
+  
+  # - _target_: sdp.processors.ConvertToTarredAudioDataset
\ No newline at end of file

From dc6143250147b1e9ae6c7e6acbc5108545404a29 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 07:37:08 -0700
Subject: [PATCH 02/90] ListToEntries is added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 build/lib/sdp/__init__.py                     |   13 +
 build/lib/sdp/logging.py                      |   18 +
 build/lib/sdp/processors/__init__.py          |  134 ++
 build/lib/sdp/processors/base_processor.py    |  502 +++++++
 build/lib/sdp/processors/datasets/__init__.py |    0
 .../datasets/commoncrawl/__init__.py          |   15 +
 .../datasets/commoncrawl/commoncrawl.py       |   99 ++
 .../datasets/commoncrawl/harv_utils.py        |   45 +
 .../sdp/processors/datasets/coraa/__init__.py |    0
 .../datasets/coraa/create_initial_manifest.py |  125 ++
 .../processors/datasets/coraal/__init__.py    |   16 +
 .../coraal/create_initial_manifest.py         |  218 +++
 .../processors/datasets/coraal/data_splits.py |  130 ++
 .../processors/datasets/fleurs/__init__.py    |    0
 .../fleurs/create_initial_manifest.py         |  150 ++
 .../sdp/processors/datasets/ksc2/__init__.py  |    0
 .../datasets/ksc2/create_initial_manifest.py  |  150 ++
 build/lib/sdp/processors/datasets/lhotse.py   |   83 ++
 .../datasets/librispeech/__init__.py          |    0
 .../librispeech/create_initial_manifest.py    |  140 ++
 .../sdp/processors/datasets/masc/__init__.py  |   18 +
 .../datasets/masc/aggregate_segments.py       |  131 ++
 .../masc/apply_reg_exp_on_vtt_entries.py      |   74 +
 .../datasets/masc/create_initial_manifest.py  |  174 +++
 .../masc/get_caption_file_segments.py         |   62 +
 .../lib/sdp/processors/datasets/masc/utils.py |   77 ++
 .../sdp/processors/datasets/mcv/__init__.py   |    0
 .../datasets/mcv/create_initial_manifest.py   |  142 ++
 .../datasets/mediaspeech/__init__.py          |   13 +
 .../mediaspeech/create_initial_manifest.py    |  145 ++
 .../sdp/processors/datasets/mls/__init__.py   |    0
 .../datasets/mls/create_initial_manifest.py   |  180 +++
 .../sdp/processors/datasets/mls/restore_pc.py |  606 ++++++++
 .../sdp/processors/datasets/mtedx/__init__.py |    0
 .../datasets/mtedx/create_initial_manifest.py |   84 ++
 .../processors/datasets/slr102/__init__.py    |    0
 .../slr102/create_initial_manifest.py         |  122 ++
 .../processors/datasets/slr140/__init__.py    |    0
 .../slr140/create_initial_manifest.py         |  213 +++
 .../sdp/processors/datasets/slr83/__init__.py |    0
 .../datasets/slr83/create_initial_manifest.py |  261 ++++
 .../datasets/uzbekvoice/__init__.py           |   13 +
 .../uzbekvoice/create_initial_manifest.py     |  120 ++
 .../processors/datasets/voxpopuli/__init__.py |    0
 .../voxpopuli/create_initial_manifest.py      |  155 +++
 .../voxpopuli/normalize_from_non_pc_text.py   |  170 +++
 .../sdp/processors/huggingface/__init__.py    |    0
 .../huggingface/create_initial_manifest.py    |   92 ++
 .../huggingface/speech_recognition.py         |  145 ++
 build/lib/sdp/processors/langs/__init__.py    |   13 +
 build/lib/sdp/processors/langs/arabic.py      |  183 +++
 build/lib/sdp/processors/langs/armenian.py    |   95 ++
 build/lib/sdp/processors/langs/kazakh.py      |   67 +
 .../processors/modify_manifest/__init__.py    |   13 +
 .../sdp/processors/modify_manifest/common.py  |  403 ++++++
 .../modify_manifest/create_manifest.py        |   93 ++
 .../modify_manifest/data_to_data.py           | 1227 +++++++++++++++++
 .../modify_manifest/data_to_dropbool.py       |  907 ++++++++++++
 .../make_letters_uppercase_after_period.py    |   80 ++
 build/lib/sdp/processors/nemo/__init__.py     |    0
 .../lib/sdp/processors/nemo/asr_inference.py  |   78 ++
 build/lib/sdp/processors/nemo/pc_inference.py |  111 ++
 .../sdp/processors/nemo/transcribe_speech.py  |  417 ++++++
 build/lib/sdp/processors/toloka/__init__.py   |   13 +
 build/lib/sdp/processors/toloka/accept_if.py  |  155 +++
 .../lib/sdp/processors/toloka/create_pool.py  |  150 ++
 .../sdp/processors/toloka/create_project.py   |  128 ++
 .../processors/toloka/create_sentence_set.py  |   56 +
 .../sdp/processors/toloka/create_task_set.py  |  160 +++
 .../processors/toloka/download_responses.py   |  244 ++++
 build/lib/sdp/processors/toloka/reject_if.py  |  160 +++
 build/lib/sdp/run_processors.py               |  253 ++++
 build/lib/sdp/utils/__init__.py               |   16 +
 build/lib/sdp/utils/bootstrap_estimates.py    |  273 ++++
 build/lib/sdp/utils/common.py                 |  111 ++
 build/lib/sdp/utils/edit_spaces.py            |   41 +
 build/lib/sdp/utils/get_diff.py               |   81 ++
 build/lib/sdp/utils/import_manager.py         |  138 ++
 build/lib/sdp/utils/metrics_computation.py    |   63 +
 docs/src/sdp/api.rst                          |    3 +
 sdp/processors/__init__.py                    |    1 +
 .../yodas2/create_initial_manifest.py         |   47 +
 sdp/processors/datasets/yodas2/download.py    |  161 +++
 .../modify_manifest/data_to_data.py           |  154 +++
 tests/test_data_to_data.py                    |   46 +-
 85 files changed, 10966 insertions(+), 10 deletions(-)
 create mode 100644 build/lib/sdp/__init__.py
 create mode 100644 build/lib/sdp/logging.py
 create mode 100644 build/lib/sdp/processors/__init__.py
 create mode 100644 build/lib/sdp/processors/base_processor.py
 create mode 100644 build/lib/sdp/processors/datasets/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/commoncrawl/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
 create mode 100644 build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
 create mode 100644 build/lib/sdp/processors/datasets/coraa/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/coraal/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/coraal/data_splits.py
 create mode 100644 build/lib/sdp/processors/datasets/fleurs/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/ksc2/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/lhotse.py
 create mode 100644 build/lib/sdp/processors/datasets/librispeech/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/aggregate_segments.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
 create mode 100644 build/lib/sdp/processors/datasets/masc/utils.py
 create mode 100644 build/lib/sdp/processors/datasets/mcv/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/mediaspeech/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/mls/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/mls/restore_pc.py
 create mode 100644 build/lib/sdp/processors/datasets/mtedx/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/slr102/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/slr140/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/slr83/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/voxpopuli/__init__.py
 create mode 100644 build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
 create mode 100644 build/lib/sdp/processors/huggingface/__init__.py
 create mode 100644 build/lib/sdp/processors/huggingface/create_initial_manifest.py
 create mode 100644 build/lib/sdp/processors/huggingface/speech_recognition.py
 create mode 100644 build/lib/sdp/processors/langs/__init__.py
 create mode 100644 build/lib/sdp/processors/langs/arabic.py
 create mode 100644 build/lib/sdp/processors/langs/armenian.py
 create mode 100644 build/lib/sdp/processors/langs/kazakh.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/__init__.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/common.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/create_manifest.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/data_to_data.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
 create mode 100644 build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
 create mode 100644 build/lib/sdp/processors/nemo/__init__.py
 create mode 100644 build/lib/sdp/processors/nemo/asr_inference.py
 create mode 100644 build/lib/sdp/processors/nemo/pc_inference.py
 create mode 100644 build/lib/sdp/processors/nemo/transcribe_speech.py
 create mode 100644 build/lib/sdp/processors/toloka/__init__.py
 create mode 100644 build/lib/sdp/processors/toloka/accept_if.py
 create mode 100644 build/lib/sdp/processors/toloka/create_pool.py
 create mode 100644 build/lib/sdp/processors/toloka/create_project.py
 create mode 100644 build/lib/sdp/processors/toloka/create_sentence_set.py
 create mode 100644 build/lib/sdp/processors/toloka/create_task_set.py
 create mode 100644 build/lib/sdp/processors/toloka/download_responses.py
 create mode 100644 build/lib/sdp/processors/toloka/reject_if.py
 create mode 100644 build/lib/sdp/run_processors.py
 create mode 100644 build/lib/sdp/utils/__init__.py
 create mode 100644 build/lib/sdp/utils/bootstrap_estimates.py
 create mode 100644 build/lib/sdp/utils/common.py
 create mode 100644 build/lib/sdp/utils/edit_spaces.py
 create mode 100644 build/lib/sdp/utils/get_diff.py
 create mode 100644 build/lib/sdp/utils/import_manager.py
 create mode 100644 build/lib/sdp/utils/metrics_computation.py
 create mode 100644 sdp/processors/datasets/yodas2/create_initial_manifest.py
 create mode 100644 sdp/processors/datasets/yodas2/download.py

diff --git a/build/lib/sdp/__init__.py b/build/lib/sdp/__init__.py
new file mode 100644
index 00000000..2db92b25
--- /dev/null
+++ b/build/lib/sdp/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/logging.py b/build/lib/sdp/logging.py
new file mode 100644
index 00000000..10d6b14f
--- /dev/null
+++ b/build/lib/sdp/logging.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+# overriding with the library specific logger, so that it's possible to
+# customize in any downstream applications
+logger = logging.getLogger("sdp")
diff --git a/build/lib/sdp/processors/__init__.py b/build/lib/sdp/processors/__init__.py
new file mode 100644
index 00000000..6788c88f
--- /dev/null
+++ b/build/lib/sdp/processors/__init__.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# let's import all supported processors here to simplify target specification
+
+from sdp.processors.datasets.coraa.create_initial_manifest import (
+    CreateInitialManifestCORAA,
+)
+from sdp.processors.datasets.coraal import (
+    CreateInitialManifestCORAAL,
+    TrainDevTestSplitCORAAL,
+)
+from sdp.processors.datasets.fleurs.create_initial_manifest import (
+    CreateInitialManifestFleurs,
+)
+from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
+    CreateInitialManifestUzbekvoice,
+)
+from sdp.processors.datasets.ksc2.create_initial_manifest import (
+    CreateInitialManifestKSC2,
+)
+from sdp.processors.datasets.lhotse import LhotseImport
+from sdp.processors.datasets.librispeech.create_initial_manifest import (
+    CreateInitialManifestLibrispeech,
+)
+from sdp.processors.datasets.masc import (
+    CreateInitialManifestMASC,
+    AggregateSegments,
+    RegExpVttEntries,
+    GetCaptionFileSegments
+)
+from sdp.processors.datasets.mediaspeech.create_initial_manifest import CreateInitialManifestMediaSpeech
+from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
+from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
+from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
+from sdp.processors.datasets.mtedx.create_initial_manifest import (
+    CreateInitialManifestMTEDX,
+)
+from sdp.processors.datasets.slr83.create_initial_manifest import (
+    CreateInitialManifestSLR83,
+    CustomDataSplitSLR83,
+)
+from sdp.processors.datasets.slr102.create_initial_manifest import (
+    CreateInitialManifestSLR102,
+)
+from sdp.processors.datasets.slr140.create_initial_manifest import (
+    CreateInitialManifestSLR140,
+    CustomDataSplitSLR140,
+)
+from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
+    CreateInitialManifestVoxpopuli,
+)
+from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
+    NormalizeFromNonPCTextVoxpopuli,
+)
+from sdp.processors.huggingface.speech_recognition import ASRTransformers
+from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
+
+from sdp.processors.modify_manifest.common import (
+    AddConstantFields,
+    ApplyInnerJoin,
+    ChangeToRelativePath,
+    CombineSources,
+    DuplicateFields,
+    KeepOnlySpecifiedFields,
+    RenameFields,
+    SortManifest,
+    SplitOnFixedDuration,
+)
+from sdp.processors.modify_manifest.create_manifest import (
+    CreateCombinedManifests,
+    CreateInitialManifestByExt,
+)
+from sdp.processors.modify_manifest.data_to_data import (
+    ASRFileCheck,
+    CopyManifestData,
+    CountNumWords,
+    ExtractFromBrackets,
+    FfmpegConvert,
+    GetAudioDuration,
+    GetWER,
+    InsIfASRInsertion,
+    InverseNormalizeText,
+    NormalizeText,
+    MakeSentence,
+    ReadDocxLines,
+    ReadTxtLines,
+    SoxConvert,
+    SplitLineBySentence,
+    SubIfASRSubstitution,
+    SubMakeLowercase,
+    SubRegex,
+)
+from sdp.processors.modify_manifest.data_to_dropbool import (
+    DropASRError,
+    DropASRErrorBeginningEnd,
+    DropDuplicates,
+    DropHighCER,
+    DropHighLowCharrate,
+    DropHighLowDuration,
+    DropHighLowWordrate,
+    DropHighWER,
+    DropIfNoneOfRegexMatch,
+    DropIfRegexMatch,
+    DropIfSubstringInInsertion,
+    DropLowWordMatchRate,
+    DropNonAlphabet,
+    DropOnAttribute,
+    PreserveByValue,
+    DropRepeatedFields,
+)
+from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
+    MakeLettersUppercaseAfterPeriod,
+)
+from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.pc_inference import PCInference
+from sdp.processors.toloka.accept_if import AcceptIfWERLess
+from sdp.processors.toloka.create_pool import CreateTolokaPool
+from sdp.processors.toloka.create_project import CreateTolokaProject
+from sdp.processors.toloka.create_sentence_set import CreateSentenceSet
+from sdp.processors.toloka.create_task_set import CreateTolokaTaskSet
+from sdp.processors.toloka.download_responses import GetTolokaResults
+from sdp.processors.toloka.reject_if import RejectIfBanned
diff --git a/build/lib/sdp/processors/base_processor.py b/build/lib/sdp/processors/base_processor.py
new file mode 100644
index 00000000..6fc22ee8
--- /dev/null
+++ b/build/lib/sdp/processors/base_processor.py
@@ -0,0 +1,502 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import multiprocessing
+import os
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from itertools import chain
+from typing import Any, Dict, List, Optional, Union
+
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map
+
+from sdp.logging import logger
+
+
+@dataclass
+class DataEntry:
+    """A wrapper for data entry + any additional metrics."""
+
+    data: Optional[Dict]  # can be None to drop the entry
+    metrics: Any = None
+
+
+class BaseProcessor(ABC):
+    """Abstract class for SDP processors.
+
+    All processor classes inherit from the ``BaseProcessor`` class.
+    This is a simple abstract class which has 2 empty methods: :meth:`process`
+    and :meth:`test`.
+
+    These serve to remind us that SDP essentially just runs ``.test()`` on all
+    processors (to implement :ref:`run-time tests <sdp-runtime-tests>`),
+    and then ``.process()`` on all processors.
+
+    Args:
+        output_manifest_file (str): path of where the output manifest file will
+            be located. Cannot have the same value as ``input_manifest_file``.
+        input_manifest_file (str): path of where the input manifest file is
+            located. This arg is optional - some processors may not take in
+            an input manifest because they need to create an initial manifest
+            from scratch (ie from some transcript file that is in a format
+            different to the NeMo manifest format). Cannot have the same value
+            as ``input_manifest_file``.
+    """
+
+    def __init__(self, output_manifest_file: str, input_manifest_file: Optional[str] = None, **kwargs):
+
+        if output_manifest_file and input_manifest_file and (output_manifest_file == input_manifest_file):
+            # we cannot have the same input and output manifest file specified because we need to be able to
+            # read from the input_manifest_file and write to the output_manifest_file at the same time
+            raise ValueError("A processor's specified input_manifest_file and output_manifest_file cannot be the same")
+
+        self.output_manifest_file = output_manifest_file
+        self.input_manifest_file = input_manifest_file
+
+    @abstractmethod
+    def process(self):
+        """Should be overriden by the child classes to implement some data processing."""
+        pass
+
+    def test(self):
+        """This method can be used to perform "runtime" tests.
+
+        This can be any kind of self-consistency tests, but are usually
+        in the form of checking that provided input test data entries match
+        provided output test data entries.
+
+        There are not tests by default.
+        """
+
+class BaseParallelProcessor(BaseProcessor):
+    """
+    A processor that performs per-entry processing in parallel (using Dask or multiprocessing).
+
+    Args:
+        input_manifest_file (str): Path to the input manifest file.
+        output_manifest_file (str): Path where the output manifest file will be written.
+        max_workers (int): Maximum number of workers.
+        chunksize (int): Chunk size used for parallel routines.
+        in_memory_chunksize (int): Maximum number of entries to load at once.
+        test_cases (list[dict]): Optional list of test cases.
+        use_dask (bool): If True, use Dask for parallelization; otherwise, use multiprocessing.
+        dask_client: (Optional) An existing Dask client.
+    """
+    
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Remove the Dask client from state (it is not picklable)
+        if 'dask_client' in state:
+            state['dask_client'] = None
+        return state
+
+    def __init__(
+        self,
+        input_manifest_file: Optional[str] = None,
+        output_manifest_file: Optional[str] = None,
+        max_workers: int = -1,
+        chunksize: int = 100,
+        in_memory_chunksize: int = 100000,
+        test_cases: Optional[List[Dict]] = None,
+        use_dask: bool = True,
+        dask_client=None,
+        **kwargs,
+    ):
+        kwargs.pop("use_dask", None) #
+        super().__init__(input_manifest_file=input_manifest_file, output_manifest_file=output_manifest_file, **kwargs)
+        if max_workers == -1:
+            max_workers = os.cpu_count()
+        self.max_workers = max_workers
+        self.chunksize = chunksize
+        self.in_memory_chunksize = in_memory_chunksize
+        self.number_of_entries = 0
+        self.total_duration = 0
+        self.start_time = time.time()
+        self.test_cases = test_cases or []
+        self.use_dask = use_dask
+        self.dask_client = dask_client
+        
+    def prepare(self):
+        """Can be used in derived classes to prepare the processing.
+        
+        """
+        pass
+
+    def process(self):
+        """A fork in the road to pick dask or classic processing
+
+        """
+        os.environ.setdefault("PATH", os.defpath)
+
+        self.prepare()
+        
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+        
+        #Ability to work sa legacy and as dask
+        if self.use_dask:
+            self._process_with_dask(metrics)
+        else:
+            self._process_with_multiprocessing(metrics)
+        self.finalize(metrics)
+
+    def _process_with_dask(self, metrics):
+        import dask.bag as db
+        from dask.distributed import Client
+
+        if self.dask_client is None:
+            self.dask_client = Client()
+        client = self.dask_client
+        from sdp.logging import logger 
+        logger.info(f"Using Dask client with dashboard at: {client.dashboard_link}")
+
+        # Delegate manifest reading to read_manifest() which returns a Dask bag.
+        bag = self.read_manifest()
+
+        if not isinstance(bag, db.Bag):
+            bag = db.from_sequence(bag)
+        total_entries = bag.count().compute()
+
+        if total_entries == 0:
+            logger.info("No entries found in the manifest input. Proceeding to create an empty output manifest.")
+            results = []
+        else:
+            processed_bag = bag.map(lambda entry: self.process_dataset_entry(entry)).flatten()
+            results = processed_bag.compute()
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for entry in results:
+                metrics.append(entry.metrics)
+                if entry.data is not None:
+                    json.dump(entry.data, fout, ensure_ascii=False)
+                    fout.write("\n")
+                    self.number_of_entries += 1
+                    self.total_duration += entry.data.get("duration", 0)
+        logger.info(f"Processed {total_entries} entries using Dask.")
+
+    def _process_with_multiprocessing(self, metrics):
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    metrics.append(data_entry.metrics)
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    fout.write("\n")
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+
+    def _chunk_manifest(self):
+        """Splits the input manifest into chunks of in_memory_chunksize size.
+           Only used in non-Dask (multiprocessing) mode.
+        """
+        manifest_chunk = []
+        # When use_dask is False, read_manifest() returns an iterator.
+        for idx, data_entry in enumerate(self.read_manifest(), 1):
+            manifest_chunk.append(data_entry)
+            if idx % self.in_memory_chunksize == 0:
+                yield manifest_chunk
+                manifest_chunk = []
+        if manifest_chunk:
+            yield manifest_chunk
+
+    def read_manifest(self):
+        """
+        Reads entries from the input manifest.
+        
+        Behavior depends on the parallelization mode:
+         - When use_dask is True:
+              If the input_manifest_file exists and is non-empty, returns a Dask bag (reading in 256KB blocks).
+              Otherwise, logs the condition and returns an empty Dask bag.
+         - When use_dask is False:
+              If the input_manifest_file does not exist or is empty, logs the condition and returns an empty iterator.
+              Otherwise, opens the file in text mode, strips each line, and yields the parsed JSON from non-empty lines.
+              
+        This unified behavior lets the processor run even in manifest-creation mode.
+
+        """
+        from sdp.logging import logger  
+        if self.use_dask:
+            import dask.bag as db
+            if self.input_manifest_file and os.path.exists(self.input_manifest_file) and os.path.getsize(self.input_manifest_file) > 0:
+                bag = db.read_text(self.input_manifest_file, blocksize=2**18).map(json.loads)
+                return bag
+            else:
+                logger.info("No input manifest file provided or file is empty. Returning an empty Dask bag for manifest creation.")
+                return db.from_sequence([])
+        else:
+            if not self.input_manifest_file or not os.path.exists(self.input_manifest_file):
+                logger.info("No input manifest file provided or file does not exist. Continuing with an empty manifest.")
+                return iter([])
+            else: 
+                #if use_dask = False, we get here
+                def generator(): #Reading manifest line by line, adding only non emply lines
+                    with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+                        for line in fin:
+                                if line:
+                                    yield json.loads(line)
+                return generator()
+
+    @abstractmethod
+    def process_dataset_entry(self, data_entry) -> List[Any]:
+        """
+        Must be implemented in derived classes.
+        For each data entry, return a list of DataEntry objects.
+        """
+        raise NotImplementedError("Derived classes must implement process_dataset_entry.")
+
+    def finalize(self, metrics: List[Any]):
+        """Outputs metrics about the processed data."""
+        from sdp.logging import logger
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        if self.total_duration:
+            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
+        else:
+            logger.info("Unable to calculate total audio duration (hours). Ensure that the manifest file includes a 'duration' key.")
+        elapsed = time.time() - self.start_time
+        logger.info("Processor completed in (seconds): %.2f", elapsed)
+
+    def test(self):
+        """Applies processing to each test case and raises an error if the output does not match expected output."""        
+        for test_case in self.test_cases:
+            input_data = test_case["input"].copy() if isinstance(test_case["input"], dict) else test_case["input"]
+            generated_outputs = self.process_dataset_entry(input_data)
+            expected_outputs = [test_case["output"]] if not isinstance(test_case["output"], list) else test_case["output"]
+            for gen_out, exp_out in zip(generated_outputs, expected_outputs):
+                gen_data = gen_out.data if hasattr(gen_out, "data") else gen_out
+                if gen_data != exp_out:
+                    raise RuntimeError(
+                        "Runtime test failed.\nTest input: {}\nGenerated output: {}\nExpected output: {}"
+                        .format(test_case["input"], gen_data, exp_out)
+                    )
+
+
+
+# ------------------ Legacy Parallel Processor ------------------ #Just for reference
+class LegacyParallelProcessor(BaseProcessor):
+    """
+    A legacy parallel processor implementation using multiprocessing and process_map.
+    
+    This class processes the manifest in chunks (using process_map) and is provided for compatibility.
+    Child classes must implement process_dataset_entry().
+    
+    Args:
+        max_workers (int): maximum number of workers that will be spawned
+            during the parallel processing.
+        chunksize (int): the size of the chunks that will be sent to worker processes
+            during the parallel processing.
+        in_memory_chunksize (int): the maximum number of input data entries that will
+            be read, processed and saved at a time.
+        test_cases (list[dict]): an optional list of dicts containing test
+            cases for checking that the processor makes the changes that we
+            are expecting.
+            
+        The dicts must have a key ``input``, the value of which is a dictionary
+            containing data which is our test's input manifest line, and a key
+            ``output``, the value of which is a dictionary containing data which is
+            the expected output manifest line.
+    """
+    def __init__(
+        self,
+        max_workers: int = -1,
+        chunksize: int = 100,
+        in_memory_chunksize: int = 100000,
+        test_cases: Optional[List[Dict]] = None,
+        **kwargs,
+    ):
+        kwargs.pop("use_dask", None) #
+        super().__init__(**kwargs)
+        if max_workers == -1:
+            max_workers = multiprocessing.cpu_count()
+        self.max_workers = max_workers
+        self.chunksize = chunksize
+        self.in_memory_chunksize = in_memory_chunksize
+        self.number_of_entries = 0
+        self.total_duration = 0
+        self.start_time = time.time()
+        self.test_cases = test_cases or []
+
+    def process(self):
+        """Parallelized implementation of the data processing.
+        The execution flow of this method is the following.
+        1. :meth:`prepare` is called. It's empty by default but can be used to
+           e.g. download the initial data files or compute some aggregates
+           required for subsequent processing.
+        2. A for-loop begins that loops over all ``manifest_chunk`` lists yielded
+           by the :meth:`_chunk_manifest` method. :meth:`_chunk_manifest` reads data
+           entries yielded by :meth:`read_manifest` and yields lists containing
+           ``in_memory_chunksize`` data entries.
+           Inside the for-loop:
+           a) :meth:`process_dataset_entry` is called **in parallel** on each element
+              of the ``manifest_chunk`` list.
+           b) All metrics are aggregated.
+           c) All output data-entries are added to the contents of ``output_manifest_file``.
+           Note:
+           * The default implementation of :meth:`read_manifest` reads an input manifest file
+             and returns a list of dictionaries for each line (we assume a standard NeMo format
+             of one json per line).
+           * :meth:`process_dataset_entry` is called **in parallel** on each element
+             of the list created in the previous step. Note that you cannot create
+             any new counters or modify the attributes of this class in any way
+             inside that function as this will lead to an undefined behavior.
+             Each call to the :meth:`process_dataset_entry` returns a list of
+             ``DataEntry`` objects that are then aggregated together. ``DataEntry``
+             simply defines a ``data`` and ``metrics`` keys.
+           * If ``data`` is set to None, the objects are ignored (metrics are still collected).
+        3. All ``metrics`` keys that were collected in the for-loop above are passed over to
+           :meth:`finalize` for any desired metric aggregation and reporting.
+        Here is a diagram outlining the execution flow of this method:
+        .. can only be viewed in the online documentation
+        .. raw:: html
+             <div align="center">
+               <img src="https://mermaid.ink/img/pako:eNqFU99r2zAQ_lcOFUYCbfbuhcCS9HFQ6N7mYS7WyRaTJSOdF7zS_32SrDYuDOYn-e6777779SJaJ0lUovM49vD9_KW2EL8wXRZLLZ6ZxgDawp75cMRAT-jRGDJP3rUUgvO7cXlttvvPEQMDce9kLRaq9H39UYsUHsioiKYRPRX0_uIPQMPIc4mDywySFE6GITlbtHAhmAJJYJdNtOt2IN3VGocSPF5BIiPgxG5A1m2UN9fiJzw8HOB8U3Fcq_CEshnQakWB11qKimuv2x5mTUaGnBPjb05Dlv07_elGf1rTN20_2V__T1CakVPkkABOQdB_KFne6bRtBhqcnxfe7E-E_6jyHGUo5yELTgRvGpbQHFYl8g-gVFmTK7sBUh_hg4wy6CahA_ESsLnFlgXIZW7i3PAS2GPLpebt4vkEAX8TuInHKbqKvGjGrvPUIVPCe92GjN_kcd-lvkzMAaR340hy-1b74632x_UIlLZoYqOaQrZZq1tWSIfR4PyeTXk3QKlR2y4mqG25B54NwRGUNqa6U0qtzae1eXGQlbUV92IgP6CW8cBekqMW3NNAtajisyx5upPXCE3b-zzbVlTsJ7oX0xj7SmeN8RAHUSk0IVpJanb-23K0-XZf_wKzfkSg" height=100% />
+             </div>
+        """
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # this will unroll all inner lists
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    if data_entry.metrics is not None:
+                        pass  # optionally accumulate metrics here
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+                    fout.write("\n")
+        self.finalize(self.test_cases)
+
+    def prepare(self):
+        """Can be used in derived classes to prepare the processing in any way.
+        E.g., download data or compute some aggregates. Will be called before
+        starting processing the data.
+        """
+
+    def _chunk_manifest(self):
+        """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``."""
+        manifest_chunk = []
+        for idx, data_entry in enumerate(self.read_manifest(), 1):
+            manifest_chunk.append(data_entry)
+            if idx % self.in_memory_chunksize == 0:
+                yield manifest_chunk
+                manifest_chunk = []
+        if manifest_chunk:
+            yield manifest_chunk
+
+    def read_manifest(self):
+        """Reading the input manifest file.
+        .. note::
+            This function should be overridden in the "initial" class creating
+            manifest to read from the original source of data.
+        """
+        if not self.input_manifest_file:
+            raise NotImplementedError("Override this method if no input manifest file is used")
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            for line in fin:
+                yield json.loads(line)
+
+    @abstractmethod
+    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
+        """Needs to be implemented in the derived classes.
+        Each returned value should be a ``DataEntry`` object that will hold
+        a dictionary (or anything else that can be json-serialized) with
+        the actual data + any additional metrics required for statistics
+        reporting. Those metrics can be used in :meth:`finalize` to
+        prepare for final reporting.
+        ``DataEntry`` is a simple dataclass defined in the following way::
+            @dataclass
+            class DataEntry:
+                # can be None to drop the entry
+                data: Optional[Dict]
+                # anything - you'd need to aggregate all
+                # values in the finalize method manually
+                metrics: Any = None
+        .. note::
+            This method should always return a list of objects to allow a
+            one-to-many mapping. E.g., if you want to cut an utterance into
+            multiple smaller parts, you can return a list of all the produced
+            utterances and they will be handled correctly.
+            The many-to-one mapping is not currently supported by design of
+            this method (but can still be done if you don't inherit from
+            this class and process the data sequentially).
+        Args:
+            data_entry: most often, ``data_entry`` will be a dictionary
+                containing items which represent the JSON manifest entry.
+                Sometimes, such as in :class:`sdp.processors.CreateInitialManifestMLS`,
+                it will be a string containing a line for that utterance
+                from the original raw MLS transcript. In general it is an element
+                of the list returned from the :meth:`read_manifest` method.
+        """
+        # TODO: it would be more straightforward to use a generator here, but
+        #     seems that it's not supported with multiprocessing. Is there a
+        #     way to make it work?
+        raise NotImplementedError("Derived classes must implement `process_dataset_entry`.")
+
+    def finalize(self, metrics):
+        """Can be used to output statistics about the processed data.
+        By default outputs new number of entries/hours.
+
+        Args:
+            metrics (list): a list containing all ``metrics`` keys from the
+                data entries returned from the :meth:`process_dataset_entry`
+                method.
+        """
+        logger.info("Total number of entries after processing (legacy): %d", self.number_of_entries)
+        if self.total_duration:
+            logger.info("Total audio duration (hours) after processing (legacy): %.2f", self.total_duration / 3600)
+        else:
+            logger.info("Unable to calculate total audio duration (legacy). Please ensure that the manifest file includes a 'duration' key.")
+        elapsed = time.time() - self.start_time
+        logger.info("Legacy processor completed in (seconds): %.2f", elapsed)
+    def test(self):
+        """Applies processing to "test_cases" and raises an error in case of mismatch."""
+        for test_case in self.test_cases:
+            generated_outputs = self.process_dataset_entry(test_case["input"].copy())
+            expected_outputs = (
+                [test_case["output"]] if not isinstance(test_case["output"], list) else test_case["output"]
+            )
+
+            for generated_output, expected_output in zip(generated_outputs, expected_outputs):
+                generated_output = generated_output.data
+
+                if generated_output != expected_output:
+                    raise RuntimeError(
+                        "Runtime test failed.\n"
+                        f"Test input: {test_case['input']}\n"
+                        f"Generated output: {generated_output}\n"
+                        f"Expected output: {expected_output}"
+                    )
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/__init__.py b/build/lib/sdp/processors/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/__init__.py b/build/lib/sdp/processors/datasets/commoncrawl/__init__.py
new file mode 100644
index 00000000..c3909eaa
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/commoncrawl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .commoncrawl import SplitByVttSentence
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py b/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
new file mode 100644
index 00000000..8a5cc2c6
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -0,0 +1,99 @@
+import os
+from typing import List
+
+import soundfile as sf
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.processors.datasets.commoncrawl.harv_utils import split_by_vtt
+
+
+
+class SplitByVttSentence(BaseParallelProcessor):
+    """
+        A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
+
+        Args:
+            splited_audio_dir (str): The directory to store the split audio files.
+            source_audio_key (str): The field in the dataset containing the path to the source audio files.
+            target_audio_key (str): The field to store the paths of the split audio files.
+            duration_key (str): The field to store the duration of each split audio segment.
+            text_key (str): The field to store the transcriptions corresponding to each split audio segment.
+            caption_file_key (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
+            additional_fields (List[str], optional): List of additional fields to copy from the original data entry to the split entries.
+                Defaults to an empty list.
+            duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0.
+    """
+
+    def __init__(
+            self,
+            splited_audio_dir: str,
+            source_audio_field: str,
+            target_audio_field: str,
+            duration_field: str,
+            text_field: str,
+            vtt_field: str,
+            additional_fields: List[str] = [],
+            duration_threshold: float = 10.0,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.splited_audio_dir = splited_audio_dir
+        self.source_audio_field = source_audio_field
+        self.target_audio_field = target_audio_field
+        self.duration_field = duration_field
+        self.text_field = text_field
+        self.vtt_field = vtt_field
+        self.duration_threshold = duration_threshold
+        self.additional_fields = additional_fields
+
+    def prepare(self):
+        os.makedirs(self.splited_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        vtt_file = data_entry[self.vtt_field]
+        source_audio = data_entry[self.source_audio_field]
+        res_list = []
+
+        if os.path.isfile(source_audio):
+            data, samplerate = sf.read(source_audio)
+            text_list, start_s, end_s = split_by_vtt(vtt_file, samplerate)
+            text_c = ''
+            start_c, end_c = 0, 0
+            if text_list:
+                for text, start_sr, end_sr in zip(text_list, start_s, end_s):
+                    text_c += " " + text
+                    if start_c == 0:
+                        start_c = start_sr
+                    else:
+                        pass
+                    end_c = end_sr
+                    if len(text_c) > 0 and (
+                            end_c - start_c > self.duration_threshold * samplerate or
+                            text_c[-1] == "." or text_c[-1] == "?"):
+                        res_list.append(
+                            self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
+                        text_c = ''
+                        start_c, end_c = 0, 0
+                    else:
+                        pass
+                if len(text_c) > 0 and start_c != 0:
+                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
+
+        return res_list
+
+    def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c):
+        data_sample = data[start_c:end_c]
+        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]),
+                                     str(int(start_c / (samplerate / 1000))) + "-" + str(
+                                         int(end_c / (samplerate / 1000))) + ".wav")
+        if not os.path.isfile(wav_save_file):
+            os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
+            sf.write(wav_save_file, data_sample, samplerate)
+
+        data = {self.target_audio_field: wav_save_file,
+                self.duration_field: data_sample.shape[0] / samplerate,
+                self.text_field: text_c.strip(),
+                }
+        for field in self.additional_fields:
+            data[field] = data_entry[field]
+        return DataEntry(data=data)
+
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py b/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
new file mode 100644
index 00000000..24efe80e
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -0,0 +1,45 @@
+import webvtt  # pip install webvtt-py
+from datetime import datetime
+from sdp.logging import logger
+
+
+def parse_hours(inp):
+    inp_list = inp.split(":")
+    if len(inp_list) == 3 and int(inp_list[0]) >= 24:
+        hours = int(inp_list[0]) % 24
+        days = int(inp_list[0]) // 24
+        if days < 31:
+            inp = str(1 + days) + ":" + str(hours) + ":" + ":".join(inp_list[1:])
+            return datetime.strptime(inp, '%d:%H:%M:%S.%f')
+        else:
+            months = days // 31
+            days = days % 31
+            inp = str(1 + months) + "/" + str(1 + days) + " " + str(hours) + ":" + ":".join(inp_list[1:])
+            return datetime.strptime(inp, '%m/%d %H:%M:%S.%f')
+    else:
+        return datetime.strptime(inp, '%H:%M:%S.%f')
+
+
+def split_by_vtt(vtt_file, samplerate):
+    try:
+        _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
+        text_list, start_s, end_s = [], [], []
+        for caption in webvtt.read(vtt_file):
+            text = ' '.join(caption.text.split('\n'))
+
+            _start = parse_hours(caption.start)
+            start = (_start - _begin).total_seconds()
+            start_sr = int(start * samplerate)
+
+            _end = parse_hours(caption.end)
+            end = (_end - _begin).total_seconds()
+            end_sr = int(end * samplerate)
+
+            text_list.append(text.strip())
+            start_s.append(start_sr)
+            end_s.append(end_sr)
+        return text_list, start_s, end_s
+    except Exception as e:
+        logger.warning(str(e) + vtt_file)
+        return None, None, None
+
diff --git a/build/lib/sdp/processors/datasets/coraa/__init__.py b/build/lib/sdp/processors/datasets/coraa/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py b/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
new file mode 100644
index 00000000..5be1a8ab
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
@@ -0,0 +1,125 @@
+import glob
+import os
+from pathlib import Path
+from typing import List
+import pandas as pd
+
+import rarfile  #Needs to be installed
+import sox
+from sox import Transformer
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import extract_archive
+
+class CreateInitialManifestCORAA(BaseParallelProcessor):
+    """
+        Processor to create initial manifest file fo CORAA ASR dataset
+
+         Dataset link: https://github.com/nilc-nlp/CORAA
+
+         Args:
+            raw_data_dir (str): the path to the directory in which all the data will be downloaded.
+            extract_archive_dir (str): directory where the extracted data will be saved.
+            data_split (str): "train", "dev" or "test".
+            resampled_audio_dir (str): the directory where the resampled wav files will be stored.
+            already_extracted (bool): if True, we will not try to extract the raw data.
+                Defaults to False.
+            already_downloaded (bool): if True, we will not try to download files.
+            target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
+                Defaults to 16000.
+            target_nchannels (int): number of channels to create during resampling process.
+                Defaults to 1.
+            exclude_dataset: list: list of the dataset names that will be excluded when creating initial manifest.
+                Options 'SP2010', 'C-ORAL-BRASIL I', 'NURC-Recife', 'TEDx Talks', 'ALIP'
+
+    """
+    def __init__(
+            self,
+            raw_data_dir: str,
+            extract_archive_dir: str,
+            data_split: str,
+            resampled_audio_dir: str,
+            already_extracted: bool = False,
+            already_downloaded: bool = False,
+            target_samplerate: int = 16000,
+            target_nchannels: int = 1,
+            exclude_dataset: list = [],
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.extract_archive_dir = extract_archive_dir
+        self.data_split = data_split
+        self.already_downloaded = already_downloaded
+        self.already_extracted = already_extracted
+        self.exclude_dataset = exclude_dataset
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        """Downloading and extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+        os.makedirs(self.extract_archive_dir, exist_ok=True)
+        if not self.already_downloaded:
+            try:
+                from huggingface_hub import snapshot_download
+                snapshot_download(repo_id="gabrielrstan/CORAA-v1.1", repo_type='dataset', local_dir=self.raw_data_dir)
+            except ImportError:
+                raise ImportError("huggingface_hub is required to download the dataset. Please install it with pip install huggingface_hub")
+        if not self.already_extracted:
+
+            if self.data_split == 'train':
+                first_rar_file = glob.glob(str(self.raw_data_dir) + "/train_dividido"+f"/*{self.data_split}*1.rar")
+                if first_rar_file and not isinstance(first_rar_file, str):
+                    first_rar_file = first_rar_file[0]
+
+                if rarfile.is_rarfile(first_rar_file):
+                    rar = rarfile.RarFile(first_rar_file)
+                    rar.extractall(path=self.extract_archive_dir)
+            else:
+
+                zip_files =glob.glob(str(self.raw_data_dir) + f"/*{self.data_split}.zip")
+                if not zip_files:
+                    raise RuntimeError(
+                        f"Did not find any file matching {self.raw_data_dir}/*.zip. "
+                        "Make sure your download passed succesfully."
+                    )
+                elif len(zip_files) > 1:
+                    raise RuntimeError(
+                        f"Expecting exactly one {self.data_split}.zip file in directory {self.raw_data_dir}"
+                    )
+
+                extract_archive(zip_files[0], self.extract_archive_dir)
+        self.transcription_file = self.raw_data_dir / f"metadata_{self.data_split}_final.csv"
+        self.audio_path_prefix = self.extract_archive_dir
+
+    def read_manifest(self):
+        self.df = pd.read_csv(self.transcription_file)
+        data_entries = self.df[~self.df['dataset'].isin(self.exclude_dataset)][['file_path','text']]
+        res = [tuple(row[1]) for row in data_entries.iterrows()]
+        return res
+
+    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
+
+        file_path, text = data_entry
+        file_name = os.path.splitext(os.path.basename(file_path))[0]
+        transcript_text = text.strip()
+
+        audio_path = os.path.join(self.audio_path_prefix, file_path)
+        output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")
+
+        if not os.path.exists(output_wav_path):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
+
+        data = {
+            "audio_filepath": output_wav_path,
+            "duration": float(sox.file_info.duration(output_wav_path)),
+            "text": transcript_text,
+        }
+
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/coraal/__init__.py b/build/lib/sdp/processors/datasets/coraal/__init__.py
new file mode 100644
index 00000000..7d2fff52
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/coraal/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .create_initial_manifest import CreateInitialManifestCORAAL
+from .data_splits import TrainDevTestSplitCORAAL
diff --git a/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py b/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
new file mode 100644
index 00000000..16aa166a
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import glob
+import os
+import urllib.request
+from pathlib import Path
+
+import pandas as pd
+from sox import Transformer
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+
+def get_coraal_url_list():
+    """Returns url list for CORAAL dataset.
+
+    There are a few mistakes in the official url list that are fixed here.
+    Can be overridden by tests to select a subset of urls.
+    """
+    dataset_url = "http://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
+    urls = []
+    for file_url in urllib.request.urlopen(dataset_url):
+        file_url = file_url.decode('utf-8').strip()
+        # fixing known errors in the urls
+        if file_url == 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt':
+            file_url = 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
+        if file_url == 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt':
+            file_url = 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
+        urls.append(file_url)
+    return urls
+
+
+class CreateInitialManifestCORAAL(BaseParallelProcessor):
+    """Processor to create initial manifest for the Corpus of Regional African American Language (CORAAL) dataset.
+
+    Dataset link: https://oraal.github.io/coraal
+
+    Will download all files, extract tars and split wav files based on the
+    provided durations in the transcripts.
+
+    Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+        drop_pauses (bool): if True, will drop all transcriptions that contain
+            only silence (indicated by ``(pause X)`` in the transcript).
+            Defaults to True.
+        group_duration_threshold (float): can be used to group consecutive
+            utterances from the same speaker to a longer duration. Set to 0
+            to disable this grouping (but note that many utterances are
+            transcribed with only a few seconds, so grouping is generally
+            advised). Defaults to 20.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription>,
+                "original_file": <name of the original file in the dataset this audio came from>,
+                "speaker": <speaker id>,
+                "is_interviewee": <whether this is an interviewee (accented speech)>,
+                "gender": <speaker gender>,
+                "age": <speaker age>,
+                "education": <speaker education>,
+                "occupation": <speaker occupation>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        resampled_audio_dir: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        drop_pauses: bool = True,
+        group_duration_threshold: float = 20.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        self.drop_pauses = drop_pauses
+        self.group_duration_threshold = group_duration_threshold
+
+    def prepare(self):
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+        # downloading all files
+        for file_url in get_coraal_url_list():
+            download_file(file_url, str(self.raw_data_dir))
+
+        os.makedirs(self.raw_data_dir / "audio", exist_ok=True)
+        os.makedirs(self.raw_data_dir / "transcripts", exist_ok=True)
+        # extracting all files
+        for data_file in glob.glob(f'{self.raw_data_dir}/*_audio_*.tar.gz'):
+            # need to set force_extract=True, since there is no folder inside, just a list of files
+            # and we extract data from multiple tars. Ideally, should change the way we check
+            # for extracted data (currently there is an assumption that all data in archive is in a single folder)
+            extract_archive(data_file, self.raw_data_dir / "audio", force_extract=True)
+        for data_file in glob.glob(f'{self.raw_data_dir}/*_textfiles_*.tar.gz'):
+            extract_archive(data_file, self.raw_data_dir / "transcripts", force_extract=True)
+
+    def read_manifest(self):
+        dfs = []
+        for data_file in glob.glob(f'{self.raw_data_dir}/transcripts/*.txt'):
+            df = pd.read_csv(data_file, delimiter='\t')
+            df['Basefile'] = os.path.basename(data_file)[:-4]  # dropping .wav in the end
+
+            if self.drop_pauses:
+                df = df[~df['Content'].str.contains(r'\(pause \d+(?:\.\d+)?\)')]
+
+            # grouping consecutive segments from the same speaker
+            if self.group_duration_threshold > 0:
+                df['Duration'] = df['EnTime'] - df['StTime']
+                # puts each sequence of same speaker utts in a "bin"
+                speaker_bins = (~df['Spkr'].eq(df['Spkr'].shift())).cumsum()
+                # within each bin, computes cumulative duration and then int-divides by the threshold
+                df['ThresholdMult'] = df.groupby(speaker_bins)['Duration'].transform(
+                    lambda x: pd.Series.cumsum(x) // self.group_duration_threshold
+                )
+                # finally, we take all positions where the int-division changes,
+                # which indicates that cumsum exceded the threshold. And combine those
+                # with speaker-change positions to get the final groups for utterance merging
+                final_bins = (
+                    (~df['Spkr'].eq(df['Spkr'].shift())) | (~df['ThresholdMult'].eq(df['ThresholdMult'].shift()))
+                ).cumsum()
+                df = df.groupby(final_bins).agg(
+                    {
+                        'StTime': 'min',
+                        'EnTime': 'max',
+                        'Content': ' '.join,
+                        # will be the same in the group
+                        'Spkr': lambda x: x.iloc[0],
+                        'Basefile': lambda x: x.iloc[0],
+                    }
+                )
+            # assigning label for interviewee vs interviewer (can be used to select a subset later)
+            df['is_interviewee'] = df.apply(lambda x: x['Spkr'] in x['Basefile'], axis=1)
+
+            # matching with metadata (age, gender, etc.)
+            metadata_dfs = []
+            for data_file in glob.glob(f'{self.raw_data_dir}/*_metadata_*.txt'):
+                metadata_dfs.append(pd.read_csv(data_file, delimiter='\t'))
+            metadata_df = pd.concat(metadata_dfs)
+            # only selecting a subset of columns - can be changed if more are needed
+            # dropping duplicates since there are multiple rows per speaker because of
+            # bit-rate, tar name and other file-specific information
+            metadata_df = metadata_df[['CORAAL.Spkr', 'Gender', 'Age', 'Education', 'Occupation']].drop_duplicates()
+            df = df.merge(metadata_df, left_on='Spkr', right_on='CORAAL.Spkr', how='left')
+            df = df.drop('CORAAL.Spkr', axis=1)
+
+            dfs.append(df)
+
+        df = pd.concat(dfs)
+        # would be better to keep it as df, but .values is way faster than .iterrows
+        return df.values
+
+    def process_dataset_entry(self, data_entry):
+        (
+            start_time,
+            end_time,
+            content,
+            speaker,
+            basefile,
+            is_interviewee,
+            gender,
+            age,
+            education,
+            occupation,
+        ) = data_entry
+
+        src_file = str(self.raw_data_dir / 'audio' / (basefile + '.wav'))
+        output_wav_path = os.path.join(
+            self.resampled_audio_dir,
+            f"{basefile}_{int(start_time * 1000)}_{int(end_time * 1000)}.wav",
+        )
+
+        if not os.path.exists(output_wav_path):
+            tfm = Transformer()
+            tfm.trim(start_time, end_time)
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=src_file, output_filepath=output_wav_path)
+
+        data = {
+            "audio_filepath": output_wav_path,
+            "duration": end_time - start_time,
+            "text": content.strip(),
+            "original_file": basefile,
+            "speaker": speaker,
+            "is_interviewee": is_interviewee,
+            "gender": gender,
+            "age": age,
+            "education": education,
+            "occupation": occupation,
+        }
+
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/coraal/data_splits.py b/build/lib/sdp/processors/datasets/coraal/data_splits.py
new file mode 100644
index 00000000..82e2819d
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/coraal/data_splits.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class TrainDevTestSplitCORAAL(BaseParallelProcessor):
+    """Custom train-dev-test split for CORAAL dataset.
+
+    Split is done speaker-wise, so the same speakers don't appear in different
+    splits.
+
+    Args:
+        data_split (str): train, dev or test.
+
+    Returns:
+        All the same fields as in the input manifest, but only a subset of
+        the data is retained.
+    """
+
+    def __init__(
+        self,
+        data_split: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if data_split not in ["train", "dev", "test"]:
+            raise ValueError("data_split has to be either train, dev or test")
+        self.data_split = data_split
+        self.split_map = {}
+        self.split_map["train"] = set(
+            [
+                'ATL_se0_ag1_m',
+                'DCA_se1_ag1_f',
+                'DCA_se1_ag2_f',
+                'DCA_se1_ag2_m',
+                'DCA_se1_ag3_f',
+                'DCA_se1_ag3_m',
+                'DCA_se1_ag4_m',
+                'DCA_se2_ag1_f',
+                'DCA_se2_ag1_m',
+                'DCA_se2_ag2_m',
+                'DCB_se1_ag1_m',
+                'DCB_se1_ag2_f',
+                'DCB_se1_ag2_m',
+                'DCB_se1_ag3_f',
+                'DCB_se1_ag3_m',
+                'DCB_se1_ag4_f',
+                'DCB_se1_ag4_m',
+                'DCB_se2_ag1_f',
+                'DCB_se2_ag1_m',
+                'DCB_se2_ag2_f',
+                'LES_se0_ag2_f',
+                'LES_se0_ag2_m',
+                'PRV_se0_ag1_f',
+                'PRV_se0_ag2_f',
+                'ROC_se0_ag1_m',
+                'ROC_se0_ag2_f',
+                'VLD_se0_ag2_f',
+                'VLD_se0_ag2_m',
+            ]
+        )
+        self.split_map["dev"] = set(
+            [
+                'ATL_se0_ag1_f',
+                'DCA_se1_ag1_m',
+                'DCB_se1_ag1_f',
+                'LES_se0_ag3_f',
+                'PRV_se0_ag1_m',
+                'ROC_se0_ag1_f',
+                'VLD_se0_ag3_f',
+            ]
+        )
+        self.split_map["test"] = set(
+            [
+                'ATL_se0_ag2_f',
+                'ATL_se0_ag2_m',
+                'DCA_se2_ag3_m',
+                'DCA_se2_ag4_f',
+                'DCA_se2_ag4_m',
+                'DCA_se3_ag1_f',
+                'DCA_se3_ag1_m',
+                'DCA_se3_ag2_f',
+                'DCA_se3_ag2_m',
+                'DCA_se3_ag3_f',
+                'DCA_se3_ag3_m',
+                'DCA_se3_ag4_m',
+                'DCB_se2_ag2_m',
+                'DCB_se2_ag3_f',
+                'DCB_se2_ag3_m',
+                'DCB_se2_ag4_f',
+                'DCB_se2_ag4_m',
+                'DCB_se3_ag1_f',
+                'DCB_se3_ag1_m',
+                'DCB_se3_ag2_f',
+                'DCB_se3_ag3_f',
+                'DCB_se3_ag3_m',
+                'DCB_se3_ag4_f',
+                'DCB_se3_ag4_m',
+                'LES_se0_ag3_m',
+                'LES_se0_ag4_f',
+                'LES_se0_ag4_m',
+                'PRV_se0_ag2_m',
+                'PRV_se0_ag3_f',
+                'PRV_se0_ag3_m',
+                'ROC_se0_ag2_m',
+                'ROC_se0_ag3_f',
+                'ROC_se0_ag3_m',
+                'VLD_se0_ag3_m',
+                'VLD_se0_ag4_f',
+                'VLD_se0_ag4_m',
+            ]
+        )
+
+    def process_dataset_entry(self, data_entry):
+        if data_entry["original_file"][:-5] in self.split_map[self.data_split]:
+            return [DataEntry(data=data_entry)]
+        return []
diff --git a/build/lib/sdp/processors/datasets/fleurs/__init__.py b/build/lib/sdp/processors/datasets/fleurs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py b/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
new file mode 100644
index 00000000..d571593a
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fnmatch
+import glob
+import json
+import os
+import shutil
+import typing
+from urllib.parse import parse_qs, urlparse
+
+from sdp.processors.base_processor import BaseProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+
+def get_fleurs_url_list(lang: str, split: str) -> list[str]:
+    # examples
+    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/audio/dev.tar.gz",
+    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/dev.tsv"
+
+    urls = []
+    base_url = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
+
+    base_lang_url = os.path.join(base_url, lang)
+    tsv_url = f"{base_lang_url}/{split}.tsv"
+    urls.append(tsv_url)
+
+    tar_gz_url = f"{base_lang_url}/audio/{split}.tar.gz"
+    urls.append(tar_gz_url)
+
+    return urls
+
+
+class CreateInitialManifestFleurs(BaseProcessor):
+    """
+    Processor to create initial manifest for the FLEURS dataset.
+
+    Dataset link: https://huggingface.co/datasets/google/fleurs
+
+    Will download all files, extract them, and create a manifest file with the
+    "audio_filepath" and "text" fields.
+
+    Args:
+        lang (str): Language to be processed, identified by a combination of ISO 639-1 and ISO 3166-1 alpha-2 codes.
+            Examples are:
+
+            - ``"hy_am"`` for Armenian
+            - ``"ko_kr"`` for Korean
+
+        split (str): Which dataset splits to process.
+            Options are:
+
+            - ``"test"``
+            - ``"train"``
+            - ``"dev"``
+
+        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        lang: str,
+        split: str,
+        raw_data_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.lang = lang
+        self.split = split
+        self.raw_data_dir = raw_data_dir
+
+    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
+        """
+        Parse transcript TSV file and put it inside manifest.
+        Assumes the TSV file has two columns: file name and text.
+        """
+
+        entries = []
+        root = os.path.dirname(file_path)
+
+        with open(file_path, encoding="utf-8") as fin:
+            for line in fin:
+                # Split the line into filename text using the tab delimiter
+                parts = line.strip().split('\t')
+                if len(parts) < 2:  # Skip lines that don't have at least 2 parts
+                    continue
+
+                file_name, transcript_text = parts[1], parts[2]
+                wav_file = os.path.join(root, file_name)
+
+                entry = {"audio_filepath": os.path.abspath(wav_file), "text": transcript_text}
+                entries.append(entry)
+
+        return entries
+
+    def process_data(self, data_folder: str, manifest_file: str) -> None:
+        entries = self.process_transcript(os.path.join(data_folder, self.split + "/" + self.split + ".tsv"))
+
+        with open(manifest_file, "w", encoding="utf-8") as fout:
+            for m in entries:
+                fout.write(json.dumps(m, ensure_ascii=False) + "\n")
+
+    def download_extract_files(self, dst_folder: str) -> None:
+        """downloading and extracting files"""
+
+        os.makedirs(dst_folder, exist_ok=True)
+
+        # downloading all files
+        for file_url in get_fleurs_url_list(self.lang, self.split):
+            download_file(file_url, str(dst_folder))
+
+        extract_archive(f'{dst_folder}/{self.split}.tar.gz', str(dst_folder), force_extract=True)
+
+        # Organizing files into their respective folders
+        target_folder = os.path.join(dst_folder, self.split)
+
+        file_name = f"{self.split}.tsv"
+
+        file_path = os.path.join(dst_folder, file_name)
+        dest_file_path = os.path.join(target_folder, file_name)
+
+        if not os.path.exists(dest_file_path):
+            shutil.move(file_path, dest_file_path)
+            print(f'Moved {file_path} to {dest_file_path}')
+        else:
+            os.remove(file_path)
+            print(f'File {file_name} already exists in {target_folder}, deleted from source.')
+
+    def process(self):
+        self.download_extract_files(self.raw_data_dir)
+        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/ksc2/__init__.py b/build/lib/sdp/processors/datasets/ksc2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py b/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
new file mode 100644
index 00000000..3bde174b
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
+# For example sudo apt-get install libsox-fmt-mp3
+import csv
+import glob
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, Tuple
+
+from sox import Transformer
+from tqdm.contrib.concurrent import process_map
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+
+class CreateInitialManifestKSC2(BaseParallelProcessor):
+    """Processor to create initial manifest for the Kazakh Speech Corpus (KSC) 2.
+
+    The dataset should be requested via Google Forms, which can be found here https://issai.nu.edu.kz/kz-speech-corpus/.
+
+    Extracts raw data for the specified language and creates an initial manifest
+    using the transcripts provided in the raw data.
+
+    Args:
+        raw_data_dir (str): the path to the directory containing the raw data archive file.
+        extract_archive_dir (str): directory where the extracted data will be saved.
+        resampled_audio_dir (str): directory where the resampled audio will be saved.
+        data_split (str): "train", "dev" or "test".
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+    Returns:
+        This processor generates an initial manifest file with the following fields:
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription (with capitalization and punctuation)>,
+                "source": <source of the given data>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        extract_archive_dir: str,
+        resampled_audio_dir: str,
+        data_split: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.extract_archive_dir = extract_archive_dir
+        self.resampled_audio_dir = resampled_audio_dir
+        self.data_split = data_split
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        """Extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*.tar.gz")
+
+        if not tar_gz_files:
+            raise RuntimeError(
+                f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
+                "For KSC2 dataset we cannot automatically download the data, so "
+                "make sure to get the data manually"
+                "and put it in the 'raw_data_dir' folder."
+            )
+
+        elif len(tar_gz_files) > 1:
+            raise RuntimeError(f"Expecting exactly one *.tar.gz file in directory {self.raw_data_dir}")
+
+        data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
+
+        if self.data_split.capitalize() not in data_folder:
+            self.data_split_dir = Path(data_folder, self.data_split.capitalize())
+        else:
+            self.data_split_dir = Path(data_folder)
+
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def read_manifest(self):
+        if self.data_split_dir is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        dataset_entries = []
+
+        without_text = defaultdict(int)
+
+        for audio_filepath in self.data_split_dir.rglob('*.flac'):
+            filename = audio_filepath.stem
+            source = audio_filepath.relative_to(self.data_split_dir).parents[0].as_posix()
+
+            transcribed_filename = Path(audio_filepath.parent, filename).with_suffix('.txt')
+
+            if transcribed_filename.exists():
+                with open(transcribed_filename, "rt", encoding="utf8") as txtfile:
+                    text = ' '.join(txtfile.readlines())
+            elif transcribed_filename.with_suffix('.txt.txt').exists():
+                transcribed_filename = transcribed_filename.with_suffix('.txt.txt')
+                with open(transcribed_filename, "rt", encoding="utf8") as txtfile:
+                    text = ' '.join(txtfile.readlines())
+            else:
+                without_text[audio_filepath.parent] += 1
+                continue
+
+            entry = {'audio_filepath': audio_filepath.as_posix(), 'text': text, 'source': source}
+
+            dataset_entries.append(entry)
+
+        logger.info(f"Without text entries -> {without_text}")
+
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: Dict):
+        wav_source_dir = Path(self.resampled_audio_dir, data_entry['source'])
+        wav_source_dir.mkdir(exist_ok=True)
+
+        output_wav_path = Path(wav_source_dir, Path(data_entry['audio_filepath']).stem).with_suffix(".wav")
+
+        if not os.path.exists(output_wav_path):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=data_entry['audio_filepath'], output_filepath=output_wav_path)
+
+        data_entry['audio_filepath'] = output_wav_path.as_posix()
+
+        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/datasets/lhotse.py b/build/lib/sdp/processors/datasets/lhotse.py
new file mode 100644
index 00000000..01f54d44
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/lhotse.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+from sdp.processors.base_processor import BaseProcessor
+
+
+class LhotseImport(BaseProcessor):
+    """Processor to create an initial manifest imported from a Lhotse CutSet.
+    The ``input_manifest_file`` is expected to point to a Lhotse CutSet manifest,
+    which usually has ``cuts`` in its name and a ``.jsonl`` or ``.jsonl.gz`` extension.
+
+    Lhotse is a library for speech data processing and loading; see:
+
+    * https://github.com/lhotse-speech/lhotse
+    * https://lhotse.readthedocs.io
+
+    It can be installed using ``pip install lhotse``.
+
+    .. caution:: Currently we only support the importing of cut sets that represent
+        single-channel, single-audio-file-per-utterance datasets.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription (with capitalization and punctuation)>,
+            }
+    """
+
+    def process(self):
+        from lhotse import CutSet
+
+        cuts = CutSet.from_file(self.input_manifest_file)
+        with open(self.output_manifest_file, "w") as f:
+            for cut in cuts:
+                self.check_entry(cut)
+                data = {
+                    "audio_filepath": cut.recording.sources[0].source,
+                    "duration": cut.duration,
+                    "lhotse_cut_id": cut.id,
+                }
+                for meta in ("text", "speaker", "gender", "language"):
+                    if (item := getattr(cut.supervisions[0], meta)) is not None:
+                        data[meta] = item
+                if (custom := cut.supervisions[0].custom) is not None:
+                    data.update(custom)
+                print(json.dumps(data), file=f)
+
+    def check_entry(self, cut) -> None:
+        from lhotse import MonoCut
+
+        assert isinstance(
+            cut, MonoCut
+        ), f"Currently, only MonoCut import is supported. Received: {cut}"
+        assert (
+            cut.has_recording
+        ), f"Currently, we only support cuts with recordings. Received: {cut}"
+        assert (
+            cut.recording.num_channels == 1
+        ), f"Currently, we only supports recordings with a single channel. Received: {cut}"
+        assert (
+            len(cut.recording.sources) == 1
+        ), f"Currently, we only support recordings with a single AudioSource. Received: {cut}"
+        assert (
+            cut.recording.sources[0].type == "file"
+        ), f"Currently, we only suppport AudioSources of type='file'. Received: {cut}"
+        assert (
+            len(cut.supervisions) == 1
+        ), f"Currently, we only support cuts with a single supervision. Received: {cut}"
diff --git a/build/lib/sdp/processors/datasets/librispeech/__init__.py b/build/lib/sdp/processors/datasets/librispeech/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py b/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
new file mode 100644
index 00000000..83d42bde
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fnmatch
+import glob
+import json
+import os
+import typing
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import download_file, extract_archive
+
+
+def get_librispeech_url_list(split: str) -> str:
+    urls = {
+        "dev-clean": "https://openslr.org/resources/12/dev-clean.tar.gz",
+        "dev-other": "https://openslr.org/resources/12/dev-other.tar.gz",
+        "test-clean": "https://openslr.org/resources/12/test-clean.tar.gz",
+        "test-other": "https://openslr.org/resources/12/test-other.tar.gz",
+        "train-clean-100": "https://openslr.org/resources/12/train-clean-100.tar.gz",
+        "train-clean-360": "https://openslr.org/resources/12/train-clean-360.tar.gz",
+        "train-other-500": "https://openslr.org/resources/12/train-other-500.tar.gz",
+        "dev-clean-2": "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
+        "train-clean-5": "https://www.openslr.org/resources/31/train-clean-5.tar.gz",
+    }
+
+    if split not in urls:
+        valid_splits = ", ".join(urls.keys())
+        raise ValueError(f"Invalid dataset split '{split}'. Valid options are: {valid_splits}")
+
+    return urls[split]
+
+
+class CreateInitialManifestLibrispeech(BaseProcessor):
+    """Processor to create initial manifest for the Librispeech dataset.
+
+    Dataset link: https://openslr.org/12
+    Dataset link: https://openslr.org/31
+
+    Will download all files, extract tars, and create a manifest file with the
+    "audio_filepath" and "text" fields.
+
+    Args:
+        split (str): Which datasets or their combinations should be processed.
+            Options are:
+
+            - ``"dev-clean"``
+            - ``"dev-other"``
+            - ``"test-clean"``
+            - ``"test-other"``
+            - ``"train-clean-100"``
+            - ``"train-clean-360"``
+            - ``"train-other-500"``
+            - ``"dev-clean-2"``
+            - ``"train-clean-5"``
+
+        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        split: str,
+        raw_data_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.split = split
+        self.raw_data_dir = raw_data_dir
+
+    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
+        """Parse transcript file and put it inside manifest
+        We assume that flac files are located in the same directory as transcript file.
+        """
+
+        entries = []
+        root = os.path.dirname(file_path)
+
+        print(f"Processing transcript file: {file_path}") 
+        with open(file_path, encoding="utf-8") as fin:
+            for line in fin:
+                id, text = line[: line.index(" ")], line[line.index(" ") + 1 :]
+                transcript_text = text.strip()
+
+                flac_file = os.path.join(root, id + ".flac")
+
+                entry = {}
+                entry["audio_filepath"] = os.path.abspath(flac_file)
+                entry["text"] = transcript_text
+                entries.append(entry)
+        return entries
+
+    def process_data(self, data_folder: str, manifest_file: str) -> None:
+        split_folder = os.path.join(data_folder, "LibriSpeech", self.split)
+        files = []
+        entries = []
+        if not os.path.exists(split_folder):
+            raise FileNotFoundError(f"Directory for split '{self.split}' not found at {split_folder}")
+
+        for root, _, filenames in os.walk(split_folder):
+            for filename in fnmatch.filter(filenames, "*.trans.txt"):
+                files.append(os.path.join(root, filename))
+
+        for file in files:
+            entries.extend(self.process_transcript(file))
+
+        with open(manifest_file, "w") as fout:
+            for entry in entries:
+                fout.write(json.dumps(entry) + "\n")
+
+    def download_extract_files(self, dst_folder: str) -> None:
+        """downloading and extracting files"""
+
+        os.makedirs(dst_folder, exist_ok=True)
+
+        download_file(get_librispeech_url_list(self.split), str(dst_folder))
+        data_file = f'{dst_folder}/{self.split}.tar.gz'
+        extract_archive(str(data_file), str(dst_folder), force_extract=True)
+
+    def process(self):
+        self.download_extract_files(self.raw_data_dir)
+        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/masc/__init__.py b/build/lib/sdp/processors/datasets/masc/__init__.py
new file mode 100644
index 00000000..82fd7b35
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .create_initial_manifest import CreateInitialManifestMASC
+from .aggregate_segments import AggregateSegments
+from .apply_reg_exp_on_vtt_entries import RegExpVttEntries
+from .get_caption_file_segments import GetCaptionFileSegments
diff --git a/build/lib/sdp/processors/datasets/masc/aggregate_segments.py b/build/lib/sdp/processors/datasets/masc/aggregate_segments.py
new file mode 100644
index 00000000..8db51046
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/aggregate_segments.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from pydub import AudioSegment
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.processors.datasets.masc.utils import save_audio_segment
+
+class AggregateSegments(BaseParallelProcessor):
+    """
+    Aggregates short segments into segments with duration not longer than `max_duration`.
+    The algorithm works by iterating from left to right, merging consecutive segments into the current segment until the total duration reaches `max_duration`.
+    
+    output_audio_dir (str): Directory where aggregated audio segments will be stored, if `save_aggregated_audio_segments` is True.
+        If `save_aggregated_audio_segments` is False, this path is used to create the audio file paths in the manifest.
+    input_segments_key (str): The field name that contains list of segments in the input manifest. Defaults to "segments".
+    input_audio_filepath_key (str): The field name that contains paths to the audio files in the input manifest.
+        Defaults to "audio_filepath".
+    output_text_key (str): Field name where to store aggregated segment text. Defaults to "text".
+    output_duration_key (str): Field name where aggregated segment durations will be stored. Defaults to "duration".
+    output_audio_filepath_key (str): Field name where aggregated segment audio file paths will be stored.
+        Defaults to "audio_filepath".
+    max_duration (float): Maximum duration of aggregated segment. Default to 20.0s.
+    save_aggregated_audio_segments (bool): Flag indicating whether to crop audio files according to the aggregated segments.
+        Defaults to True.
+    verbose (bool): Set to True to enable more detailed logging. Defaults to False.
+    """
+    def __init__(
+        self,
+        output_audio_dir: str,
+        input_segments_key: str = "segments",
+        input_audio_filepath_key: str = "audio_filepath",
+        output_text_key: str = "text",
+        output_duration_key: str = "duration",
+        output_audio_filepath_key: str = "audio_filepath",
+        max_duration: float = 20.0,
+        save_aggregated_audio_segments: bool = True,
+        verbose: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_duration = max_duration
+        self.input_audio_filepath_key = input_audio_filepath_key
+        self.output_splitted_audio_filepath_key = output_audio_filepath_key
+        self.save_aggregated_audio_segments = save_aggregated_audio_segments
+        self.output_audio_dir = output_audio_dir
+        self.input_segments_key = input_segments_key
+        self.verbose = verbose
+        self.output_text_key = output_text_key
+        self.output_duration_key = output_duration_key
+
+    def prepare(self):
+        if self.save_aggregated_audio_segments and self.output_audio_dir:
+            os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True)
+
+    def process_dataset_entry(self, data_entry: dict):
+        if self.input_segments_key not in data_entry:
+            if self.verbose:
+                logging.info(f"No segments in the sample {data_entry[self.input_audio_filepath_key]}.")
+            return []
+
+        segments = data_entry[self.input_segments_key]
+        if len(segments) == 0:
+            return []
+        
+        audio = AudioSegment.from_wav(data_entry[self.input_audio_filepath_key])
+        
+        audio_basename = os.path.basename(data_entry[self.input_audio_filepath_key]).split(".")[0]
+        agg_segments = []
+        aggregated_segment = {**segments[0]}
+        for segment in segments[1:]:
+            # checking if adding another segment will cause the total duration to exceed max_duration
+            if (segment["end_time"] > audio.duration_seconds or segment["start_time"] > audio.duration_seconds):
+                continue
+            
+            start_time = min(segment["start_time"], aggregated_segment["start_time"])
+            end_time = max(segment["end_time"], aggregated_segment["end_time"])
+            if end_time - start_time >= self.max_duration:
+                agg_segments.append(aggregated_segment)
+                aggregated_segment = {**segment}
+            else:
+                # updating aggregated segment text with correct order of segments.
+                if aggregated_segment["start_time"] < segment["start_time"]:
+                    aggregated_segment["text"] += f" {segment['text']}".strip()
+                else:
+                    aggregated_segment["text"] = f"{segment['text']} {aggregated_segment['text']}"
+                    
+                aggregated_segment["start"] = start_time    # updating aggregated segment start time
+                aggregated_segment["end_time"] = end_time   # updating aggregated segment end time
+        else:
+            # adding the last aggregated segment 
+            if aggregated_segment not in agg_segments:
+                agg_segments.append(aggregated_segment)
+            
+        valid_segments = []
+        for aggregated_segment in agg_segments:
+            aggregated_segment.update(data_entry)
+            
+            start_time = aggregated_segment.pop("start_time")
+            end_time = aggregated_segment.pop("end_time")
+            
+            aggregated_segment[self.output_duration_key] = end_time - start_time
+            aggregated_segment[self.output_splitted_audio_filepath_key] = os.path.join(self.output_audio_dir, f"{audio_basename}_{start_time}_{end_time}.wav")
+            
+            if self.save_aggregated_audio_segments:
+                try:
+                    save_audio_segment(
+                        audio=audio,
+                        start_time=start_time,
+                        end_time=end_time,
+                        output_audio_filepath=aggregated_segment[self.output_splitted_audio_filepath_key]
+                    )
+                    valid_segments.append(aggregated_segment)
+                except IndexError as e:
+                    if self.verbose:
+                        logging.warning(f"Invalid segment boundaries in {audio_basename}. Skipping...")
+                
+        return [DataEntry(data=segment) for segment in valid_segments]
+        
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py b/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
new file mode 100644
index 00000000..541e98eb
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import webvtt # pip install webvtt-py
+from typing import Dict
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class RegExpVttEntries(BaseParallelProcessor):
+    """
+    Applies regular expressions on entries of a .vtt (WebVTT) file and stores the processed file in the specified directory.
+
+    Args::
+        input_filepath_key (str): Key that stores path to the input `.vtt` file.
+        output_filtered_vtt_dir (str): Directory where the processed `.vtt` files will be stored.
+        output_filepath_key (str): Key to store the output `.vtt` file path.
+
+    Returns::
+        Manifest with additional field:
+        {
+            "output_filepath_key": <path to processed .vtt file>
+        }
+    """
+
+    def __init__(
+        self,
+        regex_params: Dict,
+        input_filepath_key: str = "vtt_filepath",
+        output_filtered_vtt_dir: str = "filtered_vtt_filepath",
+        output_filepath_key: str = "filtered_vtt_filepath",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_filepath_key = input_filepath_key
+        self.output_filepath_key = output_filepath_key
+        self.output_filtered_vtt_dir = output_filtered_vtt_dir
+        self.regex_params = regex_params
+
+    def prepare(self):
+        os.makedirs(self.output_filtered_vtt_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        try:
+            vtt = webvtt.read(data_entry[self.input_filepath_key])
+
+            for caption in vtt:
+                caption.text = re.sub(
+                    pattern=self.regex_params["pattern"],
+                    repl=self.regex_params["repl"],
+                    string=caption.text,
+                    count=self.regex_params.get("count", 0),
+                )
+
+            basename = os.path.basename(data_entry[self.input_filepath_key])
+            filtered_vtt_filepath = os.path.join(self.output_filtered_vtt_dir, basename)
+            data_entry[self.output_filepath_key] = filtered_vtt_filepath
+
+            vtt.save(filtered_vtt_filepath)
+            return [DataEntry(data=data_entry)]
+        except:
+            return [DataEntry(data=None)]
diff --git a/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py b/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
new file mode 100644
index 00000000..9563f723
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from pathlib import Path
+import pandas as pd
+from sox import Transformer
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import extract_archive
+
+class CreateInitialManifestMASC(BaseParallelProcessor):
+    """
+    Processor for creating initial manifest for Massive Arabic Speech Corpus (MASC). \n
+    Dataset link: https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus.
+    Prior to calling processor download the tarred dataset and store it under `raw_dataset_dir/masc.tar.gz`.
+    
+    Creates manifest from samples in . `dataset_dir/subsets/data_split.csv`. All meta information is kept.
+
+    Args:
+        raw_data_dir (str): The root directory of the dataset.
+        extract_archive_dir (str): Directory where the extracted data will be saved.
+        resampled_audios_dir (str): Directory where the resampled audio will be saved.
+        data_split (str): Dataset split type.
+        already_extracted (bool): If True, we will not try to extract the raw data. Defaults to False.
+        target_samplerate (int): Sample rate (Hz) to use for resampling. Defaults to 16000.
+        target_nchannels (int): Number of channels to create during resampling process. Defaults to 1.
+        output_manifest_sample_id_key (str): The field name to store sample ID. Defaults to "sample_id".
+        output_manifest_vtt_filapath_key (str): The field name to store vtt file path. Defaults to "vtt_filepath".
+        output_manifest_audio_filapath_key (str): The field name to store audio file path. Defaults to "audio_filepath".
+        verbose (bool): Set to True for more detailed logging.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "sample_id": <sample ID>
+                "audio_filepath": <path to the audio file>,
+                "vtt_filepath": <path to the vtt file>,
+                "category": <video category>,
+                "video_duration": <video duration>,
+                "channel_id": <video channel ID>,
+                "country": <video country>,
+                "dialect": <speaker dialect>,
+                "gender": <speaker gender>,
+                "transcript_duration": <transcript duration>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        data_split: str,
+        extract_archive_dir: str,
+        resampled_audios_dir: str,
+        already_extracted: bool = False,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        output_manifest_sample_id_key: str = "sample_id",
+        output_manifest_vtt_filapath_key: str = "vtt_filepath",
+        output_manifest_audio_filapath_key: str = "audio_filepath",
+        verbose: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_dataset_dir = Path(raw_data_dir)
+        self.data_split = data_split
+        
+        # in original dataset there are no train, dev and test splits. These are added to support end-to-end tests.
+        if self.data_split == "train":
+            self.data_split = "clean_train"
+        if self.data_split == "dev":
+            self.data_split = "clean_dev"
+        if self.data_split == "test":
+            self.data_split = "clean_test"
+        
+        self.extract_archive_dir = extract_archive_dir
+        self.resampled_audios_dir = Path(resampled_audios_dir)
+        self.already_extracted = already_extracted
+        
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+        self.output_manifest_sample_id_key = output_manifest_sample_id_key
+        self.output_manifest_vtt_filapath_key = output_manifest_vtt_filapath_key
+        self.output_manifest_audio_filapath_key = output_manifest_audio_filapath_key
+        
+        self.verbose = verbose
+
+        data_split_values = ["train", "dev", "test", "clean_train", "clean_dev", "clean_test", "noisy_train", "noisy_dev", "noisy_test"]
+        if self.data_split not in data_split_values:
+            raise ValueError(f'Data split value must be from {data_split_values}. "{self.data_split}" was given.')
+
+    def prepare(self):
+        # Extracting data (unless already done).
+        if not self.already_extracted:
+            tar_gz_filepath = Path(str(self.raw_dataset_dir)) / "masc.tar.gz"
+            if not tar_gz_filepath.exists:
+                raise RuntimeError(
+                    f"Did not find any file matching {tar_gz_filepath}. "
+                    "For MASC dataset we cannot automatically download the data, so "
+                    "make sure to get the data from https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus"
+                    "and put it in the 'raw_dataset_dir' folder."
+                )
+
+            self.dataset_dir = Path(extract_archive(tar_gz_filepath, self.extract_archive_dir))
+        else:
+            logging.info("Skipping dataset untarring...")
+            self.dataset_dir = Path(self.extract_archive_dir) / "masc"
+            
+        self.vtts_dir = self.dataset_dir / "subtitles"
+        self.audios_dir = self.dataset_dir / "audios"
+        if self.data_split == "clean_train" or self.data_split == "noisy_train":
+            self.csv_filepath = self.dataset_dir / "subsets" / f"{self.data_split}.csv"
+        else:
+            self.csv_filepath = self.dataset_dir / "subsets" / f"{self.data_split}_meta.csv"
+
+        if not self.csv_filepath.exists():
+            raise FileNotFoundError(f"{self.csv_filepath} not found.")
+
+        if not self.vtts_dir.exists():
+            raise FileNotFoundError(f"{self.vtts_dir} not found.")
+
+        if not self.audios_dir.exists():
+            raise FileNotFoundError(f"{self.audios_dir} not found.")
+        
+        os.makedirs(self.resampled_audios_dir, exist_ok=True)
+        
+    def read_manifest(self):
+        csv = pd.read_csv(self.csv_filepath)
+        return [row.to_dict() for _, row in csv.iterrows()]
+
+    def process_dataset_entry(self, sample_data):
+        sample_id = sample_data["video_id"]
+        source_audio_filepath = self.audios_dir / f"{sample_id}.wav"
+        target_audio_filepath = self.resampled_audios_dir / f"{sample_id}.wav"
+        vtt_filepath = self.vtts_dir / f"{sample_id}.ar.vtt"
+        
+        # if source audio or vtt file do not exist skip
+        if not (os.path.exists(source_audio_filepath) and os.path.exists(vtt_filepath)):
+            return []
+        
+        # if target audio exists skip resampling
+        if not os.path.exists(target_audio_filepath):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=source_audio_filepath, output_filepath=target_audio_filepath)
+        elif self.verbose:
+            logging.info(f"{target_audio_filepath} already exists. Skipping resampling")
+
+        sample_data.pop("video_id")
+        sample_data.update(
+            {
+                self.output_manifest_sample_id_key: sample_id,
+                self.output_manifest_vtt_filapath_key: str(vtt_filepath),
+                self.output_manifest_audio_filapath_key: str(target_audio_filepath),
+            }
+        )
+        
+        return [DataEntry(data=sample_data)]
diff --git a/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py b/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
new file mode 100644
index 00000000..745c6548
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.processors.datasets.masc.utils import parse_captions
+
+class GetCaptionFileSegments(BaseParallelProcessor):
+    """
+    This class extracts subtitle information from .vtt (WebVTT) files.
+    Each segment represents a single subtitle line.
+
+    Args:
+        input_caption_file_key (str): The field name in the input manifest containing path to the caption file.
+        output_segments_key (str): The field name to store segment information. Defaults to "segments".
+        verbose (bool): Set true for outputing logging information.
+        
+    Returns:
+        This processor adds an output_segments field to the input manifest with a list of segments.
+        Each segment has a structure:
+            {
+                "segment_id":   <index of subtitle line>,
+                "start_time":   <segment start time>,
+                "end_time":     <segment end time>
+                "text":         <segment text>
+            }
+    """
+    def __init__(
+        self,
+        input_caption_file_key: str,
+        output_segments_key: str = "segments",
+        verbose: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.caption_file_key = input_caption_file_key
+        self.output_segments_key = output_segments_key
+        self.verbose = verbose
+
+    def process_dataset_entry(self, data_entry):
+        caption_file = data_entry[self.caption_file_key]
+        
+        if not os.path.exists(caption_file):
+            if self.verbose:
+                logging.info(f"File {caption_file} does not exist.")
+            return []
+
+        data_entry[self.output_segments_key] = parse_captions(caption_file)
+
+        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/datasets/masc/utils.py b/build/lib/sdp/processors/datasets/masc/utils.py
new file mode 100644
index 00000000..e3adf646
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/masc/utils.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import webvtt # pip install webvtt-py
+from typing import Optional
+from sdp.processors.datasets.commoncrawl.harv_utils import parse_hours
+from datetime import datetime
+
+def save_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: Optional[str]):
+    """
+    Extracts a segment from audio.
+    
+    Args:
+        audio: input audio
+        start_time (float): segment start time in seconds.
+        end_time (float): segment end time in seconds.
+        audio_filepath (Optional[str]): filepath to store the segment.
+        
+    Returns:
+        audio_segment: audio segment
+    
+    IndexError: Raised if segment boundaries are out of range.
+    """
+    start_time = start_time * 1000
+    end_time = end_time * 1000
+    
+    if start_time >= len(audio) or end_time >= len(audio):
+        raise IndexError("Segment boundaries are out of range.")
+    
+    audio_segment = audio[start_time:end_time]
+    if output_audio_filepath:
+        audio_segment.export(output_audio_filepath, format="wav")
+    
+    return audio_segment
+
+
+def parse_captions(captions_filepath: str):
+    """
+    Creates a list of segments from .vtt caption files.
+    Each segment has a structure:
+    {
+        "segment_id": int,       # Unique identifier for the segment
+        "start_time": float,     # Start time of the segment (in seconds)
+        "end_time": float,       # End time of the segment (in seconds)
+        "text": str              # Text content of the segment
+    }
+    
+    Args:
+        captions_filepath (str): path to .vtt file.
+    """
+    srt_segments = []
+    initial_timestamp = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
+    for index, caption in enumerate(webvtt.read(captions_filepath)):
+        text = ' '.join([text.strip() for text in caption.text.split('\n')])
+        start_time = parse_hours(caption.start) - initial_timestamp
+        end_time = parse_hours(caption.end) - initial_timestamp
+        
+        segment = {
+            "segment_id": index,
+            "start_time": start_time.total_seconds(),
+            "end_time": end_time.total_seconds(),
+            "text": text
+        }
+        srt_segments.append(segment)
+        
+    return srt_segments
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/mcv/__init__.py b/build/lib/sdp/processors/datasets/mcv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
new file mode 100644
index 00000000..16544c4f
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
+# For example sudo apt-get install libsox-fmt-mp3
+import csv
+import glob
+import os
+from pathlib import Path
+from typing import Tuple
+
+import sox
+from sox import Transformer
+from tqdm.contrib.concurrent import process_map
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import extract_archive
+
+
+class CreateInitialManifestMCV(BaseParallelProcessor):
+    """Processor to create initial manifest for the Mozilla Common Voice (MCV) dataset.
+
+    Dataset link: https://commonvoice.mozilla.org/
+
+    Extracts raw MCV data for the specified language and creates an initial manifest
+    using the transcripts provided in the raw data.
+
+    Args:
+        raw_data_dir (str): the path to the directory containing the raw data archive file.
+            Needs to be manually downloaded from https://commonvoice.mozilla.org/.
+        extract_archive_dir (str): directory where the extracted data will be saved.
+        resampled_audio_dir (str): directory where the resampled audio will be saved.
+        data_split (str): "train", "dev" or "test".
+        language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
+        already_extracted (bool): if True, we will not try to extract the raw data.
+            Defaults to False.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription (with capitalization and punctuation)>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        extract_archive_dir: str,
+        resampled_audio_dir: str,
+        data_split: str,
+        language_id: str,
+        already_extracted: bool = False,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.extract_archive_dir = extract_archive_dir
+        self.resampled_audio_dir = resampled_audio_dir
+        self.data_split = data_split
+        self.language_id = language_id
+        self.already_extracted = already_extracted
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        """Extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        if not self.already_extracted:
+            tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*{self.language_id}.tar.gz")
+            if not tar_gz_files:
+                raise RuntimeError(
+                    f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
+                    "For MCV dataset we cannot automatically download the data, so "
+                    "make sure to get the data from https://commonvoice.mozilla.org/ "
+                    "and put it in the 'raw_data_dir' folder."
+                )
+            elif len(tar_gz_files) > 1:
+                raise RuntimeError(
+                    f"Expecting exactly one *{self.language_id}.tar.gz file in directory {self.raw_data_dir}"
+                )
+
+            data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
+            self.transcription_file = Path(data_folder)
+        else:
+            self.transcription_file = Path(self.extract_archive_dir) / self.language_id
+        self.audio_path_prefix = str(self.transcription_file / "clips")
+        self.transcription_file = str(self.transcription_file / (self.data_split + ".tsv"))
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def read_manifest(self):
+        if self.transcription_file is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        with open(self.transcription_file, "rt", encoding="utf8") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter="\t")
+            next(reader, None)  # skip the headers
+            dataset_entries = [(row["path"], row["sentence"]) for row in reader]
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: Tuple[str, str]):
+        file_path, text = data_entry
+        file_name = os.path.splitext(os.path.basename(file_path))[0]
+        transcript_text = text.strip()
+
+        audio_path = os.path.join(self.audio_path_prefix, file_path)
+        output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")
+
+        if not os.path.exists(output_wav_path):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
+
+        data = {
+            "audio_filepath": output_wav_path,
+            "duration": float(sox.file_info.duration(output_wav_path)),
+            "text": transcript_text,
+        }
+
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/mediaspeech/__init__.py b/build/lib/sdp/processors/datasets/mediaspeech/__init__.py
new file mode 100644
index 00000000..341a77c5
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mediaspeech/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
new file mode 100644
index 00000000..cfba28d1
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from glob import glob
+from pathlib import Path
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import ffmpeg_convert
+from sdp.utils.common import extract_archive
+
+
+class CreateInitialManifestMediaSpeech(BaseParallelProcessor):
+    """
+    Processor for creating initial manifest for MediaSpeech Arabic dataset.
+    Dataset link: https://www.openslr.org/108/.
+    Prior to calling processor download the tarred dataset and store it under `raw_dataset_dir/AR.tgz` or `raw_dataset_dir/AR.tar.gz`.
+    
+    Args:
+        raw_data_dir (str): The root directory of the dataset.
+        extract_archive_dir (str): Directory where the extracted data will be saved.
+        resampled_audios_dir (str): Directory where the resampled audio will be saved.
+        already_extracted (bool): If True, we will not try to extract the raw data. Defaults to False.
+        target_samplerate (int): Sample rate (Hz) to use for resampling. Defaults to 16000.
+        target_nchannels (int): Number of channels to create during resampling process. Defaults to 1.
+        output_manifest_sample_id_key (str): The field name to store sample ID. Defaults to "sample_id".
+        output_manifest_audio_filapath_key (str): The field name to store audio file path. Defaults to "audio_filepath".
+        output_manifest_text_key (str): The field name to store text. Defaults to "text".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+        
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <text>,
+            }
+    """
+    def __init__(
+        self,
+        raw_data_dir: str,
+        resampled_audios_dir: str,
+        extract_archive_dir: str,
+        already_extracted: bool = False,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        output_manifest_sample_id_key: str = "sample_id",
+        output_manifest_audio_filapath_key: str = "audio_filepath",
+        output_manifest_text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.extract_archive_dir = extract_archive_dir
+        self.resampled_audios_dir = Path(resampled_audios_dir)
+        self.already_extracted = already_extracted
+        
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        
+        self.output_manifest_sample_id_key = output_manifest_sample_id_key
+        self.output_manifest_audio_filapath_key = output_manifest_audio_filapath_key
+        self.output_manifest_text_key = output_manifest_text_key
+
+    def prepare(self):
+        # Extracting data (unless already done).
+        if not self.already_extracted:
+            tar_gz_filepath = Path(str(self.raw_data_dir)) / "AR.tgz"
+            if not tar_gz_filepath.exists():
+                # necessary to check in tests
+                tar_gz_filepath = Path(str(self.raw_data_dir)) / "AR.tar.gz"
+
+            if not tar_gz_filepath.exists():
+                raise RuntimeError(
+                    '''Did not find any file matching `AR.tgz` or `AR.tar.gz`. 
+                    For MediaSpeech dataset we cannot automatically download the data, so 
+                    make sure to get the data from https://www.openslr.org/108/ 
+                    and put it in the `raw_data_dir` folder.'''
+                )
+
+            self.dataset_dir = Path(extract_archive(tar_gz_filepath, self.extract_archive_dir))
+        else:
+            logger.info("Skipping dataset untarring...")
+            self.dataset_dir = Path(self.extract_archive_dir) / "AR"
+        
+        os.makedirs(self.resampled_audios_dir, exist_ok=True)
+
+    def read_manifest(self):
+        data_entries = []
+        audio_filepaths = glob(f"{self.dataset_dir}/*.flac")
+
+        for audio_filepath in audio_filepaths:
+            sample_id = os.path.basename(audio_filepath).split(".")[0]
+
+            text_filepath = f"{self.dataset_dir}/{sample_id}.txt"
+            if not os.path.exists(text_filepath):
+                logger.warning(
+                    f'Sample "{sample_id}" has no related .txt files. Skipping'
+                )
+                continue
+
+            data_entries.append(
+                {
+                    "sample_id": sample_id,
+                    "audio_filepath": audio_filepath,
+                    "text_filepath": text_filepath,
+                }
+            )
+
+        return data_entries
+
+    def process_dataset_entry(self, data_entry: DataEntry):
+        data = {}
+        sample_id = data_entry["sample_id"]
+        # Convert source_audio_filepath to .wav
+        data[self.output_manifest_audio_filapath_key] = os.path.join(
+            os.path.join(self.resampled_audios_dir, f"{sample_id}.wav"),
+        )
+
+        ffmpeg_convert(
+            jpg=data_entry["audio_filepath"],
+            wav=data[self.output_manifest_audio_filapath_key],
+            ar=self.target_samplerate,
+            ac=self.target_nchannels,
+        )
+
+        if not os.path.exists(data[self.output_manifest_audio_filapath_key]):
+            return []
+
+        text_file = open(data_entry["text_filepath"], "r")
+        data[self.output_manifest_text_key] = text_file.read()
+
+        return [DataEntry(data=data)]
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/mls/__init__.py b/build/lib/sdp/processors/datasets/mls/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
new file mode 100644
index 00000000..72d9c00d
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import Optional
+
+import librosa
+from sox import Transformer
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+MLS_URL_NO_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
+MLS_URL_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}_opus.tar.gz"
+
+
+class CreateInitialManifestMLS(BaseParallelProcessor):
+    """Processor to create initial manifest for the Multilingual LibriSpeech (MLS) dataset.
+
+    Dataset link: https://www.openslr.org/94/
+
+    Downloads and unzips raw MLS data for the specified language,
+    and creates an initial manifest using the transcripts provided in the raw data.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data will be/is saved.
+            This is also where the extracted and processed data will be.
+        language (str): the language of the data you wish to be downloaded.
+            This will be used to format the URL from which we attempt to download the data.
+            E.g., "english", "italian", "spanish", etc.
+        data_split (str): "train", "dev" or "test".
+        resampled_audio_dir (str or None): if specified, the directory where the resampled
+            wav files will be stored. If not specified, the audio will not be resampled and
+            the parameters ``target_samplerate`` and ``target_nchannels`` will be ignored.
+        target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
+            be ignored if ``resampled_audio_dir`` is ``None``.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process. This
+            parameter will be ignored if ``resampled_audio_dir`` is ``None``.
+            Defaults to 1.
+        use_opus_archive (bool): if ``True``, will use the version of the archive file which
+            contains audio files saved in the OPUS format, instead of FLAC. The OPUS files take up
+            less memory than the FLAC files, at the cost of the OPUS files being lower quality than
+            the FLAC files.
+            If ``True``, the parameter ``resampled_audio_dir`` must be ``None``, as resampling OPUS
+            audio files is currently not supported.
+            Defaults to False.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        language: str,
+        data_split: str,
+        resampled_audio_dir: Optional[str],
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        use_opus_archive: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.language = language
+        self.data_split = data_split
+        self.resampled_audio_dir = Path(resampled_audio_dir) if resampled_audio_dir else None
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        self.use_opus_archive = use_opus_archive
+
+        # validate params
+        if self.use_opus_archive and self.resampled_audio_dir:
+            raise ValueError(
+                f"`use_opus_archive` is True and `resampled_audio_dir` is not None, but we currently do not"
+                " support resampling OPUS-format audio, please either set `use_opus_archive` to False or"
+                " resampled_audio_dir to None."
+            )
+
+        if not resampled_audio_dir:
+            logger.info(
+                "`resampled_audio_dir` is None => will not attempt to resample audio. Please note if you have"
+                " specified `target_samplerate` or `target_nchannels`, they will be ignored."
+            )
+
+
+
+        # will be initialized in self.prepare method
+        self.audio_path_prefix = None
+        self.transcription_file = None
+
+    def prepare(self):
+        """Downloading and extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        if self.use_opus_archive:
+            url = MLS_URL_OPUS.format(language=self.language)
+            if not (self.raw_data_dir / f"mls_{self.language}_opus.tar.gz").exists():
+                download_file(url, str(self.raw_data_dir))
+
+        else:
+            url = MLS_URL_NO_OPUS.format(language=self.language)
+            if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
+                download_file(url, str(self.raw_data_dir))
+
+        data_folder = extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
+
+        self.audio_path_prefix = str(Path(data_folder) / self.data_split / "audio")
+        self.transcription_file = str(Path(data_folder) / self.data_split / "transcripts.txt")
+
+    def read_manifest(self):
+        """Reading the initial data line-by-line."""
+        if self.transcription_file is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        with open(self.transcription_file, "rt", encoding="utf8") as fin:
+            dataset_entries = fin.readlines()
+
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: str):
+        """Processing the data entries.
+
+        Converts all audio into wav format and outputs filepath, duration and
+        transcription text.
+        """
+        if len(data_entry.split("\t")) != 2:
+            raise RuntimeError(f"have more than one tab in line {data_entry}")
+
+        utt_id, text = data_entry.split("\t")
+        transcript_text = text.strip()
+
+        # specify src_audio_path
+        if self.use_opus_archive:
+            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".opus")
+        else:
+            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")
+
+        # specify tgt_audio_path
+        if self.resampled_audio_dir:
+            tgt_audio_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")
+
+            if not os.path.exists(os.path.dirname(tgt_audio_path)):
+                os.makedirs(os.path.dirname(tgt_audio_path), exist_ok=True)
+            if not os.path.exists(tgt_audio_path):
+                tfm = Transformer()
+                tfm.rate(samplerate=self.target_samplerate)
+                tfm.channels(n_channels=self.target_nchannels)
+                tfm.build(input_filepath=src_audio_path, output_filepath=tgt_audio_path)
+
+        else:
+            tgt_audio_path = src_audio_path
+
+        data = {
+            "audio_filepath": tgt_audio_path,
+            "duration": float(librosa.get_duration(path=tgt_audio_path)),
+            "text": transcript_text,
+        }
+
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/mls/restore_pc.py b/build/lib/sdp/processors/datasets/mls/restore_pc.py
new file mode 100644
index 00000000..33ff22b0
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mls/restore_pc.py
@@ -0,0 +1,606 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+import re
+import string
+import sys
+from glob import glob
+from pathlib import Path
+from typing import Optional
+
+import regex
+from joblib import Parallel, delayed
+from tqdm import tqdm
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import download_file, extract_archive
+
+sys.setrecursionlimit(1000000)
+
+NA = "n/a"
+MLS_TEXT_URL = "https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz"
+
+
+def abbreviations(text):
+    text = (
+        text.replace("Cap'n", "Captain")
+        .replace("cap'n", "captain")
+        .replace("o'shot", "o shot")
+        .replace("o' shot", "o shot")
+        .replace("on'y", "only")
+        .replace("on' y", "only")
+        .replace(" 'a ", " a ")
+        .replace(" 'em ", " em ")
+        .replace("gen'leman", "gentleman")
+    )
+    return text
+
+
+def process(text):
+    text = (
+        text.replace("www.gutenberg.org", "www dot gutenberg dot org")
+        .replace(".txt", "dot txt")
+        .replace(".zip", "dot zip")
+    )
+
+    text = (
+        text.replace("’", "'")
+        .replace("_", " ")
+        .replace("\n", " ")
+        .replace("\t", " ")
+        .replace("…", "...")
+        .replace("»", '"')
+        .replace("«", '"')
+        .replace("\\", "")
+        .replace("”", '"')
+        .replace("„", '"')
+        .replace("´", "'")
+        .replace("-- --", "--")
+        .replace("--", " -- ")
+        .replace(". . .", "...")
+        .replace("’", "'")
+        .replace("“", '"')
+        .replace("“", '"')
+        .replace("‘", "'")
+        .replace("_", " ")
+        .replace("*", " ")
+        .replace("—", "-")
+        .replace("- -", "--")
+        .replace("•", " ")
+        .replace("^", " ")
+        .replace(">", " ")
+        .replace("■", " ")
+        .replace("/", " ")
+        .replace("––––", "...")
+        .replace("W⸺", "W")
+        .replace("`", "'")
+        .replace("<", " ")
+        .replace("{", " ")
+        .replace("Good-night", "Good night")
+        .replace("good-night", "good night")
+        .replace("good-bye", "goodbye")
+        .replace("Good-bye", "Goodbye")
+        .replace(" !", "!")
+        .replace(" ?", "?")
+        .replace(" ,", ",")
+        .replace(" .", ".")
+        .replace(" ;", ";")
+        .replace(" :", ":")
+        .replace("!!", "!")
+        .replace("--", "-")
+        .replace("“", '"')
+        .replace(", , ", ", ")
+        .replace("=", " ")
+        .replace("l,000", "1,000")
+        .replace("–", "-")
+    )
+    # remove dash in between the words
+    text = re.sub(r"([A-Za-z0-9]+)(-)([A-Za-z0-9]+)", r"\g<1> \g<3>", text)
+    text = re.sub(r"([A-Za-z0-9]+)(\.)([A-Za-z]+)", r"\g<1>\g<2> \g<3>", text)
+    text = re.sub(r"([A-Za-z]+)(\.)([A-Za-z0-9]+)", r"\g<1>\g<2> \g<3>", text)
+
+    # # remove text inside square brackets
+    # text = re.sub(r"(\[.*?\])", " ", text)
+
+    def __fix_space(text):
+        # remove commas between digits
+        text = re.sub(r"([0-9]+)(,)(\d\d\d)", r"\g<1>\g<3>", text)
+        text = re.sub(r"([A-Za-z]+)(,)([A-Za-z0-9]+)", r"\g<1>\g<2> \g<3>", text)
+        return text
+
+    for _ in range(3):
+        text = __fix_space(text)
+
+    text = re.sub(r" +", " ", text)
+
+    # make sure the text starts with an alpha
+    start_idx = 0
+    while not text[start_idx].isalpha():
+        start_idx += 1
+
+    end_text = "END OF THIS PROJECT GUTENBERG"
+    end_idx = len(text)
+    if end_text in text:
+        end_idx = text.find(end_text)
+
+    end_text = "End of the Project Gutenberg"
+    if end_text in text:
+        end_idx = text.find(end_text)
+
+    return text[start_idx:end_idx]
+
+
+def read_text(text_f):
+    with open(text_f, "r") as f:
+        text = f.read()
+    return text
+
+
+def remove_punctuation(text: str, remove_spaces=True, do_lower=True, exclude=None, remove_accents=False):
+    all_punct_marks = string.punctuation + "¿¡⸘"
+
+    if exclude is not None:
+        for p in exclude:
+            all_punct_marks = all_punct_marks.replace(p, "")
+
+        # a weird bug where commas is getting deleted when dash is present in the list of punct marks
+        all_punct_marks = all_punct_marks.replace("-", "")
+    text = re.sub("[" + all_punct_marks + "]", " ", text)
+
+    if exclude and "-" not in exclude:
+        text = text.replace("-", " ")
+
+    text = re.sub(r" +", " ", text)
+    if remove_spaces:
+        text = text.replace(" ", "").replace("\u00A0", "").strip()
+
+    if do_lower:
+        text = text.lower()
+
+    if remove_accents:
+        text = text.replace("á", "a")
+        text = text.replace("é", "e")
+        text = text.replace("í", "i")
+        text = text.replace("ó", "o")
+        text = text.replace("ú", "u")
+        text = text.replace("à", "a")
+        text = text.replace("è", "e")
+        text = text.replace("ù", "u")
+        text = text.replace("â", "a")
+        text = text.replace("ê", "e")
+        text = text.replace("î", "i")
+        text = text.replace("ô", "o")
+        text = text.replace("û", "u")
+
+    return text.strip()
+
+
+def recover_lines(manifest, processed_text, output_dir, restored_text_field):
+    manifest_recovered = f"{output_dir}/{os.path.basename(manifest)}"
+    if os.path.exists(manifest_recovered):
+        return
+
+    lines = []
+    with open(manifest, "r") as f:
+        for line in f:
+            line = json.loads(line)
+            lines.append(line["text"])
+
+    logger.debug(f"processing {manifest}")
+    logger.debug(f"processing - {len(lines)} lines")
+
+    last_found_start_idx = 0
+    recovered_lines = {}
+
+    for idx, cur_line in enumerate(lines):
+        stop_search_for_line = False
+        cur_word_idx = 0
+        cur_line = abbreviations(cur_line)
+        cur_line = cur_line.split()
+        end_match_found = False
+
+        while not stop_search_for_line:
+            cur_word = cur_line[cur_word_idx]
+
+            pattern = cur_word
+            max_start_match_len = min(4, len(cur_line))
+            for i in range(1, max_start_match_len):
+                pattern += f"[^A-Za-z]+{cur_line[i]}"
+
+            pattern = re.compile(pattern)
+
+            for i, m in enumerate(pattern.finditer(processed_text[last_found_start_idx:].lower())):
+                if end_match_found:
+                    break
+                match_idx = m.start() + last_found_start_idx
+                processed_text_list = processed_text[match_idx:].split()
+                raw_text_pointer = (
+                    len(cur_line) - 3
+                )  # added in case some dash separated words and split into multiple words in the cur_line
+                stop_end_search = False
+                right_offset = 20
+                while not end_match_found and raw_text_pointer <= len(processed_text_list) and not stop_end_search:
+                    if cur_line[-1].replace("'", "") == remove_punctuation(
+                        processed_text_list[raw_text_pointer - 1],
+                        remove_spaces=True,
+                        do_lower=True,
+                        remove_accents=False,
+                    ):
+                        # processed text could contain apostrophes that are parts of quotes, let's remove them from the processed text as well
+                        if "'" not in cur_line[-1] and "'" in processed_text_list[raw_text_pointer - 1]:
+                            processed_text_list[raw_text_pointer - 1] = processed_text_list[
+                                raw_text_pointer - 1
+                            ].replace("'", "")
+                        recovered_line = " ".join(processed_text_list[:raw_text_pointer])
+                        if not is_valid(" ".join(cur_line), recovered_line):
+                            raw_text_pointer += 1
+                        else:
+                            recovered_lines[idx] = recovered_line
+                            end_match_found = True
+                            raw_text_pointer += 1
+                            stop_search_for_line = True
+                            last_found_start_idx = raw_text_pointer
+
+                    else:
+                        raw_text_pointer += 1
+                        if raw_text_pointer > (len(cur_line) + right_offset):
+                            stop_end_search = True
+
+            if not end_match_found:
+                stop_search_for_line = True
+
+    logger.debug(
+        f"recovered {len(recovered_lines)} lines out of {len(lines)} -- {round(len(recovered_lines)/len(lines)*100, 2)}% -- {os.path.basename(manifest)}"
+    )
+
+    with open(manifest_recovered, "w") as f_out, open(manifest, "r") as f_in:
+        for idx, line in enumerate(f_in):
+            line = json.loads(line)
+            if idx in recovered_lines:
+                line[restored_text_field] = recovered_lines[idx]
+            else:
+                line[restored_text_field] = NA
+            f_out.write(json.dumps(line, ensure_ascii=False) + "\n")
+
+
+def split_text_into_sentences(text: str):
+    """
+    Split text into sentences.
+
+    Args:
+        text: text
+
+    Returns list of sentences
+    """
+    # TODO: should this be filled up and exposed as a parameter?
+    lower_case_unicode = ""
+    upper_case_unicode = ""
+
+    # end of quoted speech - to be able to split sentences by full stop
+    text = re.sub(r"([\.\?\!])([\"\'])", r"\g<2>\g<1> ", text)
+
+    # remove extra space
+    text = re.sub(r" +", " ", text)
+
+    # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences
+    matches = re.findall(rf"[a-z{lower_case_unicode}]\.\s[a-z{lower_case_unicode}]\.", text)
+    for match in matches:
+        text = text.replace(match, match.replace(". ", "."))
+
+    # Read and split transcript by utterance (roughly, sentences)
+    split_pattern = (
+        rf"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)"
+        rf"(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s(?![0-9]+[a-z]*\.)"
+    )
+    sentences = regex.split(split_pattern, text)
+    return sentences
+
+
+def normalize_text(text_f: str, normalizer: Optional['Normalizer'] = None):
+    """
+    Pre-process and normalized text_f file.
+
+    Args:
+        text_f: path to .txt file to normalize
+        normalizer:
+    """
+    raw_text = read_text(text_f)
+    processed_text = abbreviations(process(raw_text))
+    if normalizer is not None:
+        processed_text_list = normalizer.split_text_into_sentences(processed_text)
+    else:
+        processed_text_list = split_text_into_sentences(processed_text)
+    processed_text_list_merged = []
+    last_segment = ""
+    max_len = 7500
+    for i, text in enumerate(processed_text_list):
+        if len(last_segment) < max_len:
+            last_segment += " " + text
+        else:
+            processed_text_list_merged.append(last_segment.strip())
+            last_segment = ""
+
+        if i == len(processed_text_list) - 1 and len(last_segment) > 0:
+            processed_text_list_merged.append(last_segment.strip())
+
+    for i, text in enumerate(tqdm(processed_text_list_merged)):
+        if normalizer is not None:
+            processed_text_list_merged[i] = normalizer.normalize(
+                text=text, punct_post_process=True, punct_pre_process=True
+            )
+        else:
+            processed_text_list_merged[i] = re.sub(r"\d", r"", processed_text_list_merged[i])
+    processed_text = " ".join(processed_text_list_merged)
+    return processed_text
+
+
+import diff_match_patch as dmp_module
+
+dmp = dmp_module.diff_match_patch()
+dmp.Diff_Timeout = 0
+
+
+def is_valid(line, recovered_line):
+    """Checks that the restore line matches the original line in everything but casing and punctuation marks"""
+    line = abbreviations(line)
+    line_no_punc = remove_punctuation(line, remove_spaces=True, do_lower=True, remove_accents=True)
+    recovered_line_no_punc = remove_punctuation(recovered_line, remove_spaces=True, do_lower=True, remove_accents=True)
+
+    is_same = line_no_punc == recovered_line_no_punc
+
+    return is_same
+
+
+def process_book(book_manifest, texts_dir, submanifests_dir, output_dir, restored_text_field, normalizer):
+    book_id = os.path.basename(book_manifest).split(".")[0]
+    text_f = f"{texts_dir}/{book_id}.txt"
+    manifests = glob(f"{submanifests_dir}/{book_id}_*.json")
+    logger.info(f"{book_id} -- {len(manifests)} manifests")
+
+    # only continue (i.e. do not make early 'return') if there are {book_id}_{spk_id}.json files in submanifests_dir
+    # that are not in output dir - else return early
+    for book_id_spk_id in [os.path.basename(x).strip(".json") for x in manifests]:
+        if not os.path.exists(os.path.join(output_dir, f"{book_id_spk_id}.json")):
+            logger.info(f"Did not find {book_id_spk_id} in {output_dir} => will process this book")
+            break
+    else:
+        return
+
+    try:
+        processed_text = normalize_text(text_f, normalizer)
+        # re-run abbreviations since new are being added
+        processed_text = abbreviations(processed_text)
+        [
+            recover_lines(
+                manifest=manifest,
+                processed_text=processed_text,
+                output_dir=output_dir,
+                restored_text_field=restored_text_field,
+            )
+            for manifest in manifests
+        ]
+    except:
+        logger.info(f"{text_f} failed")
+        return
+
+
+class RestorePCForMLS(BaseProcessor):
+    """Recovers original text from the MLS Librivox texts.
+
+    This processor can be used to restore punctuation and capitalization for the
+    MLS data. Uses the original data in https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz.
+    Saves recovered text in ``restored_text_field`` field.
+    If text was not recovered, ``restored_text_field`` will be equal to ``n/a``.
+
+    Args:
+        language_long (str): the full name of the language, used for
+            choosing the folder of the contents of
+            "https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz".
+            E.g., "english", "spanish", "italian", etc.
+        language_short (str or None): the short name of the language, used for
+            specifying the normalizer we want to use. E.g., "en", "es", "it", etc.
+            If set to None, we will not try to normalize the provided Librivox text.
+        lv_text_dir (str): the directory where the contents of
+            https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz will be saved.
+        submanifests_dir (str): the directory where submanifests (one for each
+            combo of speaker + book) will be stored.
+        restored_submanifests_dir (str): the directory where restored
+            submanifests (one for each combo of speaker + book) will be stored.
+        restored_text_field (str): the field where the recovered text will be stored.
+        n_jobs (int): number of jobs to use for parallel processing. Defaults to -1.
+        show_conversion_breakdown (bool): whether to show how much of each
+            submanifest was restored. Defaults to True.
+
+    Returns:
+        All the same data as in the input manifest with an additional key::
+
+            <restored_text_field>: <restored text or n/a if match was not found>``
+    """
+
+    def __init__(
+        self,
+        language_long: str,
+        language_short: Optional[str],
+        lv_text_dir: str,
+        submanifests_dir: str,
+        restored_submanifests_dir: str,
+        restored_text_field: str,
+        n_jobs: int = -1,
+        show_conversion_breakdown: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.language_long = language_long
+        self.language_short = language_short
+        self.lv_text_dir = Path(lv_text_dir)
+        self.submanifests_dir = Path(submanifests_dir)
+        self.restored_submanifests_dir = Path(restored_submanifests_dir)
+        self.restored_text_field = restored_text_field
+        self.n_jobs = n_jobs
+        self.show_conversion_breakdown = show_conversion_breakdown
+
+    def process(self):
+        """Main processing happens here.
+
+        * Download & extract lv_text.
+        * Create submanifests.
+        * Restore P&C to submanifests.
+        * Group back submanifests into a single manifest
+        """
+        from nemo_text_processing.text_normalization.normalize import Normalizer
+
+        os.makedirs(self.lv_text_dir, exist_ok=True)
+
+        # Download & extract lv_text.
+        download_file(MLS_TEXT_URL, str(self.lv_text_dir))
+        lv_text_data_folder = extract_archive(
+            str(self.lv_text_dir / os.path.basename(MLS_TEXT_URL)), str(self.lv_text_dir)
+        )
+
+        # Create submanifests
+        os.makedirs(self.submanifests_dir, exist_ok=True)
+
+        data = {}
+        with open(self.input_manifest_file, "r") as f:
+            for line in tqdm(f):
+                item = json.loads(line)
+                name = Path(item["audio_filepath"]).stem
+                reader_id, lv_book_id, sample_id = name.split("_")
+                key = f"{lv_book_id}_{reader_id}"
+                if key not in data:
+                    data[key] = {}
+                data[key][sample_id] = line
+
+        for key, v in data.items():
+            with open(f"{self.submanifests_dir}/{key}.json", "w") as f_out:
+                for sample_id in sorted(v.keys()):
+                    line = v[sample_id]
+                    f_out.write(line)
+
+        # Restore P&C to submanifests.
+        os.makedirs(str(self.restored_submanifests_dir), exist_ok=True)
+
+        if self.language_short:
+            try:
+                normalizer = Normalizer(
+                    input_case="cased",
+                    lang=self.language_short,
+                    cache_dir="CACHE_DIR",
+                    overwrite_cache=False,
+                    post_process=True,
+                )
+            except NotImplementedError:  # some languages don't support text normalization
+                logger.info(
+                    f"Could not find NeMo Normalizer for language {self.language_short}, so"
+                    " will not normalize the Librivox text before attempting to restore punctuation"
+                    " and capitalization."
+                )
+                normalizer = None
+        else:
+            logger.info(
+                f"`language_short` was not specified, so will not normalize the Librivox"
+                " text before attempting to restore punctuation and capitalization."
+            )
+            normalizer = None
+
+        # TODO: rename to maybe books_ids_in_datasplit
+        books_ids_in_submanifests = set([x.split("_")[0] for x in data.keys()])
+
+        Parallel(n_jobs=self.n_jobs)(
+            delayed(process_book)(
+                book_id,
+                str(Path(lv_text_data_folder) / self.language_long),
+                str(self.submanifests_dir),
+                str(self.restored_submanifests_dir),
+                self.restored_text_field,
+                normalizer,
+            )
+            for book_id in tqdm(books_ids_in_submanifests)
+        )
+
+        # get stats --- keep track of book/spk ids in  our datasplit
+        book_id_spk_ids_in_datasplit = set()  # set of tuples (book_id, spk_id), ...
+        original_manifest_duration = 0
+        with open(self.input_manifest_file, "r") as f:
+            for line in f:
+                line = json.loads(line)
+                book_id, spk_id = Path(line["audio_filepath"]).stem.split("_")[:2]
+                book_id_spk_ids_in_datasplit.add((book_id, spk_id))
+                original_manifest_duration += line["duration"]
+        logger.info(
+            f"duration ORIGINAL total (for current datasplit): {round(original_manifest_duration / 60 / 60, 2)} hrs"
+        )
+
+        # make dicts to record durations of manifests
+        filename_to_sub_manifest_durs = collections.defaultdict(float)
+        filename_to_restored_sub_manifest_durs = collections.defaultdict(float)
+
+        # duration in submanifests
+        for book_id, spk_id in book_id_spk_ids_in_datasplit:
+            manifest = os.path.join(self.submanifests_dir, f"{spk_id}_{book_id}.json")
+            with open(manifest, "r") as f:
+                for line in f:
+                    line = json.loads(line)
+                    filename_to_sub_manifest_durs[f"{spk_id}_{book_id}.json"] += line["duration"]
+
+        # duration in restored_submanifests
+        for book_id, spk_id in book_id_spk_ids_in_datasplit:
+            manifest = os.path.join(self.restored_submanifests_dir, f"{spk_id}_{book_id}.json")
+            if os.path.exists(manifest):
+                with open(manifest, "r") as f:
+                    for line in f:
+                        line = json.loads(line)
+                        if line[self.restored_text_field] != NA:
+                            filename_to_restored_sub_manifest_durs[f"{spk_id}_{book_id}.json"] += line["duration"]
+            else:
+                filename_to_restored_sub_manifest_durs[f"{spk_id}_{book_id}.json"] = 0
+
+        if self.show_conversion_breakdown:
+            for filename in filename_to_sub_manifest_durs.keys():
+                orig_dur = filename_to_sub_manifest_durs[filename]
+                restored_dur = filename_to_restored_sub_manifest_durs[filename]
+
+                pc_restored = 100 * restored_dur / orig_dur
+
+                logger.info(
+                    f"{filename}: {orig_dur/60:.2f} mins -> {restored_dur/60:.2f} mins\t({pc_restored:.2f}% restored)"
+                )
+
+        sub_manifest_duration = sum(list(filename_to_sub_manifest_durs.values()))
+        restored_manifest_duration = sum(list(filename_to_restored_sub_manifest_durs.values()))
+
+        logger.info("duration in submanifests (for current datasplit): %.2f hrs", sub_manifest_duration / 60 / 60)
+        logger.info(
+            "duration restored (for current datasplit): %.2f hrs (%.2f%%), lost: %.2f hrs",
+            restored_manifest_duration / 60 / 60,
+            restored_manifest_duration / sub_manifest_duration * 100,
+            (sub_manifest_duration - restored_manifest_duration) / 60 / 60,
+        )
+
+        logger.info(
+            "Combining restored manifest for current datasplit into single manifest at %s", self.output_manifest_file
+        )
+
+        # duration in restored_submanifests
+        with open(self.output_manifest_file, 'w') as fout:
+            for book_id, spk_id in book_id_spk_ids_in_datasplit:
+                manifest = os.path.join(self.restored_submanifests_dir, f"{spk_id}_{book_id}.json")
+                if os.path.exists(manifest):
+                    with open(manifest, "r") as fin:
+                        for line in fin:
+                            fout.write(line)
diff --git a/build/lib/sdp/processors/datasets/mtedx/__init__.py b/build/lib/sdp/processors/datasets/mtedx/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
new file mode 100644
index 00000000..8c39257b
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
@@ -0,0 +1,84 @@
+import os
+from pathlib import Path
+from typing import List
+import librosa
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+MTEDX_URL = "https://www.openslr.org/resources/100/mtedx_{language_id}.tgz"
+
+class CreateInitialManifestMTEDX(BaseParallelProcessor):
+    """Processor to create initial manifest for the Multilingual TEDx (MTedX dataset.
+
+        Dataset link: https://www.openslr.org/100/
+
+        Downloads dataset for the specified language and creates initial manifest with the provided
+        audio and vtt files.
+
+        Args:
+            raw_data_dir (str): the directory where the downloaded data will be/is saved.
+                                This is also where the extracted and processed data will be.
+            data_split (str): "train", "dev" or "test".
+            language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
+            target_samplerate (int): sample rate (Hz) to use for resampling.
+            already_extracted: (bool): if True, we will not try to extract the raw data.
+                Defaults to False.
+
+        Returns:
+            This processor generates an initial manifest file with the following fields::
+
+                {
+                    "audio_filepath": <path to the audio file>,
+                    "vtt_filepath": <path to the corresponding vtt file>
+                    "duration": <duration of the audio in seconds>
+                }
+        """
+    def __init__(
+            self,
+            raw_data_dir: str,
+            language_id: str,
+            data_split: str,
+            already_extracted: bool = False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.language_id = language_id
+        self.data_split = data_split
+        self.already_extracted = already_extracted
+
+    def prepare(self):
+        """Downloading and extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+
+        url = MTEDX_URL.format(language_id=self.language_id)
+        if not (self.raw_data_dir / f"mtedx_{self.language_id}.tgz").exists():
+            download_file(url, str(self.raw_data_dir))
+
+        if not self.already_extracted:
+            extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
+            
+        data_folder = Path(self.raw_data_dir) / f"{self.language_id}-{self.language_id}"/ "data"/ self.data_split
+        self.audio_path_prefix = Path(data_folder) / "wav"
+        self.vtt_path_prefix = Path(data_folder) / "vtt"
+
+    def read_manifest(self):
+        """Creating entries of initial manifest with flac and vtt files"""
+        audio_filepaths = []
+        for audio_file in os.listdir(self.audio_path_prefix):
+            vtt_filepath = os.path.join(self.vtt_path_prefix, audio_file.split('.')[0] + "." + self.language_id  + ".vtt")
+            audio_filepath = os.path.join(self.audio_path_prefix, audio_file)
+            audio_filepaths.append((audio_filepath, vtt_filepath))
+        return audio_filepaths
+
+    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
+        """Processing the data entries."""
+        audio_filepath, vtt_filepath = data_entry
+
+        data = {
+            'audio_filepath': audio_filepath,
+            'vtt_filepath': vtt_filepath,
+            'duration': float(librosa.get_duration(path=audio_filepath)),
+        }
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/slr102/__init__.py b/build/lib/sdp/processors/datasets/slr102/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
new file mode 100644
index 00000000..949ff1b9
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
+# For example sudo apt-get install libsox-fmt-mp3
+import csv
+import glob
+import os
+from pathlib import Path
+
+from sox import Transformer
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import download_file, extract_archive
+
+DATASET_URL = "https://www.openslr.org/resources/102/ISSAI_KSC_335RS_v1.1_flac.tar.gz"
+
+
+class CreateInitialManifestSLR102(BaseParallelProcessor):
+    """Processor to create initial manifest for the Kazakh Speech Corpus (KSC) / OpenSLR102 dataset.
+
+    Dataset link: https://www.openslr.org/resources/102/
+
+    Extracts raw data for the specified language and creates an initial manifest
+    using the transcripts provided in the raw data.
+
+    Args:
+        raw_data_dir (str): the path to the directory containing the raw data archive file.
+        extract_archive_dir (str): directory where the extracted data will be saved.
+        resampled_audio_dir (str): directory where the resampled audio will be saved.
+        data_split (str): "train", "dev" or "test".
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription (with capitalization and punctuation)>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        extract_archive_dir: str,
+        resampled_audio_dir: str,
+        data_split: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.extract_archive_dir = extract_archive_dir
+        self.resampled_audio_dir = resampled_audio_dir
+        self.data_split = data_split
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        """Extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*.tar.gz")
+
+        if not tar_gz_files:
+            download_file(DATASET_URL, self.raw_data_dir)
+
+        elif len(tar_gz_files) > 1:
+            raise RuntimeError(f"Expecting exactly one *.tar.gz file in directory {self.raw_data_dir}")
+
+        data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
+
+        self.audio_path_prefix = Path(data_folder, "Audios_flac")
+        self.transcription_path_prefix = Path(data_folder, "Transcriptions")
+        self.transcription_path_file = Path(data_folder, "Meta", self.data_split).with_suffix(".csv")
+
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def read_manifest(self):
+        if self.transcription_path_file is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        with open(self.transcription_path_file, "rt", encoding="utf8") as csvfile:
+            reader = csv.DictReader(csvfile, delimiter=" ")
+            next(reader, None)  # skip the headers
+            dataset_entries = [row["uttID"] for row in reader]
+        return dataset_entries
+
+    def process_dataset_entry(self, utt_id: str):
+        with open(Path(self.transcription_path_prefix, utt_id).with_suffix(".txt"), "rt") as txtfile:
+            transcript_text = " ".join(txtfile.readlines()).strip()
+
+        audio_path = Path(self.audio_path_prefix, utt_id).with_suffix(".flac")
+        output_wav_path = Path(self.resampled_audio_dir, utt_id).with_suffix(".wav")
+
+        if not os.path.exists(output_wav_path):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
+
+        data = {
+            "audio_filepath": output_wav_path.as_posix(),
+            "text": transcript_text,
+        }
+
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/slr140/__init__.py b/build/lib/sdp/processors/datasets/slr140/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
new file mode 100644
index 00000000..2da79027
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import sox
+from tqdm import tqdm
+from tqdm.contrib.concurrent import thread_map
+
+from sdp.logging import logger
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.common import download_file, extract_archive
+
+DATASET_URL = "https://www.openslr.org/resources/140/{audio}.zip"
+
+AVAILABLE_AUDIOS = [
+    'audio2',
+    'audio3',
+    'audio4',
+    'audio5',
+]
+
+SPLIT_STATS = {
+    'test': 1 / 6,
+    'dev': 1 / 12,
+}
+
+
+class CreateInitialManifestSLR140(BaseParallelProcessor):
+    """Processor to create initial manifest for the SLR140 dataset.
+
+    This is an open source Kazakh speech corpus developed by
+    the Department of Artificial Intelligence and Big Data of Al-Farabi Kazakh National University.
+
+    Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        audios (list | str): should be the subset of the AVAILABLE_AUDIOS or a string "all" for taking all the available audios
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        audios: Union[List[str], str],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.audios = audios
+
+        if self.audios == "all":
+            self.audios = AVAILABLE_AUDIOS
+
+        if any([audio not in AVAILABLE_AUDIOS for audio in self.audios]):
+            raise ValueError(f"audios have to be one of {AVAILABLE_AUDIOS}")
+
+    def prepare(self):
+        """Downloading and extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        audio_urls = [DATASET_URL.format(audio=audio) for audio in self.audios]
+
+        thread_map(
+            download_file,
+            audio_urls,
+            [str(self.raw_data_dir)] * len(audio_urls),
+            max_workers=self.max_workers,
+            chunksize=self.chunksize,
+        )
+
+        for audio_url in audio_urls:
+            extract_archive(str(self.raw_data_dir / os.path.basename(audio_url)), str(self.raw_data_dir))
+
+        self.transcription_file = str(self.raw_data_dir / "{audio}" / "train.json")
+
+    def read_manifest(self):
+        if self.transcription_file is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        dataset_entries = []
+
+        for audio in self.audios:
+            transcription_file = self.transcription_file.format(audio=audio)
+
+            with open(transcription_file, "rt", encoding="utf-8-sig") as fin:
+                audio_dataset_entries = [json.loads(line) for line in fin.readlines()][0]
+                dataset_entries += audio_dataset_entries
+
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: str):
+        if len(data_entry) != 2:
+            raise RuntimeError(f"Input data is badly formatted! Bad line: {data_entry}")
+
+        audio_path = str(self.raw_data_dir / data_entry["wav"].replace("dataset/", ""))
+        data = {
+            "audio_filepath": audio_path,
+            "duration": float(sox.file_info.duration(audio_path)),
+            "text": data_entry["text"].strip(),
+        }
+
+        return [DataEntry(data=data)]
+
+
+class CustomDataSplitSLR140(BaseProcessor):
+    """Splits SLR140 data into train, dev or test subset.
+
+
+    Args:
+        data_split (str): "train", "dev" or "test".
+
+    Returns:
+        All the same fields as in the input manifest, but only a subset of
+        the data is retained.
+    """
+
+    def __init__(self, data_split: str, split_audio_dir: str, **kwargs):
+        super().__init__(**kwargs)
+        self.data_split = data_split
+        self.split_audio_dir = split_audio_dir
+
+    def process(self):
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            manifest_data = [json.loads(line) for line in fin.readlines()]
+
+        # sorting and fixing random seed for reproducibility
+        manifest_data = sorted(manifest_data, key=lambda x: x['audio_filepath'])
+        sample_idxs = list(range(len(manifest_data)))
+        rng = np.random.RandomState(0)
+        rng.shuffle(sample_idxs)
+
+        duration = sum([x['duration'] for x in manifest_data])
+        validation_duration, test_duration = (duration * SPLIT_STATS['dev'], duration * SPLIT_STATS['test'])
+
+        split_data = {}
+        split_data['dev'] = self._accumulate_samples(manifest_data, sample_idxs, validation_duration)
+        split_data['test'] = self._accumulate_samples(manifest_data, sample_idxs, test_duration)
+        split_data['train'] = (
+            [manifest_data[x] for x in sample_idxs],
+            sum([manifest_data[x]['duration'] for x in sample_idxs]),
+        )
+
+        number_of_entries = 0
+        total_duration = 0
+
+        split_audio_dir = os.path.join(self.split_audio_dir, self.data_split)
+        os.makedirs(split_audio_dir, exist_ok=True)
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for data_entry in tqdm(split_data[self.data_split][0]):
+                audio_rel_path = os.path.relpath(
+                    data_entry['audio_filepath'], os.path.join(self.split_audio_dir, "audios")
+                )
+                split_filepath = os.path.join(split_audio_dir, audio_rel_path)
+                os.makedirs(os.path.dirname(split_filepath), exist_ok=True)
+                os.rename(data_entry['audio_filepath'], split_filepath)
+                data_entry['audio_filepath'] = split_filepath
+
+                json.dump(data_entry, fout, ensure_ascii=False)
+                number_of_entries += 1
+                total_duration += data_entry["duration"]
+                fout.write("\n")
+
+        logger.info("Total number of entries after processing: %d", number_of_entries)
+        logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
+
+    def _accumulate_samples(
+        self, manifest_data: List[dict], sample_idxs: List[int], duration_threshold: int
+    ) -> Tuple[List[dict], float]:
+        """Create a subset of the manifest data having duration less than duration_threshold.
+
+        Args:
+            manifest_data: data for the manifest file
+            sample_idxs: list of available indices to pick a sample from the manifest data
+            duration_threshold: maximum duration of the samples to be included in the subset
+
+        Returns:
+            tuple: The accumulated subset of the manifest data and total accumulated duration
+        """
+        accumulated_data = []
+        accumulated_duration = 0
+        while accumulated_duration <= duration_threshold:
+            sample_idx = sample_idxs.pop(0)
+            accumulated_data.append(manifest_data[sample_idx])
+            accumulated_duration += manifest_data[sample_idx]['duration']
+        return accumulated_data, accumulated_duration
diff --git a/build/lib/sdp/processors/datasets/slr83/__init__.py b/build/lib/sdp/processors/datasets/slr83/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
new file mode 100644
index 00000000..030360f7
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import sox
+from tqdm import tqdm
+
+from sdp.logging import logger
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.common import download_file, extract_archive
+
+DATASET_URL = "https://www.openslr.org/resources/83/{dialect}.zip"
+
+AVAILABLE_DIALECTS = [
+    'irish_english_male',
+    'midlands_english_female',
+    'midlands_english_male',
+    'northern_english_female',
+    'northern_english_male',
+    'scottish_english_female',
+    'scottish_english_male',
+    'southern_english_female',
+    'southern_english_male',
+    'welsh_english_female',
+    'welsh_english_male',
+]
+
+EXPECTED_SPLIT_STATS = {
+    ('irish_english_male', 'test'): (102, 604.757),
+    ('irish_english_male', 'train'): (293, 1656.917),
+    ('irish_english_male', 'dev'): (53, 302.763),
+    ('midlands_english_female', 'test'): (90, 608.341),
+    ('midlands_english_female', 'train'): (94, 636.843),
+    ('midlands_english_female', 'dev'): (45, 306.261),
+    ('midlands_english_male', 'test'): (106, 604.672),
+    ('midlands_english_male', 'train'): (270, 1568.683),
+    ('midlands_english_male', 'dev'): (52, 301.227),
+    ('northern_english_female', 'test'): (267, 1803.435),
+    ('northern_english_female', 'train'): (330, 2146.816),
+    ('northern_english_female', 'dev'): (145, 906.496),
+    ('northern_english_male', 'test'): (587, 3607.467),
+    ('northern_english_male', 'train'): (1126, 7003.136),
+    ('northern_english_male', 'dev'): (298, 1807.957),
+    ('scottish_english_female', 'test'): (284, 1801.301),
+    ('scottish_english_female', 'train'): (426, 2681.344),
+    ('scottish_english_female', 'dev'): (142, 906.24),
+    ('scottish_english_male', 'test'): (612, 3603.883),
+    ('scottish_english_male', 'train'): (663, 3994.027),
+    ('scottish_english_male', 'dev'): (306, 1800.96),
+    ('southern_english_female', 'test'): (572, 3600.128),
+    ('southern_english_female', 'train'): (3124, 19213.312),
+    ('southern_english_female', 'dev'): (293, 1804.8),
+    ('southern_english_male', 'test'): (582, 3600.555),
+    ('southern_english_male', 'train'): (3295, 20210.773),
+    ('southern_english_male', 'dev'): (296, 1807.445),
+    ('welsh_english_female', 'test'): (239, 1805.739),
+    ('welsh_english_female', 'train'): (774, 5621.675),
+    ('welsh_english_female', 'dev'): (125, 905.387),
+    ('welsh_english_male', 'test'): (557, 3605.931),
+    ('welsh_english_male', 'train'): (726, 4660.651),
+    ('welsh_english_male', 'dev'): (286, 1805.909),
+}
+
+
+class CreateInitialManifestSLR83(BaseParallelProcessor):
+    """Processor to create initial manifest for the SLR83 dataset.
+
+    This is a dataset introduced in `Open-source Multi-speaker Corpora of the
+    English Accents in the British Isles <https://aclanthology.org/2020.lrec-1.804/>`_.
+
+    Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        dialect (str): should be one of the
+
+            * ``irish_english_male``
+            * ``midlands_english_female``
+            * ``midlands_english_male``
+            * ``northern_english_female``
+            * ``northern_english_male``
+            * ``scottish_english_female``
+            * ``scottish_english_male``
+            * ``southern_english_female``
+            * ``southern_english_male``
+            * ``welsh_english_female``
+            * ``welsh_english_male``
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        dialect: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.dialect = dialect
+        if dialect not in AVAILABLE_DIALECTS:
+            raise ValueError(f"dialect has to be one of {AVAILABLE_DIALECTS}")
+
+    def prepare(self):
+        """Downloading and extracting data (unless already done)."""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        url = DATASET_URL.format(dialect=self.dialect)
+
+        if not (self.raw_data_dir / f"{self.dialect}.zip").exists():
+            download_file(url, str(self.raw_data_dir))
+
+        extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
+        self.transcription_file = str(self.raw_data_dir / "line_index.csv")
+
+    def read_manifest(self):
+        if self.transcription_file is None:
+            raise RuntimeError("self.process has to be called before processing the data.")
+
+        with open(self.transcription_file, "rt", encoding="utf8") as fin:
+            dataset_entries = fin.readlines()
+
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: str):
+        split_entry = data_entry.split(", ")
+        if len(split_entry) != 3:
+            raise RuntimeError(f"Input data is badly formatted! Bad line: {data_entry}")
+
+        _, utt_id, transcript_text = split_entry
+        audio_path = str(self.raw_data_dir / (utt_id + ".wav"))
+        data = {
+            "audio_filepath": audio_path,
+            "duration": float(sox.file_info.duration(audio_path)),
+            "text": transcript_text.strip(),
+        }
+
+        return [DataEntry(data=data)]
+
+
+class CustomDataSplitSLR83(BaseProcessor):
+    """Splits SLR83 data into train, dev or test subset.
+
+    The original paper does not provide train/dev/test splits, so we include a
+    custom processing that can be used as a standardized split to compare
+    results. For more details on this data split see `Damage Control During
+    Domain Adaptation for Transducer Based Automatic Speech Recognition
+    <https://arxiv.org/abs/2210.03255>`_.
+
+    .. note::
+        All data dropping has to be done before the split. We will check the
+        total number of files to be what is expected in the reference split.
+        But if you add any custom pre-processing that changes duration or
+        number of files, your splits will likely be different.
+
+    Args:
+        dialect (str): same as in the :class:`sdp.processors.CreateInitialManifestSLR83`.
+        data_split (str): "train", "dev" or "test".
+
+    Returns:
+        All the same fields as in the input manifest, but only a subset of
+        the data is retained.
+    """
+
+    def __init__(self, dialect, data_split, **kwargs):
+        super().__init__(**kwargs)
+        self.dialect = dialect
+        self.data_split = data_split
+
+    def process(self):
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            manifest_data = [json.loads(line) for line in fin.readlines()]
+
+        # sorting and fixing random seed for reproducibility
+        manifest_data = sorted(manifest_data, key=lambda x: x['audio_filepath'])
+        sample_idxs = list(range(len(manifest_data)))
+        rng = np.random.RandomState(0)
+        rng.shuffle(sample_idxs)
+
+        duration = sum([x['duration'] for x in manifest_data])
+        validation_duration, test_duration = 1800, 3600  # 30 minutes, 1 hour
+        if duration <= 3600:  # 1 hour
+            validation_duration, test_duration = 300, 600  # 5 minutes, 10 minutes
+        elif duration > 3600 and duration <= 9000:  # 2.5 hours
+            validation_duration, test_duration = 900, 1800  # 15 minutes, 30 minutes
+
+        split_data = {}
+        split_data['dev'] = self._accumulate_samples(manifest_data, sample_idxs, validation_duration)
+        split_data['test'] = self._accumulate_samples(manifest_data, sample_idxs, test_duration)
+        split_data['train'] = (
+            [manifest_data[x] for x in sample_idxs],
+            sum([manifest_data[x]['duration'] for x in sample_idxs]),
+        )
+
+        for split in ['train', 'dev', 'test']:
+            actual_stats = (len(split_data[split][0]), round(split_data[split][1], 3))
+            if EXPECTED_SPLIT_STATS[(self.dialect, split)] != actual_stats:
+                raise RuntimeError(
+                    f"Generated split stats (num files, duration) = {actual_stats}. "
+                    f"But expected to see {EXPECTED_SPLIT_STATS[(self.dialect, split)]}. "
+                    f"Did you add some custom pre-processing that changes number of files or duration?"
+                )
+
+        number_of_entries = 0
+        total_duration = 0
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for data_entry in tqdm(split_data[self.data_split][0]):
+                json.dump(data_entry, fout, ensure_ascii=False)
+                number_of_entries += 1
+                total_duration += data_entry["duration"]
+                fout.write("\n")
+
+        logger.info("Total number of entries after processing: %d", number_of_entries)
+        logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
+
+    def _accumulate_samples(
+        self, manifest_data: List[dict], sample_idxs: List[int], duration_threshold: int
+    ) -> Tuple[List[dict], float]:
+        """Create a subset of the manifest data having duration less than duration_threshold.
+
+        Args:
+            manifest_data: data for the manifest file
+            sample_idxs: list of available indices to pick a sample from the manifest data
+            duration_threshold: maximum duration of the samples to be included in the subset
+
+        Returns:
+            tuple: The accumulated subset of the manifest data and total accumulated duration
+        """
+        accumulated_data = []
+        accumulated_duration = 0
+        while accumulated_duration <= duration_threshold:
+            sample_idx = sample_idxs.pop(0)
+            accumulated_data.append(manifest_data[sample_idx])
+            accumulated_duration += manifest_data[sample_idx]['duration']
+        return accumulated_data, accumulated_duration
diff --git a/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py b/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
new file mode 100644
index 00000000..d9155f92
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py b/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
new file mode 100644
index 00000000..27117f2a
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import typing
+import gdown
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import extract_archive
+from sdp.logging import logger
+
+
+class CreateInitialManifestUzbekvoice(BaseProcessor):
+    """
+    Processor to create initial manifest for the Uzbekvoice dataset.
+
+    Will download all files, extract them, and create a manifest file with the
+    "audio_filepath", "text" and "duration" fields.
+
+    Args:    
+        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = raw_data_dir
+
+    def download_extract_files(self, dst_folder: str) -> None:
+        """downloading and extracting files"""
+
+        os.makedirs(dst_folder, exist_ok=True)
+
+        # downloading all files
+        # for big files google drive doesn't allow to try downlaoding them more than once
+        # so, in case of receiveing gdown error we need to download them manually
+
+        #check if clisp.zip and uzbekvoice-dataset.zip are already in dst_folder
+        if os.path.exists(os.path.join(dst_folder, 'clips.zip')) and os.path.exists(os.path.join(dst_folder, 'uzbekvoice-dataset.zip')):
+            print("Files already exist in the folder. Skipping download.")
+        else:
+            print(f"Downloading files from {self.URL}...")
+            try:
+                gdown.download_folder(self.URL, output=dst_folder)
+            except Exception as e:
+                print("Error occured while downloading files from google drive. Please download them manually.")
+                print("URL: ", self.URL)
+                print("Error: ", e)
+        for file in glob.glob(os.path.join(dst_folder, '*.zip')):
+            extract_archive(file, str(dst_folder), force_extract=True)
+            print(f"Extracted {file}")
+
+
+    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
+        """
+        Parse transcript JSON file and put it inside manifest.
+        """
+
+        entries = []
+        root = os.path.join(self.raw_data_dir, 'clips')
+        number_of_entries = 0
+        total_duration = 0
+        # parse json file and collect audio file path, transcript and lenght in entries
+        with open(file_path, encoding="utf-8") as fin:
+            data = json.load(fin)
+            for entry in data:
+                audio_file = os.path.join(root, entry["client_id"], entry["original_sentence_id"] + '.mp3')
+                transcript = entry["original_sentence"]
+                utter_length = entry["clip_duration"]
+                number_of_entries += 1
+                entries.append(
+                    {
+                        "audio_filepath": os.path.abspath(audio_file), 
+                        "text": transcript, 
+                        "duration": utter_length
+                    }
+                )
+            
+
+            logger.info("Total number of entries after processing: %d", number_of_entries)
+            logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
+
+        return entries
+
+    def process_data(self, data_folder: str, manifest_file: str) -> None:
+        entries = self.process_transcript(os.path.join(data_folder, "uzbekvoice-dataset", "voice_dataset.json"))
+
+        with open(manifest_file, "w", encoding="utf-8") as fout:
+            for m in entries:
+                fout.write(json.dumps(m, ensure_ascii=False) + "\n")
+
+
+
+    def process(self):
+        self.download_extract_files(self.raw_data_dir)
+        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/__init__.py b/build/lib/sdp/processors/datasets/voxpopuli/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
new file mode 100644
index 00000000..b7c47ba5
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+from pathlib import Path
+
+import sox
+from sox import Transformer
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+VOXPOPULI_URL = "https://github.com/facebookresearch/voxpopuli"
+
+
+class CreateInitialManifestVoxpopuli(BaseParallelProcessor):
+    """Processor to create initial manifest for the VoxPopuli dataset.
+
+    Dataset link: https://github.com/facebookresearch/voxpopuli/
+
+    Downloads and unzips raw VoxPopuli data for the specified language,
+    and creates an initial manifest using the transcripts provided in the
+    raw data.
+
+    .. note::
+        This processor will install a couple of Python packages, including
+        PyTorch, so it might be a good idea to run it in an isolated Python
+        environment.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data will be/is saved.
+        language_id (str): the language of the data you wish to be downloaded.
+            E.g., "en", "es", "it", etc.
+        data_split (str): "train", "dev" or "test".
+        resampled_audio_dir (str): the directory where the resampled wav
+            files will be stored.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription (with provided normalization)>,
+                "raw_text": <original transcription (without normalization)>,
+                "speaker_id": <speaker id>,
+                "gender": <speaker gender>,
+                "age": <speaker age>,
+                "is_gold_transcript": <whether the transcript has been verified>,
+                "accent": <speaker accent, if known>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        language_id: str,
+        data_split: str,
+        resampled_audio_dir: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.language_id = language_id
+        self.data_split = data_split
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        """Downloading data (unless already done)"""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        if not (self.raw_data_dir / "transcribed_data").exists():
+            # TODO: some kind of isolated environment?
+            if not os.path.exists(self.raw_data_dir / 'voxpopuli'):
+                logger.info("Downloading voxpopuli and installing requirements")
+                subprocess.run(f"git clone {VOXPOPULI_URL} {self.raw_data_dir / 'voxpopuli'}", check=True, shell=True)
+                subprocess.run(
+                    f"pip install -r {self.raw_data_dir / 'voxpopuli' / 'requirements.txt'}", check=True, shell=True
+                )
+            if not os.path.exists(self.raw_data_dir / 'raw_audios'):
+                logger.info("Downloading raw audios")
+                subprocess.run(
+                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                    f"python -m voxpopuli.download_audios --root {self.raw_data_dir} --subset asr",
+                    check=True,
+                    shell=True,
+                )
+            if not os.path.exists(self.raw_data_dir / 'transcribed_data' / self.language_id):
+                logger.info("Segmenting and transcribing the data")
+                subprocess.run(
+                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                    f"python -m voxpopuli.get_asr_data  --root {self.raw_data_dir} --lang {self.language_id}",
+                    check=True,
+                    shell=True,
+                )
+
+    def read_manifest(self):
+        with open(
+            self.raw_data_dir / "transcribed_data" / self.language_id / f"asr_{self.data_split}.tsv",
+            "rt",
+            encoding="utf8",
+        ) as fin:
+            dataset_entries = fin.readlines()[1:]  # skip header line
+
+        return dataset_entries
+
+    def process_dataset_entry(self, data_entry: str):
+        if len(data_entry.split("\t")) != 8:
+            raise RuntimeError(f"have more/less than 7 tabs in line {data_entry}")
+
+        utt_id, raw_text, norm_text, spk_id, _, gender, is_gold_transcript, accent = data_entry.split("\t")
+        year = utt_id[:4]
+
+        src_audio_path = os.path.join(self.raw_data_dir, "transcribed_data", self.language_id, year, utt_id + ".ogg")
+        tgt_wav_path = os.path.join(self.resampled_audio_dir, utt_id + ".wav")
+
+        if not os.path.exists(os.path.dirname(tgt_wav_path)):
+            os.makedirs(os.path.dirname(tgt_wav_path), exist_ok=True)
+        if not os.path.exists(tgt_wav_path):
+            tfm = Transformer()
+            tfm.rate(samplerate=self.target_samplerate)
+            tfm.channels(n_channels=self.target_nchannels)
+            tfm.build(input_filepath=src_audio_path, output_filepath=tgt_wav_path)
+
+        data = {
+            "audio_filepath": tgt_wav_path,
+            "duration": float(sox.file_info.duration(tgt_wav_path)),
+            "text": norm_text,
+            "raw_text": raw_text,
+            "speaker_id": spk_id,
+            "gender": gender,
+            "is_gold_transcript": is_gold_transcript,
+            "accent": accent,
+        }
+        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py b/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
new file mode 100644
index 00000000..ebe86083
--- /dev/null
+++ b/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import string
+from typing import Dict
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+def is_same(orig_word, norm_word):
+    # word is the same, except last symbol, which could indicate punctuation
+    if orig_word[-1] in string.punctuation and orig_word[:-1].lower() == norm_word.lower():
+        return True, 1
+    # word is the same, except last symbol, which could indicate punctuation
+    # (but by mistake it's been put in norm text)
+    if norm_word[-1] in string.punctuation and norm_word[:-1].lower() == orig_word.lower():
+        return True, 0
+    # word is the same, but casing could be different
+    if orig_word.lower() == norm_word.lower():
+        return True, 1
+
+    return False, None
+
+
+def restore_pc(orig_words, norm_words):
+    # separate out any "¿" so they have a space either side
+    orig_words = orig_words.replace("¿", " ¿ ")
+    orig_words = " ".join(orig_words.split())
+    norm_words = norm_words.replace("¿", " ¿ ")
+    norm_words = " ".join(norm_words.split())
+
+    orig_words_list = orig_words.split()
+    norm_words_list = norm_words.split()
+
+    # copy so not to corrupt
+    # merging any commas and dots between numbers right away to simplify logic below
+    orig_text = list([re.sub(r'(\d)[\.,](\d)', r"\1\2", word) for word in orig_words_list])
+    norm_text = list(norm_words_list)
+    # to simplify logic below, so that we can assume last word always matches
+    orig_text.append("end_text")
+    norm_text.append("end_text")
+
+    idx_orig = 0
+    idx_norm = 0
+    merged_text = []
+    while idx_orig < len(orig_text) and idx_norm < len(norm_text):
+        same, is_orig = is_same(orig_text[idx_orig], norm_text[idx_norm])
+        if same:
+            merged_text.append(orig_text[idx_orig] if is_orig else norm_text[idx_norm])
+            idx_orig += 1
+            idx_norm += 1
+            continue
+
+        # add all "¿" 'words' in orig_text (which didnt have match in norm_text) to merged_text
+        if orig_text[idx_orig] == "¿":
+            merged_text.append("¿")
+            idx_orig += 1
+            continue
+
+        # checking if first letter is a number, but the whole word is not - that happens
+        # on typos like 37a which should really be 37 a. So fixing those
+        # another case is for number + punctuation, like 2017, - handling separately
+        # another case is for numbers separated by comma, like this "1,5". Those are spelled out
+        # separately in normalized form, so just removing the comma here
+        add_punct = ""
+        if orig_text[idx_orig][0].isdigit() and not orig_text[idx_orig].isdigit():
+            number, word = re.split('(\d+)', orig_text[idx_orig])[1:]
+            orig_text[idx_orig] = number
+            if word in string.punctuation:
+                add_punct = word
+            else:
+                orig_text.insert(idx_orig + 1, word)
+
+        # another annoying case is if typo ends with number like here "dell'11"
+        # same logic, but need to go back to the first check, so doing "continue" below
+        if orig_text[idx_orig][-1].isdigit() and not orig_text[idx_orig].isdigit():
+            word, number = re.split('(\d+)', orig_text[idx_orig])[:-1]
+            orig_text[idx_orig] = word
+            orig_text.insert(idx_orig + 1, number)
+            continue
+
+        # word is different, but original is a number - take from normalized in this case until
+        # get same word again (as number might be represented with multiple words)
+        # also handling case for number + punctuation
+        while orig_text[idx_orig].isdigit():
+            idx_orig += 1
+
+        while idx_norm < len(norm_text) and not is_same(orig_text[idx_orig], norm_text[idx_norm])[0]:
+            merged_text.append(norm_text[idx_norm])
+            idx_norm += 1
+
+        # if there is any trailing punctuation from last digit, let's add it
+        merged_text[-1] = merged_text[-1] + add_punct
+
+    if idx_norm != len(norm_text):
+        print(idx_orig, idx_norm, len(orig_text), len(norm_text), orig_text, norm_text, merged_text)
+        raise RuntimeError("Something went wrong during merging")
+
+    # merging all "¿ " to the next word and removing end_text token
+    norm_text = " ".join(merged_text[:-1]).replace("¿ ", "¿")
+
+    return norm_text
+
+
+class NormalizeFromNonPCTextVoxpopuli(BaseParallelProcessor):
+    """Tries to restore punctuation and capitalization from the un-normalized text version.
+
+    VoxPopuli contains two versions of the transcription - original (non-normalized,
+    but with punctuation and capitalization) and normalized (without punctuation or capitalization),
+    but with digits and other forms normalized. This processor can be used
+    to map the normalized and non-normalized versions and produce a normalized
+    version with restored punctuation and capitalization.
+
+    .. note::
+        The current map logic is highly heuristical and might not work for all
+        languages. The processor will return ``n/a`` for any text it was not able
+        to restore, so make sure you check how much data was removed and
+        consider updating the heuristics to retain more data.
+
+    Args:
+        restored_text_field (str): the field where the recovered text (or ``n/a``)
+            will be stored. Defaults to "text".
+        raw_text_key (str): which field contains the original text without normalization.
+            Defaults to "raw_text".
+        norm_text_key (str): which field contains the normalized text.
+            Defaults to "provided_norm_text".
+
+    Returns:
+        All the same data as in the input manifest with an additional key::
+
+            <restored_text_field>: <restored text or n/a if mapping failed>``
+    """
+
+    def __init__(
+        self,
+        restored_text_field: str = "text",
+        raw_text_key: str = "raw_text",
+        norm_text_key: str = "provided_norm_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.restored_text_field = restored_text_field
+        self.raw_text_key = raw_text_key
+        self.norm_text_key = norm_text_key
+
+    def process_dataset_entry(self, data_entry: Dict):
+        try:
+            restored_norm_text = restore_pc(data_entry[self.raw_text_key], data_entry[self.norm_text_key])
+        except:
+            logger.warning(
+                f"Failed to restore normalization.\nRaw text: %s\nNormalized text: %s",
+                data_entry[self.raw_text_key],
+                data_entry[self.norm_text_key],
+            )
+            restored_norm_text = "n/a"
+        data_entry[self.restored_text_field] = restored_norm_text
+        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/huggingface/__init__.py b/build/lib/sdp/processors/huggingface/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/huggingface/create_initial_manifest.py b/build/lib/sdp/processors/huggingface/create_initial_manifest.py
new file mode 100644
index 00000000..e8abfb55
--- /dev/null
+++ b/build/lib/sdp/processors/huggingface/create_initial_manifest.py
@@ -0,0 +1,92 @@
+import os
+import glob
+
+import soundfile as sf
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.logging import logger
+from typing import Optional
+
+class CreateInitialManifestHuggingFace(BaseParallelProcessor):
+    """Processor to create initial manifest for HuggingFace dataset.
+
+    Downloads HuggingFace dataset and creates an initial manifest.
+
+    Args:
+        dataset_name (str): the name of the dataset. E.g., "tarteel-ai/everyayah"
+        raw_data_dir (str): the path to the directory containing the raw dataset files.
+        resampled_audio_dir (str): directory where the resampled audio will be saved.
+        data_split (str): "train", "validation" or "test".
+        already_downloaded (bool): if True, we will not try to load dataset from HuggingFace.
+            Defaults to False.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+        
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+                "text": <transcription (with capitalization and punctuation)>,
+            }
+    """
+
+    def __init__(
+        self,
+        dataset_name: str,
+        resampled_audio_dir: str,
+        data_split: str,
+        raw_data_dir: Optional[str] = None,
+        already_downloaded: bool = False,
+        target_samplerate: int = 16000,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.data_split = data_split
+        self.target_samplerate = target_samplerate
+        self.resampled_audio_dir = resampled_audio_dir
+        self.dataset_name = dataset_name
+        self.raw_data_dir = raw_data_dir
+        self.already_downloaded = already_downloaded
+
+    def prepare(self):
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def read_manifest(self):
+        import datasets
+        
+        # checking if dataset should be loaded from disk
+        if self.already_downloaded:
+            if os.path.exists(self.raw_data_dir):
+                hf_files = glob.glob(f'{self.raw_data_dir}/*.hf')
+                self.dataset = datasets.load_from_disk(os.path.join(self.raw_data_dir, hf_files[0]))
+            else:
+                logger.info("Dataset not found locally. Initiating download from Hugging Face.")
+        else:
+            logger.info(f"Initiating download of dataset '{self.dataset_name}' from Hugging Face.")
+            self.dataset = datasets.load_dataset(self.dataset_name, split=self.data_split)
+            logger.info(f"Finished download of dataset '{self.dataset_name}' from Hugging Face.")
+        return range(0, len(self.dataset))
+
+    def process_dataset_entry(self, data_id):
+        sample_data = self.dataset[data_id]
+        sample_audio = sample_data["audio"]["array"]
+        audio_filepath = os.path.join(self.resampled_audio_dir, f"{data_id}.wav")
+        sf.write(
+            audio_filepath,
+            sample_audio,
+            self.target_samplerate,
+        )
+        duration = len(sample_audio) / self.target_samplerate
+        text = sample_data["text"]
+
+        return [
+            DataEntry(
+                data={
+                    "audio_filepath": os.path.join("audios", f"{data_id}.wav"),
+                    "duration": duration,
+                    "text": text,
+                }
+            )
+        ]
\ No newline at end of file
diff --git a/build/lib/sdp/processors/huggingface/speech_recognition.py b/build/lib/sdp/processors/huggingface/speech_recognition.py
new file mode 100644
index 00000000..2e64e7c4
--- /dev/null
+++ b/build/lib/sdp/processors/huggingface/speech_recognition.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+from tqdm import tqdm
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import load_manifest
+from typing import Optional
+
+class ASRTransformers(BaseProcessor):
+    """This processor transcribes audio files using HuggingFace ASR Transformer models.
+
+    It processes audio files from the manifest and adds transcriptions using the specified
+    pre-trained model from HuggingFace.
+
+    Args:
+        pretrained_model (str): Name of pretrained model on HuggingFace.
+        output_text_key (str): Key to save transcription result in the manifest.
+        input_audio_key (str): Key to read audio file paths from the manifest. Default: "audio_filepath".
+        input_duration_key (str): Key for audio duration in the manifest. Default: "duration".
+        device (str): Inference device (e.g., "cuda", "cpu"). Default: None.
+        batch_size (int): Inference batch size. Default: 1.
+        chunk_length_s (int): Length of audio chunks in seconds. Default: 0.
+        torch_dtype (str): Tensor data type for model inference. Default: "float32".
+        generate_task (str): Task type for generation. Default: "transcribe".
+        generate_language (str): Language for generation. Default: "english".
+        max_new_tokens (int, Optional): Maximum number of new tokens to generate. Default: None.
+
+    Returns:
+        A manifest with transcribed text added to each entry under the specified output_text_key.
+
+    """
+
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_key: str,
+        input_audio_key: str = "audio_filepath",
+        input_duration_key: str = "duration",
+        device: str = None,
+        batch_size: int = 1,
+        chunk_length_s: int = 0,
+        torch_dtype: str = "float32",
+        generate_task: str = "transcribe",
+        generate_language: str = "english",
+        max_new_tokens: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        try:
+            import torch
+            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+        except:
+            raise ImportError("Need to install transformers: pip install accelerate transformers")
+
+        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
+        self.pretrained_model = pretrained_model
+        self.input_audio_key = input_audio_key
+        self.output_text_key = output_text_key
+        self.input_duration_key = input_duration_key
+        self.device = device
+        self.batch_size = batch_size
+        self.chunk_length_s = chunk_length_s
+        self.generate_task = generate_task
+        self.generate_language = generate_language
+        self.max_new_tokens = max_new_tokens
+        if torch_dtype == "float32":
+            self.torch_dtype = torch.float32
+        elif torch_dtype == "float16":
+            self.torch_dtype = torch.float16
+        else:
+            raise NotImplementedError(torch_dtype + " is not implemented!")
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda:0"
+            else:
+                self.device = "cpu"
+
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+        )
+        self.model.to(self.device)
+
+        # Check if using Whisper/Seamless or NVIDIA model based on the model name
+        self.is_whisper_or_seamless = any(x in self.pretrained_model.lower() for x in ['whisper', 'seamless'])
+        
+        # Only set language in generation config for Whisper/Seamless models
+        if self.is_whisper_or_seamless and self.generate_language:
+            self.model.generation_config.language = self.generate_language
+
+        processor = AutoProcessor.from_pretrained(self.pretrained_model)
+
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=self.max_new_tokens,
+            chunk_length_s=self.chunk_length_s,
+            batch_size=self.batch_size,
+            return_timestamps=self.is_whisper_or_seamless,  # Only set return_timestamps for Whisper/Seamless models
+            torch_dtype=self.torch_dtype,
+            device=self.device,
+        )
+
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
+        json_list_sorted = sorted(json_list, key=lambda d: d[self.input_duration_key], reverse=True)
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        with Path(self.output_manifest_file).open("w") as f:
+            start_index = 0
+            for _ in tqdm(range(len(json_list_sorted) // self.batch_size)):
+                batch = json_list_sorted[start_index : start_index + self.batch_size]
+                start_index += self.batch_size
+                audio_files = [item[self.input_audio_key] for item in batch]
+                
+                # Only pass generate_kwargs for Whisper/Seamless models
+                if self.is_whisper_or_seamless and self.generate_language and self.generate_task:
+                    results = self.pipe(
+                        audio_files, generate_kwargs={"language": self.generate_language, "task": self.generate_task}
+                    )
+                else:
+                    results = self.pipe(audio_files)
+
+                for i, item in enumerate(batch):
+                    item[self.output_text_key] = results[i]["text"]
+                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/langs/__init__.py b/build/lib/sdp/processors/langs/__init__.py
new file mode 100644
index 00000000..4fc50543
--- /dev/null
+++ b/build/lib/sdp/processors/langs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/processors/langs/arabic.py b/build/lib/sdp/processors/langs/arabic.py
new file mode 100644
index 00000000..2ebe444b
--- /dev/null
+++ b/build/lib/sdp/processors/langs/arabic.py
@@ -0,0 +1,183 @@
+import re
+import unicodedata
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+# Arabic letters
+HAMZA = "\u0621"
+ALEF_MADDA = "\u0622"
+ALEF_HAMZA_ABOVE = "\u0623"
+WAW_HAMZA = "\u0624"
+ALEF_HAMZA_BELOW = "\u0625"
+YEH_HAMZA_ABOVE = "\u0626"
+ALEF = "\u0627"
+BEH = "\u0628"
+TEH_MARBUTA = "\u0629"
+TEH = "\u062A"
+THEH = "\u062B"
+JEEM = "\u062C"
+HAH = "\u062D"
+KHAH = "\u062E"
+DAL = "\u062F"
+THAL = "\u0630"
+REH = "\u0631"
+ZAIN = "\u0632"
+SEEN = "\u0633"
+SHEEN = "\u0634"
+SAD = "\u0635"
+DAD = "\u0636"
+TAH = "\u0637"
+ZAH = "\u0638"
+AIN = "\u0639"
+GHAIN = "\u063A"
+FEH = "\u0641"
+QAF = "\u0642"
+KAF = "\u0643"
+LAM = "\u0644"
+MEEM = "\u0645"
+NOON = "\u0646"
+HEH = "\u0647"
+WAW = "\u0648"
+ALEF_MAKSURA = "\u0649"
+YEH = "\u064A"
+
+# Harakats (diacritics)
+FATHAT = "\u064E"
+KASRAH = "\u0650"
+DAMMAH = "\u064F"
+SUKUN = "\u0652"
+SHADDAH = "\u0651"
+KASRATAN = "\u064D"
+DAMMATAN = "\u064C"
+FATHATAN = "\u064B"
+
+# Ligatures
+LAM_ALEF = u'\uFEFB'
+LAM_ALEF_HAMZA_ABOVE = u'\uFEF7'
+LAM_ALEF_HAMZA_BELOW = u'\uFEF9'
+LAM_ALEF_MADDA_ABOVE = u'\uFEF5'
+LIGATURES=(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
+
+# Punctuation marks
+QUESTION_MARK = "\u061F"
+SAMICOLON = "\u061B"
+COMMA = "\u060C"
+
+DIACRITICS = [chr(x) for x in range(0x0600, 0x06ff) if unicodedata.category(chr(x)) == "Mn"]
+PUNCTUATION_MARKS = ["?", "!", ":", ";", "-", ".", ",", "؟","،", "؛"]
+ALEFS = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW)
+
+class ArabicTextPreprocessor(BaseParallelProcessor):
+    """Class for Arabic text preprocessing.
+
+    Operates on the text in the ``input_text_key``, and saves output text in
+    the ``output_text_key``.
+    
+    Args:
+        input_text_key (str):       the text field that will be the input to the processor.
+        output_text_key (str):      the text field that will contain processed text.
+        remove_extra_spaces (bool): replaces consequent spaces by one. Defaults to True.
+        remove_empty_lines (bool):  joins multiline input into single-line text. Defaults to True.
+        remove_diacritics (bool):   removes Arabic diacritical marks from the input text. Defaults to False.
+        remove_punctuation (bool):  removes punctuation marks from the input text. Defaults to False.
+            Processed punctuation marks are: Question mark, Exclamation mark, Colon,Semicolon,
+            Hypen-Minus, Full stop, Comma, Arabic Question Mark, Arabic Comma, Arabic Semicolon.
+        remove_tatweel (bool):      removes tatweel justification sign from the text. Defaults to False.
+        apply_nfkc (bool):   applies compatability decomposition followed by canonical composition.
+            Useful for replacing Arabic letters positional forms with general unicode and ensuring consistent diacritical marks ordering.
+            Find more here https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize.
+            Defaults to False.
+        normalize (bool): normalizes the input text. Normalization includes:    removing diacritical marks,
+            normalization of letter `ALEF`-- `ALEF_HAMZA_BELOW`, `ALEF_HAMZA_ABOVE`, `ALEF_MADDA_ABOVE` will be replaced by `ALEF`,
+            normalization of ligatures: `LAM_ALEF`, `LAM_ALEF_HAMZA_ABOVE`, `LAM_ALEF_HAMZA_BELOW`, `LAM_ALEF_MADDA_ABOVE` ligatures will be replaces by two letters `LAM` and `ALEF`.
+            letter `TEH_MARBUTA` will be replaced by `HEH`. Defaults to False.
+    """
+    def __init__(
+        self,
+        input_text_key: str = "text",
+        output_text_key: str = "text",
+        remove_extra_spaces: bool = True,
+        remove_empty_lines: bool = True,
+        remove_diacritics: bool = False,
+        remove_punctuation: bool = False,
+        remove_tatweel: bool = False,
+        normalize_ligature: bool = False,
+        apply_nfkc: bool = False,
+        normalize: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_text_key = input_text_key
+        self.output_text_key = output_text_key
+        self.output_text_key = output_text_key
+        self.remove_extra_spaces = remove_extra_spaces
+        self.remove_empty_lines = remove_empty_lines
+        self.remove_diacritics = remove_diacritics
+        self.remove_punctuation = remove_punctuation
+        self.remove_tatweel = remove_tatweel
+        self.normalize_ligature = normalize_ligature
+        self.normalize = normalize
+        self.apply_nfkc = apply_nfkc
+
+    def process_dataset_entry(self, data_entry):
+        data_entry[self.output_text_key] = self.clean_data(
+            data_entry[self.input_text_key]
+        )
+        return [DataEntry(data=data_entry)]
+
+    def _remove_diacritics(self, text):
+        for char in DIACRITICS:
+            text = text.replace(char, '')
+        return text
+
+    def _remove_punctuation(self, text):
+        for char in PUNCTUATION_MARKS:
+            text = text.replace(char, '')
+        return text
+
+    def _normalize_teh(self, text):
+        text = text.replace(TEH_MARBUTA, HEH)
+        return text
+    
+    def _normalize_ligature(self, text):
+        LIGUATURES_PATTERN = re.compile(u"[" + u"".join(LIGATURES) + u"]", re.UNICODE)
+        return LIGUATURES_PATTERN.sub(u'%s%s' % (LAM, ALEF), text)
+    
+    def _normalize_alef(self, text):
+        ALEFS_PATTERN = re.compile(u"[" + u"".join(ALEFS) + u"]", re.UNICODE)
+        return re.sub(ALEFS_PATTERN, ALEF, text)
+
+    def _remove_extra_spaces(self, text):
+        text = re.sub(" +", " ", text)
+        return text
+
+    def _remove_empty_lines(self, text):
+        lines = text.split("\n")
+        return ("\n").join([line for line in lines if len(line) >= 1])
+
+    def _normalize(self, text):
+        text = self._remove_diacritics(text)
+        text = self._normalize_alef(text)
+        text = self._normalize_ligature(text)
+        text = self._normalize_teh(text)
+
+        return text
+
+    def clean_data(self, text):
+        if self.remove_extra_spaces:
+            text = self._remove_extra_spaces(text)
+        if self.remove_empty_lines:
+            text = self._remove_empty_lines(text)
+        if self.remove_diacritics:
+            text = self._remove_diacritics(text)
+        if self.remove_tatweel:
+            text = text.replace("ـ", "")
+        if self.remove_punctuation:
+            text = self._remove_punctuation(text)
+        if self.normalize_ligature:
+            text = self._normalize_ligature(text)
+        if self.normalize:
+            text = self._normalize(text)
+        if self.apply_nfkc:
+            text = unicodedata.normalize("NFKC", text)
+        return text
\ No newline at end of file
diff --git a/build/lib/sdp/processors/langs/armenian.py b/build/lib/sdp/processors/langs/armenian.py
new file mode 100644
index 00000000..586807ed
--- /dev/null
+++ b/build/lib/sdp/processors/langs/armenian.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+
+import pandas as pd
+
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.common import load_manifest
+
+
+class GetSourceBookName(BaseParallelProcessor):
+    """
+    Processor for extracting source book name from file paths and updating the manifest.
+
+    Args:
+        source_file_key (str): The field containing the file path in the manifest.
+        source_key (str): The field to store the extracted source book name in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        source_file_key: str,
+        source_key: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.source_file_key = source_file_key
+        self.source_key = source_key
+
+    def process_dataset_entry(self, data_entry):
+        input_values = os.path.splitext(data_entry[self.source_file_key])[0].split("/")
+
+        data_entry[self.source_key] = input_values[-1]
+        return [DataEntry(data=data_entry)]
+
+
+class MakeTsv(BaseProcessor):
+    """
+    Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
+
+    Args:
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    """
+
+    def process(self):
+        df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file)))
+        df1.to_csv(self.output_manifest_file, index=None, sep='\t')
+
+
+class RandomTsvPart(BaseProcessor):
+    """
+    Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
+
+    Args:
+        part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
+        random_state (int): Seed for reproducibility when generating the random subset.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        part: float,
+        random_state: int = 100,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.part = part
+        self.random_state = random_state
+
+    def process(self):
+        df1 = pd.read_csv(self.input_manifest_file, sep='\t')
+        df1.sample(frac=self.part, random_state=self.random_state).to_csv(
+            self.output_manifest_file, index=None, sep='\t'
+        )
diff --git a/build/lib/sdp/processors/langs/kazakh.py b/build/lib/sdp/processors/langs/kazakh.py
new file mode 100644
index 00000000..876e9d41
--- /dev/null
+++ b/build/lib/sdp/processors/langs/kazakh.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from typing import List
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class LatinToCyrillic(BaseParallelProcessor):
+    """Converts visually identical latin letters  to cyrillic equivalents.
+
+    Args:
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with latin letters replaced with cyrillic ones.
+    """
+
+    LATIN = "AaƏəBEeKkMHOoPpCcTYyXxhi"
+    CYRILLIC = "АаӘәВЕеКкМНОоРрСсТУуХхһі"
+
+    def __init__(
+        self,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        latin_counter = collections.defaultdict(int)
+
+        text_in = data_entry[self.text_key]
+        text_out = text_in
+
+        for char in text_in:
+            if char in self.LATIN:
+                cyrillic_eqv = self.CYRILLIC[self.LATIN.index(char)]
+                text_out = text_out.replace(char, cyrillic_eqv)
+                latin_counter[char] += 1
+
+        data_entry[self.text_key] = text_out
+        return [DataEntry(data=data_entry, metrics=latin_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for char, value in counter.items():
+                total_counter[char] += value
+        logger.info("Num of Latin characters")
+        for char, count in total_counter.items():
+            logger.info(f"{char}: {count}")
+        super().finalize(metrics)
diff --git a/build/lib/sdp/processors/modify_manifest/__init__.py b/build/lib/sdp/processors/modify_manifest/__init__.py
new file mode 100644
index 00000000..2db92b25
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/processors/modify_manifest/common.py b/build/lib/sdp/processors/modify_manifest/common.py
new file mode 100644
index 00000000..98ad1fa3
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/common.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Union, Optional
+
+import pandas as pd
+from tqdm import tqdm
+
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+    LegacyParallelProcessor,
+)
+from sdp.utils.common import load_manifest
+
+class CombineSources(BaseParallelProcessor):
+    """Can be used to create a single field from two alternative sources.
+
+    E.g.::
+
+        _target_: sdp.processors.CombineSources
+        sources:
+            - field: text_pc
+              origin_label: original
+            - field: text_pc_pred
+              origin_label: synthetic
+            - field: text
+              origin_label: no_pc
+        target: text
+
+    will populate the ``text`` field with data from ``text_pc`` field if it's
+    present and not equal to ``n/a`` (can be customized). If ``text_pc`` is
+    not available, it  will populate ``text`` from ``text_pc_pred`` field,
+    following the same rules. If both are not available, it will fall back to
+    the ``text`` field itself. In all cases it will specify which source was
+    used in the ``text_origin`` field by using the label from the
+    ``origin_label`` field.. If non of the sources is available,
+    it will populate both the target and the origin fields with ``n/a``.
+
+    Args:
+        sources (list[dict]): list of the sources to use in order of preference.
+            Each element in the list should be in the following format::
+
+                {
+                    field: <which field to take the data from>
+                    origin_label: <what to write in the "<target>_origin"
+                }
+        target (str): target field that we are populating.
+        na_indicator (str): if any source field has text equal to the
+            ``na_indicator`` it will be considered as not available. If none
+            of the sources are present, this will also be used as the value
+            for the target and origin fields. Defaults to ``n/a``.
+
+    Returns:
+        The same data as in the input manifest enhanced with the following fields::
+
+            <target>: <populated with data from either <source1> or <source2> \
+                       or with <na_indicator> if none are available>
+            <target>_origin: <label that marks where the data came from>
+    """
+
+    def __init__(
+        self,
+        sources: List[Dict[str, str]],
+        target: str,
+        na_indicator: str = "n/a",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.sources = sources
+        self.target = target
+        self.na_indicator = na_indicator
+
+    def process_dataset_entry(self, data_entry: Dict):
+        for source_dict in self.sources:
+            if data_entry.get(source_dict["field"], self.na_indicator) != self.na_indicator:
+                data_entry[self.target] = data_entry[source_dict["field"]]
+                data_entry[f"{self.target}_origin"] = source_dict["origin_label"]
+                break  # breaking out on the first present label
+        else:  # going here if no break was triggered
+            data_entry[self.target] = self.na_indicator
+            data_entry[f"{self.target}_origin"] = self.na_indicator
+
+        return [DataEntry(data=data_entry)]
+
+
+class AddConstantFields(BaseParallelProcessor):
+    """
+    This processor adds constant fields to all manifest entries using Dask BaseParallelProcessor.
+    It is useful when you want to attach fixed information (e.g., a language label or metadata)
+    to each entry for downstream tasks such as language identification model training.
+    
+    Args:
+        fields (dict): A dictionary containing key-value pairs of fields to add to each manifest entry.
+            For example::
+    
+                {
+                    "label": "en",
+                    "metadata": "mcv-11.0-2022-09-21"
+                }
+    
+    Returns:
+        dict: The same data as in the input manifest with the added constant fields as specified in
+        the ``fields`` dictionary.
+    
+    Example:
+    
+        .. code-block:: yaml
+    
+            - _target_: sdp.processors.modify_manifest.common.AddConstantFields
+              input_manifest_file: ${workspace_dir}/input_manifest.json
+              output_manifest_file: ${workspace_dir}/output_manifest.json
+              fields:
+                label: "en"
+                metadata: "mcv-11.0-2022-09-21"
+    """
+
+    def __init__(self, fields: Dict, **kwargs):
+        super().__init__(**kwargs)
+        self.fields = fields
+
+    def process_dataset_entry(self, data_entry: Dict):
+        data_entry.update(self.fields)
+        return [DataEntry(data=data_entry)]
+
+
+
+class DuplicateFields(BaseParallelProcessor):
+    """This processor duplicates fields in all manifest entries.
+
+    It is useful for when you want to do downstream processing of a variant
+    of the entry. E.g. make a copy of "text" called "text_no_pc", and
+    remove punctuation from "text_no_pc" in downstream processors.
+
+    Args:
+        duplicate_fields (dict): dictionary where keys are the original
+            fields to be copied and their values are the new names of
+            the duplicate fields.
+
+    Returns:
+        The same data as in the input manifest with duplicated fields
+        as specified in the ``duplicate_fields`` input dictionary. 
+    
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.modify_manifest.common.DuplicateFields
+              input_manifest_file: ${workspace_dir}/test1.json
+              output_manifest_file: ${workspace_dir}/test2.json
+              duplicate_fields: {"text":"answer"}
+
+    """
+    def __init__(
+        self,
+        duplicate_fields: Dict,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.duplicate_fields = duplicate_fields
+
+    def process_dataset_entry(self, data_entry: Dict):
+        for field_src, field_tgt in self.duplicate_fields.items():
+            if not field_src in data_entry:
+                raise ValueError(f"Expected field {field_src} in data_entry {data_entry} but there isn't one.")
+
+            data_entry[field_tgt] = data_entry[field_src]
+
+        return [DataEntry(data=data_entry)]
+
+
+class RenameFields(BaseParallelProcessor):
+    """This processor renames fields in all manifest entries.
+
+    Args:
+        rename_fields: dictionary where keys are the fields to be
+            renamed and their values are the new names of the fields.
+
+    Returns:
+        The same data as in the input manifest with renamed fields
+        as specified in the ``rename_fields`` input dictionary.
+    """
+
+    def __init__(
+        self,
+        rename_fields: Dict,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.rename_fields = rename_fields
+
+    def process_dataset_entry(self, data_entry: Dict):
+        for field_src, field_tgt in self.rename_fields.items():
+            if not field_src in data_entry:
+                raise ValueError(f"Expected field {field_src} in data_entry {data_entry} but there isn't one.")
+
+            data_entry[field_tgt] = data_entry[field_src]
+            del data_entry[field_src]
+
+        return [DataEntry(data=data_entry)]
+
+
+class SplitOnFixedDuration(BaseParallelProcessor):
+    """This processor splits audio into a fixed length segments.
+
+    It does not actually create different audio files, but simply adds
+    corresponding ``offset`` and ``duration`` fields. These fields can
+    be automatically processed by NeMo to split audio on the fly during
+    training.
+
+    Args:
+        segment_duration (float): fixed desired duration of each segment.
+        drop_last (bool): whether to drop the last segment if total duration is
+            not divisible by desired segment duration. If False, the last
+            segment will be of a different length which is ``< segment_duration``.
+            Defaults to True.
+        drop_text (bool): whether to drop text from entries as it is most likely
+            inaccurate after the split on duration. Defaults to True.
+
+    Returns:
+        The same data as in the input manifest but all audio that's longer
+        than the ``segment_duration`` will be duplicated multiple times with
+        additional ``offset`` and ``duration`` fields. If ``drop_text=True``
+        will also drop ``text`` field from all entries.
+    """
+
+    def __init__(
+        self,
+        segment_duration: float,
+        drop_last: bool = True,
+        drop_text: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.segment_duration = segment_duration
+        self.drop_last = drop_last
+        self.drop_text = drop_text
+
+    def process_dataset_entry(self, data_entry: Dict):
+        total_duration = data_entry["duration"]
+        total_segments = int(total_duration // self.segment_duration)
+        output = [None] * total_segments
+        for segment_idx in range(total_segments):
+            modified_entry = data_entry.copy()  # shallow copy should be good enough
+            modified_entry["duration"] = self.segment_duration
+            modified_entry["offset"] = segment_idx * self.segment_duration
+            if self.drop_text:
+                modified_entry.pop("text", None)
+            output[segment_idx] = DataEntry(data=modified_entry)
+
+        remainder = total_duration - self.segment_duration * total_segments
+        if not self.drop_last and remainder > 0:
+            modified_entry = data_entry.copy()
+            modified_entry["duration"] = remainder
+            modified_entry["offset"] = self.segment_duration * total_segments
+            if self.drop_text:
+                modified_entry.pop("text", None)
+            output.append(DataEntry(data=modified_entry))
+
+        return output
+
+
+class ChangeToRelativePath(BaseParallelProcessor):
+    """This processor changes the audio filepaths to be relative.
+
+    Args:
+        base_dir: typically a folder where manifest file is going to be
+            stored. All passes will be relative to that folder.
+
+    Returns:
+         The same data as in the input manifest with ``audio_filepath`` key
+         changed to contain relative path to the ``base_dir``.
+    """
+
+    def __init__(
+        self,
+        base_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.base_dir = base_dir
+
+    def process_dataset_entry(self, data_entry: Dict):
+        data_entry["audio_filepath"] = os.path.relpath(data_entry["audio_filepath"], self.base_dir)
+
+        return [DataEntry(data=data_entry)]
+
+
+class SortManifest(BaseProcessor):
+    """Processor which will sort the manifest by some specified attribute.
+
+    Args:
+        attribute_sort_by (str): the attribute by which the manifest will be sorted.
+        descending (bool): if set to False, attribute will be in ascending order.
+            If True, attribute will be in descending order. Defaults to True.
+
+    Returns:
+        The same entries as in the input manifest, but sorted based
+        on the provided parameters.
+    """
+
+    def __init__(
+        self,
+        attribute_sort_by: str,
+        descending: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.attribute_sort_by = attribute_sort_by
+        self.descending = descending
+
+    def process(self):
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            dataset_entries = [json.loads(line) for line in fin.readlines()]
+
+        dataset_entries = sorted(dataset_entries, key=lambda x: x[self.attribute_sort_by], reverse=self.descending)
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for line in dataset_entries:
+                fout.write(json.dumps(line, ensure_ascii=False) + "\n")
+
+
+class KeepOnlySpecifiedFields(BaseProcessor):
+    """Saves a copy of a manifest but only with a subset of the fields.
+
+    Typically will be the final processor to save only relevant fields
+    in the desired location.
+
+    Args:
+        fields_to_keep (list[str]): list of the fields in the input manifest
+            that we want to retain. The output file will only contain these
+            fields.
+
+    Returns:
+        The same data as in input manifest, but re-saved in the new location
+        with only ``fields_to_keep`` fields retained.
+    """
+
+    def __init__(self, fields_to_keep: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self.fields_to_keep = fields_to_keep
+
+    def process(self):
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin, open(
+            self.output_manifest_file, "wt", encoding="utf8"
+        ) as fout:
+            for line in tqdm(fin):
+                line = json.loads(line)
+                new_line = {field: line[field] for field in self.fields_to_keep}
+                fout.write(json.dumps(new_line, ensure_ascii=False) + "\n")
+
+
+class ApplyInnerJoin(BaseProcessor):
+    """Applies inner join to two manifests, i.e. creates a manifest from records that have matching values in both manifests.
+    For more information, please refer to the Pandas merge function documentation:
+    https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge
+
+
+    Args:
+        column_id (Union[str, List[str], None]): Field names to join on. These must be found in both manifests.
+            If `column_id` is None then this defaults to the intersection of the columns in both manifests.
+            Defaults to None.
+        left_manifest_file (Optional[str]): path to the left manifest. Defaults to `input_manifest_file`.
+        right_manifest_file (str): path to the right manifest.
+
+    Returns:
+        Inner join of two manifests.
+    """
+
+    def __init__(
+        self,
+        right_manifest_file: str,
+        left_manifest_file: Optional[str] = None,
+        column_id: Union[str, List[str], None] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.left_manifest_file = left_manifest_file if left_manifest_file != None else self.input_manifest_file
+        self.right_manifest_file = right_manifest_file
+        self.column_id = column_id
+
+    def process(self):
+        m1 = pd.DataFrame.from_records(load_manifest(Path(self.left_manifest_file)))
+        m2 = pd.DataFrame.from_records(load_manifest(Path(self.right_manifest_file)))
+        m3 = pd.merge(m1, m2, on=self.column_id, how="inner")
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for _, line in m3.iterrows():
+                fout.write(json.dumps(dict(line), ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/modify_manifest/create_manifest.py b/build/lib/sdp/processors/modify_manifest/create_manifest.py
new file mode 100644
index 00000000..1e416571
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/create_manifest.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+import pandas
+
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    DataEntry,
+)
+
+
+class CreateInitialManifestByExt(BaseParallelProcessor):
+    """
+    Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
+
+    Args:
+        raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory.
+        output_file_key (str): The key to store the paths to the files in the dataset.
+        extension (str): The file extension of the of the files to be added to the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        output_file_key: str = "audio_filepath",
+        extension: str = "mp3",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.output_file_key = output_file_key
+        self.extension = extension
+
+    def read_manifest(self):
+        # Get all files with the specified extension
+        files = list(self.raw_data_dir.rglob('*.' + self.extension))
+        # Get relative paths and then rebuild proper paths to avoid duplication
+        return [str(self.raw_data_dir / file.relative_to(self.raw_data_dir)) for file in files]
+
+    def process_dataset_entry(self, data_entry):
+        data = {self.output_file_key: data_entry}
+        return [DataEntry(data=data)]
+
+
+class CreateCombinedManifests(BaseParallelProcessor):
+    """Reads JSON lines from specified files and creates a combined manifest.
+
+    This processor iterates over files listed in `manifest_list`, reads each file line by line, 
+    and yields the parsed JSON data from each line.
+
+    Args:
+        manifest_list (list(str)): A list of file paths or directories to process. The processor will 
+                                   recursively read files within the directories and expect each file to contain JSON data.
+        **kwargs: Additional keyword arguments passed to the base class `BaseParallelProcessor`.
+
+    Returns:
+        A generator that yields parsed JSON data from each line in the files listed in `manifest_list`.
+    """
+    def __init__(
+        self,
+        manifest_list: list[str],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.manifest_list = manifest_list
+
+    def read_manifest(self):
+        for file in self.manifest_list:
+            with open(file, "rt", encoding="utf8") as fin:
+                for line in fin:
+                    yield json.loads(line)
+
+    def process_dataset_entry(self, data_entry):
+        return [DataEntry(data=data_entry)]
+
+
+
diff --git a/build/lib/sdp/processors/modify_manifest/data_to_data.py b/build/lib/sdp/processors/modify_manifest/data_to_data.py
new file mode 100644
index 00000000..55db4080
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/data_to_data.py
@@ -0,0 +1,1227 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import re
+from typing import Dict, List, Optional
+
+import soundfile
+import torchaudio
+from docx import Document
+from sox import Transformer
+from tqdm import tqdm
+import json
+
+from sdp.logging import logger
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.common import ffmpeg_convert
+from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
+from sdp.utils.get_diff import get_diff_with_subs_grouped
+from sdp.utils.metrics_computation import (
+    get_cer,
+    get_charrate,
+    get_wer,
+    get_wmr,
+    get_wordrate,
+)
+
+
+class GetAudioDuration(BaseParallelProcessor):
+    """
+    Processor that computes the duration of the file in ``audio_filepath_key`` (using soundfile)
+    and saves the duration in ``duration_key``. If there is an error computing the duration,
+    the value at ``duration_key`` will be updated with the value -1.0.
+
+    Args:
+        audio_filepath_key (str): Key to get path to wav file.
+        duration_key (str): Key to put to audio duration.
+    Returns:
+        All the same fields as in the input manifest plus duration_key
+    """
+
+    def __init__(
+        self,
+        audio_filepath_key: str,
+        duration_key: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_filepath_key = audio_filepath_key
+        self.duration_key = duration_key
+
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.audio_filepath_key]
+        try:
+            data, samplerate = soundfile.read(audio_filepath)
+            data_entry[self.duration_key] = data.shape[0] / samplerate
+        except Exception as e:
+            logger.warning(str(e) + " file: " + audio_filepath)
+            data_entry[self.duration_key] = -1.0
+        return [DataEntry(data=data_entry)]
+
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+    If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
+    If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+
+    .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
+        e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
+
+        ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
+
+    Args:
+        converted_audio_dir (str): The directory to store the resampled audio files.
+        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
+        output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+        id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
+        output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
+        target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_file_key: str,
+        output_file_key: str,
+        id_key: str = None,
+        output_format: str = "wav",
+        base_dir: str = None,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.converted_audio_dir = converted_audio_dir
+        self.input_file_key = input_file_key
+        self.output_file_key = output_file_key
+        self.output_format = output_format
+        self.id_key = id_key
+        self.base_dir = base_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        assert self.output_format == "wav", "Currently only wav format is supported"
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        input_file = data_entry[self.input_file_key]
+        if self.id_key:
+            key = data_entry[self.id_key]
+            os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
+        else:
+            key = os.path.splitext(input_file)[0].split("/")[-1]
+
+        if self.base_dir:
+            new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
+            os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
+
+            key = os.path.join(new_dir, key)
+
+        audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
+
+        if not os.path.isfile(audio_file):
+            ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.output_file_key] = audio_file
+        return [DataEntry(data=data_entry)]
+
+
+class ReadTxtLines(BaseParallelProcessor):
+    """
+    The text file specified in source_filepath will be read, and each line in it will be added as a line in the output manifest,
+    saved in the field text_key.
+
+    Args:
+        input_file_key (str): The key in the manifest containing the input txt file path .
+        text_key (str): The key to store the read text lines in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        input_file_key: str,
+        text_key: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_file_key = input_file_key
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry):
+        fname = data_entry[self.input_file_key]
+        data_list = []
+        with open(fname, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data = data_entry.copy()
+                    data[self.text_key] = line
+                    data_list.append(DataEntry(data=data))
+        return data_list
+
+
+class SoxConvert(BaseParallelProcessor):
+    """Processor for Sox to convert audio files to specified format.
+
+    Args:
+        output_manifest_file (str): Path to the output manifest file.
+        input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
+        output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
+        converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
+        output_format (str): Format of the output audio file.
+        rate (int): Sample rate of the output audio file.
+        channels (int): Number of channels of the output audio file.
+        workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_audio_file_key: str = "audio_filepath",
+        output_audio_file_key: str = "audio_filepath",
+        output_format: str = "wav",
+        rate: int = 16000,
+        channels: int = 1,
+        workspace_dir: Optional[str] = None,
+        **kwargs,
+    ):
+        # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
+        if "workspace_dir" in kwargs:
+            workspace_dir = kwargs.pop("workspace_dir")
+            
+        super().__init__(**kwargs)
+        self.input_audio_file_key = input_audio_file_key
+        self.output_audio_file_key = output_audio_file_key
+        self.converted_audio_dir = converted_audio_dir
+        self.output_format = output_format
+        self.workspace_dir = workspace_dir
+
+        # Store the new parameters for later use:
+        self.rate = rate
+        self.channels = channels
+
+    def prepare(self):
+        # Debug print for workspace_dir
+        logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        audio_path = data_entry[self.input_audio_file_key]
+        
+        # If workspace_dir is provided, join it with audio_path to get absolute path
+        if self.workspace_dir is not None:
+            full_audio_path = os.path.join(self.workspace_dir, audio_path)
+        else:
+            full_audio_path = audio_path
+            
+        # Debug print first file path
+        if not hasattr(self, '_debug_printed'):
+            logger.info(f"First audio_path from manifest: {audio_path}")
+            logger.info(f"First full_audio_path: {full_audio_path}")
+            logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
+            self._debug_printed = True
+
+        key = os.path.splitext(audio_path)[0].split("/")[-1]
+        converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
+
+        if not os.path.isfile(converted_file):
+            transformer = Transformer()
+
+            transformer.rate(self.rate)
+            transformer.channels(self.channels)
+
+            transformer.build(full_audio_path, converted_file)
+
+        data_entry[self.output_audio_file_key] = converted_file
+        return [DataEntry(data=data_entry)]
+
+
+class CountNumWords(BaseParallelProcessor):
+    """
+    Processor for counting the number of words in the text_key field saving the number in num_words_key.
+
+    Args:
+        text_key (str): The field containing the input text in the dataset.
+        num_words_key (str): The field to store the number of words in the dataset.
+        alphabet (str): Characters to be used to count words. Any other characters are substituted by whitespace and not take into account.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        text_key: str,
+        num_words_key: str,
+        alphabet: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+        self.num_words_key = num_words_key
+        self.pattern = re.compile("[^" + alphabet + "]")
+
+    def process_dataset_entry(self, data_entry):
+        text = data_entry[self.text_key]
+        cleaned_string = self.pattern.sub("", text).strip()
+        cleaned_string = re.sub("\\s+", " ", cleaned_string).strip()
+        words = cleaned_string.split()
+        num_words = len(words)
+        data_entry[self.num_words_key] = num_words
+        return [DataEntry(data=data_entry)]
+
+
+class SplitLineBySentence(BaseParallelProcessor):
+    """
+    Processor for splitting lines of text into sentences based on a specified pattern.
+    One line containing N sentences will be transformed into N lines containing one sentence.
+
+    Args:
+        text_key (str): The field containing the text lines in the dataset.
+        end_pattern (str): The regular expression pattern to identify sentence boundaries.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+    """
+
+    def __init__(
+        self,
+        text_key: str,
+        end_pattern: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+        self.pattern = re.compile(end_pattern)
+
+    def process_dataset_entry(self, data_entry):
+        line = data_entry[self.text_key]
+        data_list = []
+        start = 0
+        ends = [m.start() for m in self.pattern.finditer(line)]
+        if ends:
+            for end in ends:
+                sent = line[start : end + 1].strip()
+                # if sent and sent[0].isupper():
+                data = data_entry.copy()
+                data[self.text_key] = sent
+                data_list.append(DataEntry(data=data))
+                start = end + 1
+            if start < len(line):
+                pass
+        else:
+            data = data_entry.copy()
+            data[self.text_key] = line.strip()
+            data_list.append(DataEntry(data=data))
+        return data_list
+
+
+class InsIfASRInsertion(BaseParallelProcessor):
+    """Processor that adds substrings to transcription if they are present in ASR predictions.
+
+    Will insert substrings into ``data[self.text_key]`` if it is
+    present at that location in ``data[self.pred_text_key]``.
+    It is useful if words are systematically missing from ground truth
+    transcriptions.
+
+    Args:
+        insert_words (list[str]): list of strings that will be inserted
+            into ``data[self.text_key]`` if there is an insertion (containing
+            only that string) in ``data[self.pred_text_key]``.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+            .. note::
+                Because this processor looks for an exact match in the insertion,
+                we recommend including variations with different spaces in
+                ``insert_words``, e.g. ``[' nemo', 'nemo ', ' nemo ']``.
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field changed.
+    """
+
+    def __init__(
+        self,
+        insert_words: List[str],
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.insert_words = insert_words
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        insert_word_counter = collections.defaultdict(int)
+        for insert_word in self.insert_words:
+            if not insert_word in data_entry[self.pred_text_key]:
+                break
+            orig_words, pred_words = (
+                data_entry[self.text_key],
+                data_entry[self.pred_text_key],
+            )
+            diff = get_diff_with_subs_grouped(orig_words, pred_words)
+
+            if len(diff) > 0:  # ie if there are differences between text and pred_text
+                new_sent = ""
+
+                for diff_entry in diff:
+                    if diff_entry[0] == 0:  # no change
+                        new_sent += diff_entry[1]
+
+                    elif diff_entry[0] == -1:  # deletion in original string
+                        new_sent += diff_entry[1]
+
+                    elif diff_entry[0] == 1:  # insertion in original string
+                        if diff_entry[1] == insert_word:
+                            new_sent += insert_word
+                            insert_word_counter[insert_word] += 1
+
+                    elif isinstance(diff_entry, tuple):  # i.e. diff is a substitution
+                        new_sent += diff_entry[0][1]
+                    else:
+                        raise ValueError(f"unexpected item in diff_entry: {diff_entry}")
+
+                new_sent = " ".join(new_sent.split())  # remove any extra spaces
+                data_entry[self.text_key] = new_sent
+
+        return [DataEntry(data=data_entry, metrics=insert_word_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for word, count in counter.items():
+                total_counter[word] += count
+        logger.info("Num of words that were inserted")
+        for word, count in total_counter.items():
+            logger.info(f"{word} {count}")
+        super().finalize(metrics)
+
+
+class SubIfASRSubstitution(BaseParallelProcessor):
+    """Processor that substitutes substrings to transcription if they are present in ASR predictions.
+
+    Will convert a substring in ``data[self.text_key]`` to a
+    substring in ``data[self.pred_text_key]`` if both are located in the
+    same place (ie are part of a 'substitution' operation) and if the substrings
+    correspond to key-value pairs in ``sub_words``.
+    This is useful if words are systematically incorrect in ground truth
+    transcriptions.
+
+    Before starting to look for substitution, this processor adds spaces at the beginning and end of
+    ``data[self.text_key]`` and ``data[self.pred_text_key]``, to ensure that an argument like
+    ``sub_words = {"nmo ": "nemo "}`` would cause a substitution to be made even if the original
+    ``data[self.text_key]`` ends with ``"nmo"`` and ``data[self.pred_text_key]`` ends with ``"nemo"``.
+
+    Args:
+        sub_words (dict): dictionary where a key is a string that might be in
+            ``data[self.text_key]`` and the value is the string that might
+            be in ``data[self.pred_text_key]``. If both are located in the same
+            place (i.e. are part of a 'substitution' operation)
+            then the key string will be converted to the value string
+            in ``data[self.text_key]``.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+            .. note::
+                This processor looks for exact string matches of substitutions,
+                so you may need to be careful with spaces in ``sub_words``. E.g.
+                it is recommended to do ``sub_words = {"nmo ": "nemo "}``
+                instead of ``sub_words = {"nmo" : "nemo"}``.
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field changed.
+    """
+
+    def __init__(
+        self,
+        sub_words: Dict,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.sub_words = sub_words
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        sub_word_counter = collections.defaultdict(int)
+        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
+        data_entry[self.pred_text_key] = add_start_end_spaces(data_entry[self.pred_text_key])
+        for original_word, new_word in self.sub_words.items():
+            if not original_word in data_entry[self.text_key]:
+                break
+            orig_words, pred_words = (
+                data_entry[self.text_key],
+                data_entry[self.pred_text_key],
+            )
+            diff = get_diff_with_subs_grouped(orig_words, pred_words)
+
+            if len(diff) > 0:  # ie if there are differences between text and pred_text
+                new_sent = ""
+
+                for diff_entry in diff:
+                    if diff_entry[0] == 0:  # no change
+                        new_sent += diff_entry[1]
+
+                    elif diff_entry[0] == -1:  # deletion in original string
+                        new_sent += diff_entry[1]
+
+                    elif diff_entry[0] == 1:  # insertion in original string
+                        # don't make changes
+                        pass
+
+                    elif isinstance(diff_entry, tuple):  # substitution
+                        if diff_entry[0][1] == original_word and diff_entry[1][1] == new_word:
+                            # ie. substitution is one we want to use to change the original text
+                            new_sent += new_word
+                            sub_word_counter[original_word] += 1
+
+                        else:
+                            # ie. substitution is one we want to ignore
+                            new_sent += diff_entry[0][1]
+                    else:
+                        raise ValueError(f"unexpected item in diff_entry: {diff_entry}")
+
+                new_sent = add_start_end_spaces(new_sent)
+                data_entry[self.text_key] = new_sent
+
+        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
+        data_entry[self.pred_text_key] = remove_extra_spaces(data_entry[self.pred_text_key])
+
+        return [DataEntry(data=data_entry, metrics=sub_word_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for word, count in counter.items():
+                total_counter[word] += count
+        logger.info("Num of words that were substituted")
+        for word, count in total_counter.items():
+            logger.info(f"{word} {count}")
+        super().finalize(metrics)
+
+
+# TODO: replace with generic regex
+
+
+class SubMakeLowercase(BaseParallelProcessor):
+    """Processor to convert text to lowercase.
+
+    text_key (str): a string indicating which key of the data entries
+        should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+        The same data as in the input manifest with ``<text_key>`` field changed.
+    """
+
+    def __init__(
+        self,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        data_entry[self.text_key] = data_entry[self.text_key].lower()
+        return [DataEntry(data=data_entry)]
+
+    def finalize(self, metrics):
+        logger.info("Made all letters lowercase")
+        super().finalize(metrics)
+
+
+class SubRegex(BaseParallelProcessor):
+    """Converts a regex match to a string, as defined by key-value pairs in ``regex_to_sub``.
+
+    Before applying regex changes, we will add a space
+    character to the beginning and end of the ``text`` and ``pred_text``
+    keys for each data entry. After the the regex changes,
+    the extra spaces are removed. This includes the spaces in the beginning
+    and end of the text, as well as any double spaces ``"  "``.
+
+    Args:
+        regex_params_list (list[dict]): list of dicts.
+            Each dict must contain a ``pattern`` and a ``repl`` key,
+            and optionally a ``count`` key (by default, ``count`` will be 0).
+            This processor will go through the list in order, and apply a ``re.sub`` operation on
+            the input text in ``data_entry[self.text_key]``, feeding in the specified ``pattern``, ``repl``
+            and ``count`` parameters to ``re.sub``.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field changed.
+    """
+
+    def __init__(
+        self,
+        regex_params_list: List[Dict],
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.regex_params_list = regex_params_list
+        self.text_key = text_key
+
+        # verify all dicts in regex_params_list have "pattern" and "repl" keys
+        for regex_params_dict in self.regex_params_list:
+            if not "pattern" in regex_params_dict.keys():
+                raise ValueError(
+                    f"Need to have key 'pattern' in all entries of `regex_params_list`: {self.regex_params_list}"
+                )
+            if not "repl" in regex_params_dict.keys():
+                raise ValueError(
+                    f"Need to have key 'repl' in all entries of `regex_params_list`: {self.regex_params_list}"
+                )
+
+    def process_dataset_entry(self, data_entry) -> List:
+        """Replaces each found regex match with a given string."""
+        replace_word_counter = collections.defaultdict(int)
+
+        text_in = data_entry[self.text_key]
+
+        text_in = add_start_end_spaces(text_in)
+        for regex_params in self.regex_params_list:
+            text_out = re.sub(
+                pattern=regex_params["pattern"],
+                repl=regex_params["repl"],
+                string=text_in,
+                # note: this count param is the maximum number of pattern occurrences to be replaced.
+                count=regex_params.get("count", 0),
+            )
+
+            if text_in != text_out:
+                replace_word_counter[regex_params["pattern"]] += 1
+            text_in = text_out
+
+        text_out = remove_extra_spaces(text_out)
+
+        data_entry[self.text_key] = text_out
+
+        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
+
+    def finalize(self, metrics):
+        """Reports how many substitutions were made for each pattern."""
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for word, count in counter.items():
+                total_counter[word] += count
+        logger.info("Number of utterances which applied substitutions for the following patterns:")
+        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
+        for word, count in total_counter_sorted.items():
+            logger.info(f"{word} {count}")
+        super().finalize(metrics)
+
+
+class NormalizeText(BaseParallelProcessor):
+    """This processor applies text normalization (TN) to the text. I.e. converts text from written form into its verbalized form.
+    E.g., "$123" is converted to "one hundred and twenty-three dollars."
+
+    Args:
+        input_text_key (str): the text field that will be the input to the Normalizer. Defaults to: text.
+        input_language (str): language specifying the text normalization rules in ISO 639 Set 1 format. E.g., "en", "es", "it", etc.
+            Defaults to: English.
+        input_case (str): input text capitalization, set to `cased` if text contains capital letters.
+            This flag affects normalization rules applied to the text. Note, `lower_cased` won't lower case input.
+            Defaults to: cased.
+        output_text_key (str): the text field that will be the output from the Normalizer.
+            Defaults to: text.
+
+    Returns:
+        This processor normalizes the text in the `input_text_key` field and saves the normalized text in `output_text_key` field.
+
+    Raises:
+        `NotImplementedError`: when TN is not implemented for the requested language.
+    """
+
+    def __init__(
+        self,
+        input_text_key: str = "text",
+        input_language: str = "en",
+        input_case: str = "cased",
+        output_text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_text_key = input_text_key
+        self.output_text_key = output_text_key
+        self.input_case = input_case
+        self.input_language = input_language
+
+    def prepare(self):
+        from nemo_text_processing.text_normalization.normalize import Normalizer
+        try:
+            self.normalizer = Normalizer(input_case=self.input_case, lang=self.input_language)
+        except NotImplementedError as e:
+            logger.error("Failed to run text normalization: %s", repr(e))
+
+    def process_dataset_entry(self, data_entry):
+        data_entry[self.output_text_key] = self.normalizer.normalize(data_entry[self.input_text_key])
+        return [DataEntry(data=data_entry)]
+
+
+class InverseNormalizeText(BaseParallelProcessor):
+    """This processor applies inverse text normalization (ITN) to the text. I.e. transforms spoken forms of numbers, dates, etc into their written equivalents.
+    E.g., "one hundred and twenty-three dollars." is converted to "$123".
+
+    Args:
+        input_text_key (str): the text field that will be the input to the InverseNormalizer. Defaults to: text.
+        input_language (str): language specifying the text normalization rules in ISO 639 Set 1 format. E.g., "en", "es", "it", etc.
+            Defaults to: English.
+        input_case (str): input text capitalization, set to `cased` if text contains capital letters.
+            This flag affects normalization rules applied to the text. Note, `lower_cased` won't lower case input.
+            Defaults to: cased.
+        output_text_key (str): the text field that will be the output from the InverseNormalizer.
+            Defaults to: text.
+
+    Returns:
+        This processor inverse normalizes the text in the `input_text_key` field and saves the inverse normalized text in `output_text_key` field.
+
+    Raises:
+        `NotImplementedError`: when ITN is not implemented for the requested language.
+    """
+
+    def __init__(
+        self,
+        input_text_key: str = "text",
+        input_language: str = "en",
+        input_case: str = "cased",
+        output_text_key: str = "text",
+        verbose: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_text_key = input_text_key
+        self.output_text_key = output_text_key
+        self.input_case = input_case
+        self.input_language = input_language
+        self.verbose = verbose
+
+    def prepare(self):
+        from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+        try:
+            self.inverse_normalizer = InverseNormalizer(input_case=self.input_case, lang=self.input_language)
+        except NotImplementedError as e:
+            logger.error("Failed to run text inverse normalization: %s", repr(e))
+
+    def process_dataset_entry(self, data_entry):
+        data_entry[self.output_text_key] = self.inverse_normalizer.inverse_normalize(
+            data_entry[self.input_text_key], verbose=self.verbose
+        )
+        return [DataEntry(data=data_entry)]
+
+
+class CopyManifestData(BaseParallelProcessor):
+    """This processor copies files specified in the manifest to a new location.
+
+    It is useful for creating a consolidated dataset by gathering files from different sources
+    into a single directory.
+
+    Args:
+        copy_path (str): The destination directory where files will be copied.
+        source_filepath (str): The key in the manifest that contains the path to 
+            the file to be copied. Default: "audio_path".
+
+    Returns:
+        The same data as in the input manifest, but the files referenced in the manifest
+        will have been copied to the specified destination directory.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.modify_manifest.data_to_data.CopyManifestData
+              input_manifest_file: ${workspace_dir}/dataset.json
+              output_manifest_file: ${workspace_dir}/dataset_copied.json
+              copy_path: ${workspace_dir}/consolidated_data
+              source_filepath: "audio_filepath"
+    """
+    def __init__(
+        self,
+        copy_path: str,
+        source_filepath: str = "audio_path",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = source_filepath
+        self.copy_path = copy_path
+
+    def prepare(self):
+        os.makedirs(self.copy_path, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        fname = data_entry[self.input_field]
+
+        dest_file_path = os.path.join(self.copy_path, os.path.basename(fname))
+        shutil.copy(fname, dest_file_path)
+        data_entry[self.input_field] = dest_file_path
+
+        return [DataEntry(data=data_entry)]
+
+
+class ReadDocxLines(BaseParallelProcessor):
+    """
+    Processor for reading text lines from a docx file and updating the manifest.
+
+    Args:
+        source_filepath (str): The field containing the file path in the manifest.
+        text_key (str): The field to store the read text lines in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        source_filepath: str,
+        text_key: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = source_filepath
+        self.output_field = text_key
+
+    def process_dataset_entry(self, data_entry):
+        fname = data_entry[self.input_field]
+
+        # Skip hidden files and directories (e.g., .DS_Store, ._filename)
+        if os.path.basename(fname).startswith('.'):
+            logger.warning(f"Skipping hidden file: {fname}")
+            return []
+
+        data_list = []
+
+        try:
+            doc = Document(fname)
+            for para in doc.paragraphs:
+                line = para.text.strip()
+                if line:
+                    data = data_entry.copy()
+                    data[self.output_field] = line
+                    data_list.append(DataEntry(data=data))
+        except Exception as e:
+            logger.error(f"Error reading document {fname}: {e}")
+
+        return data_list
+
+
+class ExtractFromBrackets(BaseParallelProcessor):
+    """
+    A class for extracting text contained within specified bracket types from strings,
+    handling nested brackets.
+
+    Example Input:
+        data_entry = {
+            "text": "This is a [test] string with [multiple [nested] brackets]."
+        }
+
+    Example Output:
+        [
+            {
+                "text": "test"
+            },
+            {
+                "text": "multiple [nested] brackets"
+            }
+        ]
+
+    Explanation:
+        - It extracts "test" from the first occurrence of brackets.
+        - It extracts "multiple [nested] brackets" from the second occurrence, handling nested brackets correctly.
+
+    Attributes:
+        brackets (List[str]): A list where each element is a pair of strings representing
+                              the opening and closing brackets.
+        text_key (str): The key in the input data from which to extract text, defaults to "text".
+    """
+
+    def __init__(
+        self,
+        brackets: List[str],
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.brackets = brackets
+        self.text_key = text_key
+
+    def extract_text_within_brackets(self, text, brackets):
+        """
+        Extracts text within the specified brackets, including handling nested brackets.
+
+        Args:
+            text (str): The string from which to extract text.
+            brackets (tuple[str, str]): A tuple containing the opening and closing bracket.
+
+        Returns:
+            List[str]: A list of strings, each representing a segment of text found within
+                       the outermost brackets, including any nested brackets content.
+        """
+        open_bracket, close_bracket = brackets
+        depth = 0
+        buffer = ""
+        sentences = []
+
+        for char in text:
+            if char == open_bracket:
+                if depth > 0:
+                    buffer += char  # Add to buffer if already inside brackets
+                depth += 1
+            elif char == close_bracket:
+                depth -= 1
+                if depth == 0:  # Exiting outermost brackets
+                    if buffer:
+                        sentences.append(buffer)
+                        buffer = ""  # Reset buffer for next possible extraction
+                elif depth > 0:
+                    buffer += char  # Still inside nested brackets, continue adding
+            elif depth > 0:
+                buffer += char  # Add characters inside brackets to buffer
+
+        return sentences
+
+    def process_dataset_entry(self, data_entry) -> List:
+        data: list[dict] = []
+        sentences = []
+        text_in = data_entry[self.text_key]
+
+        for bracket in self.brackets:
+            sentences.extend(self.extract_text_within_brackets(text_in, bracket))
+
+        for sentence in sentences:
+            new_entry = data_entry.copy()
+            new_entry[self.text_key] = sentence
+            # new_entry["ORIGINAL TEXT"] = text_in  # for testing
+            data.append(new_entry)
+
+        data_list = []
+        for data_point in data:
+            data_list.append(DataEntry(data=data_point))
+
+        return data_list
+
+
+class GetWER(BaseParallelProcessor):
+    """This processor calculates Word Error Rate (WER) between predicted text and ground truth text.
+
+    It computes the WER for each entry in the manifest and adds the result as a new field.
+    
+    Args:
+        text_key (str): Key for the ground truth text field in the manifest. Default: "text".
+        pred_text_key (str): Key for the predicted text field in the manifest. Default: "pred_text".
+    
+    Returns:
+        The same data as in the input manifest with an additional 'wer' field containing 
+        the calculated Word Error Rate between the specified text fields.
+    """
+    def __init__(
+        self,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        data_entry['wer'] = get_wer(data_entry[self.text_key], data_entry[self.pred_text_key])
+        return [DataEntry(data=data_entry)]
+
+
+class MakeSentence(BaseParallelProcessor):
+    """This processor formats text strings into proper sentences.
+
+    It capitalizes the first character of the text (if enabled) and appends
+    an end symbol if the text does not already end with punctuation.
+
+    Args:
+        text_key (str): The key in the manifest containing the text to be processed.
+            Default: "text".
+        end_symbol (str): The punctuation symbol to add at the end of the text if it
+            doesn't already have one. Default: ":".
+        make_uppercase (bool): Whether to capitalize the first character of the text.
+            Default: True.
+
+    Returns:
+        The same data as in the input manifest with the text field modified to have
+        proper sentence formatting.
+
+    Example:
+        .. code-block:: yaml
+
+            - _target_: sdp.processors.modify_manifest.data_to_data.MakeSentence
+              input_manifest_file: ${workspace_dir}/dataset.json
+              output_manifest_file: ${workspace_dir}/dataset_formatted.json
+              text_key: "transcript"
+              end_symbol: "."
+              make_uppercase: true
+    """
+    def __init__(
+        self,
+        text_key: str = "text",
+        end_symbol: str = ":",
+        make_uppercase: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.make_uppercase = make_uppercase
+        self.text_key = text_key
+        self.end_symbol = end_symbol
+
+    def process_dataset_entry(self, data_entry) -> List:
+        if self.make_uppercase:
+            data_entry[self.text_key] = data_entry[self.text_key][0].upper() + data_entry[self.text_key][1:]
+
+        # Append end_symbol only if the text doesn't end with punctuation
+        if data_entry[self.text_key][-1].isalpha():
+            data_entry[self.text_key] += self.end_symbol
+        return [DataEntry(data=data_entry)]
+
+
+class ASRFileCheck(BaseProcessor):
+    """This processor validates audio files in the manifest and identifies corrupted files.
+
+    It attempts to load each audio file using the torchaudio library and moves corrupted
+    files to a specified directory.
+
+    Args:
+        audio_filepath_key (str): The key in the manifest that contains the path to
+            the audio file. Default: "audio_filepath".
+        corrupted_audio_dir (str): The directory where corrupted audio files will be moved.
+        workspace_dir (str, optional): The base directory for resolving relative paths.
+            Default: None.
+
+    Returns:
+        A manifest with corrupted audio files removed.
+
+    """
+    def __init__(self, audio_filepath_key: str = "audio_filepath", corrupted_audio_dir: str = None, workspace_dir: str = None, **kwargs):
+        """
+        Constructs the necessary attributes for the ASRFileCheck class.
+
+        Parameters:
+        ----------
+        audio_filepath_key : str, optional
+            The key in the manifest entries used to retrieve the path to the audio file. Defaults to 'audio_filepath'.
+        corrupted_audio_dir : str
+            The directory where corrupted audio files will be moved. This is required.
+        workspace_dir : str, optional
+            The base directory where audio files are stored. If provided, audio file paths will be resolved
+            relative to this directory. Defaults to None.
+        """
+        super().__init__(**kwargs)
+        self.audio_filepath_key = audio_filepath_key
+        
+        if corrupted_audio_dir is None:
+            raise ValueError("corrupted_audio_dir parameter is required. Please specify a directory to move corrupted files.")
+        
+        self.corrupted_audio_dir = corrupted_audio_dir
+        self.workspace_dir = workspace_dir
+        self.failed_files = []
+
+    def process(self):
+        """
+        Check each file listed in the manifest to ensure it can be loaded with torchaudio.
+
+        This method reads through the manifest file, attempts to load each audio file using torchaudio,
+        and moves corrupted files. A new manifest file is created with only the valid entries.
+        
+        Specific errors handled:
+        - FileNotFoundError: File doesn't exist
+        - RuntimeError: File format issues or codec problems
+        - Other exceptions: General issues with file loading
+        """
+        from sdp.logging import logger
+        
+        # Debug print to show workspace_dir
+        logger.info(f"ASRFileCheck workspace_dir: {self.workspace_dir}")
+        
+        with open(self.input_manifest_file, 'r') as f:
+            lines = f.readlines()
+
+        entries = []
+        total_lines = len(lines)
+
+        # Ensure the corrupted files directory exists
+        os.makedirs(self.corrupted_audio_dir, exist_ok=True)
+
+        for idx in tqdm(range(total_lines), desc="Checking Audio Files"):
+            line = lines[idx]
+            entry = json.loads(line)
+            audio_path = entry[self.audio_filepath_key]
+            
+            # Debug print first file path
+            if idx == 0:
+                logger.info(f"First audio_path from manifest: {audio_path}")
+            
+            # If workspace_dir is provided, join it with audio_path to get absolute path
+            if self.workspace_dir is not None:
+                full_audio_path = os.path.join(self.workspace_dir, audio_path)
+            else:
+                full_audio_path = audio_path
+            
+            # Debug print first full path
+            if idx == 0:
+                logger.info(f"First full_audio_path: {full_audio_path}")
+                logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
+            
+            try:
+                # Attempt to load the audio file to check if it is corrupted
+                torchaudio.load(full_audio_path)
+                entries.append(entry)  # File is good, append to entries list
+            except FileNotFoundError:
+                logger.warning(f"File not found: {full_audio_path}")
+                self.failed_files.append(audio_path)
+            except RuntimeError as e:
+                logger.warning(f"Audio format error in {audio_path}: {e}")
+                self.failed_files.append(audio_path)
+                
+                # Move the corrupted audio file
+                if os.path.exists(full_audio_path):
+                    dest_path = os.path.join(self.corrupted_audio_dir, os.path.basename(audio_path))
+                    os.rename(full_audio_path, dest_path)
+                    logger.info(f"Moved corrupted file to: {dest_path}")
+            except Exception as e:
+                logger.warning(f"Unknown error loading {audio_path}: {e}")
+                self.failed_files.append(audio_path)
+                
+                # Move the corrupted audio file
+                if os.path.exists(full_audio_path):
+                    dest_path = os.path.join(self.corrupted_audio_dir, os.path.basename(audio_path))
+                    os.rename(full_audio_path, dest_path)
+                    logger.info(f"Moved corrupted file to: {dest_path}")
+
+        # Output non-corrupted entries to a new manifest file
+        with open(self.output_manifest_file, 'w', encoding='utf-8') as f_out:
+            for entry in entries:
+                json.dump(entry, f_out, ensure_ascii=False)
+                f_out.write("\n")
+
+        if self.failed_files:
+            logger.warning(f"Failed to process {len(self.failed_files)} files.")
+            logger.debug(f"Failed files: {self.failed_files}")
+
+
+class ListToEntries(BaseParallelProcessor):
+    """
+    A dataset processor that transforms a single entry containing a list of items into multiple entries,
+    one for each item in the list.
+
+    This is useful when a dataset field (e.g., "segments") contains a list of sub-entries, and you want
+    to flatten these into individual records for further processing or training.
+
+    Args:
+        field_with_list (str): The name of the field in the input entry that contains a list.
+        output_field (str, optional): The name of the output field to assign to items in the list
+            if they are not dictionaries. Required if the list contains primitive types (e.g., strings).
+        fields_to_save (list[str], optional): A list of field names to preserve from the original entry.
+            All other fields will be removed.
+        fields_to_remove (list[str], optional): A list of field names to explicitly remove from the original entry,
+            in addition to those excluded by `fields_to_save`.
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
+
+    Raises:
+        TypeError: If the specified list field is not of type list.
+        ValueError: If the list items are not dictionaries and `output_field` is not provided.
+    
+    Returns:
+        A manifest where each entry corresponds to one item in the original list from the input entry. 
+        This effectively transforms a single input entry containing a list of items into multiple standalone 
+        entries, each suitable for further dataset processing.
+
+    Example 1 (list of dicts):
+        Input: 
+            {"audio_filepath": "sample.wav", "segments": [{"start": 0.0, "end": 1.5, "text": "Hello"}, {"start": 1.6, "end": 3.0, "text": "World"}]}
+        Output:
+            {"audio_filepath": "sample.wav", "start": 0.0, "end": 1.5, "text": "Hello"}
+            {"audio_filepath": "sample.wav", "start": 1.6, "end": 3.0, "text": "World"}
+    
+    Example 2 (list of primitives, where field_with_list="text_chunks" and output_field="text"):
+        Input:
+            {"audio_filepath": "sample.wav", "text_chunks": ["Hello", "World"]}
+        Output:
+            {"audio_filepath": "sample.wav", "text": "Hello"}
+            {"audio_filepath": "sample.wav", "text": "World"}
+    """
+
+    def __init__(self, 
+        field_with_list: str,
+        output_field: str = None,
+        fields_to_save: list[str] = None,
+        fields_to_remove: list[str] = None,
+        **kwargs):
+        super().__init__(**kwargs)
+        self.field_with_list = field_with_list
+        self.output_field = output_field
+        self.fields_to_save = fields_to_save
+        self.fields_to_remove = fields_to_remove
+        
+    def process_dataset_entry(self, data_entry):
+        _entries = []
+
+        # Check that the target field is actually a list
+        if not isinstance(data_entry[self.field_with_list], list):
+            raise TypeError(f'Values of {self.field_with_list} field should be list type only: {data_entry}')
+        
+        # Remove the list field from the entry and get the list of items
+        items_list = data_entry.pop(self.field_with_list)
+
+        # If items are not dicts, output_field must be specified to store the item
+        if not isinstance(items_list[0], dict) and not self.output_field:
+            raise ValueError(f'Type of items in items list `{self.field_with_list}` is not dict ({type(items_list[0])}). In this case `output_field` should be provided.')
+
+        # Determine which fields to remove from the entry before expanding
+        fields_to_remove = set()
+        if self.fields_to_save is not None:
+            for field in data_entry:
+                if field not in self.fields_to_save:
+                    fields_to_remove.add(field)
+
+        if self.fields_to_remove is not None:
+            fields_to_remove.update(self.fields_to_remove)
+
+        # Remove specified fields
+        for field in fields_to_remove:
+            data_entry.pop(field)
+
+        # Expand the list into multiple entries
+        for item in items_list:
+            _entry = data_entry.copy()
+
+            # If item is a dict, merge its keys; otherwise, store it in `output_field`
+            if isinstance(item, dict):
+                _entry.update(item)
+            else: 
+                _entry[self.output_field] = item
+
+            _entry = DataEntry(_entry)
+            _entries.append(_entry)
+
+        return _entries
diff --git a/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py b/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
new file mode 100644
index 00000000..ff675e0a
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -0,0 +1,907 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import re
+import os 
+import json
+from operator import eq, ge, gt, le, lt, ne
+from typing import List, Union
+
+from sdp.logging import logger
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
+from sdp.utils.get_diff import get_diff, get_diff_with_subs_grouped
+from sdp.utils.metrics_computation import (
+    get_cer,
+    get_charrate,
+    get_wer,
+    get_wmr,
+    get_wordrate,
+)
+
+
+class PreserveByValue(BaseParallelProcessor):
+    """
+    Processor for preserving dataset entries based on a specified condition involving a target value and an input field.
+
+    Args:
+        input_value_key (str): The field in the dataset entries to be evaluated.
+        target_value (Union[int, str]): The value to compare with the input field.
+        operator (str): (Optional) The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        input_value_key: str,
+        target_value: Union[int, str],
+        operator: str = "eq",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_value_key = input_value_key
+        self.target_value = target_value
+        if operator == "lt":
+            self.operator = lt
+        elif operator == "le":
+            self.operator = le
+        elif operator == "eq":
+            self.operator = eq
+        elif operator == "ne":
+            self.operator = ne
+        elif operator == "ge":
+            self.operator = ge
+        elif operator == "gt":
+            self.operator = gt
+        else:
+            raise ValueError(
+                'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)'
+            )
+
+    def process_dataset_entry(self, data_entry):
+        input_value = data_entry[self.input_value_key]
+        target = self.target_value
+        if self.operator(input_value, target):
+            return [DataEntry(data=data_entry)]
+        else:
+            return [DataEntry(data=None)]
+
+
+class DropHighLowCharrate(BaseParallelProcessor):
+    """Drops utterances if their character rate is too low or too high.
+
+    Character rate = ``(num of characters in self.text_key) / (duration of audio)``.
+    A too-low or too-high character rate often implies that the ground
+    truth transcription might be inaccurate.
+
+    Args:
+        high_charrate_threshold (float): upper character rate threshold.
+            If the character rate of an utterance is higher than this number,
+            the utterance will be dropped.
+        low_charrate_threshold (float): lower character rate threshold.
+            If the character rate of an utterance is lower than this number,
+            the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        high_charrate_threshold: float,
+        low_charrate_threshold: float,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.high_charrate_threshold = high_charrate_threshold
+        self.low_charrate_threshold = low_charrate_threshold
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        """Drops utterances based on the provided thresholds."""
+        charrate = get_charrate(data_entry[self.text_key], data_entry["duration"])
+        if charrate > self.high_charrate_threshold:
+            return [DataEntry(data=None, metrics=(0, 1))]
+        elif charrate < self.low_charrate_threshold:
+            return [DataEntry(data=None, metrics=(1, 0))]
+
+        return [DataEntry(data=data_entry, metrics=(0, 0))]
+
+    def finalize(self, metrics):
+        """Will report how many utterances were dropped for each threshold."""
+        high_drop_counter = 0
+        low_drop_counter = 0
+        for dropped_low, dropped_high in metrics:
+            low_drop_counter += dropped_low
+            high_drop_counter += dropped_high
+        logger.info(
+            "Num of utterances that were dropped due to char rate > %f: %d",
+            self.high_charrate_threshold,
+            high_drop_counter,
+        )
+
+        logger.info(
+            "Num of utterances that were dropped due to char rate < %f: %d",
+            self.low_charrate_threshold,
+            low_drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropHighLowWordrate(BaseParallelProcessor):
+    """Drops utterances if their word rate is too low or too high.
+
+    Word rate = ``(num of words in self.text_key) / (duration of audio)``.
+    A too-low or too-high word rate often implies that the ground
+    truth transcription might be inaccurate.
+
+    Args:
+        high_wordrate_threshold (float): upper word rate threshold.
+            If the word rate of an utterance is higher than this number,
+            the utterance will be dropped.
+        low_wordrate_threshold (float): lower word rate threshold.
+            If the word rate of an utterance is lower than this number,
+            the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        high_wordrate_threshold: float,
+        low_wordrate_threshold: float,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.high_wordrate_threshold = high_wordrate_threshold
+        self.low_wordrate_threshold = low_wordrate_threshold
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        wordrate = get_wordrate(data_entry[self.text_key], data_entry["duration"])
+        if wordrate > self.high_wordrate_threshold:
+            return [DataEntry(data=None, metrics=(0, 1))]
+        elif wordrate < self.low_wordrate_threshold:
+            return [DataEntry(data=None, metrics=(1, 0))]
+
+        return [DataEntry(data=data_entry, metrics=(0, 0))]
+
+    def finalize(self, metrics):
+        high_drop_counter = 0
+        low_drop_counter = 0
+        for dropped_low, dropped_high in metrics:
+            low_drop_counter += dropped_low
+            high_drop_counter += dropped_high
+        logger.info(
+            "Num of utterances that were dropped due to word rate > %f: %d",
+            self.high_wordrate_threshold,
+            high_drop_counter,
+        )
+        logger.info(
+            "Num of utterances that were dropped due to word rate < %f: %d",
+            self.low_wordrate_threshold,
+            low_drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropHighLowDuration(BaseParallelProcessor):
+    """Drops utterances if their duration is too low or too high.
+
+    Args:
+        high_duration_threshold (float): upper duration threshold (in seconds).
+            If the duration of an utterance's audio is higher than this number,
+            the utterance will be dropped.
+        low_duration_threshold (float): lower duration threshold (in seconds).
+            If the duration of an utterance's audio is lower than this number,
+            the utterance will be dropped.
+        duration_key (str): a string indicating which key of the data entries
+            should be used to find the utterance duration. Defaults to "duration".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        high_duration_threshold: float,
+        low_duration_threshold: float,
+        duration_key: str = "duration",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.high_duration_threshold = high_duration_threshold
+        self.low_duration_threshold = low_duration_threshold
+        self.high_drop_counter = 0
+        self.low_drop_counter = 0
+        self.duration_key = duration_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        duration = data_entry[self.duration_key]
+        if duration > self.high_duration_threshold:
+            return [DataEntry(data=None, metrics=(0, 1))]
+        elif duration < self.low_duration_threshold:
+            return [DataEntry(data=None, metrics=(1, 0))]
+
+        return [DataEntry(data=data_entry, metrics=(0, 0))]
+
+    def finalize(self, metrics):
+        high_drop_counter = 0
+        low_drop_counter = 0
+        for dropped_low, dropped_high in metrics:
+            low_drop_counter += dropped_low
+            high_drop_counter += dropped_high
+        logger.info(
+            "Num of utterances that were dropped due to duration > %f: %d",
+            self.high_duration_threshold,
+            high_drop_counter,
+        )
+        logger.info(
+            "Num of utterances that were dropped due to duration < %f: %d",
+            self.low_duration_threshold,
+            low_drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropIfNoneOfRegexMatch(BaseParallelProcessor):
+    """Drops utterances if ``data[self.text_key]`` does not match any of ``regex_patterns``.
+
+    Before applying regex checks, we will add a space
+    character to the beginning and end of the ``text`` and ``pred_text``
+    keys for each data entry. After the the regex checks, assuming the utterance isn't dropped,
+    the extra spaces are removed. This includes the spaces in the beginning
+    and end of the text, as well as any double spaces ``"  "``.
+
+    Args:
+        regex_patterns (list[str]): If ``data_entry[self.text_key]`` does not
+            match any of the regex patterns in the list, that utterance
+            will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        regex_patterns: List[str],
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.regex_patterns = regex_patterns
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
+        for regex_pattern in self.regex_patterns:
+            if re.search(regex_pattern, data_entry[self.text_key]):
+                break
+        else:  # will only reach this if none of the regex match
+            return [DataEntry(data=None, metrics=1)]
+
+        # will reach this part of code if at least one of the regexes matches
+        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
+        return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        total_counter = 0
+        for value in metrics:
+            if value:
+                total_counter += value
+        logger.info("Num of utterances that were dropped due to not containing any of the specified regex patterns")
+        logger.info(f"{total_counter}")
+        super().finalize(metrics)
+
+
+class DropNonAlphabet(BaseParallelProcessor):
+    """Drops utterances if they contain characters that are not in the ``alphabet``.
+
+    Args:
+        alphabet (str): a string containing all of the characters in our alphabet.
+            If an utterance contains at least one character that is not in the
+            ``alphabet``, then that utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+            .. note::
+                Don't forget to include spaces in your alphabet, unless you
+                want to make sure none of the utterances contain spaces.
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        alphabet: str,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.alphabet = alphabet
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        drop_this_utt = False
+        non_alphabet_counter = collections.defaultdict(int)
+        for char in data_entry[self.text_key]:
+            if char not in self.alphabet:
+                drop_this_utt = True
+                non_alphabet_counter[char] += 1
+        if drop_this_utt:
+            return [DataEntry(data=None, metrics=non_alphabet_counter)]
+        return [DataEntry(data=data_entry, metrics=non_alphabet_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for char, value in counter.items():
+                total_counter[char] += value
+        logger.info("Num of non-alphabet characters")
+        for char, count in total_counter.items():
+            logger.info(f"{char}: {count}")
+        super().finalize(metrics)
+
+
+class DropASRErrorBeginningEnd(BaseParallelProcessor):
+    """Drops utterances if there is a sufficiently long ASR mismatch
+    at the beginning or end of the utterance.
+
+    Args:
+        beginning_error_char_threshold (int): if there is an insertion or deletion at
+            the beginning of the utterance that has more characters than this number,
+            then the utterance will be dropped.
+            If there is a substitution at the beginning of the utterance, then the
+            utterance will be dropped if
+            ``abs(len(deletion) - len(insertion)) > beginning_error_char_threshold``.
+        end_error_char_threshold (int): if there is an insertion or deletion at
+            the end of the utterance that has more characters than this number,
+            then the utterance will be dropped.
+            If there is a substitution at the end of the utterance, then the
+            utterance will be dropped if
+            ``abs(len(deletion) - len(insertion)) > end_error_char_threshold``.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        beginning_error_char_threshold: int,
+        end_error_char_threshold: int,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.beginning_error_char_threshold = beginning_error_char_threshold
+        self.end_error_char_threshold = end_error_char_threshold
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
+
+        diff = get_diff_with_subs_grouped(orig_words, pred_words)
+
+        if len(diff) > 0:  # i.e. if there are differences between text and pred_text
+            first_diff_entry = diff[0]
+            if first_diff_entry[0] == 1 or first_diff_entry[0] == -1:  # i.e. diff is purely an insertion or deletion
+                if len(first_diff_entry[1]) > self.beginning_error_char_threshold:
+                    return [DataEntry(data=None, metrics=(1, 0))]
+            elif first_diff_entry[0] != 0:  # i.e. diff should be a tuple representing substitution
+                len_deletion = len(first_diff_entry[0][1])
+                len_insertion = len(first_diff_entry[1][1])
+                if abs(len_deletion - len_insertion) > self.beginning_error_char_threshold:
+                    return [DataEntry(data=None, metrics=(1, 0))]
+
+            last_diff_entry = diff[-1]
+            if last_diff_entry[0] == 1 or last_diff_entry[0] == -1:  # i.e. diff is purely an insertion or deletion
+                if len(last_diff_entry[1]) > self.end_error_char_threshold:
+                    return [DataEntry(data=None, metrics=(0, 1))]
+            elif last_diff_entry[0] != 0:  # i.e. diff should be a tuple representing substitution
+                len_deletion = len(last_diff_entry[0][1])
+                len_insertion = len(last_diff_entry[1][1])
+                if abs(len_deletion - len_insertion) > self.end_error_char_threshold:
+                    return [DataEntry(data=None, metrics=(0, 1))]
+
+        return [DataEntry(data=data_entry, metrics=(0, 0))]
+
+    def finalize(self, metrics):
+        beginning_drop_counter = 0
+        end_drop_counter = 0
+        for dropped_beginning, dropped_end in metrics:
+            beginning_drop_counter += dropped_beginning
+            end_drop_counter += dropped_end
+        logger.info(
+            "Num of utterances that were dropped due to asr insertions/deletions at the beginning: %d",
+            beginning_drop_counter,
+        )
+        logger.info(
+            "Num of utterances that were dropped due to asr insertions/deletions at the end: %d",
+            end_drop_counter,
+        )
+        super().finalize(metrics)
+
+
+# TODO: needs unification with above class in some way
+class DropASRError(BaseParallelProcessor):
+    """Drops utterances if there is a sufficiently long ASR mismatch anywhere in the utterance.
+
+    Args:
+        consecutive_words_threshold (int): will drop if there is a mismatch of
+            at least this many words in a row.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        consecutive_words_threshold: int,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.consecutive_words_threshold = consecutive_words_threshold
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
+        diffs = get_diff(orig_words, pred_words)
+
+        for diff_entry in diffs:
+            if diff_entry[0] == 0:
+                continue
+            if len(diff_entry[1].split()) >= self.consecutive_words_threshold:
+                return []
+
+        return [DataEntry(data=data_entry)]
+
+
+class DropHighCER(BaseParallelProcessor):
+    """Drops utterances if there is a sufficiently high character-error-rate (CER).
+
+    CER is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
+
+    .. note::
+        We only drop the utterance if ``CER > threshold`` (i.e. strictly greater
+        than) so that if we set the threshold to 0, we will not remove
+        utterances with ``CER == 0``.
+
+    Args:
+        cer_threshold (float): CER threshold above which the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        cer_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.cer_threshold = cer_threshold
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        cer = get_cer(data_entry[self.text_key], data_entry[self.pred_text_key])
+        if cer > self.cer_threshold:
+            return [DataEntry(data=None, metrics=1)]
+        else:
+            return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        drop_counter = 0
+        for dropped in metrics:
+            drop_counter += dropped
+        logger.info(
+            "Num of utterances that were dropped due to CER > %d: %d",
+            self.cer_threshold,
+            drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropHighWER(BaseParallelProcessor):
+    """Drops utterances if there is a sufficiently high word-error-rate (WER).
+
+    WER is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
+
+    .. note::
+        We only drop the utterance if ``WER > threshold`` (i.e. strictly greater
+        than) so that if we set the threshold to 0, we will not remove
+        utterances with ``WER == 0``.
+
+    Args:
+        wer_threshold (float): WER threshold above which the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        wer_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.wer_threshold = wer_threshold
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        wer = get_wer(data_entry[self.text_key], data_entry[self.pred_text_key])
+        if wer > self.wer_threshold:
+            return [DataEntry(data=None, metrics=1)]
+        else:
+            return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        drop_counter = 0
+        for dropped in metrics:
+            drop_counter += dropped
+        logger.info(
+            "Num of utterances that were dropped due to WER > %d: %d",
+            self.wer_threshold,
+            drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropLowWordMatchRate(BaseParallelProcessor):
+    """Drops utterances if there is a sufficiently low word-match-rate (WMR).
+
+    WMR is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
+
+    .. note::
+        We only drop the utterance if ``WMR < threshold`` (i.e. strictly lower
+        than) so that if we set the threshold to 100, we will not remove
+        utterances with ``WMR == 100``.
+
+    Args:
+        wmr_threshold (float): WMR threshold below which the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+        The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        wmr_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.wmr_threshold = wmr_threshold
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
+        wmr = get_wmr(orig_words, pred_words)
+        if wmr < self.wmr_threshold:
+            return [DataEntry(data=None, metrics=1)]
+        else:
+            return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        drop_counter = 0
+        for dropped in metrics:
+            drop_counter += dropped
+        logger.info(
+            "Num of utterances that were dropped due to WMR < %d: %d",
+            self.wmr_threshold,
+            drop_counter,
+        )
+        super().finalize(metrics)
+
+
+class DropIfRegexMatch(BaseParallelProcessor):
+    """Drops utterances if text matches a regex pattern.
+
+    Before applying regex checks, we will add a space
+    character to the beginning and end of the ``text`` and ``pred_text``
+    keys for each data entry. After the the regex checks, assuming the utterance isn't dropped,
+    the extra spaces are removed. This includes the spaces in the beginning
+    and end of the text, as well as any double spaces ``"  "``.
+
+    Args:
+        regex_patterns (list[str]): a list of strings. The list will be
+            traversed in order. If ``data_entry.data[self.text_key]`` matches
+            the regex, the entry will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        regex_patterns: List[str],
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.regex_patterns = regex_patterns
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        drop_counter = collections.defaultdict(int)
+        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
+        for regex_pattern in self.regex_patterns:
+            if re.search(regex_pattern, data_entry[self.text_key]):
+                for match in re.finditer(regex_pattern, data_entry[self.text_key]):
+                    drop_counter[regex_pattern] += 1
+                return [DataEntry(data=None, metrics=drop_counter)]
+        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
+        return [DataEntry(data=data_entry, metrics=drop_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for attribute, value in counter.items():
+                total_counter[attribute] += value
+        logger.info("Regex matches that were dropped in attribute")
+        for attribute, matches in total_counter.items():
+            logger.info(f"{attribute}, {matches}")
+        super().finalize(metrics)
+
+
+class DropOnAttribute(BaseParallelProcessor):
+    """Drops utterances if attribute is set to True/False.
+
+    Args:
+        key (str): which key to use for dropping utterances.
+        drop_if_false (bool): whether to drop if value is False. Defaults
+            to dropping if True.
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        key: str,
+        drop_if_false: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.key = key
+        self.drop_if_false = drop_if_false
+
+    def process_dataset_entry(self, data_entry) -> List:
+        if data_entry[self.key] is not self.drop_if_false:
+            return [DataEntry(data=None, metrics=1)]
+        return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        total_counter = 0
+        for counter in metrics:
+            total_counter += counter
+        logger.info("Dropped %d utterances", total_counter)
+        super().finalize(metrics)
+
+
+class DropIfSubstringInInsertion(BaseParallelProcessor):
+    """Drops utterances if a substring matches an ASR insertion.
+
+    Insertions are checked between ``data[self.text_key]`` and
+    ``data[self.pred_text_key]``.
+
+    .. note::
+        We check for exact matches, so you need to be mindful of spaces, e.g.
+        you may wish to do ``substrings_in_insertion = ["nemo "]`` instead
+        of ``substrings_in_insertion = ["nemo"]``.
+
+    Args:
+        substrings_in_insertion (list[str]): a list of strings which might be
+            inserted in predicted ASR text. If the insertion matches a
+            string exactly, the utterance will be dropped.
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+        pred_text_key (str): a string indicating which key of the data entries
+            should be used to access the ASR predictions. Defaults to "pred_text".
+
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+    """
+
+    def __init__(
+        self,
+        substrings_in_insertion: List[str],
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.substrings_in_insertion = substrings_in_insertion
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        for substring_in_insertion in self.substrings_in_insertion:
+            if substring_in_insertion in data_entry[self.pred_text_key]:
+                orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
+                diff = get_diff_with_subs_grouped(orig_words, pred_words)
+
+                for diff_entry in diff:
+                    if diff_entry[0] == 1:  # insertion in original string
+                        if substring_in_insertion in diff_entry[1]:
+                            return [DataEntry(data=None, metrics=diff_entry[1])]
+        return [DataEntry(data=data_entry, metrics="")]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for diff_entry in metrics:
+            if diff_entry:
+                total_counter[diff_entry] += 1
+        logger.info("Some of the insertions that cause the utterance to be dropped:")
+        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
+
+        for insertion, count in total_counter_sorted.items():
+            logger.info(f"{insertion}, {count}")
+        super().finalize(metrics)
+
+
+class DropRepeatedFields(BaseParallelProcessor):
+    """Drops utterances from the current manifest if their text fields are present in other manifests.
+
+    This class processes multiple manifest files and removes entries from the current manifest if the text field
+    matches any entry in the other manifests. It allows for optional punctuation removal from the text fields 
+    before performing the check.
+    
+    .. note::
+        It is better to process Test/Dev/Train and then Other.tsv
+
+    Args:
+        manifests_paths (list[str]): List of paths to the manifest files to check against.
+        current_manifest_file (str): Path to the current manifest file to be processed.
+        punctuations (str): (Optional): String of punctuation characters to be removed from the text fields before checking for duplicates. Defaults to None.
+        text_key (str): The key in the manifest entries that contains the text field. Defaults to "text".
+    
+    Returns:
+         The same data as in the input manifest with some entries dropped.
+
+    """
+    def __init__(self,
+                 manifests_paths: List[str], 
+                 current_manifest_file: str,
+                 punctuations: str = None,
+                 text_key: str = "text",
+                 **kwargs
+                 ):
+        super().__init__( **kwargs)
+        self.manifests_paths = manifests_paths
+        self.current_manifest_file = current_manifest_file
+        self.text_key = text_key
+        self.punctuations = punctuations
+        self.text_set = set()
+        self.load_data()
+
+    def load_data(self):
+        if self.current_manifest_file in self.manifests_paths:
+            self.manifests_paths.remove(self.current_manifest_file)
+        for path in self.manifests_paths:
+            if os.path.exists(path):
+                with open(path, "rt", encoding="utf8") as fin:
+                    for line in fin:
+                        line_dict = json.loads(line)
+                        line_text = line_dict[self.text_key]
+                        if self.punctuations is not None and len(self.punctuations) > 0:
+                            line_text = self.remove_punctuation(line_text)
+                        self.text_set.add(line_text)
+        
+    def remove_punctuation(self, text):
+        return re.sub(fr'[{self.punctuations}]', '', text)
+    
+    def process_dataset_entry(self, data_entry) -> List:
+        text_for_check = data_entry[self.text_key]
+        if self.punctuations is not None and len(self.punctuations) > 0:
+            text_for_check = self.remove_punctuation(text_for_check)
+        if text_for_check in self.text_set:
+            return [DataEntry(data=None, metrics=1)]
+        return [DataEntry(data=data_entry, metrics=0)]
+    
+    def finalize(self, metrics: List):
+        total_counter = 0
+        for counter in metrics:
+            total_counter += counter
+        logger.info("Dropped %d utterances", total_counter)
+        super().finalize(metrics)
+
+
+class DropDuplicates(BaseProcessor):
+    """
+    Processor that drops all the non unique uterances associated with the specified key, keeping only the first utterance.
+
+    Args:
+        drop_key (str): A string specifying the key in the data entries used to determine uniqueness. Defaults to "text".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Returns:
+        A list of unique data entries after removing duplicates.
+
+    """
+
+    def __init__(self, drop_key: str = "text", **kwargs):
+        super().__init__(**kwargs)
+        self.drop_key = drop_key
+        self.seen_texts = set()
+
+    def process(self):
+        unique_entries = []
+        with open(self.input_manifest_file, 'r', encoding='utf-8') as file:
+            for line in file:
+                data_entry = json.loads(line)
+                text_in = data_entry[self.drop_key]
+                if text_in not in self.seen_texts:
+                    self.seen_texts.add(text_in)
+                    unique_entries.append(data_entry)
+
+        with open(self.output_manifest_file, "wt", encoding='utf-8') as fout:
+            for entry in unique_entries:
+                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+        logger.info(f"Total number of entries after processing: {len(unique_entries)}")
+        return unique_entries
diff --git a/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py b/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
new file mode 100644
index 00000000..10ccd57e
--- /dev/null
+++ b/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from typing import List
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+# TODO: should be done with general sub-regex processor
+
+
+class MakeLettersUppercaseAfterPeriod(BaseParallelProcessor):
+    """Can be used to replace characters with upper-case version after punctuation.
+
+    Args:
+        punctuation (str): string with all punctuation characters to consider.
+            Defaults to ".!?".
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field changed.
+    """
+
+    def __init__(
+        self, punctuation=".!?", text_key: str = "text", **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.punctuation = punctuation
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        replace_word_counter = collections.defaultdict(int)
+
+        # keeping in a list, since strings are immutable
+        new_text = []
+
+        idx = 0
+        while idx < len(data_entry[self.text_key]):
+            character = data_entry[self.text_key][idx]
+            # checking that next is space and then we upper whatever is after that
+            # note that Python's upper correctly does not change anything that's not a letter
+            if (
+                character in self.punctuation
+                and idx + 2 < len(data_entry[self.text_key])
+                and data_entry[self.text_key][idx + 1] == " "
+            ):
+                new_text.extend([character, " ", data_entry[self.text_key][idx + 2].upper()])
+                replace_word_counter[data_entry[self.text_key][idx : idx + 3]] += 1
+                idx += 2
+            else:
+                new_text.append(character)
+            idx += 1
+        data_entry[self.text_key] = "".join(new_text)
+
+        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
+
+    def finalize(self, metrics):
+        total_counter = collections.defaultdict(int)
+        for counter in metrics:
+            for word, count in counter.items():
+                total_counter[word] += count
+        logger.info("Some of the substrings that were uppercased")
+        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
+        for word, count in total_counter_sorted.items():
+            if count > 1:
+                logger.info(f"{word} {count}")
+        super().finalize(metrics)
diff --git a/build/lib/sdp/processors/nemo/__init__.py b/build/lib/sdp/processors/nemo/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/build/lib/sdp/processors/nemo/asr_inference.py b/build/lib/sdp/processors/nemo/asr_inference.py
new file mode 100644
index 00000000..4359f320
--- /dev/null
+++ b/build/lib/sdp/processors/nemo/asr_inference.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from sdp.processors.base_processor import BaseProcessor
+
+# Note that we do not re-use base parallel implementation, since the ASR
+# inference is already run in batches.
+
+# TODO: actually, it might still be beneficial to have another level of
+#       parallelization, but that needs to be tested.
+
+
+class ASRInference(BaseProcessor):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    ASR predictions will be saved in the ``pred_text`` key.
+
+    Args:
+        pretrained_model (str, Optional): the name or the filepath of the pretrained NeMo ASR model
+            which will be used to do inference.
+        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         ``pred_text`` containing ASR model's predictions.
+    """
+
+    def __init__(
+        self,
+        pretrained_model: Optional[str]=None,
+        batch_size: int = 32,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech.py"
+        self.pretrained_model = pretrained_model
+        self.batch_size = batch_size
+
+    def process(self):
+        """This will add "pred_text" key into the output manifest."""
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if self.pretrained_model.endswith(".nemo"):
+            subprocess.run(
+                f"python {self.script_path} "
+                f"model_path={self.pretrained_model} "
+                f"dataset_manifest={self.input_manifest_file} "
+                f"output_filename={self.output_manifest_file} "
+                f"batch_size={self.batch_size} ",
+                shell=True,
+                check=True,
+            )
+        else:
+            subprocess.run(
+                f"python {self.script_path} "
+                f"pretrained_name={self.pretrained_model} "
+                f"dataset_manifest={self.input_manifest_file} "
+                f"output_filename={self.output_manifest_file} "
+                f"batch_size={self.batch_size} ",
+                shell=True,
+                check=True,
+            )
\ No newline at end of file
diff --git a/build/lib/sdp/processors/nemo/pc_inference.py b/build/lib/sdp/processors/nemo/pc_inference.py
new file mode 100644
index 00000000..8be34ec8
--- /dev/null
+++ b/build/lib/sdp/processors/nemo/pc_inference.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from sdp.processors.base_processor import BaseProcessor
+
+
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    result = []
+    with manifest.open() as f:
+        for i, line in enumerate(f):
+            data = json.loads(line)
+            result.append(data)
+    return result
+
+
+class PCInference(BaseProcessor):
+    """Adds predictions of a text-based punctuation and capitalization (P&C) model.
+
+    Operates on the text in the ``input_text_field``, and saves predictions in
+    the ``output_text_field``.
+
+    Args:
+        input_text_field (str): the text field that will be the input to the P&C model.
+        output_text_field (str): the text field where the output of the PC model
+            will be saved.
+        batch_size (int): the batch sized used by the P&C model.
+        device (str, Optional): the device used by the P&C model. Can be skipped to auto-select.
+        pretrained_name (str, Optional): the pretrained_name of the P&C model.
+        model_path (str, Optional): the model path to the P&C model.
+        **kwargs: Additional keyword arguments to be passed to the base class `PCInference`.
+
+    .. note::
+        Either ``pretrained_name`` or ``model_path`` have to be specified.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         <output_text_field> containing P&C model's predictions.
+    """
+
+    def __init__(
+        self,
+        input_text_field: str,
+        output_text_field: str,
+        batch_size: int,
+        device: Optional[str] = None,
+        pretrained_name: Optional[str] = None,
+        model_path: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.pretrained_name = pretrained_name
+        self.model_path = model_path
+        self.input_text_field = input_text_field
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+
+        # verify self.pretrained_name/model_path
+        if self.pretrained_name is None and self.model_path is None:
+            raise ValueError("pretrained_name and model_path cannot both be None")
+        if self.pretrained_name is not None and self.model_path is not None:
+            raise ValueError("pretrained_name and model_path cannot both be specified")
+
+    def process(self):
+        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
+        from nemo.collections.nlp.models import PunctuationCapitalizationModel
+
+        if self.pretrained_name:
+            model = PunctuationCapitalizationModel.from_pretrained(self.pretrained_name)
+        else:
+            model = PunctuationCapitalizationModel.restore_from(self.model_path)
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                model = model.cuda()
+            else:
+                model = model.cpu()
+        else:
+            model = model.to(self.device)
+
+        manifest = load_manifest(Path(self.input_manifest_file))
+
+        texts = []
+        for item in manifest:
+            texts.append(item[self.input_text_field])
+
+        processed_texts = model.add_punctuation_capitalization(
+            texts,
+            batch_size=self.batch_size,
+        )
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        with Path(self.output_manifest_file).open('w') as f:
+            for item, t in zip(manifest, processed_texts):
+                item[self.output_text_field] = t
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
diff --git a/build/lib/sdp/processors/nemo/transcribe_speech.py b/build/lib/sdp/processors/nemo/transcribe_speech.py
new file mode 100644
index 00000000..bb04047b
--- /dev/null
+++ b/build/lib/sdp/processors/nemo/transcribe_speech.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is copied over from https://github.com/NVIDIA/NeMo/blob/v1.23.0/examples/asr/transcribe_speech.py.
+# It is currently only compatible with NeMo v1.23.0. To use a different version of NeMo, please modify the file.
+
+import contextlib
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import List, Optional, Union
+
+import pytorch_lightning as pl
+import torch
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
+from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    prepare_audio_data,
+    setup_model,
+    transcribe_partial_audio,
+    write_transcription,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+"""
+Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data.
+
+# Arguments
+  model_path: path to .nemo ASR checkpoint
+  pretrained_name: name of pretrained ASR model (from NGC registry)
+  audio_dir: path to directory with audio files
+  dataset_manifest: path to dataset JSON manifest file (in NeMo format)
+
+  compute_timestamps: Bool to request greedy time stamp information (if the model supports it)
+  compute_langs: Bool to request language ID information (if the model supports it)
+
+  (Optionally: You can limit the type of timestamp computations using below overrides)
+  ctc_decoding.ctc_timestamp_type="all"  # (default all, can be [all, char, word])
+  rnnt_decoding.rnnt_timestamp_type="all"  # (default all, can be [all, char, word])
+
+  (Optionally: You can limit the type of timestamp computations using below overrides)
+  ctc_decoding.ctc_timestamp_type="all"  # (default all, can be [all, char, word])
+  rnnt_decoding.rnnt_timestamp_type="all"  # (default all, can be [all, char, word])
+
+  output_filename: Output filename where the transcriptions will be written
+  batch_size: batch size during inference
+
+  cuda: Optional int to enable or disable execution of model on certain CUDA device.
+  allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
+  amp: Bool to decide if Automatic Mixed Precision should be used during inference
+  audio_type: Str filetype of the audio. Supported = wav, flac, mp3
+
+  overwrite_transcripts: Bool which when set allows repeated transcriptions to overwrite previous results.
+
+  ctc_decoding: Decoding sub-config for CTC. Refer to documentation for specific values.
+  rnnt_decoding: Decoding sub-config for RNNT. Refer to documentation for specific values.
+
+  calculate_wer: Bool to decide whether to calculate wer/cer at end of this script
+  clean_groundtruth_text: Bool to clean groundtruth text
+  langid: Str used for convert_num_to_words during groundtruth cleaning
+  use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
+
+# Usage
+ASR model can be specified by either "model_path" or "pretrained_name".
+Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
+append_pred - optional. Allows you to add more than one prediction to an existing .json
+pred_name_postfix - optional. The name you want to be written for the current model
+Results are returned in a JSON manifest file.
+
+python transcribe_speech.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    clean_groundtruth_text=True \
+    langid='en' \
+    batch_size=32 \
+    compute_timestamps=False \
+    compute_langs=False \
+    cuda=0 \
+    amp=True \
+    append_pred=False \
+    pred_name_postfix="<remove or use another model name for output filename>"
+"""
+
+
+@dataclass
+class ModelChangeConfig:
+
+    # Sub-config for changes specific to the Conformer Encoder
+    conformer: ConformerChangeConfig = ConformerChangeConfig()
+
+
+@dataclass
+class TranscriptionConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    channel_selector: Optional[
+        Union[int, str]
+    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
+    audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
+    eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
+
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    num_workers: int = 0
+    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
+    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
+    random_seed: Optional[int] = None  # seed number going to be used in seed_everything()
+
+    # Set to True to output greedy timestamp information (only supported models)
+    compute_timestamps: bool = False
+    # set to True if need to return full alignment information
+    preserve_alignment: bool = False
+
+    # Set to True to output language ID information
+    compute_langs: bool = False
+
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
+    amp: bool = False
+    amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
+    audio_type: str = "wav"
+
+    # Recompute model transcription, even if the output folder exists with scores.
+    overwrite_transcripts: bool = True
+
+    # Decoding strategy for CTC models
+    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
+
+    # Decoding strategy for RNNT models
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+
+    # Decoding strategy for AED models
+    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+
+    # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
+    decoder_type: Optional[str] = None
+    # att_context_size can be set for cache-aware streaming models with multiple look-aheads
+    att_context_size: Optional[list] = None
+
+    # Use this for model-specific changes before transcription
+    model_change: ModelChangeConfig = ModelChangeConfig()
+
+    # Config for word / character error rate calculation
+    calculate_wer: bool = True
+    clean_groundtruth_text: bool = False
+    langid: str = "en"  # specify this for convert_num_to_words step in groundtruth cleaning
+    use_cer: bool = False
+
+    # can be set to True to return list of transcriptions instead of the config
+    # if True, will also skip writing anything to the output file
+    return_transcriptions: bool = False
+
+    # Set to False to return text instead of hypotheses from the transcribe function, so as to save memory
+    return_hypotheses: bool = True
+
+    # key for groundtruth text in manifest
+    gt_text_attr_name: str = "text"
+
+    # Use model's transcribe() function instead of transcribe_partial_audio() by default
+    # Only use transcribe_partial_audio() when the audio is too long to fit in memory
+    # Your manifest input should have `offset` field to use transcribe_partial_audio()
+    allow_partial_transcribe: bool = False
+
+
+@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
+def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
+    for key in cfg:
+        cfg[key] = None if cfg[key] == 'None' else cfg[key]
+
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+
+    if cfg.random_seed:
+        pl.seed_everything(cfg.random_seed)
+
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+
+    # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C
+    augmentor = None
+    if cfg.eval_config_yaml:
+        eval_config = OmegaConf.load(cfg.eval_config_yaml)
+        augmentor = eval_config.test_ds.get("augmentor")
+        logging.info(f"Will apply on-the-fly augmentation on samples during transcription: {augmentor} ")
+
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+            map_location = torch.device('cuda:0')
+        elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            logging.warning(
+                "MPS device (Apple Silicon M-series GPU) support is experimental."
+                " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures."
+            )
+            device = [0]
+            accelerator = 'mps'
+            map_location = torch.device('mps')
+        else:
+            device = 1
+            accelerator = 'cpu'
+            map_location = torch.device('cpu')
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+        map_location = torch.device(f'cuda:{cfg.cuda}')
+
+    logging.info(f"Inference will be done on device: {map_location}")
+
+    asr_model, model_name = setup_model(cfg, map_location)
+
+    trainer = pl.Trainer(devices=device, accelerator=accelerator)
+    asr_model.set_trainer(trainer)
+    asr_model = asr_model.eval()
+
+    # we will adjust this flag if the model does not support it
+    compute_timestamps = cfg.compute_timestamps
+    compute_langs = cfg.compute_langs
+    # has to be True if timestamps are required
+    preserve_alignment = True if cfg.compute_timestamps else cfg.preserve_alignment
+
+    # Check whether model and decoder type match
+    if isinstance(asr_model, EncDecCTCModel):
+        if cfg.decoder_type and cfg.decoder_type != 'ctc':
+            raise ValueError('CTC model only support ctc decoding!')
+    elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
+        if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
+            raise ValueError('Hybrid model only support ctc or rnnt decoding!')
+    else:  # rnnt model, there could be other models needs to be addressed.
+        if cfg.decoder_type and cfg.decoder_type != 'rnnt':
+            raise ValueError('RNNT model only support rnnt decoding!')
+
+    if cfg.decoder_type and hasattr(asr_model.encoder, 'set_default_att_context_size'):
+        asr_model.encoder.set_default_att_context_size(cfg.att_context_size)
+
+    # Setup decoding strategy
+    if hasattr(asr_model, 'change_decoding_strategy') and hasattr(asr_model, 'decoding'):
+        if isinstance(asr_model.decoding, MultiTaskDecoding):
+            cfg.multitask_decoding.compute_langs = cfg.compute_langs
+            cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment
+            asr_model.change_decoding_strategy(cfg.multitask_decoding)
+        elif cfg.decoder_type is not None:
+            # TODO: Support compute_langs in CTC eventually
+            if cfg.compute_langs and cfg.decoder_type == 'ctc':
+                raise ValueError("CTC models do not support `compute_langs` at the moment")
+
+            decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding
+            decoding_cfg.compute_timestamps = cfg.compute_timestamps  # both ctc and rnnt support it
+            if 'preserve_alignments' in decoding_cfg:
+                decoding_cfg.preserve_alignments = preserve_alignment
+            if 'compute_langs' in decoding_cfg:
+                decoding_cfg.compute_langs = cfg.compute_langs
+            if hasattr(asr_model, 'cur_decoder'):
+                asr_model.change_decoding_strategy(decoding_cfg, decoder_type=cfg.decoder_type)
+            else:
+                asr_model.change_decoding_strategy(decoding_cfg)
+
+        # Check if ctc or rnnt model
+        elif hasattr(asr_model, 'joint'):  # RNNT model
+            cfg.rnnt_decoding.fused_batch_size = -1
+            cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps
+            cfg.rnnt_decoding.compute_langs = cfg.compute_langs
+            if 'preserve_alignments' in cfg.rnnt_decoding:
+                cfg.rnnt_decoding.preserve_alignments = preserve_alignment
+
+            asr_model.change_decoding_strategy(cfg.rnnt_decoding)
+        else:
+            if cfg.compute_langs:
+                raise ValueError("CTC models do not support `compute_langs` at the moment.")
+            cfg.ctc_decoding.compute_timestamps = cfg.compute_timestamps
+
+            asr_model.change_decoding_strategy(cfg.ctc_decoding)
+
+    # Setup decoding config based on model type and decoder_type
+    with open_dict(cfg):
+        if isinstance(asr_model, EncDecCTCModel) or (
+            isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc"
+        ):
+            cfg.decoding = cfg.ctc_decoding
+        else:
+            cfg.decoding = cfg.rnnt_decoding
+
+    if isinstance(asr_model, EncDecMultiTaskModel):
+        # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
+        partial_audio = False
+        filepaths = cfg.dataset_manifest
+        assert cfg.dataset_manifest is not None
+    else:
+        # prepare audio filepaths and decide wether it's partial audio
+        filepaths, partial_audio = prepare_audio_data(cfg)
+
+    if not cfg.allow_partial_transcribe:
+        # by defatul, use model's transcribe() function, unless partial audio is required
+        partial_audio = False
+
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
+
+        @contextlib.contextmanager
+        def autocast(dtype=None):
+            yield
+
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+
+    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
+    if not cfg.return_transcriptions and not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
+            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
+        )
+        return cfg
+
+    # transcribe audio
+
+    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+
+    with autocast(dtype=amp_dtype):
+        with torch.no_grad():
+            if partial_audio:
+                transcriptions = transcribe_partial_audio(
+                    asr_model=asr_model,
+                    path2manifest=cfg.dataset_manifest,
+                    batch_size=cfg.batch_size,
+                    num_workers=cfg.num_workers,
+                    return_hypotheses=cfg.return_hypotheses,
+                    channel_selector=cfg.channel_selector,
+                    augmentor=augmentor,
+                    decoder_type=cfg.decoder_type,
+                )
+            else:
+                transcriptions = asr_model.transcribe(
+                    paths2audio_files=filepaths,
+                    batch_size=cfg.batch_size,
+                    num_workers=cfg.num_workers,
+                    return_hypotheses=cfg.return_hypotheses,
+                    channel_selector=cfg.channel_selector,
+                    augmentor=augmentor,
+                )
+
+    logging.info(f"Finished transcribing {len(filepaths)} files !")
+    logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
+
+    # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
+    if type(transcriptions) == tuple and len(transcriptions) == 2:
+        transcriptions = transcriptions[0]
+
+    if cfg.return_transcriptions:
+        return transcriptions
+
+    # write audio transcriptions
+    output_filename, pred_text_attr_name = write_transcription(
+        transcriptions,
+        cfg,
+        model_name,
+        filepaths=filepaths,
+        compute_langs=compute_langs,
+        compute_timestamps=compute_timestamps,
+    )
+    logging.info(f"Finished writing predictions to {output_filename}!")
+
+    if cfg.calculate_wer:
+        output_manifest_w_wer, total_res, _ = cal_write_wer(
+            pred_manifest=output_filename,
+            gt_text_attr_name=cfg.gt_text_attr_name,
+            pred_text_attr_name=pred_text_attr_name,
+            clean_groundtruth_text=cfg.clean_groundtruth_text,
+            langid=cfg.langid,
+            use_cer=cfg.use_cer,
+            output_filename=None,
+        )
+        if output_manifest_w_wer:
+            logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
+            logging.info(f"{total_res}")
+
+    return cfg
+
+
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter
\ No newline at end of file
diff --git a/build/lib/sdp/processors/toloka/__init__.py b/build/lib/sdp/processors/toloka/__init__.py
new file mode 100644
index 00000000..d9155f92
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/build/lib/sdp/processors/toloka/accept_if.py b/build/lib/sdp/processors/toloka/accept_if.py
new file mode 100644
index 00000000..3b0b7452
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/accept_if.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import defaultdict
+from typing import Optional
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+try:
+    import toloka.client
+    import toloka.client.project.template_builder
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. AcceptIf processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+from tqdm import tqdm
+
+
+class AcceptIfWERLess(BaseParallelProcessor):
+    """This processor accepts Toloka assignments if the Word Error Rate (WER) is below a threshold.
+
+    It evaluates the WER between ground truth and predicted text for each assignment
+    and accepts those that meet the specified threshold criteria.
+
+    Args:
+        input_data_file (str): Path to the input data file containing API configurations.
+        input_pool_file (str): Path to the input pool file containing pool configurations.
+        threshold (float): The WER threshold below which assignments are accepted. Default: 75.
+        config_file (str): Path to the configuration file. Default: None.
+        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
+        platform (str): The Toloka platform to use. Default: None.
+        pool_id (str): The ID of the Toloka pool. Default: None.
+
+    Returns:
+        A manifest with accepted assignments from Toloka based on the WER threshold.
+        
+    Example:
+    .. code-block:: yaml
+
+        - _target_: sdp.processors.toloka.accept_if.AcceptIfWERLess
+            input_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
+            output_manifest_file: ${workspace_dir}/result_manifest_pred_review.json
+            input_data_file: ${workspace_dir}/data_file.json
+            input_pool_file: ${workspace_dir}/taskpool.json
+            threshold: 50
+    """
+    
+    def __init__(
+        self,
+        input_data_file: str,
+        input_pool_file: str,
+        threshold: float = 75,
+        config_file: str = None,
+        API_KEY: str = None,
+        platform: str = None,
+        pool_id: str = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_data_file = input_data_file
+        self.input_pool_file = input_pool_file
+        self.threshold = threshold
+        self.config_file = config_file
+        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
+        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
+        self.pool_id = pool_id
+        if self.config_file:
+            self.load_config()
+
+    def load_config(self):
+        """
+        Loads configuration data from the specified config file.
+
+        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
+        If the file is missing or improperly formatted, an appropriate error is logged.
+        """
+        try:
+            with open(self.config_file, 'r') as file:
+                config = json.load(file)
+                self.API_KEY = config.get('API_KEY', self.API_KEY)
+                self.platform = config.get('platform', self.platform)
+                self.pool_id = config.get('pool_id', self.pool_id)
+        except FileNotFoundError:
+            logger.error("Configuration file not found.")
+        except json.JSONDecodeError:
+            logger.error("Error decoding JSON from the configuration file.")
+
+    def prepare(self):
+        """
+        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
+
+        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
+        """
+        if not self.API_KEY or not self.platform or not self.pool_id:
+            try:
+                with open(self.input_data_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.API_KEY = data.get("API_KEY", self.API_KEY)
+                    self.platform = data.get("platform", self.platform)
+            except FileNotFoundError:
+                logger.error("Data file not found.")
+            except json.JSONDecodeError:
+                logger.error("Error decoding JSON from the data file.")
+
+            try:
+                with open(self.input_pool_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.pool_id = data.get("pool_id", self.pool_id)
+            except FileNotFoundError:
+                logger.error("Pool file not found.")
+            except json.JSONDecodeError:
+                logger.error("Error decoding JSON from the pool file.")
+
+        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+
+    def process(self):
+        """
+        Accepts Toloka assignments if their Word Error Rate (WER) is below the specified threshold.
+
+        This method reads assignments from the manifest file, evaluates the WER, and accepts assignments that
+        meet the acceptance criteria.
+        """
+        big_dict = defaultdict(int)
+        self.prepare()
+        with open(self.input_manifest_file, 'r') as file:
+            for line in file:
+                data_entry = json.loads(line)
+                if data_entry["wer"] < self.threshold:
+                    if str(data_entry["status"]) == "Status.SUBMITTED":
+                        big_dict[data_entry["assignment_id"]] += 1
+
+        accepted = 0
+        for assignment_id, count in tqdm(big_dict.items()):
+            if count >= 3:  # should be >= 3 and <= 5
+                self.toloka_client.accept_assignment(assignment_id=assignment_id, public_comment='Well done!')
+                accepted += 1
+
+        logger.info(f"Number of accepted task suits: {accepted} of {len(big_dict)}")
+
diff --git a/build/lib/sdp/processors/toloka/create_pool.py b/build/lib/sdp/processors/toloka/create_pool.py
new file mode 100644
index 00000000..88da4960
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/create_pool.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import json
+import os
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor
+
+try:
+    import toloka.client
+    import toloka.client.project.template_builder
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. CreatePool processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+
+class CreateTolokaPool(BaseParallelProcessor):
+    """Creates a Toloka pool for a given project based on user-provided configurations.
+
+    This class connects to Toloka, loads necessary settings, creates a new pool,
+    and optionally sets up quality control mechanisms for worker submissions.
+
+    Args:
+        lang (str): The language filter for the pool. Default: 'HY'.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Returns:
+        A newly created pool on the Toloka platform, configured and ready for task assignment.
+    """
+    def __init__(
+        self,
+        lang: str = 'HY',
+        **kwargs,
+    ):
+        """
+        Constructs the necessary attributes for the CreateTolokaPool class.
+
+        Parameters:
+        ----------
+        lang : str, optional
+            The language filter for the pool. Defaults to 'HY'.
+        """
+        super().__init__(**kwargs)
+        self.API_KEY = os.getenv('TOLOKA_API_KEY')
+        if not self.API_KEY:
+            raise ValueError("TOLOKA_API_KEY environment variable is not set")
+            
+        self.platform = os.getenv('TOLOKA_PLATFORM')
+        if not self.platform:
+            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
+            
+        # Project ID will be read from the input manifest file in process_dataset_entry
+        self.project_id = None
+        self.lang = lang
+
+    def process_dataset_entry(self, data_entry):
+        """
+        Creates a new Toloka pool based on the provided dataset entry.
+
+        This method retrieves the project ID from the dataset entry and uses Toloka's API
+        to create a new pool for the specified project and returns the pool details.
+
+        Parameters:
+        ----------
+        data_entry : dict
+            A dictionary containing the data entry information, which should include project_id.
+
+        Returns:
+        -------
+        list
+            A list containing a DataEntry object with the new pool ID if successful, or an empty list if failed.
+        """
+        # Get project_id from the data entry
+        project_id = data_entry.get("project_id")
+        if not project_id:
+            logger.error("No project_id found in data entry")
+            return []
+
+        try:
+            toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+
+            new_pool = toloka.client.Pool(
+                project_id=project_id,
+                private_name='Voice recording',
+                may_contain_adult_content=False,
+                will_expire=datetime.datetime.utcnow() + datetime.timedelta(days=365),
+                reward_per_assignment=0.01,
+                assignment_max_duration_seconds=60 * 10,
+                auto_accept_solutions=False,
+                auto_accept_period_day=14,
+                filter=(
+                    (toloka.client.filter.Languages.in_(self.lang)) & (toloka.client.filter.ClientType == 'TOLOKA_APP')
+                ),
+            )
+            new_pool.set_mixer_config(real_tasks_count=5)
+            self.setup_quality_control(new_pool)
+
+            new_pool = toloka_client.create_pool(new_pool)
+            data = {"pool_id": new_pool.id}
+            return [DataEntry(data=data)]
+        except Exception as e:
+            logger.error(f"Failed to create a new pool in Toloka: {e}")
+            return []
+
+    def setup_quality_control(self, pool):
+        """
+        Sets up quality control rules for the Toloka pool to ensure high-quality task results.
+
+        Parameters:
+        ----------
+        pool : toloka.client.Pool
+            The pool object for which quality control rules will be set up.
+        """
+        # Control for skipped tasks in a row
+        pool.quality_control.add_action(
+            collector=toloka.client.collectors.SkippedInRowAssignments(),
+            conditions=[toloka.client.conditions.SkippedInRowCount >= 2],
+            action=toloka.client.actions.RestrictionV2(
+                scope='POOL',
+                duration=1,
+                duration_unit='DAYS',
+                private_comment='Skips too many task suites in a row',
+            ),
+        )
+
+        # Control for fast responses that might indicate fraud
+        pool.quality_control.add_action(
+            collector=toloka.client.collectors.AssignmentSubmitTime(history_size=10, fast_submit_threshold_seconds=60),
+            conditions=[toloka.client.conditions.FastSubmittedCount >= 5],
+            action=toloka.client.actions.RestrictionV2(
+                scope='ALL_PROJECTS',
+                duration_unit='PERMANENT',
+                private_comment='Fast responses',
+            ),
+        )
diff --git a/build/lib/sdp/processors/toloka/create_project.py b/build/lib/sdp/processors/toloka/create_project.py
new file mode 100644
index 00000000..a229214e
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/create_project.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+try:
+    import toloka.client
+    import toloka.client.project.template_builder
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. CreateTolokaProject processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+
+class CreateTolokaProject(BaseParallelProcessor):
+    """Creates a Toloka project based on user-provided configurations.
+
+    This class connects to Toloka, configures a new project with a name, description, and instructions,
+    and saves the created project details for future use.
+
+    Args:
+        project_name (str): The name of the project to be created.
+        project_description (str): A description shown to Toloka workers about the project.
+        project_instructions (str): Instructions provided to workers on how to complete assigned tasks.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Returns:
+        A project created on the Toloka platform, configured and ready for task and pool setup.
+    """
+    
+    def __init__(
+        self,
+        project_name: str,
+        project_description: str,
+        project_instructions: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.API_KEY = os.getenv('TOLOKA_API_KEY')
+        if not self.API_KEY:
+            raise ValueError("TOLOKA_API_KEY environment variable is not set")
+            
+        self.platform = os.getenv('TOLOKA_PLATFORM')
+        if not self.platform:
+            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
+            
+        self.project_name = project_name
+        self.project_description = project_description
+        self.project_instructions = project_instructions
+
+    def process(self):
+        """
+        Processes the creation of a Toloka project.
+
+        This method establishes a connection to the Toloka API using the API key and platform from environment variables,
+        then creates a new project with the specified name, description, and instructions. It also defines
+        the task specifications including the input and output fields, and then submits the project to Toloka.
+
+        After creating the project, it saves the project details (including the project ID) to a specified file.
+        """
+        logger.info("Processing Toloka project creation...")
+
+        toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+
+        # Create a new project
+        new_project = toloka.client.Project(
+            public_name=self.project_name,
+            public_description=self.project_description,
+            public_instructions=self.project_instructions,
+        )
+
+        # Setup the project interface
+        text_view = toloka.client.project.template_builder.TextViewV1(
+            toloka.client.project.template_builder.InputData('text')
+        )
+        audio_field = toloka.client.project.template_builder.AudioFieldV1(
+            toloka.client.project.template_builder.OutputData('audio_file'),
+            validation=toloka.client.project.template_builder.RequiredConditionV1(),
+        )
+        width_plugin = toloka.client.project.template_builder.TolokaPluginV1('scroll', task_width=500)
+
+        project_interface = toloka.client.project.TemplateBuilderViewSpec(
+            view=toloka.client.project.template_builder.ListViewV1(items=[text_view, audio_field]),
+            plugins=[width_plugin],
+        )
+
+        # Define task specification
+        input_specification = {'text': toloka.client.project.StringSpec()}
+        output_specification = {'audio_file': toloka.client.project.FileSpec()}
+
+        new_project.task_spec = toloka.client.project.task_spec.TaskSpec(
+            input_spec=input_specification,
+            output_spec=output_specification,
+            view_spec=project_interface,
+        )
+
+        # Create the project in Toloka
+        created_project = toloka_client.create_project(new_project)
+
+        # Always save project details to a file
+        data_file = self.output_manifest_file
+        directory = os.path.dirname(data_file)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+
+        data = {"project_id": created_project.id, "platform": self.platform}
+
+        with open(data_file, "w") as fout:
+            fout.write(json.dumps(data) + "\n")
+
+        logger.info("Project created successfully: Project ID - {}".format(created_project.id))
+
diff --git a/build/lib/sdp/processors/toloka/create_sentence_set.py b/build/lib/sdp/processors/toloka/create_sentence_set.py
new file mode 100644
index 00000000..8a86afb6
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/create_sentence_set.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from docx import Document
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class CreateSentenceSet(BaseParallelProcessor):
+    """Creates a set of sentences from a DOCX file by splitting its content into individual sentences.
+
+    This processor reads a DOCX file, extracts the full text, splits it into sentences
+    based on the Armenian period character, and wraps each sentence into a `DataEntry`.
+
+    Args:
+        **kwargs: Additional arguments passed to the base `BaseParallelProcessor` class.
+
+    Returns:
+        A list of `DataEntry` objects, each containing a single extracted sentence.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def parse_docx(self, file_path):
+        doc = Document(file_path)
+
+        full_text = []
+        for para in doc.paragraphs:
+            full_text.append(para.text)
+
+        combined_text = '\n'.join(full_text)
+
+        sentences = combined_text.split('։')
+
+        return sentences
+
+    def process_dataset_entry(self, data_entry):
+        file = data_entry["source_filepath"]
+
+        data = [DataEntry(data={"text": text}) for text in self.parse_docx(file)]
+
+        return data
diff --git a/build/lib/sdp/processors/toloka/create_task_set.py b/build/lib/sdp/processors/toloka/create_task_set.py
new file mode 100644
index 00000000..692a1e2f
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/create_task_set.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import List, Optional
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+try:
+    import toloka.client
+    import toloka.client.project.template_builder
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. CreateTaskSet processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+
+
+class CreateTolokaTaskSet(BaseParallelProcessor):
+    """Creates a set of tasks in a Toloka pool based on user-provided configurations and input data.
+
+    This class reads data from a manifest file, loads the target pool configuration,
+    and uses Toloka's API to create and upload tasks into the specified pool.
+
+    Args:
+        input_data_file (str): Path to the input data file containing API configurations.
+        input_pool_file (str): Path to the input pool file containing pool configurations.
+        limit (float): Percentage of tasks to load from the manifest file. Default: 100.
+
+    Returns:
+        A set of tasks created and uploaded to the specified Toloka pool.
+    """
+    def __init__(
+        self,
+        input_data_file: str,
+        input_pool_file: str,
+        limit: float = 100,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_data_file = input_data_file
+        self.input_pool_file = input_pool_file
+        self.limit = limit
+        self.pool_id = None
+        
+        # Get API key and platform from environment variables
+        self.API_KEY = os.getenv('TOLOKA_API_KEY')
+        if not self.API_KEY:
+            raise ValueError("TOLOKA_API_KEY environment variable is not set")
+            
+        self.platform = os.getenv('TOLOKA_PLATFORM')
+        if not self.platform:
+            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
+        
+        self.toloka_client = None
+
+    def prepare(self):
+        """
+        Prepares the class by loading pool configuration and initializing Toloka client.
+
+        This method sets up the necessary components for task creation, including loading the
+        pool configuration and initializing the Toloka client.
+        """
+        self.load_pool_config()
+        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+
+    def load_pool_config(self):
+        """
+        Loads pool configuration data from the input pool file.
+
+        This method reads the pool configuration from the specified file and extracts the
+        pool ID for use in task creation.
+
+        Raises:
+        ------
+        ValueError
+            If the input pool file does not contain a pool ID.
+        """
+        try:
+            with open(self.input_pool_file, 'r') as file:
+                pool_config = json.load(file)
+                self.pool_id = pool_config.get('pool_id')
+                if not self.pool_id:
+                    raise ValueError("No pool ID found in the pool configuration file.")
+        except FileNotFoundError:
+            raise ValueError(f"Pool configuration file {self.input_pool_file} not found.")
+        except json.JSONDecodeError:
+            raise ValueError(f"Error decoding JSON from the pool configuration file {self.input_pool_file}.")
+
+    def read_manifest(self) -> List[dict]:
+        """
+        Reads and returns a portion of the manifest data from the input manifest file based on the specified limit.
+
+        This method reads the input manifest file, calculates the number of entries to read based on the
+        specified limit, and returns a list of those entries.
+
+        Returns:
+        -------
+        List[dict]
+            A list of manifest data entries that have been read.
+        """
+        logger.info("Reading manifest...")
+        with open(self.input_manifest_file, "rt") as fin:
+            total_lines = sum(1 for _ in fin)
+            lines_to_read = max(1, int(total_lines * (self.limit / 100)))
+            fin.seek(0)
+            entries = [json.loads(fin.readline()) for _ in range(lines_to_read)]
+            return entries
+
+    def process(self):
+        """
+        Creates Toloka tasks based on manifest data and adds them to the specified pool.
+
+        This method reads the input manifest, creates task objects for Toloka, and submits
+        them to the specified pool. It also writes the manifest data to an output file after 
+        tasks have been created.
+
+        Raises:
+        ------
+        ValueError
+            If no pool ID is available or if there are issues with the Toloka API.
+        """
+        logger.info("Processing tasks...")
+        self.prepare()
+
+        if not self.pool_id:
+            raise ValueError("No pool ID available. Cannot create tasks.")
+
+        entries = self.read_manifest()
+        tasks = [
+            toloka.client.Task(input_values={'text': data_entry["text"]}, pool_id=self.pool_id)
+            for data_entry in entries
+        ]
+
+        try:
+            self.toloka_client.create_tasks(tasks, allow_defaults=True)
+            logger.info(f"Created {len(tasks)} tasks.")
+        except Exception as e:
+            logger.error(f"Error creating tasks: {e}")
+            raise ValueError(f"Failed to create tasks: {e}")
+
+        # Write the manifest data to the output file
+        with open(self.output_manifest_file, "wt", encoding='utf-8') as fout:
+            for entry in entries:
+                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/toloka/download_responses.py b/build/lib/sdp/processors/toloka/download_responses.py
new file mode 100644
index 00000000..9e6c08a3
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/download_responses.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+try:
+    import toloka.client
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. DownloadResponses processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+
+
+class GetTolokaResults(BaseParallelProcessor):
+    """Fetches and stores results from a specified Toloka pool based on user-configured conditions.
+
+    This class connects to Toloka, retrieves task results from a specified pool, filters them by assignment status,
+    and stores the results in the given output directory.
+
+    Args:
+        input_data_file (str): Path to the input data file containing API configurations.
+        input_pool_file (str): Path to the input pool file containing pool configurations.
+        output_dir (str): Directory where the results will be stored.
+        status (str): Status filter for assignments to retrieve (default: 'ACCEPTED').
+        config_file (str): Path to a configuration file. Default: None.
+        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
+        platform (str): The Toloka environment to use ('PRODUCTION' or 'SANDBOX'). Default: None.
+        pool_id (str): The ID of the Toloka pool to retrieve results from. Default: None.
+
+    Returns:
+        A set of task results from Toloka, stored in the specified output directory.
+    """
+    def __init__(
+        self,
+        input_data_file: str,
+        input_pool_file: str,
+        output_dir: str,
+        status: str = "ACCEPTED",
+        config_file: str = None,
+        API_KEY: str = None,
+        platform: str = None,
+        pool_id: str = None,
+        **kwargs
+    ):
+        """
+        Constructs the necessary attributes for the GetTolokaResults class.
+
+        Parameters:
+        ----------
+        input_data_file : str
+            The path to the input data file containing API configurations.
+        input_pool_file : str
+            The path to the input pool file containing pool configurations.
+        output_dir : str
+            The directory where the output results will be stored.
+        status : str, optional
+            The status filter for assignments to retrieve. Defaults to 'ACCEPTED'.
+        config_file : str, optional
+            The path to the configuration file. Defaults to None.
+        API_KEY : str, optional
+            The API key used to authenticate with Toloka's API. If not provided, it is retrieved from the environment.
+        platform : str, optional
+            Specifies the Toloka environment (e.g., 'PRODUCTION', 'SANDBOX'). If not provided, it is retrieved from the environment.
+        pool_id : str, optional
+            The ID of the pool from which results will be retrieved. Defaults to None.
+        """
+        super().__init__(**kwargs)
+        self.input_data_file = input_data_file
+        self.input_pool_file = input_pool_file
+        self.output_dir = output_dir
+        self.status = status
+        self.config_file = config_file
+        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
+        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
+        self.pool_id = pool_id
+        if self.config_file:
+            self.load_config()
+
+    def load_config(self):
+        """
+        Loads configuration data from the specified config file.
+
+        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
+        If the file is missing or improperly formatted, an appropriate error is logged.
+        """
+        try:
+            with open(self.config_file, 'r') as file:
+                config = json.load(file)
+                self.API_KEY = config.get('API_KEY', self.API_KEY)
+                self.platform = config.get('platform', self.platform)
+                self.pool_id = config.get('pool_id', self.pool_id)
+        except FileNotFoundError:
+            logger.error("Configuration file not found.")
+        except json.JSONDecodeError:
+            logger.error("Error decoding JSON from the configuration file.")
+
+    def prepare(self):
+        """
+        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
+
+        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
+        """
+        if not self.API_KEY or not self.platform or not self.pool_id:
+            try:
+                with open(self.input_data_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.API_KEY = data.get("API_KEY", self.API_KEY)
+                    self.platform = data.get("platform", self.platform)
+            except FileNotFoundError:
+                logger.error("Data file not found.")
+            except json.JSONDecodeError:
+                logger.error("Error decoding JSON from the data file.")
+
+            try:
+                with open(self.input_pool_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.pool_id = data.get("pool_id", self.pool_id)
+            except FileNotFoundError:
+                logger.error("Pool file not found.")
+            except json.JSONDecodeError:
+                logger.error("Error decoding JSON from the pool file.")
+
+        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+        return super().prepare()
+
+    def read_manifest(self):
+        """
+        Retrieves and yields task information from Toloka based on the specified pool and assignment status.
+
+        This method retrieves assignments from Toloka for a given pool and yields task information for
+        each assignment that matches the specified status.
+
+        Yields:
+        ------
+        dict
+            A dictionary containing task information such as task ID, text, attachment ID, status, etc.
+        """
+        for assignment in self.toloka_client.get_assignments(pool_id=self.pool_id):
+            if str(assignment.status) == 'Status.' + self.status:
+                # ACCEPTED, ACTIVE, EXPIRED, REJECTED, SKIPPED, SUBMITTED
+                if (
+                    str(assignment.status) == 'Status.ACCEPTED'
+                    or str(assignment.status) == 'Status.REJECTED'
+                    or str(assignment.status) == 'Status.SUBMITTED'
+                ):
+                    for task, solution in zip(assignment.tasks, assignment.solutions):
+                        suit_id = assignment.task_suite_id
+                        assignment_id = assignment.id
+                        user_id = assignment.user_id
+                        task_id = task.id
+                        text = task.input_values['text']
+                        attachment_id = solution.output_values.get('audio_file', None)
+                        status = assignment.status
+                        task_info = {
+                            'task_id': task_id,
+                            'text': text,
+                            'attachment_id': attachment_id,
+                            'status': str(status),
+                            'suit_id': suit_id,
+                            'assignment_id': assignment_id,
+                            'user_id': user_id,
+                        }
+                        yield task_info
+                else:
+                    for task in assignment.tasks:
+                        suit_id = assignment.task_suite_id
+                        assignment_id = assignment.id
+                        user_id = assignment.user_id
+                        task_id = task.id
+                        text = task.input_values['text']
+                        attachment_id = ""
+                        status = assignment.status
+                        task_info = {
+                            'task_id': task_id,
+                            'text': text,
+                            'attachment_id': attachment_id,
+                            'status': str(status),
+                            'suit_id': suit_id,
+                            'assignment_id': assignment_id,
+                            'user_id': user_id,
+                        }
+                        yield task_info
+
+    def process_dataset_entry(self, data_entry):
+        """
+        Downloads and processes individual task results.
+
+        This method takes a data entry, retrieves the corresponding attachment, and stores it in the
+        specified output directory. The task information is then returned.
+
+        Parameters:
+        ----------
+        data_entry : dict
+            A dictionary containing the data entry information.
+
+        Returns:
+        -------
+        list
+            A list containing a DataEntry object with the task information.
+        """
+        user_id = data_entry["user_id"]
+        task_id = data_entry["task_id"]
+        text = data_entry["text"]
+        attachment_id = data_entry["attachment_id"]
+        status = data_entry["status"]
+        suit_id = data_entry["suit_id"]
+        assignment_id = data_entry["assignment_id"]
+        output_path = os.path.join(self.output_dir, attachment_id + '.wav')
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+        if attachment_id != "":
+            with open(output_path, 'wb') as attachment_file:
+                self.toloka_client.download_attachment(attachment_id, out=attachment_file)
+
+        task_info = {
+            'task_id': task_id,
+            'text': text,
+            'attachment_id': attachment_id,
+            'status': status,
+            'audio_filepath': output_path,
+            'suit_id': suit_id,
+            'assignment_id': assignment_id,
+            'user_id': user_id,
+        }
+
+        return [DataEntry(data=task_info)]
+
diff --git a/build/lib/sdp/processors/toloka/reject_if.py b/build/lib/sdp/processors/toloka/reject_if.py
new file mode 100644
index 00000000..7da755a4
--- /dev/null
+++ b/build/lib/sdp/processors/toloka/reject_if.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+try:
+    import toloka.client
+    import toloka.client.project.template_builder
+    TOLOKA_AVAILABLE = True
+except ImportError:
+    logger.warning("Toloka is currently not supported. RejectIf processor functionality will be limited.")
+    TOLOKA_AVAILABLE = False
+    toloka = None
+
+from docx import Document
+from tqdm import tqdm
+
+
+class RejectIfBanned(BaseParallelProcessor):
+    """Rejects Toloka assignments if the user is banned.
+
+    This class connects to Toloka, checks the user’s ban status, and rejects any assignments 
+    from users who are identified as banned.
+
+    Args:
+        input_data_file (str): Path to the input data file containing API configurations.
+        input_pool_file (str): Path to the input pool file containing pool configurations.
+        config_file (str): Path to the configuration file. Default: None.
+        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
+        platform (str): The Toloka environment to use ('PRODUCTION' or 'SANDBOX'). Default: None.
+        pool_id (str): The ID of the Toloka pool to retrieve assignments from. Default: None.
+
+    Returns:
+        A list of rejected assignments for users who are banned.
+    """
+    def __init__(
+        self,
+        input_data_file: str,
+        input_pool_file: str,
+        config_file: str = None,
+        API_KEY: str = None,
+        platform: str = None,
+        pool_id: str = None,
+        **kwargs
+    ):
+        """
+        Constructs the necessary attributes for the RejectIfBanned class.
+
+        Parameters:
+        ----------
+        input_data_file : str
+            The path to the input data file containing API configurations.
+        input_pool_file : str
+            The path to the input pool file containing pool configurations.
+        config_file : str, optional
+            The path to the configuration file. Defaults to None.
+        API_KEY : str, optional
+            The API key used to authenticate with Toloka's API. If not provided, it is retrieved from the environment.
+        platform : str, optional
+            Specifies the Toloka environment (e.g., 'PRODUCTION', 'SANDBOX'). If not provided, it is retrieved from the environment.
+        pool_id : str, optional
+            The ID of the pool from which assignments will be retrieved. Defaults to None.
+        """
+        super().__init__(**kwargs)
+        self.input_data_file = input_data_file
+        self.input_pool_file = input_pool_file
+        self.config_file = config_file
+        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
+        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
+        self.pool_id = pool_id
+        if self.config_file:
+            self.load_config()
+
+    def load_config(self):
+        """
+        Loads configuration data from the specified config file.
+
+        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
+        If the file is missing or improperly formatted, an appropriate error is logged.
+        """
+        try:
+            with open(self.config_file, 'r') as file:
+                config = json.load(file)
+                self.API_KEY = config.get('API_KEY', self.API_KEY)
+                self.platform = config.get('platform', self.platform)
+                self.pool_id = config.get('pool_id', self.pool_id)
+        except FileNotFoundError:
+            print("Configuration file not found.")
+        except json.JSONDecodeError:
+            print("Error decoding JSON from the configuration file.")
+
+    def prepare(self):
+        """
+        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
+
+        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
+        """
+        if not self.API_KEY or not self.platform or not self.pool_id:
+            try:
+                with open(self.input_data_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.API_KEY = data.get("API_KEY", self.API_KEY)
+                    self.platform = data.get("platform", self.platform)
+            except FileNotFoundError:
+                print("Data file not found.")
+            except json.JSONDecodeError:
+                print("Error decoding JSON from the data file.")
+
+            try:
+                with open(self.input_pool_file, 'r') as file:
+                    data = json.loads(file.readline())
+                    self.pool_id = data.get("pool_id", self.pool_id)
+            except FileNotFoundError:
+                print("Pool file not found.")
+            except json.JSONDecodeError:
+                print("Error decoding JSON from the pool file.")
+
+        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
+
+    def process(self):
+        """
+        Rejects Toloka assignments if the user is in the banned list.
+
+        This method retrieves the list of banned users and rejects assignments from these users if they have
+        submitted assignments that are still in the 'SUBMITTED' status.
+        """
+        self.prepare()
+        list_of_banned = []
+        reject_list = []
+        list_of_banned = [
+            restriction.user_id for restriction in self.toloka_client.get_user_restrictions(scope='ALL_PROJECTS')
+        ]
+        print("LIST OF BANNED -------------------------", list_of_banned)
+        with open(self.input_manifest_file, 'r') as file:
+            for line in file:
+                data_entry = json.loads(line)
+                if data_entry["user_id"] in list_of_banned:
+                    if str(data_entry["status"]) == "Status.SUBMITTED":
+                        if data_entry['assignment_id'] not in reject_list:
+                            reject_list.append(data_entry['assignment_id'])
+
+        print("REJECTION LIST -------------------------", reject_list)
+        for assignment_id in tqdm(reject_list, desc="Rejecting assignments"):
+            self.toloka_client.reject_assignment(assignment_id=assignment_id, public_comment='Bad quality of audio.')
+
diff --git a/build/lib/sdp/run_processors.py b/build/lib/sdp/run_processors.py
new file mode 100644
index 00000000..8c498cf2
--- /dev/null
+++ b/build/lib/sdp/run_processors.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import tempfile
+import uuid
+from typing import List, Optional
+import psutil
+import json
+
+import hydra
+from omegaconf import OmegaConf, open_dict
+
+from sdp.logging import logger
+
+from sdp.utils.import_manager import ImportManager
+
+# registering new resolvers to simplify config files
+OmegaConf.register_new_resolver("subfield", lambda node, field: node[field])
+OmegaConf.register_new_resolver("not", lambda x: not x)
+OmegaConf.register_new_resolver("equal", lambda field, value: field == value)
+
+
+# customizing logger
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    '[SDP %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d] %(message)s',
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+handler.setFormatter(formatter)
+logger.handlers
+logger.addHandler(handler)
+logger.propagate = False
+
+def update_processor_imports(config_path: str, init_file: str = None):
+    """
+    Update processor imports based on config file.
+    
+    Args:
+        config_path: Path to the YAML config file
+        init_file: Optional path to __init__.py file to update
+    """
+    try:
+        import yaml
+        manager = ImportManager()
+        manager.sync_with_config(config_path, init_file)
+        logger.info(f"Successfully updated imports for config: {config_path}")
+    except FileNotFoundError as e:
+        logger.error(f"File not found: {e}")
+    except yaml.YAMLError as e:
+        logger.error(f"Error parsing YAML config: {e}")
+    except ImportError as e:
+        logger.error(f"Import error: {e}")
+    except ValueError as e:  # For unexpected data structures in the YAML config
+        logger.error(f"Invalid value encountered: {e}")
+    except Exception as e:  # For any other unexpected errors
+        logger.error(f"An unexpected error occurred: {e}")
+
+
+def select_subset(input_list: List, select_str: str) -> List:
+    """This function parses a string and selects objects based on that.
+
+    The string is expected to be a valid representation of Python slice. The
+    only difference with using an actual slice is that we are always returning
+    a list, never a single element. See examples below for more details.
+
+    Examples::
+
+        >>> processors_to_run = [1, 2, 3, 4, 5]
+        >>> select_subset(processors_to_run, "3:") # to exclude first 3 objects
+        [4, 5]
+
+        >>> select_subset(processors_to_run, ":-1") # to select all but last
+        [1, 2, 3, 4]
+
+        >>> select_subset(processors_to_run, "2:5") # to select 3rd to 5th
+        [3, 4, 5]
+
+        >>> # note that unlike normal slice, we still return a list here
+        >>> select_subset(processors_to_run, "0") # to select only the first
+        [1]
+
+        >>> select_subset(processors_to_run, "-1") # to select only the last
+        [5]
+
+    Args:
+        input_list (list): input list to select objects from.
+        select_str (str): string representing Python slice.
+
+    Returns:
+        list: a subset of the input according to the ``select_str``
+
+    """
+    if ":" not in select_str:
+        selected_objects = [input_list[int(select_str)]]
+    else:
+        slice_obj = slice(*map(lambda x: int(x.strip()) if x.strip() else None, select_str.split(":")))
+        selected_objects = input_list[slice_obj]
+    return selected_objects
+
+
+def run_processors(cfg):
+    logger.info(f"Hydra config: {OmegaConf.to_yaml(cfg)}")
+
+    # Handle import manager if enabled
+    if cfg.get("use_import_manager", False):
+        try:
+            import yaml
+            yaml_path = cfg.get("config_path")
+            if not yaml_path:
+                raise ValueError("No configuration path provided in 'config_path'. Please specify the path.")
+
+            if not os.path.exists(yaml_path):
+                raise FileNotFoundError(f"Configuration file not found: {yaml_path}")
+            
+            logger.info(f"Managing imports for config: {yaml_path}")
+            manager = ImportManager()
+            manager.sync_with_config(yaml_path)
+        except FileNotFoundError as e:
+            logger.error(f"File not found: {e}")
+        except ValueError as e:
+            logger.error(f"Invalid configuration: {e}")
+        except yaml.YAMLError as e:
+            logger.error(f"Error parsing YAML file: {e}")
+        except ImportError as e:
+            logger.error(f"Import-related error: {e}")
+        except Exception as e:
+            logger.error(f"An unexpected error occurred during management of imports: {e}")
+
+    # Detecting dask
+    try:
+        from dask.distributed import Client
+        dask_available = True
+    except ImportError:
+        logger.warning("Dask not installed; using multiprocessing for all processors")
+        dask_available = False
+    
+    # look for global directions in cfg for dask usage
+    global_use_dask = bool(cfg.get("use_dask", True)) and dask_available
+
+    processors_to_run = cfg.get("processors_to_run", "all")
+    if processors_to_run == "all":
+        processors_to_run = ":"
+    selected_cfgs = select_subset(cfg.processors, processors_to_run)
+    
+    # filtering out any processors that have should_run=False
+    processors_cfgs = []
+    for processor_cfg in selected_cfgs:
+        with open_dict(processor_cfg):
+            should_run = processor_cfg.pop("should_run", True)
+        if should_run:
+            processors_cfgs.append(processor_cfg)
+
+    logger.info(
+        "Specified to run the following processors: %s ",
+        [proc_cfg["_target_"] for proc_cfg in processors_cfgs],
+    )
+    
+    
+    
+    processors = []
+    # Create a temporary directory to hold intermediate files if needed.
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # special check for the first processor.
+        # In case user selected something that does not start from
+        # manifest creation we will try to infer the input from previous
+        # output file
+        if processors_cfgs[0] is not cfg.processors[0] and "input_manifest_file" not in processors_cfgs[0]:
+            # locating starting processor
+            for idx, processor in enumerate(cfg.processors):
+                if processor is processors_cfgs[0]:  # we don't do a copy, so can just check object ids
+                    if "output_manifest_file" in cfg.processors[idx - 1]:
+                        with open_dict(processors_cfgs[0]):
+                            processors_cfgs[0]["input_manifest_file"] = cfg.processors[idx - 1]["output_manifest_file"]
+                    break
+        
+        for idx, processor_cfg in enumerate(processors_cfgs):
+            logger.info('=> Building processor "%s"', processor_cfg["_target_"])
+
+            # we assume that each processor defines "output_manifest_file"
+            # and "input_manifest_file" keys, which can be optional. In case they
+            # are missing, we create tmp files here for them
+            # (1) first use a temporary file for the "output_manifest_file" if it is unspecified
+            if "output_manifest_file" not in processor_cfg:
+                tmp_file_path = os.path.join(tmp_dir, str(uuid.uuid4()))
+                with open_dict(processor_cfg):
+                    processor_cfg["output_manifest_file"] = tmp_file_path
+
+            # (2) then link the current processor's output_manifest_file to the next processor's input_manifest_file
+            # if it hasn't been specified (and if you are not on the last processor)
+            if idx != len(processors_cfgs) - 1 and "input_manifest_file" not in processors_cfgs[idx + 1]:
+                with open_dict(processors_cfgs[idx + 1]):
+                    processors_cfgs[idx + 1]["input_manifest_file"] = processor_cfg["output_manifest_file"]
+            
+            #check if we have processor level directions of using dask
+            flag=processor_cfg.get("use_dask", None)
+
+            # if no processor-specific flag, fallback to global; otherwise use provided value
+            if flag is None:
+                use_dask_flag = global_use_dask
+            else:
+                use_dask_flag = flag
+
+            processor = hydra.utils.instantiate(processor_cfg)
+            processor.use_dask = use_dask_flag
+            # running runtime tests to fail right-away if something is not
+            # matching users expectations
+            processor.test()
+            processors.append(processor)
+
+
+        # Start Dask client if any processor requires it
+        dask_client = None
+        if any(p.use_dask for p in processors):
+            try:
+                num_cpus = psutil.cpu_count(logical=False) or 4
+                logger.info(f"Starting Dask client with {num_cpus} workers")
+                dask_client = Client(n_workers=num_cpus, processes=True)
+                logger.info(f"Dask dashboard at: {dask_client.dashboard_link}")
+            except Exception as e:
+                logger.warning(f"Failed to start Dask client: {e}")
+                dask_client = None
+
+        # Run processors in order
+        try:
+            for proc in processors:
+                if proc.use_dask and dask_client is not None:
+                    proc.dask_client = dask_client
+                    logger.info('=> Running processor "%s" with Dask', proc)
+                else:
+                    logger.info('=> Running processor "%s" with Multiprocessing', proc)
+                proc.process()
+        finally:
+            if dask_client is not None:
+                logger.info("Shutting down Dask client...")
+                dask_client.close(timeout="60s")
+                logger.info("Dask client shutdown complete")
+
+#tmp_dir is removed here after all processing finishes. !!!
diff --git a/build/lib/sdp/utils/__init__.py b/build/lib/sdp/utils/__init__.py
new file mode 100644
index 00000000..2223b231
--- /dev/null
+++ b/build/lib/sdp/utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from sdp.utils.bootstrap_estimates import BootstrapProcessor
\ No newline at end of file
diff --git a/build/lib/sdp/utils/bootstrap_estimates.py b/build/lib/sdp/utils/bootstrap_estimates.py
new file mode 100644
index 00000000..efdebe83
--- /dev/null
+++ b/build/lib/sdp/utils/bootstrap_estimates.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+import numpy as np
+from tqdm import tqdm
+from sdp.processors.base_processor import BaseProcessor
+from typing import List, Dict, Union, Optional, Tuple
+from . import metrics_computation as metrics
+
+class BootstrapProcessor(BaseProcessor):
+    """This processor evaluates ASR performance metrics using bootstrapped confidence intervals.
+
+    It calculates metrics such as Word Error Rate (WER), Character Error Rate (CER), Word Match 
+    Rate (WMR), character rate, and word rate. When `calculate_pairwise` is set to `True`, it also 
+    computes the Probability of Improvement (POI) between different ASR models.
+
+    This implementation leverages bootstrapping to provide robust confidence intervals for each metric,
+    helping to understand the variability in metric estimates and the likelihood that one model
+    performs better than another.
+
+    Reference: Bootstrap estimates for confidence intervals in ASR performance evaluation:
+    <https://ieeexplore.ieee.org/document/1326009>
+
+    Args:
+        bootstrap_manifest_files (List[str]): A list of file paths to manifest files (in JSON Lines format)
+            used for metric calculation. Each manifest file contains the ground truth and predicted transcriptions.
+        raw_data_dir (str): The directory containing the data files referenced in the manifests.
+        output_file (str): Path to the output JSON file where results will be saved.
+        num_bootstraps (int): The number of bootstrap iterations to perform, which determines
+            the reliability of the confidence intervals (default: 1000).
+        bootstrap_sample_ratio (float): Proportion of the dataset size used for each bootstrap sample,
+            allowing sub-sampling or over-sampling (default: 1.0, meaning full dataset).
+        calculate_pairwise (bool): Whether to calculate pairwise differences in metric values between
+            models and compute the Probability of Improvement (default: True).
+        metric_type (str): Specifies the metric to calculate. Options include 'wer', 'cer', 'wmr',
+            'charrate', and 'wordrate' (default: 'wer').
+        text_key (str): Key in the manifest that contains the ground truth text (default: 'text').
+        pred_text_key (str): Key in the manifest that contains the predicted text (default: 'pred_text').
+        ci_lower (float): The lower bound percentile for the confidence intervals (default: 2.5).
+        ci_upper (float): The upper bound percentile for the confidence intervals (default: 97.5).
+        random_state (int): Sets a random state for reproducibility of bootstrap sampling.
+
+    Returns:
+        Results saved in a JSON file at the specified `output_file` path, containing individual metric
+        computations for each manifest file and pairwise comparisons between each model if
+        `calculate_pairwise` is enabled.
+
+    """
+
+    def __init__(
+        self,
+        bootstrap_manifest_files: List[str],
+        raw_data_dir: str,
+        output_file: str, 
+        num_bootstraps: int = 1000,
+        bootstrap_sample_ratio: float = 1.0,
+        calculate_pairwise: bool = True, 
+        metric_type: str = 'wer',
+        text_key: str = 'text',
+        pred_text_key: str = 'pred_text',
+        ci_lower: float = 2.5,
+        ci_upper: float = 97.5,
+        random_state: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.bootstrap_manifest_files = bootstrap_manifest_files
+        self.raw_data_dir = raw_data_dir
+        self.output_file = output_file
+        self.num_bootstraps = num_bootstraps
+        self.bootstrap_sample_ratio = bootstrap_sample_ratio
+        self.calculate_pairwise = calculate_pairwise
+        self.metric_type = metric_type.lower()
+        self.text_key = text_key
+        self.pred_text_key = pred_text_key 
+        self.ci_lower = ci_lower
+        self.ci_upper = ci_upper 
+        self.random_state = random_state 
+
+
+        if self.random_state is not None:
+            np.random.seed(self.random_state)
+
+        if self.metric_type not in ['wer', 'cer', 'wmr', 'charrate', 'wordrate']:
+            raise ValueError(f"Invalid metric_type '{self.metric_type}'! Must be one of ['wer', 'cer', 'wmr', 'charrate', 'wordrate']")
+
+    def read_manifest(self, manifest_path: Path) -> List[Dict[str, Union[str, float]]]:
+        manifest_data = []
+        with manifest_path.open('r', encoding='utf-8') as f:
+            for line in f:
+                data = json.loads(line.strip()) 
+                manifest_data.append(data)
+
+        return manifest_data
+
+    def calculate_metric(self, text: str, pred_text: str, duration: Optional[float] = None) -> float:
+        if self.metric_type == 'wer':
+            return metrics.get_wer(text, pred_text)
+        elif self.metric_type == 'cer':
+            return metrics.get_cer(text, pred_text)
+        elif self.metric_type == 'wmr':
+            return metrics.get_wmr(text, pred_text)
+        elif self.metric_type == 'charrate':
+            if duration is None:
+                raise ValueError("Duration is required for calculating character rate.")
+            return metrics.get_charrate(text, duration)
+        elif self.metric_type == 'wordrate':
+            if duration is None:
+                raise ValueError("Duration is required for calculating word rate.")
+            return metrics.get_wordrate(text, duration)
+        else:
+            raise ValueError(f"Unsupported metric_type: {self.metric_type}")
+
+    def bootstrap_metric(self, hypotheses: List[str], references: List[str], durations: Optional[List[float]] = None) -> np.ndarray:
+        """
+        Bootstraps metric computation (WER, CER, etc.) to calculate confidence intervals.
+
+        Args:
+            hypotheses (List[str]): Predicted transcriptions
+            references (List[str]): Ground truth transcriptions
+            durations (Optional[List[float]]): Duration for each transcription, required for charrate and wordrate
+
+        Returns:
+            np.ndarray: Bootstrapped metric values
+        """
+        n = len(hypotheses)
+        sample_size = int(n * self.bootstrap_sample_ratio)
+
+        metric_bootstrap = []
+        for _ in tqdm(range(self.num_bootstraps), desc=f"Bootstrapping {self.metric_type.upper()}"):
+            indices = np.random.choice(n, size=sample_size, replace=True)
+            sampled_hypotheses = [hypotheses[i] for i in indices]
+            sampled_references = [references[i] for i in indices]
+            if durations:
+                sampled_durations = [durations[i] for i in indices]
+                metric = [self.calculate_metric(sampled_references[i], sampled_hypotheses[i], sampled_durations[i])
+                          for i in range(sample_size)]
+            else:
+                metric = [self.calculate_metric(sampled_references[i], sampled_hypotheses[i]) for i in range(sample_size)]
+            metric_bootstrap.append(np.mean(metric))
+
+        return np.array(metric_bootstrap)
+
+    def bootstrap_wer_difference(self, predictions1: List[str], predictions2: List[str], references: List[str], durations: Optional[List[float]] = None) -> Tuple[np.ndarray, float]:
+        """
+        Calculates the bootstrapped difference in metrics between two sets of predictions and the probability of improvement.
+
+        Args:
+            predictions1 (List[str]): Predictions from the first model
+            predictions2 (List[str]): Predictions from the second model
+            references (List[str]): Ground truth references
+            durations (Optional[List[float]]): Durations for each sample, if required for the metric
+
+        Returns:
+            Tuple[np.ndarray, float]: A tuple containing:
+                - np.ndarray: Bootstrapped differences in metric
+                - float: Probability of Improvement (POI)
+        """
+        n = len(references)
+        sample_size = int(n * self.bootstrap_sample_ratio)
+        delta_metric_bootstrap = []
+
+        for _ in tqdm(range(self.num_bootstraps), desc=f"Bootstrapping {self.metric_type.upper()} difference"):
+            indices = np.random.choice(n, size=sample_size, replace=True)
+            sampled_pred1 = [predictions1[i] for i in indices]
+            sampled_pred2 = [predictions2[i] for i in indices]
+            sampled_refs = [references[i] for i in indices]
+
+            if durations:
+                sampled_durations = [durations[i] for i in indices]
+                metric1 = [self.calculate_metric(sampled_refs[i], sampled_pred1[i], sampled_durations[i]) for i in range(sample_size)]
+                metric2 = [self.calculate_metric(sampled_refs[i], sampled_pred2[i], sampled_durations[i]) for i in range(sample_size)]
+            else:
+                metric1 = [self.calculate_metric(sampled_refs[i], sampled_pred1[i]) for i in range(sample_size)]
+                metric2 = [self.calculate_metric(sampled_refs[i], sampled_pred2[i]) for i in range(sample_size)]
+
+            delta_metric = np.mean(metric1) - np.mean(metric2)
+            delta_metric_bootstrap.append(delta_metric)
+
+        poi = np.mean(np.array(delta_metric_bootstrap) > 0)
+        return np.array(delta_metric_bootstrap), poi
+
+    def prepare(self):
+        output_path = Path(self.output_file)
+        output_path.parent.mkdir(exist_ok=True, parents=True)
+
+    def process(self):
+        """
+        Main processing function that loads data, performs metric bootstrapping and optionally 
+        pairwise comparison, and saves the results to a JSON file.
+        """
+        self.prepare()
+        results = {}
+
+        # Load ground truth and predictions
+        bootstrap_manifest_files = [Path(f) for f in self.bootstrap_manifest_files]
+        ground_truth = []
+        predicted_texts = []
+        durations = []
+
+        for manifest_file in bootstrap_manifest_files:
+            manifest_data = self.read_manifest(Path(self.raw_data_dir) / manifest_file)
+            # Use text_key and pred_text_key to extract ground truth and predictions
+            gt_texts = [entry[self.text_key] for entry in manifest_data]
+            pred_texts = [entry[self.pred_text_key] for entry in manifest_data]
+            if 'duration' in manifest_data[0]:  # Check if duration is available
+                file_durations = [entry['duration'] for entry in manifest_data]
+                durations.append(file_durations)
+
+            if not ground_truth:
+                ground_truth = gt_texts  # Ground truth is assumed to be the same for all models
+            predicted_texts.append(pred_texts)
+
+        # Bootstrapping individual metric for each model
+        results["individual_results"] = {}
+        for idx, predicted in enumerate(predicted_texts):
+            if durations:
+                metric_conf_intervals = self.bootstrap_metric(predicted, ground_truth, durations[idx])
+            else:
+                metric_conf_intervals = self.bootstrap_metric(predicted, ground_truth)
+
+            ci_lower_value = np.percentile(metric_conf_intervals, self.ci_lower)
+            ci_upper_value = np.percentile(metric_conf_intervals, self.ci_upper)
+            mean_metric = np.mean(metric_conf_intervals)
+
+            results["individual_results"][bootstrap_manifest_files[idx].name] = {
+                f"mean_{self.metric_type}": mean_metric,
+                "ci_lower": ci_lower_value,
+                "ci_upper": ci_upper_value
+            }
+
+        # Pairwise comparison between models (only if calculate_pairwise is True)
+        if self.calculate_pairwise:
+            results["pairwise_comparisons"] = []
+            num_files = len(predicted_texts)
+            for i in range(num_files):
+                for j in range(i + 1, num_files):
+                    if durations:
+                        delta_metric_bootstrap, poi = self.bootstrap_wer_difference(predicted_texts[i], predicted_texts[j], ground_truth, durations[i])
+                    else:
+                        delta_metric_bootstrap, poi = self.bootstrap_wer_difference(predicted_texts[i], predicted_texts[j], ground_truth)
+
+                    mean_delta_metric = np.mean(delta_metric_bootstrap)
+                    ci_lower_value = np.percentile(delta_metric_bootstrap, self.ci_lower)
+                    ci_upper_value = np.percentile(delta_metric_bootstrap, self.ci_upper)
+
+                    results["pairwise_comparisons"].append({
+                        "file_1": bootstrap_manifest_files[i].name,
+                        "file_2": bootstrap_manifest_files[j].name,
+                        f"delta_{self.metric_type}_mean": mean_delta_metric,
+                        "ci_lower": ci_lower_value,
+                        "ci_upper": ci_upper_value,
+                        "poi": poi
+                    })
+
+        output_path = Path(self.output_file)
+        with output_path.open('w') as out_file:
+            json.dump(results, out_file, indent=4)
+
diff --git a/build/lib/sdp/utils/common.py b/build/lib/sdp/utils/common.py
new file mode 100644
index 00000000..6d9c4fba
--- /dev/null
+++ b/build/lib/sdp/utils/common.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import subprocess
+import tarfile
+import urllib
+import zipfile
+from pathlib import Path
+from typing import Dict, List, Union
+
+import wget
+
+from sdp.logging import logger
+
+
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    # read NeMo manifest as a list of dicts
+    result = []
+    with manifest.open() as f:
+        for line in f:
+            data = json.loads(line)
+            result.append(data)
+    return result
+
+
+def download_file(source_url: str, target_directory: str, verbose=True):
+    # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later
+    target_directory = os.path.abspath(target_directory)
+
+    if verbose:
+        logger.info(f"Trying to download data from {source_url} and save it in this directory: {target_directory}")
+    filename = os.path.basename(urllib.parse.urlparse(source_url).path)
+    target_filepath = os.path.join(target_directory, filename)
+
+    if os.path.exists(target_filepath):
+        if verbose:
+            logger.info(f"Found file {target_filepath} => will not be attempting download from {source_url}")
+    else:
+        logger.info(f"Not found file {target_filepath}")
+        original_dir = os.getcwd()  # record current working directory so can cd back to it
+        os.chdir(target_directory)  # cd to target dir so that temporary download file will be saved in target dir
+
+        wget.download(source_url, target_directory)
+
+        # change back to original directory as the rest of the code may assume that we are in that directory
+        os.chdir(original_dir)
+        if verbose:
+            logger.info("Download completed")
+
+    return target_filepath
+
+
+def extract_archive(archive_path: str, extract_path: str, force_extract: bool = False) -> str:
+    logger.info(f"Attempting to extract all contents from tar file {archive_path} and save in {extract_path}")
+    if not force_extract:
+        if tarfile.is_tarfile(archive_path):
+            with tarfile.open(archive_path, "r") as archive:
+                archive_extracted_dir = os.path.commonprefix(archive.getnames()[1:])
+        elif zipfile.is_zipfile(archive_path):
+            with zipfile.ZipFile(archive_path, "r") as archive:
+                archive_extracted_dir = archive.namelist()[0]
+        else:
+            raise RuntimeError(f"Unknown archive format: {archive_path}. We only support tar and zip archives.")
+
+        archive_contents_dir = os.path.join(extract_path, archive_extracted_dir)
+
+    if not force_extract and os.path.exists(archive_contents_dir):
+        logger.info(f"Directory {archive_contents_dir} already exists => will not attempt to extract file")
+    else:
+        if tarfile.is_tarfile(archive_path):
+            with tarfile.open(archive_path, "r") as archive:
+                archive.extractall(path=extract_path)
+        elif zipfile.is_zipfile(archive_path):
+            with zipfile.ZipFile(archive_path, "r") as archive:
+                archive.extractall(extract_path)
+        logger.info("Finished extracting")
+
+    if force_extract:
+        return None
+    return archive_contents_dir
+
+
+def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
+    process_args = ["ffmpeg", "-nostdin", "-i", jpg, "-ac", str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
+    if ar:
+        process_args = process_args[:-1]
+        process_args.extend(["-ar", str(ar), wav])
+    return subprocess.run(process_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+
+def extract_tar_with_strip_components(tar_path, extract_path, strip_components=1):
+    with tarfile.open(tar_path, "r") as tar:
+        members = tar.getmembers()
+        for member in members:
+            components = member.name.split(os.path.sep)
+            if len(components) > strip_components:
+                member.name = os.path.sep.join(components[strip_components:])
+                tar.extract(member, extract_path)
diff --git a/build/lib/sdp/utils/edit_spaces.py b/build/lib/sdp/utils/edit_spaces.py
new file mode 100644
index 00000000..d84d960f
--- /dev/null
+++ b/build/lib/sdp/utils/edit_spaces.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def remove_extra_spaces(input_string):
+    """
+    Removes extra spaces in between words and at the start and end
+    of the string.
+    e.g. "abc  xyz   abc xyz" --> "abc xyz abc xyz"
+    e.g. " abc xyz " --> "abc xyz"
+    """
+    output_string = " ".join(input_string.split())
+    return output_string
+
+
+def add_start_end_spaces(input_string):
+    """
+    Adds spaces at the start and end of the input string.
+    This is useful for when we specify we are looking for a particular
+    word " <word> ". This will ensure we will find the word even
+    if it is at the beginning or end of the utterances (ie. there will
+    definitely be two spaces around the word).
+
+    e.g. "abc xyz" --> " abc xyz "
+    """
+    # ensure no extra spaces
+    no_extra_spaces_string = remove_extra_spaces(input_string)
+    output_string = f" {no_extra_spaces_string} "
+
+    return output_string
diff --git a/build/lib/sdp/utils/get_diff.py b/build/lib/sdp/utils/get_diff.py
new file mode 100644
index 00000000..a0bc29d1
--- /dev/null
+++ b/build/lib/sdp/utils/get_diff.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import diff_match_patch
+
+from sdp.utils.edit_spaces import remove_extra_spaces
+
+diff = diff_match_patch.diff_match_patch()
+diff.Diff_Timeout = 0
+
+
+def get_diff(orig_words: str, pred_words: str) -> List[tuple]:
+    orig_words = remove_extra_spaces(orig_words)
+    orig_words = orig_words.replace(" ", "\n") + "\n"
+
+    pred_words = remove_extra_spaces(pred_words)
+    pred_words = pred_words.replace(" ", "\n") + "\n"
+
+    orig_enc, pred_enc, enc = diff.diff_linesToChars(orig_words, pred_words)
+    diffs = diff.diff_main(orig_enc, pred_enc, False)
+    diff.diff_charsToLines(diffs, enc)
+    diffs_post = []
+
+    for d in diffs:
+        diffs_post.append((d[0], d[1].replace("\n", " ")))
+    return diffs_post
+
+
+def get_diff_with_subs_grouped(orig_words: str, pred_words: str) -> List[tuple]:
+    """
+    Function to produce a list of word-level diffs, but with the substitutions
+    grouped together.
+        e.g.
+        orig_words = "hello there nemo"
+        pred_words = "hello my name is nemo"
+        will give an output of:
+        [(0, 'hello '), ((-1, 'there '), (1, 'my name is ')), (0, 'nemo ')]
+        (note how the 'there' nad 'my name is' entry are grouped together in a tuple)
+
+        This is to make it easier to find substitutions in the diffs, as
+        dif_match_patch does not show substitutions clearly, only as a deletion followed by
+        an insertion.
+
+    Args:
+        orig_words: a string containing the ground truth.
+        pred_words: a string containing the text predicted by ASR.
+
+    Returns:
+        A list of tuples containing the word-level diffs between the ground truth
+        and ASR.
+    """
+    diffs = get_diff(orig_words, pred_words)
+
+    diffs_group_subs = []
+    i = 0
+    while i < len(diffs):
+        if i < len(diffs) - 1:  # if i == len(diffs), line accessing diffs[i+1] will raise error
+            if diffs[i][0] == -1 and diffs[i + 1][0] == 1:
+                diffs_group_subs.append((diffs[i], diffs[i + 1]))
+                i += 1  # skip extra diff entry so we don't append diffs[i+1] again
+            else:
+                diffs_group_subs.append(diffs[i])
+        else:
+            diffs_group_subs.append(diffs[i])
+
+        i += 1
+
+    return diffs_group_subs
diff --git a/build/lib/sdp/utils/import_manager.py b/build/lib/sdp/utils/import_manager.py
new file mode 100644
index 00000000..5458892e
--- /dev/null
+++ b/build/lib/sdp/utils/import_manager.py
@@ -0,0 +1,138 @@
+import ast
+import importlib
+import inspect
+import os
+from pathlib import Path
+from typing import Dict, Optional, Set
+import yaml
+
+from sdp.logging import logger
+
+class ImportManager:
+    """
+    The ImportManager class is a utility designed to manage dynamic imports for a specific Python package based on a provided YAML configuration.
+    This class simplifies the process of selectively importing only the necessary components,
+    enabling the creation of a custom __init__.py file with imports for required processors. 
+    By doing so, it ensures that users only need to install the libraries they actually use, 
+    reducing unnecessary dependencies.
+    
+    To eable the ImportManager, set the `use_import_manager` key to `True` in the YAML config file. (Or provide it as an argument to main.py)
+    use_import_manager: True
+
+    """
+    def __init__(self, base_package: str = "sdp"):
+        self.base_package = base_package
+        self.package_path = self._find_package_path()
+        
+    def _find_package_path(self) -> Path:
+        try:
+            package = importlib.import_module(self.base_package)
+            return Path(package.__file__).parent
+        except ImportError:
+            current_dir = Path.cwd()
+            for parent in [current_dir, *current_dir.parents]:
+                if (parent / self.base_package).is_dir():
+                    return parent / self.base_package
+            raise FileNotFoundError(f"Could not find package '{self.base_package}'")
+
+    def _get_processor_import(self, target: str) -> Optional[str]:
+        try:
+            module_path, class_name = target.rsplit('.', 1)
+            return f"from {module_path} import {class_name}"
+        except ValueError as e:
+        # Raised if the target does not contain a '.'
+            logger.warning(f"Invalid target format for import: '{target}'. Expected '<module>.<class>'. Error: {e}")
+        except AttributeError as e:
+        # Raised if the target module or class does not exist
+            logger.warning(f"Invalid target type for import: {type(target)}. Error: {e}")
+        except Exception as e:
+            logger.warning(f"Could not process import for {target}: {e}")
+        return None
+
+
+
+
+
+
+    def get_required_imports(self, yaml_config: str) -> Set[str]:
+        with open(yaml_config, 'r') as f:
+            config = yaml.safe_load(f)
+            
+        required_imports = set()
+        if 'processors' in config:
+            for processor in config['processors']:
+                if isinstance(processor, dict) and '_target_' in processor:
+                    import_stmt = self._get_processor_import(processor['_target_'])
+                    if import_stmt:
+                        required_imports.add(import_stmt)
+                        logger.debug(f"Found required processor: {processor['_target_']}")
+        
+        return required_imports
+
+    def sync_with_config(self, yaml_config: str, init_file: Optional[str] = None) -> None:
+        """
+        Synchronize the __init__.py imports with the YAML config while preserving existing imports.
+        """
+        if init_file is None:
+            init_file = self.package_path / 'processors' / '__init__.py'
+        else:
+            init_file = Path(init_file)
+
+        logger.info(f"Syncing imports between {yaml_config} and {init_file}")
+
+        # Get current content
+        current_content = ""
+        if init_file.exists():
+            with open(init_file, 'r') as f:
+                current_content = f.read()
+
+        # Parse YAML config and get required imports
+        required_imports = self.get_required_imports(yaml_config)
+        
+        # Mention that this file is auto-generated
+        new_content = []
+        if "let's import all supported processors" in current_content:
+            # Keep the header comment if it exists
+            new_content.append("# This was automaticly generated, to disable: set use_import_manager: False in yaml config\n")
+        
+        # Add imports
+        for import_stmt in sorted(required_imports):
+            new_content.append(import_stmt)
+        
+        # Write the new content
+        init_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(init_file, 'w') as f:
+            f.write('\n'.join(new_content))
+        
+        logger.info(f"Successfully updated {init_file} with required imports")
+
+
+def setup_import_hooks():
+    """Set up import hooks for automatic import management."""
+    original_yaml_load = yaml.safe_load
+    
+    def yaml_load_hook(stream):
+        result = original_yaml_load(stream)
+        if isinstance(result, dict) and 'processors' in result:
+            frame = inspect.currentframe()
+            while frame:
+                if frame.f_code.co_name != 'yaml_load_hook':
+                    break
+                frame = frame.f_back
+            
+            if frame:
+                caller_file = frame.f_code.co_filename
+                if isinstance(stream, str):
+                    yaml_path = stream
+                else:
+                    yaml_path = os.path.abspath(caller_file)
+                    
+                manager = ImportManager()
+                try:
+                    manager.sync_with_config(yaml_path)
+                except Exception as e:
+                    logger.warning(f"Failed to sync imports: {e}")
+        
+        return result
+    
+    yaml.safe_load = yaml_load_hook
\ No newline at end of file
diff --git a/build/lib/sdp/utils/metrics_computation.py b/build/lib/sdp/utils/metrics_computation.py
new file mode 100644
index 00000000..cacf5a2f
--- /dev/null
+++ b/build/lib/sdp/utils/metrics_computation.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import difflib
+
+import editdistance
+
+sm = difflib.SequenceMatcher()
+
+
+def get_cer(text, pred_text):
+    char_dist = editdistance.eval(text, pred_text)
+    num_chars = len(text)
+    cer = round(char_dist / num_chars * 100.0, 2)
+
+    return cer
+
+
+def get_wer(text, pred_text):
+    text_words = text.split()
+    pred_text_words = pred_text.split()
+    word_dist = editdistance.eval(text_words, pred_text_words)
+
+    num_words = len(text_words)
+    wer = round(word_dist / num_words * 100.0, 2)
+
+    return wer
+
+
+def get_charrate(text, duration):
+    num_chars = len(text)
+    charrate = round(num_chars / duration, 2)
+
+    return charrate
+
+
+def get_wordrate(text, duration):
+    num_words = len(text.split())
+    wordrate = round(num_words / duration, 2)
+
+    return wordrate
+
+
+def get_wmr(text, pred_text):
+    orig = text.strip().split()
+    sm.set_seqs(orig, pred_text.strip().split())
+    num_matches = 0
+    for m in sm.get_matching_blocks():
+        for word_idx in range(m[0], m[0] + m[2]):
+            num_matches += 1
+    wmr = round(num_matches / len(orig) * 100.0, 2)
+    return wmr
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 4700f860..75092ef6 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -227,6 +227,9 @@ Data modifications
 .. autodata:: sdp.processors.SubIfASRSubstitution
    :annotation:
 
+.. autodata:: sdp.processors.ListToEntries
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 6788c88f..92114541 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -92,6 +92,7 @@
     GetWER,
     InsIfASRInsertion,
     InverseNormalizeText,
+    ListToEntries,
     NormalizeText,
     MakeSentence,
     ReadDocxLines,
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
new file mode 100644
index 00000000..0888058d
--- /dev/null
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from sdp.processors.base_processor import DataEntry, BaseParallelProcessor
+from sdp.processors import ListToEntries
+from sdp.logging import logger
+
+class CreateInitialManifest(ListToEntries):
+    def __init__(self, **kwargs):
+         super().__init__(**kwargs)
+        
+    def get_samples_durations(self, durations_filepath: str):
+        durations = dict()
+        with open(durations_filepath, 'r') as durations_txt:
+            for line in durations_txt:
+                yodas_id, duration = line.strip().split()
+                durations[yodas_id] = float(duration)
+        return durations
+    
+    def process_dataset_entry(self, data_entry):
+        durations = self.get_samples_durations(data_entry['local_duration'])
+        data_entries = super().process_dataset_entry(data_entry)
+
+        yodas_entries = []
+        for entry in data_entries:
+            yodas_id = os.path.basename(entry.data['source_audio_filepath']).split('.')[0]
+            entry.data['yodas_id'] = yodas_id
+            if yodas_id in durations:
+                entry.data['duration'] = durations[yodas_id]
+                yodas_entries.append(entry)
+            else:
+                logger.warning(f'Skipping `{yodas_id}` because there is no duration info in metadata.')
+                
+        return yodas_entries
\ No newline at end of file
diff --git a/sdp/processors/datasets/yodas2/download.py b/sdp/processors/datasets/yodas2/download.py
new file mode 100644
index 00000000..066cd73f
--- /dev/null
+++ b/sdp/processors/datasets/yodas2/download.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+import json
+import os
+from glob import glob
+from tqdm import tqdm
+import tempfile
+import importlib.util
+
+from huggingface_hub import hf_hub_download
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+from sdp.processors import ExtractTar
+
+class ListYodas2Data(ListRepoFiles): 
+    def __init__(self, use_metadata: bool = False, **kwargs):
+        super().__init__(repo_id = "espnet/yodas2", repo_type = "dataset", **kwargs)
+        self.use_metadata = use_metadata
+    
+    def process(self):
+        if self.use_metadata:
+            metadata = None
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                yodas_metafile = hf_hub_download(repo_id="espnet/yodas2", filename="meta.py", repo_type="dataset", local_dir = tmp_dir)
+                spec = importlib.util.spec_from_file_location("script", yodas_metafile)
+                metadata = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(metadata)
+            
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                for lang_subset in sorted(metadata.lang2shard_cnt.keys()):
+                    for shard_no in range(metadata.lang2shard_cnt['aa000']):
+                        shard_id = str(shard_no).zfill(8)
+                        data_entry = dict(lang_subset = lang_subset, shard_id = shard_id,
+                                          audio_key = f'data/{lang_subset}/audio/{shard_id}.tar.gz',
+                                          duration_key = f'data/{lang_subset}/duration/{shard_id}.txt',
+                                          text_key = f'data/{lang_subset}/text/{shard_id}.json')
+                        line = json.dumps(data_entry)
+                        fout.writelines(f'{line}\n')
+        else:       
+            logger.info(f'Recieving files list of espnet/yodas2 dataset from Hugging Face..')
+            self.list_repo_files()
+            logger.info(f'Metadata have beeen successfully recieved. Aggregating filenames into shards..')
+
+            lang2shard_files = {}
+            
+            for file in tqdm(self.files):
+                if not file.startswith('data/'):
+                    continue
+
+                path = Path(file)
+                lang_subset = path.parts[1]
+                if lang_subset not in lang2shard_files:
+                    lang2shard_files[lang_subset] = dict()
+                lang_shards = lang2shard_files[lang_subset]
+
+                shard_no = path.parts[3].split('.')[0]
+                if shard_no not in lang_shards:
+                    lang_shards[shard_no] = dict()
+                shard_files = lang_shards[shard_no]
+
+                data_type = path.parts[2]
+                shard_files[data_type] = file
+
+            logger.info(f'Writing data into manifest..')
+
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                for lang_subset in sorted(lang2shard_files.keys()):
+                    lang_subset_shards = lang2shard_files[lang_subset]
+                    for shard_id in sorted(lang_subset_shards.keys()):
+                        data_entry = dict(
+                            lang_subset = lang_subset,
+                            shard_id = shard_id,
+                            )
+                        
+                        shard_data = lang_subset_shards[shard_id]
+                        for data_type in sorted(shard_data.keys()):
+                            data_entry[f'{data_type}_key'] = shard_data[data_type]
+                        
+                        line = json.dumps(data_entry)
+                        fout.writelines(f'{line}\n')
+        
+        logger.info(f'Metadata successfully saved!')
+        
+
+class DownloadYodas2Data(SnapshotDownload):
+    def __init__(self, **kwargs):
+        super().__init__(repo_id = "espnet/yodas2", repo_type = "dataset", **kwargs)
+    
+    def write_output_manifest_file(self): 
+        samples = []
+        with open(self.input_manifest_file, 'r', encoding = 'utf8') as fin, open(self.output_manifest_file, 'w', encoding = 'utf8') as fout:
+            for line in fin:
+                sample = json.loads(line)
+                audio_file = sample.get('audio_key', None)
+                if audio_file:
+                    local_audio_file = os.path.join(self.local_dir, audio_file)
+                    if os.path.exists(local_audio_file):
+                        sample['local_audio'] = local_audio_file
+
+                duration_file = sample.get('duration_key', None)
+                if duration_file:
+                    local_duration_file = os.path.join(self.local_dir, duration_file)
+                    if os.path.exists(local_duration_file):
+                        sample['local_duration'] = local_duration_file
+                
+                text_file = sample.get('text_key', None)
+                if text_file:
+                    local_text_file = os.path.join(self.local_dir, text_file)
+                    if os.path.exists(local_text_file):
+                        sample['local_text'] = local_text_file
+                
+                line = json.dumps(sample)
+                fout.writelines(f'{line}\n')
+
+    def process(self):
+        allow_patterns = []
+        with open(self.input_manifest_file, 'r', encoding = 'utf8') as fin:
+            for line in fin:
+                sample = json.loads(line)
+                audio_file = sample.get('audio_key', None)
+                if audio_file:
+                    allow_patterns.append(audio_file)
+
+                duration_file = sample.get('duration_key', None)
+                if duration_file:
+                    allow_patterns.append(duration_file)
+                
+                text_file = sample.get('text_key', None)
+                if text_file:
+                    allow_patterns.append(text_file)
+    
+        self.snapshot_download_kwargs['allow_patterns'] = allow_patterns
+        self.download()
+        self.write_output_manifest_file()
+
+
+class ExtractYodas2Data(ExtractTar):
+    def __init__(self, **kwargs):
+        kwargs['get_extracted_filepaths'] = True
+        super().__init__(**kwargs)
+    
+    def process_dataset_entry(self, data_entry):
+        super().process_dataset_entry()
+        audio_samples = []
+        for audio_filepath in data_entry[self.output_filepath_field]:
+            sample = dict(data_entry['lang_'])
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 16e1de6d..a8a583d1 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -1127,3 +1127,157 @@ def process(self):
         if self.failed_files:
             logger.warning(f"Failed to process {len(self.failed_files)} files.")
             logger.debug(f"Failed files: {self.failed_files}")
+
+
+class ListToEntries(BaseParallelProcessor):
+    """
+    A dataset processor that transforms a single entry containing a list of items into multiple entries,
+    one for each item in the list.
+
+    This is useful when a manifest field (e.g., "segments") contains a list of sub-entries, and you want
+    to flatten these into individual records for further processing.
+
+    Args:
+        field_with_list (str): The name of the field in the input entry that contains a list.
+        output_field (str, optional): The name of the output field to assign to items in the list
+            if they are not dictionaries. Required if the list contains primitive types (e.g., strings).
+        fields_to_save (list[str], optional): A list of field names to preserve from the original entry.
+            All other fields will be removed.
+        fields_to_remove (list[str], optional): A list of field names to explicitly remove from the original entry,
+            in addition to those excluded by `fields_to_save`.
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
+
+    Raises:
+        TypeError: If the specified list field is not of type list.
+        ValueError: If the list items are not dictionaries and `output_field` is not provided.
+    
+    Returns:
+        A manifest where each entry corresponds to one item in the original list from the input entry. 
+        This effectively transforms a single input entry containing a list of items into multiple standalone 
+        entries, each suitable for further dataset processing.
+
+    .. admonition:: Example 1 (list of dicts)
+        
+        .. code-block:: yaml
+    
+            - _target_: sdp.processors.ListToEntries
+              input_manifest_file: ${workspace_dir}/input_manifest.json
+              output_manifest_file: ${workspace_dir}/output_manifest.json
+              field_with_list: "segments"
+                
+        Input::
+ 
+            {
+                "audio_filepath": "sample.wav",
+                "segments": [
+                    {"start": 0.0, "end": 1.5, "text": "Hello"},
+                    {"start": 1.6, "end": 3.0, "text": "World"}
+                ]
+            }
+
+        Output::
+
+            [
+                {
+                    "audio_filepath": "sample.wav",
+                    "start": 0.0,
+                    "end": 1.5,
+                    "text": "Hello"
+                },
+                {
+                    "audio_filepath": "sample.wav",
+                    "start": 1.6,
+                    "end": 3.0,
+                    "text": "World"
+                }
+            ]
+    
+    .. admonition:: Example 2 (list of primitives)
+        
+        .. code-block:: yaml
+    
+            - _target_: sdp.processors.ListToEntries
+              input_manifest_file: ${workspace_dir}/input_manifest.json
+              output_manifest_file: ${workspace_dir}/output_manifest.json
+              field_with_list: "text_chunks"
+              output_field: "text"
+                
+        Input::
+ 
+            {
+                "audio_filepath": "sample.wav",
+                "text_chunks": [
+                    "Hello",
+                    "World"
+                ]
+            }
+
+        Output::
+
+            [
+                {
+                    "audio_filepath": "sample.wav",
+                    "text": "Hello"
+                },
+                {
+                    "audio_filepath": "sample.wav",
+                    "text": "World"
+                }
+            ]
+
+    """
+
+    def __init__(self, 
+        field_with_list: str,
+        output_field: str = None,
+        fields_to_save: list[str] = None,
+        fields_to_remove: list[str] = None,
+        **kwargs):
+        super().__init__(**kwargs)
+        self.field_with_list = field_with_list
+        self.output_field = output_field
+        self.fields_to_save = fields_to_save
+        self.fields_to_remove = fields_to_remove
+        
+    def process_dataset_entry(self, data_entry):
+        _entries = []
+
+        # Check that the target field is actually a list
+        if not isinstance(data_entry[self.field_with_list], list):
+            raise TypeError(f'Values of {self.field_with_list} field should be list type only: {data_entry}')
+        
+        # Remove the list field from the entry and get the list of items
+        items_list = data_entry.pop(self.field_with_list)
+
+        # If items are not dicts, output_field must be specified to store the item
+        if not isinstance(items_list[0], dict) and not self.output_field:
+            raise ValueError(f'Type of items in items list `{self.field_with_list}` is not dict ({type(items_list[0])}). In this case `output_field` should be provided.')
+
+        # Determine which fields to remove from the entry before expanding
+        fields_to_remove = set()
+        if self.fields_to_save is not None:
+            for field in data_entry:
+                if field not in self.fields_to_save:
+                    fields_to_remove.add(field)
+
+        if self.fields_to_remove is not None:
+            fields_to_remove.update(self.fields_to_remove)
+
+        # Remove specified fields
+        for field in fields_to_remove:
+            data_entry.pop(field)
+
+        # Expand the list into multiple entries
+        for item in items_list:
+            _entry = data_entry.copy()
+
+            # If item is a dict, merge its keys; otherwise, store it in `output_field`
+            if isinstance(item, dict):
+                _entry.update(item)
+            else: 
+                _entry[self.output_field] = item
+
+            _entry = DataEntry(_entry)
+            _entries.append(_entry)
+
+        return _entries
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 5bd75f47..f0b124f8 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -19,6 +19,7 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    ListToEntries
 )
 
 test_params_list = []
@@ -29,13 +30,13 @@
             InsIfASRInsertion,
             {"insert_words": [" nemo", "nemo ", " nemo "]},
             {"text": "i love the toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
         (
             InsIfASRInsertion,
             {"insert_words": [" nemo", "nemo ", " nemo "]},
             {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"},
-            {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"},
+            [{"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}],
         ),
     ]
 )
@@ -46,7 +47,7 @@
             SubIfASRSubstitution,
             {"sub_words": {"nmo ": "nemo "}},
             {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
     ]
 )
@@ -57,7 +58,7 @@
             SubIfASRSubstitution,
             {"sub_words": {"nmo ": "nemo "}},
             {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
     ]
 )
@@ -68,13 +69,13 @@
             SubMakeLowercase,
             {},
             {"text": "Hello Привет 123"},
-            {"text": "hello привет 123"},
+            [{"text": "hello привет 123"}],
         ),
         (
             SubMakeLowercase,
             {"text_key": "text_new"},
             {"text_new": "Hello Привет 123"},
-            {"text_new": "hello привет 123"},
+            [{"text_new": "hello привет 123"}],
         ),
     ]
 )
@@ -85,8 +86,34 @@
             SubRegex,
             {"regex_params_list": [{"pattern": "\s<.*>\s", "repl": " "}]},
             {"text": "hello <cough> world"},
-            {"text": "hello world"},
+            [{"text": "hello world"}],
+        ),
+    ]
+)
+
+test_params_list.extend(
+    [
+        # Test: list of dictionaries (e.g., segments)
+        (
+        ListToEntries,
+            {"field_with_list": "segments", "fields_to_remove": ["duration"]},
+            {"audio_filepath": "a.wav", "segments": [{"start": 0.0, "end": 1.0, "text": "Hello"}, {"start": 1.1, "end": 2.0, "text": "World"}], "duration": 2.5},
+            [{"audio_filepath": "a.wav", "start": 0.0, "end": 1.0, "text": "Hello"}, {"audio_filepath": "a.wav", "start": 1.1, "end": 2.0, "text": "World"}]
+        ),
+        # Test: list of primitive values (strings), requires output_field
+        (
+            ListToEntries,
+            {"field_with_list": "text_chunks", "output_field": "text"},
+            {"audio_filepath": "b.wav", "text_chunks": ["Привет", "Мир"], "lang": "ru"},
+            [{"audio_filepath": "b.wav", "lang": "ru", "text": "Привет"}, {"audio_filepath": "b.wav", "lang": "ru", "text": "Мир"}]
         ),
+        # Test: only keep specified fields (fields_to_save)
+        (
+            ListToEntries,
+            {"field_with_list": "segments", "fields_to_save": ["audio_filepath"]},
+            {"audio_filepath": "c.wav", "segments": [{"start": 0, "text": "A"}, {"start": 1, "text": "B"}], "remove_me": "to_delete"},
+            [{"audio_filepath": "c.wav", "start": 0, "text": "A"}, {"audio_filepath": "c.wav", "start": 1, "text": "B"}]
+        )
     ]
 )
 
@@ -94,7 +121,6 @@
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):
     processor = test_class(**class_kwargs, output_manifest_file=None)
+    result = [entry.data for entry in processor.process_dataset_entry(test_input)]
 
-    output = processor.process_dataset_entry(test_input)[0].data
-
-    assert output == expected_output
+    assert result == expected_output

From 69210353433d2d6a9ed80280df83012683deeb04 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 07:47:13 -0700
Subject: [PATCH 03/90] ListToEntries is added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 .../modify_manifest/data_to_data.py           | 154 ++++++++++++++++++
 tests/test_data_to_data.py                    |  46 ++++--
 3 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 4700f860..75092ef6 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -227,6 +227,9 @@ Data modifications
 .. autodata:: sdp.processors.SubIfASRSubstitution
    :annotation:
 
+.. autodata:: sdp.processors.ListToEntries
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 16e1de6d..a8a583d1 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -1127,3 +1127,157 @@ def process(self):
         if self.failed_files:
             logger.warning(f"Failed to process {len(self.failed_files)} files.")
             logger.debug(f"Failed files: {self.failed_files}")
+
+
+class ListToEntries(BaseParallelProcessor):
+    """
+    A dataset processor that transforms a single entry containing a list of items into multiple entries,
+    one for each item in the list.
+
+    This is useful when a manifest field (e.g., "segments") contains a list of sub-entries, and you want
+    to flatten these into individual records for further processing.
+
+    Args:
+        field_with_list (str): The name of the field in the input entry that contains a list.
+        output_field (str, optional): The name of the output field to assign to items in the list
+            if they are not dictionaries. Required if the list contains primitive types (e.g., strings).
+        fields_to_save (list[str], optional): A list of field names to preserve from the original entry.
+            All other fields will be removed.
+        fields_to_remove (list[str], optional): A list of field names to explicitly remove from the original entry,
+            in addition to those excluded by `fields_to_save`.
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
+
+    Raises:
+        TypeError: If the specified list field is not of type list.
+        ValueError: If the list items are not dictionaries and `output_field` is not provided.
+    
+    Returns:
+        A manifest where each entry corresponds to one item in the original list from the input entry. 
+        This effectively transforms a single input entry containing a list of items into multiple standalone 
+        entries, each suitable for further dataset processing.
+
+    .. admonition:: Example 1 (list of dicts)
+        
+        .. code-block:: yaml
+    
+            - _target_: sdp.processors.ListToEntries
+              input_manifest_file: ${workspace_dir}/input_manifest.json
+              output_manifest_file: ${workspace_dir}/output_manifest.json
+              field_with_list: "segments"
+                
+        Input::
+ 
+            {
+                "audio_filepath": "sample.wav",
+                "segments": [
+                    {"start": 0.0, "end": 1.5, "text": "Hello"},
+                    {"start": 1.6, "end": 3.0, "text": "World"}
+                ]
+            }
+
+        Output::
+
+            [
+                {
+                    "audio_filepath": "sample.wav",
+                    "start": 0.0,
+                    "end": 1.5,
+                    "text": "Hello"
+                },
+                {
+                    "audio_filepath": "sample.wav",
+                    "start": 1.6,
+                    "end": 3.0,
+                    "text": "World"
+                }
+            ]
+    
+    .. admonition:: Example 2 (list of primitives)
+        
+        .. code-block:: yaml
+    
+            - _target_: sdp.processors.ListToEntries
+              input_manifest_file: ${workspace_dir}/input_manifest.json
+              output_manifest_file: ${workspace_dir}/output_manifest.json
+              field_with_list: "text_chunks"
+              output_field: "text"
+                
+        Input::
+ 
+            {
+                "audio_filepath": "sample.wav",
+                "text_chunks": [
+                    "Hello",
+                    "World"
+                ]
+            }
+
+        Output::
+
+            [
+                {
+                    "audio_filepath": "sample.wav",
+                    "text": "Hello"
+                },
+                {
+                    "audio_filepath": "sample.wav",
+                    "text": "World"
+                }
+            ]
+
+    """
+
+    def __init__(self, 
+        field_with_list: str,
+        output_field: str = None,
+        fields_to_save: list[str] = None,
+        fields_to_remove: list[str] = None,
+        **kwargs):
+        super().__init__(**kwargs)
+        self.field_with_list = field_with_list
+        self.output_field = output_field
+        self.fields_to_save = fields_to_save
+        self.fields_to_remove = fields_to_remove
+        
+    def process_dataset_entry(self, data_entry):
+        _entries = []
+
+        # Check that the target field is actually a list
+        if not isinstance(data_entry[self.field_with_list], list):
+            raise TypeError(f'Values of {self.field_with_list} field should be list type only: {data_entry}')
+        
+        # Remove the list field from the entry and get the list of items
+        items_list = data_entry.pop(self.field_with_list)
+
+        # If items are not dicts, output_field must be specified to store the item
+        if not isinstance(items_list[0], dict) and not self.output_field:
+            raise ValueError(f'Type of items in items list `{self.field_with_list}` is not dict ({type(items_list[0])}). In this case `output_field` should be provided.')
+
+        # Determine which fields to remove from the entry before expanding
+        fields_to_remove = set()
+        if self.fields_to_save is not None:
+            for field in data_entry:
+                if field not in self.fields_to_save:
+                    fields_to_remove.add(field)
+
+        if self.fields_to_remove is not None:
+            fields_to_remove.update(self.fields_to_remove)
+
+        # Remove specified fields
+        for field in fields_to_remove:
+            data_entry.pop(field)
+
+        # Expand the list into multiple entries
+        for item in items_list:
+            _entry = data_entry.copy()
+
+            # If item is a dict, merge its keys; otherwise, store it in `output_field`
+            if isinstance(item, dict):
+                _entry.update(item)
+            else: 
+                _entry[self.output_field] = item
+
+            _entry = DataEntry(_entry)
+            _entries.append(_entry)
+
+        return _entries
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 5bd75f47..f0b124f8 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -19,6 +19,7 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    ListToEntries
 )
 
 test_params_list = []
@@ -29,13 +30,13 @@
             InsIfASRInsertion,
             {"insert_words": [" nemo", "nemo ", " nemo "]},
             {"text": "i love the toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
         (
             InsIfASRInsertion,
             {"insert_words": [" nemo", "nemo ", " nemo "]},
             {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"},
-            {"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"},
+            [{"text": "i love the toolkit", "pred_text": "i love the new nemo toolkit"}],
         ),
     ]
 )
@@ -46,7 +47,7 @@
             SubIfASRSubstitution,
             {"sub_words": {"nmo ": "nemo "}},
             {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
     ]
 )
@@ -57,7 +58,7 @@
             SubIfASRSubstitution,
             {"sub_words": {"nmo ": "nemo "}},
             {"text": "i love the nmo toolkit", "pred_text": "i love the nemo toolkit"},
-            {"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"},
+            [{"text": "i love the nemo toolkit", "pred_text": "i love the nemo toolkit"}],
         ),
     ]
 )
@@ -68,13 +69,13 @@
             SubMakeLowercase,
             {},
             {"text": "Hello Привет 123"},
-            {"text": "hello привет 123"},
+            [{"text": "hello привет 123"}],
         ),
         (
             SubMakeLowercase,
             {"text_key": "text_new"},
             {"text_new": "Hello Привет 123"},
-            {"text_new": "hello привет 123"},
+            [{"text_new": "hello привет 123"}],
         ),
     ]
 )
@@ -85,8 +86,34 @@
             SubRegex,
             {"regex_params_list": [{"pattern": "\s<.*>\s", "repl": " "}]},
             {"text": "hello <cough> world"},
-            {"text": "hello world"},
+            [{"text": "hello world"}],
+        ),
+    ]
+)
+
+test_params_list.extend(
+    [
+        # Test: list of dictionaries (e.g., segments)
+        (
+        ListToEntries,
+            {"field_with_list": "segments", "fields_to_remove": ["duration"]},
+            {"audio_filepath": "a.wav", "segments": [{"start": 0.0, "end": 1.0, "text": "Hello"}, {"start": 1.1, "end": 2.0, "text": "World"}], "duration": 2.5},
+            [{"audio_filepath": "a.wav", "start": 0.0, "end": 1.0, "text": "Hello"}, {"audio_filepath": "a.wav", "start": 1.1, "end": 2.0, "text": "World"}]
+        ),
+        # Test: list of primitive values (strings), requires output_field
+        (
+            ListToEntries,
+            {"field_with_list": "text_chunks", "output_field": "text"},
+            {"audio_filepath": "b.wav", "text_chunks": ["Привет", "Мир"], "lang": "ru"},
+            [{"audio_filepath": "b.wav", "lang": "ru", "text": "Привет"}, {"audio_filepath": "b.wav", "lang": "ru", "text": "Мир"}]
         ),
+        # Test: only keep specified fields (fields_to_save)
+        (
+            ListToEntries,
+            {"field_with_list": "segments", "fields_to_save": ["audio_filepath"]},
+            {"audio_filepath": "c.wav", "segments": [{"start": 0, "text": "A"}, {"start": 1, "text": "B"}], "remove_me": "to_delete"},
+            [{"audio_filepath": "c.wav", "start": 0, "text": "A"}, {"audio_filepath": "c.wav", "start": 1, "text": "B"}]
+        )
     ]
 )
 
@@ -94,7 +121,6 @@
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):
     processor = test_class(**class_kwargs, output_manifest_file=None)
+    result = [entry.data for entry in processor.process_dataset_entry(test_input)]
 
-    output = processor.process_dataset_entry(test_input)[0].data
-
-    assert output == expected_output
+    assert result == expected_output

From ea5f874b46e1f22456de1854f3df65ec23f44e94 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 08:57:49 -0700
Subject: [PATCH 04/90] ListRepoFiles and SnapshotDownload are added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/huggingface/huggingface_hub.py | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 sdp/processors/huggingface/huggingface_hub.py

diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
new file mode 100644
index 00000000..d7584b0a
--- /dev/null
+++ b/sdp/processors/huggingface/huggingface_hub.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from huggingface_hub import snapshot_download, list_repo_files
+
+from sdp.processors.base_processor import BaseProcessor
+
+
+class ListRepoFiles(BaseProcessor):
+    """
+    Processor that lists files in a Hugging Face Hub repository and writes them
+    into a JSON manifest file.
+
+    Each line in the output manifest is a JSON object with the key defined by `file_key`
+    and value being a file path from the repository.
+
+    Args:
+        output_manifest_file (str): Path to write the output manifest file.
+        file_key (str): The key name to use in each manifest entry (default: "file_key").
+        **list_repo_files_kwargs: Keyword arguments forwarded to `huggingface_hub.list_repo_files()`.
+
+    See also:
+        https://huggingface.co/docs/huggingface_hub/v0.8.0/en/package_reference/hf_api#huggingface_hub.HfApi.list_repo_files
+
+    Returns:
+        A line-delimited JSON manifest where each line looks like:
+        {"file_key": "path/to/file.ext"}
+    """
+
+    def __init__(
+        self,
+        output_manifest_file: str,
+        file_key: str = "file_key",
+        **list_repo_files_kwargs,
+    ):
+        super().__init__(output_manifest_file=output_manifest_file)
+        self.list_repo_files_kwargs = list_repo_files_kwargs
+        self.file_key = file_key
+
+    def list_repo_files(self):
+        """
+        Retrieve the list of files from a Hugging Face repository.
+        """
+        self.files = list_repo_files(**self.list_repo_files_kwargs)
+
+    def write_output_manifest_file(self):
+        """
+        Write the list of repo files to the output manifest, one file per line as JSON.
+        """
+        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            for file in self.files:
+                line = json.dumps({self.file_key: file})
+                fout.writelines(f'{line}\n')  # Fixed typo: was `lines`, should be `line`
+
+    def process(self):
+        """
+        Main processing entrypoint: get repo files and write to manifest.
+        """
+        self.list_repo_files()
+        self.write_output_manifest_file()
+
+
+class SnapshotDownload(BaseProcessor):
+    """
+    Processor that downloads a snapshot of a Hugging Face repository to a local directory
+    and writes the local folder path to a JSON manifest file.
+
+    Args:
+        output_manifest_file (str): Path to write the output manifest file.
+        input_manifest_file (str, optional): Path to input manifest (not used in this processor).
+        **snapshot_download_kwargs: Keyword arguments forwarded to `huggingface_hub.snapshot_download()`.
+
+    See also:
+        https://huggingface.co/docs/huggingface_hub/v0.30.2/en/package_reference/file_download#huggingface_hub.snapshot_download
+
+    Returns:
+        A JSON file containing one line:
+        {"destination_dir": "/path/to/downloaded/repo"}
+    """
+
+    def __init__(
+        self,
+        output_manifest_file: str,
+        input_manifest_file: str = None,
+        **snapshot_download_kwargs,
+    ):
+        super().__init__(
+            output_manifest_file=output_manifest_file,
+            input_manifest_file=input_manifest_file,
+        )
+        self.snapshot_download_kwargs = snapshot_download_kwargs
+
+    def download(self):
+        """
+        Download the repository snapshot to a local folder.
+        """
+        self.local_dir = snapshot_download(**self.snapshot_download_kwargs)
+
+    def write_output_manifest_file(self):
+        """
+        Write the path of the downloaded snapshot folder to the output manifest.
+        """
+        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            fout.writelines(json.dumps({"destination_dir": self.local_dir}))
+
+    def process(self):
+        """
+        Main processing entrypoint: download repo and write path to manifest.
+        """
+        self.download()
+        self.write_output_manifest_file()

From 3fc489546f2df6a2722227f06c3e9aae987673b7 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 08:58:30 -0700
Subject: [PATCH 05/90] ListRepoFiles and SnapshotDownload docs are added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 75092ef6..fdb58351 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -116,6 +116,12 @@ HuggingFace Datasets
 .. autodata:: sdp.processors.CreateInitialManifestHuggingFace
    :annotation:
 
+.. autodata:: sdp.processors.ListRepoFiles
+   :annotation:
+
+.. autodata:: sdp.processors.SnapshotDownload
+   :annotation:
+
 Lhotse processors
 #################
 

From 0c093d60a669141f1be788622f4084138580918d Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 09:14:09 -0700
Subject: [PATCH 06/90] removed build

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 build/lib/sdp/__init__.py                     |   13 -
 build/lib/sdp/logging.py                      |   18 -
 build/lib/sdp/processors/__init__.py          |  134 --
 build/lib/sdp/processors/base_processor.py    |  502 -------
 build/lib/sdp/processors/datasets/__init__.py |    0
 .../datasets/commoncrawl/__init__.py          |   15 -
 .../datasets/commoncrawl/commoncrawl.py       |   99 --
 .../datasets/commoncrawl/harv_utils.py        |   45 -
 .../sdp/processors/datasets/coraa/__init__.py |    0
 .../datasets/coraa/create_initial_manifest.py |  125 --
 .../processors/datasets/coraal/__init__.py    |   16 -
 .../coraal/create_initial_manifest.py         |  218 ---
 .../processors/datasets/coraal/data_splits.py |  130 --
 .../processors/datasets/fleurs/__init__.py    |    0
 .../fleurs/create_initial_manifest.py         |  150 --
 .../sdp/processors/datasets/ksc2/__init__.py  |    0
 .../datasets/ksc2/create_initial_manifest.py  |  150 --
 build/lib/sdp/processors/datasets/lhotse.py   |   83 --
 .../datasets/librispeech/__init__.py          |    0
 .../librispeech/create_initial_manifest.py    |  140 --
 .../sdp/processors/datasets/masc/__init__.py  |   18 -
 .../datasets/masc/aggregate_segments.py       |  131 --
 .../masc/apply_reg_exp_on_vtt_entries.py      |   74 -
 .../datasets/masc/create_initial_manifest.py  |  174 ---
 .../masc/get_caption_file_segments.py         |   62 -
 .../lib/sdp/processors/datasets/masc/utils.py |   77 --
 .../sdp/processors/datasets/mcv/__init__.py   |    0
 .../datasets/mcv/create_initial_manifest.py   |  142 --
 .../datasets/mediaspeech/__init__.py          |   13 -
 .../mediaspeech/create_initial_manifest.py    |  145 --
 .../sdp/processors/datasets/mls/__init__.py   |    0
 .../datasets/mls/create_initial_manifest.py   |  180 ---
 .../sdp/processors/datasets/mls/restore_pc.py |  606 --------
 .../sdp/processors/datasets/mtedx/__init__.py |    0
 .../datasets/mtedx/create_initial_manifest.py |   84 --
 .../processors/datasets/slr102/__init__.py    |    0
 .../slr102/create_initial_manifest.py         |  122 --
 .../processors/datasets/slr140/__init__.py    |    0
 .../slr140/create_initial_manifest.py         |  213 ---
 .../sdp/processors/datasets/slr83/__init__.py |    0
 .../datasets/slr83/create_initial_manifest.py |  261 ----
 .../datasets/uzbekvoice/__init__.py           |   13 -
 .../uzbekvoice/create_initial_manifest.py     |  120 --
 .../processors/datasets/voxpopuli/__init__.py |    0
 .../voxpopuli/create_initial_manifest.py      |  155 ---
 .../voxpopuli/normalize_from_non_pc_text.py   |  170 ---
 .../sdp/processors/huggingface/__init__.py    |    0
 .../huggingface/create_initial_manifest.py    |   92 --
 .../huggingface/speech_recognition.py         |  145 --
 build/lib/sdp/processors/langs/__init__.py    |   13 -
 build/lib/sdp/processors/langs/arabic.py      |  183 ---
 build/lib/sdp/processors/langs/armenian.py    |   95 --
 build/lib/sdp/processors/langs/kazakh.py      |   67 -
 .../processors/modify_manifest/__init__.py    |   13 -
 .../sdp/processors/modify_manifest/common.py  |  403 ------
 .../modify_manifest/create_manifest.py        |   93 --
 .../modify_manifest/data_to_data.py           | 1227 -----------------
 .../modify_manifest/data_to_dropbool.py       |  907 ------------
 .../make_letters_uppercase_after_period.py    |   80 --
 build/lib/sdp/processors/nemo/__init__.py     |    0
 .../lib/sdp/processors/nemo/asr_inference.py  |   78 --
 build/lib/sdp/processors/nemo/pc_inference.py |  111 --
 .../sdp/processors/nemo/transcribe_speech.py  |  417 ------
 build/lib/sdp/processors/toloka/__init__.py   |   13 -
 build/lib/sdp/processors/toloka/accept_if.py  |  155 ---
 .../lib/sdp/processors/toloka/create_pool.py  |  150 --
 .../sdp/processors/toloka/create_project.py   |  128 --
 .../processors/toloka/create_sentence_set.py  |   56 -
 .../sdp/processors/toloka/create_task_set.py  |  160 ---
 .../processors/toloka/download_responses.py   |  244 ----
 build/lib/sdp/processors/toloka/reject_if.py  |  160 ---
 build/lib/sdp/run_processors.py               |  253 ----
 build/lib/sdp/utils/__init__.py               |   16 -
 build/lib/sdp/utils/bootstrap_estimates.py    |  273 ----
 build/lib/sdp/utils/common.py                 |  111 --
 build/lib/sdp/utils/edit_spaces.py            |   41 -
 build/lib/sdp/utils/get_diff.py               |   81 --
 build/lib/sdp/utils/import_manager.py         |  138 --
 build/lib/sdp/utils/metrics_computation.py    |   63 -
 79 files changed, 10564 deletions(-)
 delete mode 100644 build/lib/sdp/__init__.py
 delete mode 100644 build/lib/sdp/logging.py
 delete mode 100644 build/lib/sdp/processors/__init__.py
 delete mode 100644 build/lib/sdp/processors/base_processor.py
 delete mode 100644 build/lib/sdp/processors/datasets/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/commoncrawl/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
 delete mode 100644 build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
 delete mode 100644 build/lib/sdp/processors/datasets/coraa/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/coraal/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/coraal/data_splits.py
 delete mode 100644 build/lib/sdp/processors/datasets/fleurs/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/ksc2/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/lhotse.py
 delete mode 100644 build/lib/sdp/processors/datasets/librispeech/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/aggregate_segments.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
 delete mode 100644 build/lib/sdp/processors/datasets/masc/utils.py
 delete mode 100644 build/lib/sdp/processors/datasets/mcv/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/mediaspeech/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/mls/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/mls/restore_pc.py
 delete mode 100644 build/lib/sdp/processors/datasets/mtedx/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr102/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr140/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr83/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/voxpopuli/__init__.py
 delete mode 100644 build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
 delete mode 100644 build/lib/sdp/processors/huggingface/__init__.py
 delete mode 100644 build/lib/sdp/processors/huggingface/create_initial_manifest.py
 delete mode 100644 build/lib/sdp/processors/huggingface/speech_recognition.py
 delete mode 100644 build/lib/sdp/processors/langs/__init__.py
 delete mode 100644 build/lib/sdp/processors/langs/arabic.py
 delete mode 100644 build/lib/sdp/processors/langs/armenian.py
 delete mode 100644 build/lib/sdp/processors/langs/kazakh.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/__init__.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/common.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/create_manifest.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/data_to_data.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
 delete mode 100644 build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
 delete mode 100644 build/lib/sdp/processors/nemo/__init__.py
 delete mode 100644 build/lib/sdp/processors/nemo/asr_inference.py
 delete mode 100644 build/lib/sdp/processors/nemo/pc_inference.py
 delete mode 100644 build/lib/sdp/processors/nemo/transcribe_speech.py
 delete mode 100644 build/lib/sdp/processors/toloka/__init__.py
 delete mode 100644 build/lib/sdp/processors/toloka/accept_if.py
 delete mode 100644 build/lib/sdp/processors/toloka/create_pool.py
 delete mode 100644 build/lib/sdp/processors/toloka/create_project.py
 delete mode 100644 build/lib/sdp/processors/toloka/create_sentence_set.py
 delete mode 100644 build/lib/sdp/processors/toloka/create_task_set.py
 delete mode 100644 build/lib/sdp/processors/toloka/download_responses.py
 delete mode 100644 build/lib/sdp/processors/toloka/reject_if.py
 delete mode 100644 build/lib/sdp/run_processors.py
 delete mode 100644 build/lib/sdp/utils/__init__.py
 delete mode 100644 build/lib/sdp/utils/bootstrap_estimates.py
 delete mode 100644 build/lib/sdp/utils/common.py
 delete mode 100644 build/lib/sdp/utils/edit_spaces.py
 delete mode 100644 build/lib/sdp/utils/get_diff.py
 delete mode 100644 build/lib/sdp/utils/import_manager.py
 delete mode 100644 build/lib/sdp/utils/metrics_computation.py

diff --git a/build/lib/sdp/__init__.py b/build/lib/sdp/__init__.py
deleted file mode 100644
index 2db92b25..00000000
--- a/build/lib/sdp/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/logging.py b/build/lib/sdp/logging.py
deleted file mode 100644
index 10d6b14f..00000000
--- a/build/lib/sdp/logging.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-
-# overriding with the library specific logger, so that it's possible to
-# customize in any downstream applications
-logger = logging.getLogger("sdp")
diff --git a/build/lib/sdp/processors/__init__.py b/build/lib/sdp/processors/__init__.py
deleted file mode 100644
index 6788c88f..00000000
--- a/build/lib/sdp/processors/__init__.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# let's import all supported processors here to simplify target specification
-
-from sdp.processors.datasets.coraa.create_initial_manifest import (
-    CreateInitialManifestCORAA,
-)
-from sdp.processors.datasets.coraal import (
-    CreateInitialManifestCORAAL,
-    TrainDevTestSplitCORAAL,
-)
-from sdp.processors.datasets.fleurs.create_initial_manifest import (
-    CreateInitialManifestFleurs,
-)
-from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
-    CreateInitialManifestUzbekvoice,
-)
-from sdp.processors.datasets.ksc2.create_initial_manifest import (
-    CreateInitialManifestKSC2,
-)
-from sdp.processors.datasets.lhotse import LhotseImport
-from sdp.processors.datasets.librispeech.create_initial_manifest import (
-    CreateInitialManifestLibrispeech,
-)
-from sdp.processors.datasets.masc import (
-    CreateInitialManifestMASC,
-    AggregateSegments,
-    RegExpVttEntries,
-    GetCaptionFileSegments
-)
-from sdp.processors.datasets.mediaspeech.create_initial_manifest import CreateInitialManifestMediaSpeech
-from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
-from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
-from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
-from sdp.processors.datasets.mtedx.create_initial_manifest import (
-    CreateInitialManifestMTEDX,
-)
-from sdp.processors.datasets.slr83.create_initial_manifest import (
-    CreateInitialManifestSLR83,
-    CustomDataSplitSLR83,
-)
-from sdp.processors.datasets.slr102.create_initial_manifest import (
-    CreateInitialManifestSLR102,
-)
-from sdp.processors.datasets.slr140.create_initial_manifest import (
-    CreateInitialManifestSLR140,
-    CustomDataSplitSLR140,
-)
-from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
-    CreateInitialManifestVoxpopuli,
-)
-from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
-    NormalizeFromNonPCTextVoxpopuli,
-)
-from sdp.processors.huggingface.speech_recognition import ASRTransformers
-from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
-
-from sdp.processors.modify_manifest.common import (
-    AddConstantFields,
-    ApplyInnerJoin,
-    ChangeToRelativePath,
-    CombineSources,
-    DuplicateFields,
-    KeepOnlySpecifiedFields,
-    RenameFields,
-    SortManifest,
-    SplitOnFixedDuration,
-)
-from sdp.processors.modify_manifest.create_manifest import (
-    CreateCombinedManifests,
-    CreateInitialManifestByExt,
-)
-from sdp.processors.modify_manifest.data_to_data import (
-    ASRFileCheck,
-    CopyManifestData,
-    CountNumWords,
-    ExtractFromBrackets,
-    FfmpegConvert,
-    GetAudioDuration,
-    GetWER,
-    InsIfASRInsertion,
-    InverseNormalizeText,
-    NormalizeText,
-    MakeSentence,
-    ReadDocxLines,
-    ReadTxtLines,
-    SoxConvert,
-    SplitLineBySentence,
-    SubIfASRSubstitution,
-    SubMakeLowercase,
-    SubRegex,
-)
-from sdp.processors.modify_manifest.data_to_dropbool import (
-    DropASRError,
-    DropASRErrorBeginningEnd,
-    DropDuplicates,
-    DropHighCER,
-    DropHighLowCharrate,
-    DropHighLowDuration,
-    DropHighLowWordrate,
-    DropHighWER,
-    DropIfNoneOfRegexMatch,
-    DropIfRegexMatch,
-    DropIfSubstringInInsertion,
-    DropLowWordMatchRate,
-    DropNonAlphabet,
-    DropOnAttribute,
-    PreserveByValue,
-    DropRepeatedFields,
-)
-from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
-    MakeLettersUppercaseAfterPeriod,
-)
-from sdp.processors.nemo.asr_inference import ASRInference
-from sdp.processors.nemo.pc_inference import PCInference
-from sdp.processors.toloka.accept_if import AcceptIfWERLess
-from sdp.processors.toloka.create_pool import CreateTolokaPool
-from sdp.processors.toloka.create_project import CreateTolokaProject
-from sdp.processors.toloka.create_sentence_set import CreateSentenceSet
-from sdp.processors.toloka.create_task_set import CreateTolokaTaskSet
-from sdp.processors.toloka.download_responses import GetTolokaResults
-from sdp.processors.toloka.reject_if import RejectIfBanned
diff --git a/build/lib/sdp/processors/base_processor.py b/build/lib/sdp/processors/base_processor.py
deleted file mode 100644
index 6fc22ee8..00000000
--- a/build/lib/sdp/processors/base_processor.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import json
-import multiprocessing
-import os
-import time
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from itertools import chain
-from typing import Any, Dict, List, Optional, Union
-
-from tqdm import tqdm
-from tqdm.contrib.concurrent import process_map
-
-from sdp.logging import logger
-
-
-@dataclass
-class DataEntry:
-    """A wrapper for data entry + any additional metrics."""
-
-    data: Optional[Dict]  # can be None to drop the entry
-    metrics: Any = None
-
-
-class BaseProcessor(ABC):
-    """Abstract class for SDP processors.
-
-    All processor classes inherit from the ``BaseProcessor`` class.
-    This is a simple abstract class which has 2 empty methods: :meth:`process`
-    and :meth:`test`.
-
-    These serve to remind us that SDP essentially just runs ``.test()`` on all
-    processors (to implement :ref:`run-time tests <sdp-runtime-tests>`),
-    and then ``.process()`` on all processors.
-
-    Args:
-        output_manifest_file (str): path of where the output manifest file will
-            be located. Cannot have the same value as ``input_manifest_file``.
-        input_manifest_file (str): path of where the input manifest file is
-            located. This arg is optional - some processors may not take in
-            an input manifest because they need to create an initial manifest
-            from scratch (ie from some transcript file that is in a format
-            different to the NeMo manifest format). Cannot have the same value
-            as ``input_manifest_file``.
-    """
-
-    def __init__(self, output_manifest_file: str, input_manifest_file: Optional[str] = None, **kwargs):
-
-        if output_manifest_file and input_manifest_file and (output_manifest_file == input_manifest_file):
-            # we cannot have the same input and output manifest file specified because we need to be able to
-            # read from the input_manifest_file and write to the output_manifest_file at the same time
-            raise ValueError("A processor's specified input_manifest_file and output_manifest_file cannot be the same")
-
-        self.output_manifest_file = output_manifest_file
-        self.input_manifest_file = input_manifest_file
-
-    @abstractmethod
-    def process(self):
-        """Should be overriden by the child classes to implement some data processing."""
-        pass
-
-    def test(self):
-        """This method can be used to perform "runtime" tests.
-
-        This can be any kind of self-consistency tests, but are usually
-        in the form of checking that provided input test data entries match
-        provided output test data entries.
-
-        There are not tests by default.
-        """
-
-class BaseParallelProcessor(BaseProcessor):
-    """
-    A processor that performs per-entry processing in parallel (using Dask or multiprocessing).
-
-    Args:
-        input_manifest_file (str): Path to the input manifest file.
-        output_manifest_file (str): Path where the output manifest file will be written.
-        max_workers (int): Maximum number of workers.
-        chunksize (int): Chunk size used for parallel routines.
-        in_memory_chunksize (int): Maximum number of entries to load at once.
-        test_cases (list[dict]): Optional list of test cases.
-        use_dask (bool): If True, use Dask for parallelization; otherwise, use multiprocessing.
-        dask_client: (Optional) An existing Dask client.
-    """
-    
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        # Remove the Dask client from state (it is not picklable)
-        if 'dask_client' in state:
-            state['dask_client'] = None
-        return state
-
-    def __init__(
-        self,
-        input_manifest_file: Optional[str] = None,
-        output_manifest_file: Optional[str] = None,
-        max_workers: int = -1,
-        chunksize: int = 100,
-        in_memory_chunksize: int = 100000,
-        test_cases: Optional[List[Dict]] = None,
-        use_dask: bool = True,
-        dask_client=None,
-        **kwargs,
-    ):
-        kwargs.pop("use_dask", None) #
-        super().__init__(input_manifest_file=input_manifest_file, output_manifest_file=output_manifest_file, **kwargs)
-        if max_workers == -1:
-            max_workers = os.cpu_count()
-        self.max_workers = max_workers
-        self.chunksize = chunksize
-        self.in_memory_chunksize = in_memory_chunksize
-        self.number_of_entries = 0
-        self.total_duration = 0
-        self.start_time = time.time()
-        self.test_cases = test_cases or []
-        self.use_dask = use_dask
-        self.dask_client = dask_client
-        
-    def prepare(self):
-        """Can be used in derived classes to prepare the processing.
-        
-        """
-        pass
-
-    def process(self):
-        """A fork in the road to pick dask or classic processing
-
-        """
-        os.environ.setdefault("PATH", os.defpath)
-
-        self.prepare()
-        
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        metrics = []
-        
-        #Ability to work sa legacy and as dask
-        if self.use_dask:
-            self._process_with_dask(metrics)
-        else:
-            self._process_with_multiprocessing(metrics)
-        self.finalize(metrics)
-
-    def _process_with_dask(self, metrics):
-        import dask.bag as db
-        from dask.distributed import Client
-
-        if self.dask_client is None:
-            self.dask_client = Client()
-        client = self.dask_client
-        from sdp.logging import logger 
-        logger.info(f"Using Dask client with dashboard at: {client.dashboard_link}")
-
-        # Delegate manifest reading to read_manifest() which returns a Dask bag.
-        bag = self.read_manifest()
-
-        if not isinstance(bag, db.Bag):
-            bag = db.from_sequence(bag)
-        total_entries = bag.count().compute()
-
-        if total_entries == 0:
-            logger.info("No entries found in the manifest input. Proceeding to create an empty output manifest.")
-            results = []
-        else:
-            processed_bag = bag.map(lambda entry: self.process_dataset_entry(entry)).flatten()
-            results = processed_bag.compute()
-
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for entry in results:
-                metrics.append(entry.metrics)
-                if entry.data is not None:
-                    json.dump(entry.data, fout, ensure_ascii=False)
-                    fout.write("\n")
-                    self.number_of_entries += 1
-                    self.total_duration += entry.data.get("duration", 0)
-        logger.info(f"Processed {total_entries} entries using Dask.")
-
-    def _process_with_multiprocessing(self, metrics):
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for manifest_chunk in self._chunk_manifest():
-                data = itertools.chain(
-                    *process_map(
-                        self.process_dataset_entry,
-                        manifest_chunk,
-                        max_workers=self.max_workers,
-                        chunksize=self.chunksize,
-                    )
-                )
-                for data_entry in tqdm(data):
-                    metrics.append(data_entry.metrics)
-                    if data_entry.data is None:
-                        continue
-                    json.dump(data_entry.data, fout, ensure_ascii=False)
-                    fout.write("\n")
-                    self.number_of_entries += 1
-                    self.total_duration += data_entry.data.get("duration", 0)
-
-    def _chunk_manifest(self):
-        """Splits the input manifest into chunks of in_memory_chunksize size.
-           Only used in non-Dask (multiprocessing) mode.
-        """
-        manifest_chunk = []
-        # When use_dask is False, read_manifest() returns an iterator.
-        for idx, data_entry in enumerate(self.read_manifest(), 1):
-            manifest_chunk.append(data_entry)
-            if idx % self.in_memory_chunksize == 0:
-                yield manifest_chunk
-                manifest_chunk = []
-        if manifest_chunk:
-            yield manifest_chunk
-
-    def read_manifest(self):
-        """
-        Reads entries from the input manifest.
-        
-        Behavior depends on the parallelization mode:
-         - When use_dask is True:
-              If the input_manifest_file exists and is non-empty, returns a Dask bag (reading in 256KB blocks).
-              Otherwise, logs the condition and returns an empty Dask bag.
-         - When use_dask is False:
-              If the input_manifest_file does not exist or is empty, logs the condition and returns an empty iterator.
-              Otherwise, opens the file in text mode, strips each line, and yields the parsed JSON from non-empty lines.
-              
-        This unified behavior lets the processor run even in manifest-creation mode.
-
-        """
-        from sdp.logging import logger  
-        if self.use_dask:
-            import dask.bag as db
-            if self.input_manifest_file and os.path.exists(self.input_manifest_file) and os.path.getsize(self.input_manifest_file) > 0:
-                bag = db.read_text(self.input_manifest_file, blocksize=2**18).map(json.loads)
-                return bag
-            else:
-                logger.info("No input manifest file provided or file is empty. Returning an empty Dask bag for manifest creation.")
-                return db.from_sequence([])
-        else:
-            if not self.input_manifest_file or not os.path.exists(self.input_manifest_file):
-                logger.info("No input manifest file provided or file does not exist. Continuing with an empty manifest.")
-                return iter([])
-            else: 
-                #if use_dask = False, we get here
-                def generator(): #Reading manifest line by line, adding only non emply lines
-                    with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-                        for line in fin:
-                                if line:
-                                    yield json.loads(line)
-                return generator()
-
-    @abstractmethod
-    def process_dataset_entry(self, data_entry) -> List[Any]:
-        """
-        Must be implemented in derived classes.
-        For each data entry, return a list of DataEntry objects.
-        """
-        raise NotImplementedError("Derived classes must implement process_dataset_entry.")
-
-    def finalize(self, metrics: List[Any]):
-        """Outputs metrics about the processed data."""
-        from sdp.logging import logger
-        logger.info("Total number of entries after processing: %d", self.number_of_entries)
-        if self.total_duration:
-            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
-        else:
-            logger.info("Unable to calculate total audio duration (hours). Ensure that the manifest file includes a 'duration' key.")
-        elapsed = time.time() - self.start_time
-        logger.info("Processor completed in (seconds): %.2f", elapsed)
-
-    def test(self):
-        """Applies processing to each test case and raises an error if the output does not match expected output."""        
-        for test_case in self.test_cases:
-            input_data = test_case["input"].copy() if isinstance(test_case["input"], dict) else test_case["input"]
-            generated_outputs = self.process_dataset_entry(input_data)
-            expected_outputs = [test_case["output"]] if not isinstance(test_case["output"], list) else test_case["output"]
-            for gen_out, exp_out in zip(generated_outputs, expected_outputs):
-                gen_data = gen_out.data if hasattr(gen_out, "data") else gen_out
-                if gen_data != exp_out:
-                    raise RuntimeError(
-                        "Runtime test failed.\nTest input: {}\nGenerated output: {}\nExpected output: {}"
-                        .format(test_case["input"], gen_data, exp_out)
-                    )
-
-
-
-# ------------------ Legacy Parallel Processor ------------------ #Just for reference
-class LegacyParallelProcessor(BaseProcessor):
-    """
-    A legacy parallel processor implementation using multiprocessing and process_map.
-    
-    This class processes the manifest in chunks (using process_map) and is provided for compatibility.
-    Child classes must implement process_dataset_entry().
-    
-    Args:
-        max_workers (int): maximum number of workers that will be spawned
-            during the parallel processing.
-        chunksize (int): the size of the chunks that will be sent to worker processes
-            during the parallel processing.
-        in_memory_chunksize (int): the maximum number of input data entries that will
-            be read, processed and saved at a time.
-        test_cases (list[dict]): an optional list of dicts containing test
-            cases for checking that the processor makes the changes that we
-            are expecting.
-            
-        The dicts must have a key ``input``, the value of which is a dictionary
-            containing data which is our test's input manifest line, and a key
-            ``output``, the value of which is a dictionary containing data which is
-            the expected output manifest line.
-    """
-    def __init__(
-        self,
-        max_workers: int = -1,
-        chunksize: int = 100,
-        in_memory_chunksize: int = 100000,
-        test_cases: Optional[List[Dict]] = None,
-        **kwargs,
-    ):
-        kwargs.pop("use_dask", None) #
-        super().__init__(**kwargs)
-        if max_workers == -1:
-            max_workers = multiprocessing.cpu_count()
-        self.max_workers = max_workers
-        self.chunksize = chunksize
-        self.in_memory_chunksize = in_memory_chunksize
-        self.number_of_entries = 0
-        self.total_duration = 0
-        self.start_time = time.time()
-        self.test_cases = test_cases or []
-
-    def process(self):
-        """Parallelized implementation of the data processing.
-        The execution flow of this method is the following.
-        1. :meth:`prepare` is called. It's empty by default but can be used to
-           e.g. download the initial data files or compute some aggregates
-           required for subsequent processing.
-        2. A for-loop begins that loops over all ``manifest_chunk`` lists yielded
-           by the :meth:`_chunk_manifest` method. :meth:`_chunk_manifest` reads data
-           entries yielded by :meth:`read_manifest` and yields lists containing
-           ``in_memory_chunksize`` data entries.
-           Inside the for-loop:
-           a) :meth:`process_dataset_entry` is called **in parallel** on each element
-              of the ``manifest_chunk`` list.
-           b) All metrics are aggregated.
-           c) All output data-entries are added to the contents of ``output_manifest_file``.
-           Note:
-           * The default implementation of :meth:`read_manifest` reads an input manifest file
-             and returns a list of dictionaries for each line (we assume a standard NeMo format
-             of one json per line).
-           * :meth:`process_dataset_entry` is called **in parallel** on each element
-             of the list created in the previous step. Note that you cannot create
-             any new counters or modify the attributes of this class in any way
-             inside that function as this will lead to an undefined behavior.
-             Each call to the :meth:`process_dataset_entry` returns a list of
-             ``DataEntry`` objects that are then aggregated together. ``DataEntry``
-             simply defines a ``data`` and ``metrics`` keys.
-           * If ``data`` is set to None, the objects are ignored (metrics are still collected).
-        3. All ``metrics`` keys that were collected in the for-loop above are passed over to
-           :meth:`finalize` for any desired metric aggregation and reporting.
-        Here is a diagram outlining the execution flow of this method:
-        .. can only be viewed in the online documentation
-        .. raw:: html
-             <div align="center">
-               <img src="https://mermaid.ink/img/pako:eNqFU99r2zAQ_lcOFUYCbfbuhcCS9HFQ6N7mYS7WyRaTJSOdF7zS_32SrDYuDOYn-e6777779SJaJ0lUovM49vD9_KW2EL8wXRZLLZ6ZxgDawp75cMRAT-jRGDJP3rUUgvO7cXlttvvPEQMDce9kLRaq9H39UYsUHsioiKYRPRX0_uIPQMPIc4mDywySFE6GITlbtHAhmAJJYJdNtOt2IN3VGocSPF5BIiPgxG5A1m2UN9fiJzw8HOB8U3Fcq_CEshnQakWB11qKimuv2x5mTUaGnBPjb05Dlv07_elGf1rTN20_2V__T1CakVPkkABOQdB_KFne6bRtBhqcnxfe7E-E_6jyHGUo5yELTgRvGpbQHFYl8g-gVFmTK7sBUh_hg4wy6CahA_ESsLnFlgXIZW7i3PAS2GPLpebt4vkEAX8TuInHKbqKvGjGrvPUIVPCe92GjN_kcd-lvkzMAaR340hy-1b74632x_UIlLZoYqOaQrZZq1tWSIfR4PyeTXk3QKlR2y4mqG25B54NwRGUNqa6U0qtzae1eXGQlbUV92IgP6CW8cBekqMW3NNAtajisyx5upPXCE3b-zzbVlTsJ7oX0xj7SmeN8RAHUSk0IVpJanb-23K0-XZf_wKzfkSg" height=100% />
-             </div>
-        """
-        self.prepare()
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        metrics = []
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for manifest_chunk in self._chunk_manifest():
-                # this will unroll all inner lists
-                data = itertools.chain(
-                    *process_map(
-                        self.process_dataset_entry,
-                        manifest_chunk,
-                        max_workers=self.max_workers,
-                        chunksize=self.chunksize,
-                    )
-                )
-                for data_entry in tqdm(data):
-                    if data_entry.metrics is not None:
-                        pass  # optionally accumulate metrics here
-                    if data_entry.data is None:
-                        continue
-                    json.dump(data_entry.data, fout, ensure_ascii=False)
-                    self.number_of_entries += 1
-                    self.total_duration += data_entry.data.get("duration", 0)
-                    fout.write("\n")
-        self.finalize(self.test_cases)
-
-    def prepare(self):
-        """Can be used in derived classes to prepare the processing in any way.
-        E.g., download data or compute some aggregates. Will be called before
-        starting processing the data.
-        """
-
-    def _chunk_manifest(self):
-        """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``."""
-        manifest_chunk = []
-        for idx, data_entry in enumerate(self.read_manifest(), 1):
-            manifest_chunk.append(data_entry)
-            if idx % self.in_memory_chunksize == 0:
-                yield manifest_chunk
-                manifest_chunk = []
-        if manifest_chunk:
-            yield manifest_chunk
-
-    def read_manifest(self):
-        """Reading the input manifest file.
-        .. note::
-            This function should be overridden in the "initial" class creating
-            manifest to read from the original source of data.
-        """
-        if not self.input_manifest_file:
-            raise NotImplementedError("Override this method if no input manifest file is used")
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-            for line in fin:
-                yield json.loads(line)
-
-    @abstractmethod
-    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
-        """Needs to be implemented in the derived classes.
-        Each returned value should be a ``DataEntry`` object that will hold
-        a dictionary (or anything else that can be json-serialized) with
-        the actual data + any additional metrics required for statistics
-        reporting. Those metrics can be used in :meth:`finalize` to
-        prepare for final reporting.
-        ``DataEntry`` is a simple dataclass defined in the following way::
-            @dataclass
-            class DataEntry:
-                # can be None to drop the entry
-                data: Optional[Dict]
-                # anything - you'd need to aggregate all
-                # values in the finalize method manually
-                metrics: Any = None
-        .. note::
-            This method should always return a list of objects to allow a
-            one-to-many mapping. E.g., if you want to cut an utterance into
-            multiple smaller parts, you can return a list of all the produced
-            utterances and they will be handled correctly.
-            The many-to-one mapping is not currently supported by design of
-            this method (but can still be done if you don't inherit from
-            this class and process the data sequentially).
-        Args:
-            data_entry: most often, ``data_entry`` will be a dictionary
-                containing items which represent the JSON manifest entry.
-                Sometimes, such as in :class:`sdp.processors.CreateInitialManifestMLS`,
-                it will be a string containing a line for that utterance
-                from the original raw MLS transcript. In general it is an element
-                of the list returned from the :meth:`read_manifest` method.
-        """
-        # TODO: it would be more straightforward to use a generator here, but
-        #     seems that it's not supported with multiprocessing. Is there a
-        #     way to make it work?
-        raise NotImplementedError("Derived classes must implement `process_dataset_entry`.")
-
-    def finalize(self, metrics):
-        """Can be used to output statistics about the processed data.
-        By default outputs new number of entries/hours.
-
-        Args:
-            metrics (list): a list containing all ``metrics`` keys from the
-                data entries returned from the :meth:`process_dataset_entry`
-                method.
-        """
-        logger.info("Total number of entries after processing (legacy): %d", self.number_of_entries)
-        if self.total_duration:
-            logger.info("Total audio duration (hours) after processing (legacy): %.2f", self.total_duration / 3600)
-        else:
-            logger.info("Unable to calculate total audio duration (legacy). Please ensure that the manifest file includes a 'duration' key.")
-        elapsed = time.time() - self.start_time
-        logger.info("Legacy processor completed in (seconds): %.2f", elapsed)
-    def test(self):
-        """Applies processing to "test_cases" and raises an error in case of mismatch."""
-        for test_case in self.test_cases:
-            generated_outputs = self.process_dataset_entry(test_case["input"].copy())
-            expected_outputs = (
-                [test_case["output"]] if not isinstance(test_case["output"], list) else test_case["output"]
-            )
-
-            for generated_output, expected_output in zip(generated_outputs, expected_outputs):
-                generated_output = generated_output.data
-
-                if generated_output != expected_output:
-                    raise RuntimeError(
-                        "Runtime test failed.\n"
-                        f"Test input: {test_case['input']}\n"
-                        f"Generated output: {generated_output}\n"
-                        f"Expected output: {expected_output}"
-                    )
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/__init__.py b/build/lib/sdp/processors/datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/__init__.py b/build/lib/sdp/processors/datasets/commoncrawl/__init__.py
deleted file mode 100644
index c3909eaa..00000000
--- a/build/lib/sdp/processors/datasets/commoncrawl/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .commoncrawl import SplitByVttSentence
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py b/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
deleted file mode 100644
index 8a5cc2c6..00000000
--- a/build/lib/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os
-from typing import List
-
-import soundfile as sf
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.processors.datasets.commoncrawl.harv_utils import split_by_vtt
-
-
-
-class SplitByVttSentence(BaseParallelProcessor):
-    """
-        A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
-
-        Args:
-            splited_audio_dir (str): The directory to store the split audio files.
-            source_audio_key (str): The field in the dataset containing the path to the source audio files.
-            target_audio_key (str): The field to store the paths of the split audio files.
-            duration_key (str): The field to store the duration of each split audio segment.
-            text_key (str): The field to store the transcriptions corresponding to each split audio segment.
-            caption_file_key (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
-            additional_fields (List[str], optional): List of additional fields to copy from the original data entry to the split entries.
-                Defaults to an empty list.
-            duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0.
-    """
-
-    def __init__(
-            self,
-            splited_audio_dir: str,
-            source_audio_field: str,
-            target_audio_field: str,
-            duration_field: str,
-            text_field: str,
-            vtt_field: str,
-            additional_fields: List[str] = [],
-            duration_threshold: float = 10.0,
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.splited_audio_dir = splited_audio_dir
-        self.source_audio_field = source_audio_field
-        self.target_audio_field = target_audio_field
-        self.duration_field = duration_field
-        self.text_field = text_field
-        self.vtt_field = vtt_field
-        self.duration_threshold = duration_threshold
-        self.additional_fields = additional_fields
-
-    def prepare(self):
-        os.makedirs(self.splited_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        vtt_file = data_entry[self.vtt_field]
-        source_audio = data_entry[self.source_audio_field]
-        res_list = []
-
-        if os.path.isfile(source_audio):
-            data, samplerate = sf.read(source_audio)
-            text_list, start_s, end_s = split_by_vtt(vtt_file, samplerate)
-            text_c = ''
-            start_c, end_c = 0, 0
-            if text_list:
-                for text, start_sr, end_sr in zip(text_list, start_s, end_s):
-                    text_c += " " + text
-                    if start_c == 0:
-                        start_c = start_sr
-                    else:
-                        pass
-                    end_c = end_sr
-                    if len(text_c) > 0 and (
-                            end_c - start_c > self.duration_threshold * samplerate or
-                            text_c[-1] == "." or text_c[-1] == "?"):
-                        res_list.append(
-                            self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
-                        text_c = ''
-                        start_c, end_c = 0, 0
-                    else:
-                        pass
-                if len(text_c) > 0 and start_c != 0:
-                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
-
-        return res_list
-
-    def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c):
-        data_sample = data[start_c:end_c]
-        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]),
-                                     str(int(start_c / (samplerate / 1000))) + "-" + str(
-                                         int(end_c / (samplerate / 1000))) + ".wav")
-        if not os.path.isfile(wav_save_file):
-            os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
-            sf.write(wav_save_file, data_sample, samplerate)
-
-        data = {self.target_audio_field: wav_save_file,
-                self.duration_field: data_sample.shape[0] / samplerate,
-                self.text_field: text_c.strip(),
-                }
-        for field in self.additional_fields:
-            data[field] = data_entry[field]
-        return DataEntry(data=data)
-
diff --git a/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py b/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
deleted file mode 100644
index 24efe80e..00000000
--- a/build/lib/sdp/processors/datasets/commoncrawl/harv_utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import webvtt  # pip install webvtt-py
-from datetime import datetime
-from sdp.logging import logger
-
-
-def parse_hours(inp):
-    inp_list = inp.split(":")
-    if len(inp_list) == 3 and int(inp_list[0]) >= 24:
-        hours = int(inp_list[0]) % 24
-        days = int(inp_list[0]) // 24
-        if days < 31:
-            inp = str(1 + days) + ":" + str(hours) + ":" + ":".join(inp_list[1:])
-            return datetime.strptime(inp, '%d:%H:%M:%S.%f')
-        else:
-            months = days // 31
-            days = days % 31
-            inp = str(1 + months) + "/" + str(1 + days) + " " + str(hours) + ":" + ":".join(inp_list[1:])
-            return datetime.strptime(inp, '%m/%d %H:%M:%S.%f')
-    else:
-        return datetime.strptime(inp, '%H:%M:%S.%f')
-
-
-def split_by_vtt(vtt_file, samplerate):
-    try:
-        _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
-        text_list, start_s, end_s = [], [], []
-        for caption in webvtt.read(vtt_file):
-            text = ' '.join(caption.text.split('\n'))
-
-            _start = parse_hours(caption.start)
-            start = (_start - _begin).total_seconds()
-            start_sr = int(start * samplerate)
-
-            _end = parse_hours(caption.end)
-            end = (_end - _begin).total_seconds()
-            end_sr = int(end * samplerate)
-
-            text_list.append(text.strip())
-            start_s.append(start_sr)
-            end_s.append(end_sr)
-        return text_list, start_s, end_s
-    except Exception as e:
-        logger.warning(str(e) + vtt_file)
-        return None, None, None
-
diff --git a/build/lib/sdp/processors/datasets/coraa/__init__.py b/build/lib/sdp/processors/datasets/coraa/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py b/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
deleted file mode 100644
index 5be1a8ab..00000000
--- a/build/lib/sdp/processors/datasets/coraa/create_initial_manifest.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import glob
-import os
-from pathlib import Path
-from typing import List
-import pandas as pd
-
-import rarfile  #Needs to be installed
-import sox
-from sox import Transformer
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import extract_archive
-
-class CreateInitialManifestCORAA(BaseParallelProcessor):
-    """
-        Processor to create initial manifest file fo CORAA ASR dataset
-
-         Dataset link: https://github.com/nilc-nlp/CORAA
-
-         Args:
-            raw_data_dir (str): the path to the directory in which all the data will be downloaded.
-            extract_archive_dir (str): directory where the extracted data will be saved.
-            data_split (str): "train", "dev" or "test".
-            resampled_audio_dir (str): the directory where the resampled wav files will be stored.
-            already_extracted (bool): if True, we will not try to extract the raw data.
-                Defaults to False.
-            already_downloaded (bool): if True, we will not try to download files.
-            target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
-                Defaults to 16000.
-            target_nchannels (int): number of channels to create during resampling process.
-                Defaults to 1.
-            exclude_dataset: list: list of the dataset names that will be excluded when creating initial manifest.
-                Options 'SP2010', 'C-ORAL-BRASIL I', 'NURC-Recife', 'TEDx Talks', 'ALIP'
-
-    """
-    def __init__(
-            self,
-            raw_data_dir: str,
-            extract_archive_dir: str,
-            data_split: str,
-            resampled_audio_dir: str,
-            already_extracted: bool = False,
-            already_downloaded: bool = False,
-            target_samplerate: int = 16000,
-            target_nchannels: int = 1,
-            exclude_dataset: list = [],
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.extract_archive_dir = extract_archive_dir
-        self.data_split = data_split
-        self.already_downloaded = already_downloaded
-        self.already_extracted = already_extracted
-        self.exclude_dataset = exclude_dataset
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        """Downloading and extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-        os.makedirs(self.extract_archive_dir, exist_ok=True)
-        if not self.already_downloaded:
-            try:
-                from huggingface_hub import snapshot_download
-                snapshot_download(repo_id="gabrielrstan/CORAA-v1.1", repo_type='dataset', local_dir=self.raw_data_dir)
-            except ImportError:
-                raise ImportError("huggingface_hub is required to download the dataset. Please install it with pip install huggingface_hub")
-        if not self.already_extracted:
-
-            if self.data_split == 'train':
-                first_rar_file = glob.glob(str(self.raw_data_dir) + "/train_dividido"+f"/*{self.data_split}*1.rar")
-                if first_rar_file and not isinstance(first_rar_file, str):
-                    first_rar_file = first_rar_file[0]
-
-                if rarfile.is_rarfile(first_rar_file):
-                    rar = rarfile.RarFile(first_rar_file)
-                    rar.extractall(path=self.extract_archive_dir)
-            else:
-
-                zip_files =glob.glob(str(self.raw_data_dir) + f"/*{self.data_split}.zip")
-                if not zip_files:
-                    raise RuntimeError(
-                        f"Did not find any file matching {self.raw_data_dir}/*.zip. "
-                        "Make sure your download passed succesfully."
-                    )
-                elif len(zip_files) > 1:
-                    raise RuntimeError(
-                        f"Expecting exactly one {self.data_split}.zip file in directory {self.raw_data_dir}"
-                    )
-
-                extract_archive(zip_files[0], self.extract_archive_dir)
-        self.transcription_file = self.raw_data_dir / f"metadata_{self.data_split}_final.csv"
-        self.audio_path_prefix = self.extract_archive_dir
-
-    def read_manifest(self):
-        self.df = pd.read_csv(self.transcription_file)
-        data_entries = self.df[~self.df['dataset'].isin(self.exclude_dataset)][['file_path','text']]
-        res = [tuple(row[1]) for row in data_entries.iterrows()]
-        return res
-
-    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
-
-        file_path, text = data_entry
-        file_name = os.path.splitext(os.path.basename(file_path))[0]
-        transcript_text = text.strip()
-
-        audio_path = os.path.join(self.audio_path_prefix, file_path)
-        output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")
-
-        if not os.path.exists(output_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
-
-        data = {
-            "audio_filepath": output_wav_path,
-            "duration": float(sox.file_info.duration(output_wav_path)),
-            "text": transcript_text,
-        }
-
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/coraal/__init__.py b/build/lib/sdp/processors/datasets/coraal/__init__.py
deleted file mode 100644
index 7d2fff52..00000000
--- a/build/lib/sdp/processors/datasets/coraal/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .create_initial_manifest import CreateInitialManifestCORAAL
-from .data_splits import TrainDevTestSplitCORAAL
diff --git a/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py b/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
deleted file mode 100644
index 16aa166a..00000000
--- a/build/lib/sdp/processors/datasets/coraal/create_initial_manifest.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import urllib.request
-from pathlib import Path
-
-import pandas as pd
-from sox import Transformer
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-
-def get_coraal_url_list():
-    """Returns url list for CORAAL dataset.
-
-    There are a few mistakes in the official url list that are fixed here.
-    Can be overridden by tests to select a subset of urls.
-    """
-    dataset_url = "http://lingtools.uoregon.edu/coraal/coraal_download_list.txt"
-    urls = []
-    for file_url in urllib.request.urlopen(dataset_url):
-        file_url = file_url.decode('utf-8').strip()
-        # fixing known errors in the urls
-        if file_url == 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2018.10.06.txt':
-            file_url = 'http://lingtools.uoregon.edu/coraal/les/2021.07/LES_metadata_2021.07.txt'
-        if file_url == 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2018.10.06.txt':
-            file_url = 'http://lingtools.uoregon.edu/coraal/vld/2021.07/VLD_metadata_2021.07.txt'
-        urls.append(file_url)
-    return urls
-
-
-class CreateInitialManifestCORAAL(BaseParallelProcessor):
-    """Processor to create initial manifest for the Corpus of Regional African American Language (CORAAL) dataset.
-
-    Dataset link: https://oraal.github.io/coraal
-
-    Will download all files, extract tars and split wav files based on the
-    provided durations in the transcripts.
-
-    Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
-        drop_pauses (bool): if True, will drop all transcriptions that contain
-            only silence (indicated by ``(pause X)`` in the transcript).
-            Defaults to True.
-        group_duration_threshold (float): can be used to group consecutive
-            utterances from the same speaker to a longer duration. Set to 0
-            to disable this grouping (but note that many utterances are
-            transcribed with only a few seconds, so grouping is generally
-            advised). Defaults to 20.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription>,
-                "original_file": <name of the original file in the dataset this audio came from>,
-                "speaker": <speaker id>,
-                "is_interviewee": <whether this is an interviewee (accented speech)>,
-                "gender": <speaker gender>,
-                "age": <speaker age>,
-                "education": <speaker education>,
-                "occupation": <speaker occupation>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        resampled_audio_dir: str,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        drop_pauses: bool = True,
-        group_duration_threshold: float = 20.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-        self.drop_pauses = drop_pauses
-        self.group_duration_threshold = group_duration_threshold
-
-    def prepare(self):
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-        # downloading all files
-        for file_url in get_coraal_url_list():
-            download_file(file_url, str(self.raw_data_dir))
-
-        os.makedirs(self.raw_data_dir / "audio", exist_ok=True)
-        os.makedirs(self.raw_data_dir / "transcripts", exist_ok=True)
-        # extracting all files
-        for data_file in glob.glob(f'{self.raw_data_dir}/*_audio_*.tar.gz'):
-            # need to set force_extract=True, since there is no folder inside, just a list of files
-            # and we extract data from multiple tars. Ideally, should change the way we check
-            # for extracted data (currently there is an assumption that all data in archive is in a single folder)
-            extract_archive(data_file, self.raw_data_dir / "audio", force_extract=True)
-        for data_file in glob.glob(f'{self.raw_data_dir}/*_textfiles_*.tar.gz'):
-            extract_archive(data_file, self.raw_data_dir / "transcripts", force_extract=True)
-
-    def read_manifest(self):
-        dfs = []
-        for data_file in glob.glob(f'{self.raw_data_dir}/transcripts/*.txt'):
-            df = pd.read_csv(data_file, delimiter='\t')
-            df['Basefile'] = os.path.basename(data_file)[:-4]  # dropping .wav in the end
-
-            if self.drop_pauses:
-                df = df[~df['Content'].str.contains(r'\(pause \d+(?:\.\d+)?\)')]
-
-            # grouping consecutive segments from the same speaker
-            if self.group_duration_threshold > 0:
-                df['Duration'] = df['EnTime'] - df['StTime']
-                # puts each sequence of same speaker utts in a "bin"
-                speaker_bins = (~df['Spkr'].eq(df['Spkr'].shift())).cumsum()
-                # within each bin, computes cumulative duration and then int-divides by the threshold
-                df['ThresholdMult'] = df.groupby(speaker_bins)['Duration'].transform(
-                    lambda x: pd.Series.cumsum(x) // self.group_duration_threshold
-                )
-                # finally, we take all positions where the int-division changes,
-                # which indicates that cumsum exceded the threshold. And combine those
-                # with speaker-change positions to get the final groups for utterance merging
-                final_bins = (
-                    (~df['Spkr'].eq(df['Spkr'].shift())) | (~df['ThresholdMult'].eq(df['ThresholdMult'].shift()))
-                ).cumsum()
-                df = df.groupby(final_bins).agg(
-                    {
-                        'StTime': 'min',
-                        'EnTime': 'max',
-                        'Content': ' '.join,
-                        # will be the same in the group
-                        'Spkr': lambda x: x.iloc[0],
-                        'Basefile': lambda x: x.iloc[0],
-                    }
-                )
-            # assigning label for interviewee vs interviewer (can be used to select a subset later)
-            df['is_interviewee'] = df.apply(lambda x: x['Spkr'] in x['Basefile'], axis=1)
-
-            # matching with metadata (age, gender, etc.)
-            metadata_dfs = []
-            for data_file in glob.glob(f'{self.raw_data_dir}/*_metadata_*.txt'):
-                metadata_dfs.append(pd.read_csv(data_file, delimiter='\t'))
-            metadata_df = pd.concat(metadata_dfs)
-            # only selecting a subset of columns - can be changed if more are needed
-            # dropping duplicates since there are multiple rows per speaker because of
-            # bit-rate, tar name and other file-specific information
-            metadata_df = metadata_df[['CORAAL.Spkr', 'Gender', 'Age', 'Education', 'Occupation']].drop_duplicates()
-            df = df.merge(metadata_df, left_on='Spkr', right_on='CORAAL.Spkr', how='left')
-            df = df.drop('CORAAL.Spkr', axis=1)
-
-            dfs.append(df)
-
-        df = pd.concat(dfs)
-        # would be better to keep it as df, but .values is way faster than .iterrows
-        return df.values
-
-    def process_dataset_entry(self, data_entry):
-        (
-            start_time,
-            end_time,
-            content,
-            speaker,
-            basefile,
-            is_interviewee,
-            gender,
-            age,
-            education,
-            occupation,
-        ) = data_entry
-
-        src_file = str(self.raw_data_dir / 'audio' / (basefile + '.wav'))
-        output_wav_path = os.path.join(
-            self.resampled_audio_dir,
-            f"{basefile}_{int(start_time * 1000)}_{int(end_time * 1000)}.wav",
-        )
-
-        if not os.path.exists(output_wav_path):
-            tfm = Transformer()
-            tfm.trim(start_time, end_time)
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=src_file, output_filepath=output_wav_path)
-
-        data = {
-            "audio_filepath": output_wav_path,
-            "duration": end_time - start_time,
-            "text": content.strip(),
-            "original_file": basefile,
-            "speaker": speaker,
-            "is_interviewee": is_interviewee,
-            "gender": gender,
-            "age": age,
-            "education": education,
-            "occupation": occupation,
-        }
-
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/coraal/data_splits.py b/build/lib/sdp/processors/datasets/coraal/data_splits.py
deleted file mode 100644
index 82e2819d..00000000
--- a/build/lib/sdp/processors/datasets/coraal/data_splits.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-class TrainDevTestSplitCORAAL(BaseParallelProcessor):
-    """Custom train-dev-test split for CORAAL dataset.
-
-    Split is done speaker-wise, so the same speakers don't appear in different
-    splits.
-
-    Args:
-        data_split (str): train, dev or test.
-
-    Returns:
-        All the same fields as in the input manifest, but only a subset of
-        the data is retained.
-    """
-
-    def __init__(
-        self,
-        data_split: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        if data_split not in ["train", "dev", "test"]:
-            raise ValueError("data_split has to be either train, dev or test")
-        self.data_split = data_split
-        self.split_map = {}
-        self.split_map["train"] = set(
-            [
-                'ATL_se0_ag1_m',
-                'DCA_se1_ag1_f',
-                'DCA_se1_ag2_f',
-                'DCA_se1_ag2_m',
-                'DCA_se1_ag3_f',
-                'DCA_se1_ag3_m',
-                'DCA_se1_ag4_m',
-                'DCA_se2_ag1_f',
-                'DCA_se2_ag1_m',
-                'DCA_se2_ag2_m',
-                'DCB_se1_ag1_m',
-                'DCB_se1_ag2_f',
-                'DCB_se1_ag2_m',
-                'DCB_se1_ag3_f',
-                'DCB_se1_ag3_m',
-                'DCB_se1_ag4_f',
-                'DCB_se1_ag4_m',
-                'DCB_se2_ag1_f',
-                'DCB_se2_ag1_m',
-                'DCB_se2_ag2_f',
-                'LES_se0_ag2_f',
-                'LES_se0_ag2_m',
-                'PRV_se0_ag1_f',
-                'PRV_se0_ag2_f',
-                'ROC_se0_ag1_m',
-                'ROC_se0_ag2_f',
-                'VLD_se0_ag2_f',
-                'VLD_se0_ag2_m',
-            ]
-        )
-        self.split_map["dev"] = set(
-            [
-                'ATL_se0_ag1_f',
-                'DCA_se1_ag1_m',
-                'DCB_se1_ag1_f',
-                'LES_se0_ag3_f',
-                'PRV_se0_ag1_m',
-                'ROC_se0_ag1_f',
-                'VLD_se0_ag3_f',
-            ]
-        )
-        self.split_map["test"] = set(
-            [
-                'ATL_se0_ag2_f',
-                'ATL_se0_ag2_m',
-                'DCA_se2_ag3_m',
-                'DCA_se2_ag4_f',
-                'DCA_se2_ag4_m',
-                'DCA_se3_ag1_f',
-                'DCA_se3_ag1_m',
-                'DCA_se3_ag2_f',
-                'DCA_se3_ag2_m',
-                'DCA_se3_ag3_f',
-                'DCA_se3_ag3_m',
-                'DCA_se3_ag4_m',
-                'DCB_se2_ag2_m',
-                'DCB_se2_ag3_f',
-                'DCB_se2_ag3_m',
-                'DCB_se2_ag4_f',
-                'DCB_se2_ag4_m',
-                'DCB_se3_ag1_f',
-                'DCB_se3_ag1_m',
-                'DCB_se3_ag2_f',
-                'DCB_se3_ag3_f',
-                'DCB_se3_ag3_m',
-                'DCB_se3_ag4_f',
-                'DCB_se3_ag4_m',
-                'LES_se0_ag3_m',
-                'LES_se0_ag4_f',
-                'LES_se0_ag4_m',
-                'PRV_se0_ag2_m',
-                'PRV_se0_ag3_f',
-                'PRV_se0_ag3_m',
-                'ROC_se0_ag2_m',
-                'ROC_se0_ag3_f',
-                'ROC_se0_ag3_m',
-                'VLD_se0_ag3_m',
-                'VLD_se0_ag4_f',
-                'VLD_se0_ag4_m',
-            ]
-        )
-
-    def process_dataset_entry(self, data_entry):
-        if data_entry["original_file"][:-5] in self.split_map[self.data_split]:
-            return [DataEntry(data=data_entry)]
-        return []
diff --git a/build/lib/sdp/processors/datasets/fleurs/__init__.py b/build/lib/sdp/processors/datasets/fleurs/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py b/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
deleted file mode 100644
index d571593a..00000000
--- a/build/lib/sdp/processors/datasets/fleurs/create_initial_manifest.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fnmatch
-import glob
-import json
-import os
-import shutil
-import typing
-from urllib.parse import parse_qs, urlparse
-
-from sdp.processors.base_processor import BaseProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-
-def get_fleurs_url_list(lang: str, split: str) -> list[str]:
-    # examples
-    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/audio/dev.tar.gz",
-    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/dev.tsv"
-
-    urls = []
-    base_url = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
-
-    base_lang_url = os.path.join(base_url, lang)
-    tsv_url = f"{base_lang_url}/{split}.tsv"
-    urls.append(tsv_url)
-
-    tar_gz_url = f"{base_lang_url}/audio/{split}.tar.gz"
-    urls.append(tar_gz_url)
-
-    return urls
-
-
-class CreateInitialManifestFleurs(BaseProcessor):
-    """
-    Processor to create initial manifest for the FLEURS dataset.
-
-    Dataset link: https://huggingface.co/datasets/google/fleurs
-
-    Will download all files, extract them, and create a manifest file with the
-    "audio_filepath" and "text" fields.
-
-    Args:
-        lang (str): Language to be processed, identified by a combination of ISO 639-1 and ISO 3166-1 alpha-2 codes.
-            Examples are:
-
-            - ``"hy_am"`` for Armenian
-            - ``"ko_kr"`` for Korean
-
-        split (str): Which dataset splits to process.
-            Options are:
-
-            - ``"test"``
-            - ``"train"``
-            - ``"dev"``
-
-        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        lang: str,
-        split: str,
-        raw_data_dir: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.lang = lang
-        self.split = split
-        self.raw_data_dir = raw_data_dir
-
-    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
-        """
-        Parse transcript TSV file and put it inside manifest.
-        Assumes the TSV file has two columns: file name and text.
-        """
-
-        entries = []
-        root = os.path.dirname(file_path)
-
-        with open(file_path, encoding="utf-8") as fin:
-            for line in fin:
-                # Split the line into filename text using the tab delimiter
-                parts = line.strip().split('\t')
-                if len(parts) < 2:  # Skip lines that don't have at least 2 parts
-                    continue
-
-                file_name, transcript_text = parts[1], parts[2]
-                wav_file = os.path.join(root, file_name)
-
-                entry = {"audio_filepath": os.path.abspath(wav_file), "text": transcript_text}
-                entries.append(entry)
-
-        return entries
-
-    def process_data(self, data_folder: str, manifest_file: str) -> None:
-        entries = self.process_transcript(os.path.join(data_folder, self.split + "/" + self.split + ".tsv"))
-
-        with open(manifest_file, "w", encoding="utf-8") as fout:
-            for m in entries:
-                fout.write(json.dumps(m, ensure_ascii=False) + "\n")
-
-    def download_extract_files(self, dst_folder: str) -> None:
-        """downloading and extracting files"""
-
-        os.makedirs(dst_folder, exist_ok=True)
-
-        # downloading all files
-        for file_url in get_fleurs_url_list(self.lang, self.split):
-            download_file(file_url, str(dst_folder))
-
-        extract_archive(f'{dst_folder}/{self.split}.tar.gz', str(dst_folder), force_extract=True)
-
-        # Organizing files into their respective folders
-        target_folder = os.path.join(dst_folder, self.split)
-
-        file_name = f"{self.split}.tsv"
-
-        file_path = os.path.join(dst_folder, file_name)
-        dest_file_path = os.path.join(target_folder, file_name)
-
-        if not os.path.exists(dest_file_path):
-            shutil.move(file_path, dest_file_path)
-            print(f'Moved {file_path} to {dest_file_path}')
-        else:
-            os.remove(file_path)
-            print(f'File {file_name} already exists in {target_folder}, deleted from source.')
-
-    def process(self):
-        self.download_extract_files(self.raw_data_dir)
-        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/ksc2/__init__.py b/build/lib/sdp/processors/datasets/ksc2/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py b/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
deleted file mode 100644
index 3bde174b..00000000
--- a/build/lib/sdp/processors/datasets/ksc2/create_initial_manifest.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
-# For example sudo apt-get install libsox-fmt-mp3
-import csv
-import glob
-import os
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, Tuple
-
-from sox import Transformer
-from tqdm.contrib.concurrent import process_map
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-
-class CreateInitialManifestKSC2(BaseParallelProcessor):
-    """Processor to create initial manifest for the Kazakh Speech Corpus (KSC) 2.
-
-    The dataset should be requested via Google Forms, which can be found here https://issai.nu.edu.kz/kz-speech-corpus/.
-
-    Extracts raw data for the specified language and creates an initial manifest
-    using the transcripts provided in the raw data.
-
-    Args:
-        raw_data_dir (str): the path to the directory containing the raw data archive file.
-        extract_archive_dir (str): directory where the extracted data will be saved.
-        resampled_audio_dir (str): directory where the resampled audio will be saved.
-        data_split (str): "train", "dev" or "test".
-        target_samplerate (int): sample rate (Hz) to use for resampling.
-            Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process.
-            Defaults to 1.
-    Returns:
-        This processor generates an initial manifest file with the following fields:
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription (with capitalization and punctuation)>,
-                "source": <source of the given data>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        extract_archive_dir: str,
-        resampled_audio_dir: str,
-        data_split: str,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.extract_archive_dir = extract_archive_dir
-        self.resampled_audio_dir = resampled_audio_dir
-        self.data_split = data_split
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        """Extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*.tar.gz")
-
-        if not tar_gz_files:
-            raise RuntimeError(
-                f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
-                "For KSC2 dataset we cannot automatically download the data, so "
-                "make sure to get the data manually"
-                "and put it in the 'raw_data_dir' folder."
-            )
-
-        elif len(tar_gz_files) > 1:
-            raise RuntimeError(f"Expecting exactly one *.tar.gz file in directory {self.raw_data_dir}")
-
-        data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
-
-        if self.data_split.capitalize() not in data_folder:
-            self.data_split_dir = Path(data_folder, self.data_split.capitalize())
-        else:
-            self.data_split_dir = Path(data_folder)
-
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-    def read_manifest(self):
-        if self.data_split_dir is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        dataset_entries = []
-
-        without_text = defaultdict(int)
-
-        for audio_filepath in self.data_split_dir.rglob('*.flac'):
-            filename = audio_filepath.stem
-            source = audio_filepath.relative_to(self.data_split_dir).parents[0].as_posix()
-
-            transcribed_filename = Path(audio_filepath.parent, filename).with_suffix('.txt')
-
-            if transcribed_filename.exists():
-                with open(transcribed_filename, "rt", encoding="utf8") as txtfile:
-                    text = ' '.join(txtfile.readlines())
-            elif transcribed_filename.with_suffix('.txt.txt').exists():
-                transcribed_filename = transcribed_filename.with_suffix('.txt.txt')
-                with open(transcribed_filename, "rt", encoding="utf8") as txtfile:
-                    text = ' '.join(txtfile.readlines())
-            else:
-                without_text[audio_filepath.parent] += 1
-                continue
-
-            entry = {'audio_filepath': audio_filepath.as_posix(), 'text': text, 'source': source}
-
-            dataset_entries.append(entry)
-
-        logger.info(f"Without text entries -> {without_text}")
-
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: Dict):
-        wav_source_dir = Path(self.resampled_audio_dir, data_entry['source'])
-        wav_source_dir.mkdir(exist_ok=True)
-
-        output_wav_path = Path(wav_source_dir, Path(data_entry['audio_filepath']).stem).with_suffix(".wav")
-
-        if not os.path.exists(output_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=data_entry['audio_filepath'], output_filepath=output_wav_path)
-
-        data_entry['audio_filepath'] = output_wav_path.as_posix()
-
-        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/datasets/lhotse.py b/build/lib/sdp/processors/datasets/lhotse.py
deleted file mode 100644
index 01f54d44..00000000
--- a/build/lib/sdp/processors/datasets/lhotse.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-
-from sdp.processors.base_processor import BaseProcessor
-
-
-class LhotseImport(BaseProcessor):
-    """Processor to create an initial manifest imported from a Lhotse CutSet.
-    The ``input_manifest_file`` is expected to point to a Lhotse CutSet manifest,
-    which usually has ``cuts`` in its name and a ``.jsonl`` or ``.jsonl.gz`` extension.
-
-    Lhotse is a library for speech data processing and loading; see:
-
-    * https://github.com/lhotse-speech/lhotse
-    * https://lhotse.readthedocs.io
-
-    It can be installed using ``pip install lhotse``.
-
-    .. caution:: Currently we only support the importing of cut sets that represent
-        single-channel, single-audio-file-per-utterance datasets.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription (with capitalization and punctuation)>,
-            }
-    """
-
-    def process(self):
-        from lhotse import CutSet
-
-        cuts = CutSet.from_file(self.input_manifest_file)
-        with open(self.output_manifest_file, "w") as f:
-            for cut in cuts:
-                self.check_entry(cut)
-                data = {
-                    "audio_filepath": cut.recording.sources[0].source,
-                    "duration": cut.duration,
-                    "lhotse_cut_id": cut.id,
-                }
-                for meta in ("text", "speaker", "gender", "language"):
-                    if (item := getattr(cut.supervisions[0], meta)) is not None:
-                        data[meta] = item
-                if (custom := cut.supervisions[0].custom) is not None:
-                    data.update(custom)
-                print(json.dumps(data), file=f)
-
-    def check_entry(self, cut) -> None:
-        from lhotse import MonoCut
-
-        assert isinstance(
-            cut, MonoCut
-        ), f"Currently, only MonoCut import is supported. Received: {cut}"
-        assert (
-            cut.has_recording
-        ), f"Currently, we only support cuts with recordings. Received: {cut}"
-        assert (
-            cut.recording.num_channels == 1
-        ), f"Currently, we only supports recordings with a single channel. Received: {cut}"
-        assert (
-            len(cut.recording.sources) == 1
-        ), f"Currently, we only support recordings with a single AudioSource. Received: {cut}"
-        assert (
-            cut.recording.sources[0].type == "file"
-        ), f"Currently, we only suppport AudioSources of type='file'. Received: {cut}"
-        assert (
-            len(cut.supervisions) == 1
-        ), f"Currently, we only support cuts with a single supervision. Received: {cut}"
diff --git a/build/lib/sdp/processors/datasets/librispeech/__init__.py b/build/lib/sdp/processors/datasets/librispeech/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py b/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
deleted file mode 100644
index 83d42bde..00000000
--- a/build/lib/sdp/processors/datasets/librispeech/create_initial_manifest.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fnmatch
-import glob
-import json
-import os
-import typing
-
-from sdp.processors.base_processor import BaseProcessor
-from sdp.utils.common import download_file, extract_archive
-
-
-def get_librispeech_url_list(split: str) -> str:
-    urls = {
-        "dev-clean": "https://openslr.org/resources/12/dev-clean.tar.gz",
-        "dev-other": "https://openslr.org/resources/12/dev-other.tar.gz",
-        "test-clean": "https://openslr.org/resources/12/test-clean.tar.gz",
-        "test-other": "https://openslr.org/resources/12/test-other.tar.gz",
-        "train-clean-100": "https://openslr.org/resources/12/train-clean-100.tar.gz",
-        "train-clean-360": "https://openslr.org/resources/12/train-clean-360.tar.gz",
-        "train-other-500": "https://openslr.org/resources/12/train-other-500.tar.gz",
-        "dev-clean-2": "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
-        "train-clean-5": "https://www.openslr.org/resources/31/train-clean-5.tar.gz",
-    }
-
-    if split not in urls:
-        valid_splits = ", ".join(urls.keys())
-        raise ValueError(f"Invalid dataset split '{split}'. Valid options are: {valid_splits}")
-
-    return urls[split]
-
-
-class CreateInitialManifestLibrispeech(BaseProcessor):
-    """Processor to create initial manifest for the Librispeech dataset.
-
-    Dataset link: https://openslr.org/12
-    Dataset link: https://openslr.org/31
-
-    Will download all files, extract tars, and create a manifest file with the
-    "audio_filepath" and "text" fields.
-
-    Args:
-        split (str): Which datasets or their combinations should be processed.
-            Options are:
-
-            - ``"dev-clean"``
-            - ``"dev-other"``
-            - ``"test-clean"``
-            - ``"test-other"``
-            - ``"train-clean-100"``
-            - ``"train-clean-360"``
-            - ``"train-other-500"``
-            - ``"dev-clean-2"``
-            - ``"train-clean-5"``
-
-        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        split: str,
-        raw_data_dir: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.split = split
-        self.raw_data_dir = raw_data_dir
-
-    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
-        """Parse transcript file and put it inside manifest
-        We assume that flac files are located in the same directory as transcript file.
-        """
-
-        entries = []
-        root = os.path.dirname(file_path)
-
-        print(f"Processing transcript file: {file_path}") 
-        with open(file_path, encoding="utf-8") as fin:
-            for line in fin:
-                id, text = line[: line.index(" ")], line[line.index(" ") + 1 :]
-                transcript_text = text.strip()
-
-                flac_file = os.path.join(root, id + ".flac")
-
-                entry = {}
-                entry["audio_filepath"] = os.path.abspath(flac_file)
-                entry["text"] = transcript_text
-                entries.append(entry)
-        return entries
-
-    def process_data(self, data_folder: str, manifest_file: str) -> None:
-        split_folder = os.path.join(data_folder, "LibriSpeech", self.split)
-        files = []
-        entries = []
-        if not os.path.exists(split_folder):
-            raise FileNotFoundError(f"Directory for split '{self.split}' not found at {split_folder}")
-
-        for root, _, filenames in os.walk(split_folder):
-            for filename in fnmatch.filter(filenames, "*.trans.txt"):
-                files.append(os.path.join(root, filename))
-
-        for file in files:
-            entries.extend(self.process_transcript(file))
-
-        with open(manifest_file, "w") as fout:
-            for entry in entries:
-                fout.write(json.dumps(entry) + "\n")
-
-    def download_extract_files(self, dst_folder: str) -> None:
-        """downloading and extracting files"""
-
-        os.makedirs(dst_folder, exist_ok=True)
-
-        download_file(get_librispeech_url_list(self.split), str(dst_folder))
-        data_file = f'{dst_folder}/{self.split}.tar.gz'
-        extract_archive(str(data_file), str(dst_folder), force_extract=True)
-
-    def process(self):
-        self.download_extract_files(self.raw_data_dir)
-        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/masc/__init__.py b/build/lib/sdp/processors/datasets/masc/__init__.py
deleted file mode 100644
index 82fd7b35..00000000
--- a/build/lib/sdp/processors/datasets/masc/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .create_initial_manifest import CreateInitialManifestMASC
-from .aggregate_segments import AggregateSegments
-from .apply_reg_exp_on_vtt_entries import RegExpVttEntries
-from .get_caption_file_segments import GetCaptionFileSegments
diff --git a/build/lib/sdp/processors/datasets/masc/aggregate_segments.py b/build/lib/sdp/processors/datasets/masc/aggregate_segments.py
deleted file mode 100644
index 8db51046..00000000
--- a/build/lib/sdp/processors/datasets/masc/aggregate_segments.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import logging
-from pydub import AudioSegment
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.processors.datasets.masc.utils import save_audio_segment
-
-class AggregateSegments(BaseParallelProcessor):
-    """
-    Aggregates short segments into segments with duration not longer than `max_duration`.
-    The algorithm works by iterating from left to right, merging consecutive segments into the current segment until the total duration reaches `max_duration`.
-    
-    output_audio_dir (str): Directory where aggregated audio segments will be stored, if `save_aggregated_audio_segments` is True.
-        If `save_aggregated_audio_segments` is False, this path is used to create the audio file paths in the manifest.
-    input_segments_key (str): The field name that contains list of segments in the input manifest. Defaults to "segments".
-    input_audio_filepath_key (str): The field name that contains paths to the audio files in the input manifest.
-        Defaults to "audio_filepath".
-    output_text_key (str): Field name where to store aggregated segment text. Defaults to "text".
-    output_duration_key (str): Field name where aggregated segment durations will be stored. Defaults to "duration".
-    output_audio_filepath_key (str): Field name where aggregated segment audio file paths will be stored.
-        Defaults to "audio_filepath".
-    max_duration (float): Maximum duration of aggregated segment. Default to 20.0s.
-    save_aggregated_audio_segments (bool): Flag indicating whether to crop audio files according to the aggregated segments.
-        Defaults to True.
-    verbose (bool): Set to True to enable more detailed logging. Defaults to False.
-    """
-    def __init__(
-        self,
-        output_audio_dir: str,
-        input_segments_key: str = "segments",
-        input_audio_filepath_key: str = "audio_filepath",
-        output_text_key: str = "text",
-        output_duration_key: str = "duration",
-        output_audio_filepath_key: str = "audio_filepath",
-        max_duration: float = 20.0,
-        save_aggregated_audio_segments: bool = True,
-        verbose: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.max_duration = max_duration
-        self.input_audio_filepath_key = input_audio_filepath_key
-        self.output_splitted_audio_filepath_key = output_audio_filepath_key
-        self.save_aggregated_audio_segments = save_aggregated_audio_segments
-        self.output_audio_dir = output_audio_dir
-        self.input_segments_key = input_segments_key
-        self.verbose = verbose
-        self.output_text_key = output_text_key
-        self.output_duration_key = output_duration_key
-
-    def prepare(self):
-        if self.save_aggregated_audio_segments and self.output_audio_dir:
-            os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True)
-
-    def process_dataset_entry(self, data_entry: dict):
-        if self.input_segments_key not in data_entry:
-            if self.verbose:
-                logging.info(f"No segments in the sample {data_entry[self.input_audio_filepath_key]}.")
-            return []
-
-        segments = data_entry[self.input_segments_key]
-        if len(segments) == 0:
-            return []
-        
-        audio = AudioSegment.from_wav(data_entry[self.input_audio_filepath_key])
-        
-        audio_basename = os.path.basename(data_entry[self.input_audio_filepath_key]).split(".")[0]
-        agg_segments = []
-        aggregated_segment = {**segments[0]}
-        for segment in segments[1:]:
-            # checking if adding another segment will cause the total duration to exceed max_duration
-            if (segment["end_time"] > audio.duration_seconds or segment["start_time"] > audio.duration_seconds):
-                continue
-            
-            start_time = min(segment["start_time"], aggregated_segment["start_time"])
-            end_time = max(segment["end_time"], aggregated_segment["end_time"])
-            if end_time - start_time >= self.max_duration:
-                agg_segments.append(aggregated_segment)
-                aggregated_segment = {**segment}
-            else:
-                # updating aggregated segment text with correct order of segments.
-                if aggregated_segment["start_time"] < segment["start_time"]:
-                    aggregated_segment["text"] += f" {segment['text']}".strip()
-                else:
-                    aggregated_segment["text"] = f"{segment['text']} {aggregated_segment['text']}"
-                    
-                aggregated_segment["start"] = start_time    # updating aggregated segment start time
-                aggregated_segment["end_time"] = end_time   # updating aggregated segment end time
-        else:
-            # adding the last aggregated segment 
-            if aggregated_segment not in agg_segments:
-                agg_segments.append(aggregated_segment)
-            
-        valid_segments = []
-        for aggregated_segment in agg_segments:
-            aggregated_segment.update(data_entry)
-            
-            start_time = aggregated_segment.pop("start_time")
-            end_time = aggregated_segment.pop("end_time")
-            
-            aggregated_segment[self.output_duration_key] = end_time - start_time
-            aggregated_segment[self.output_splitted_audio_filepath_key] = os.path.join(self.output_audio_dir, f"{audio_basename}_{start_time}_{end_time}.wav")
-            
-            if self.save_aggregated_audio_segments:
-                try:
-                    save_audio_segment(
-                        audio=audio,
-                        start_time=start_time,
-                        end_time=end_time,
-                        output_audio_filepath=aggregated_segment[self.output_splitted_audio_filepath_key]
-                    )
-                    valid_segments.append(aggregated_segment)
-                except IndexError as e:
-                    if self.verbose:
-                        logging.warning(f"Invalid segment boundaries in {audio_basename}. Skipping...")
-                
-        return [DataEntry(data=segment) for segment in valid_segments]
-        
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py b/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
deleted file mode 100644
index 541e98eb..00000000
--- a/build/lib/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import webvtt # pip install webvtt-py
-from typing import Dict
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-class RegExpVttEntries(BaseParallelProcessor):
-    """
-    Applies regular expressions on entries of a .vtt (WebVTT) file and stores the processed file in the specified directory.
-
-    Args::
-        input_filepath_key (str): Key that stores path to the input `.vtt` file.
-        output_filtered_vtt_dir (str): Directory where the processed `.vtt` files will be stored.
-        output_filepath_key (str): Key to store the output `.vtt` file path.
-
-    Returns::
-        Manifest with additional field:
-        {
-            "output_filepath_key": <path to processed .vtt file>
-        }
-    """
-
-    def __init__(
-        self,
-        regex_params: Dict,
-        input_filepath_key: str = "vtt_filepath",
-        output_filtered_vtt_dir: str = "filtered_vtt_filepath",
-        output_filepath_key: str = "filtered_vtt_filepath",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_filepath_key = input_filepath_key
-        self.output_filepath_key = output_filepath_key
-        self.output_filtered_vtt_dir = output_filtered_vtt_dir
-        self.regex_params = regex_params
-
-    def prepare(self):
-        os.makedirs(self.output_filtered_vtt_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        try:
-            vtt = webvtt.read(data_entry[self.input_filepath_key])
-
-            for caption in vtt:
-                caption.text = re.sub(
-                    pattern=self.regex_params["pattern"],
-                    repl=self.regex_params["repl"],
-                    string=caption.text,
-                    count=self.regex_params.get("count", 0),
-                )
-
-            basename = os.path.basename(data_entry[self.input_filepath_key])
-            filtered_vtt_filepath = os.path.join(self.output_filtered_vtt_dir, basename)
-            data_entry[self.output_filepath_key] = filtered_vtt_filepath
-
-            vtt.save(filtered_vtt_filepath)
-            return [DataEntry(data=data_entry)]
-        except:
-            return [DataEntry(data=None)]
diff --git a/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py b/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
deleted file mode 100644
index 9563f723..00000000
--- a/build/lib/sdp/processors/datasets/masc/create_initial_manifest.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import logging
-from pathlib import Path
-import pandas as pd
-from sox import Transformer
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import extract_archive
-
-class CreateInitialManifestMASC(BaseParallelProcessor):
-    """
-    Processor for creating initial manifest for Massive Arabic Speech Corpus (MASC). \n
-    Dataset link: https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus.
-    Prior to calling processor download the tarred dataset and store it under `raw_dataset_dir/masc.tar.gz`.
-    
-    Creates manifest from samples in . `dataset_dir/subsets/data_split.csv`. All meta information is kept.
-
-    Args:
-        raw_data_dir (str): The root directory of the dataset.
-        extract_archive_dir (str): Directory where the extracted data will be saved.
-        resampled_audios_dir (str): Directory where the resampled audio will be saved.
-        data_split (str): Dataset split type.
-        already_extracted (bool): If True, we will not try to extract the raw data. Defaults to False.
-        target_samplerate (int): Sample rate (Hz) to use for resampling. Defaults to 16000.
-        target_nchannels (int): Number of channels to create during resampling process. Defaults to 1.
-        output_manifest_sample_id_key (str): The field name to store sample ID. Defaults to "sample_id".
-        output_manifest_vtt_filapath_key (str): The field name to store vtt file path. Defaults to "vtt_filepath".
-        output_manifest_audio_filapath_key (str): The field name to store audio file path. Defaults to "audio_filepath".
-        verbose (bool): Set to True for more detailed logging.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "sample_id": <sample ID>
-                "audio_filepath": <path to the audio file>,
-                "vtt_filepath": <path to the vtt file>,
-                "category": <video category>,
-                "video_duration": <video duration>,
-                "channel_id": <video channel ID>,
-                "country": <video country>,
-                "dialect": <speaker dialect>,
-                "gender": <speaker gender>,
-                "transcript_duration": <transcript duration>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        data_split: str,
-        extract_archive_dir: str,
-        resampled_audios_dir: str,
-        already_extracted: bool = False,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        output_manifest_sample_id_key: str = "sample_id",
-        output_manifest_vtt_filapath_key: str = "vtt_filepath",
-        output_manifest_audio_filapath_key: str = "audio_filepath",
-        verbose: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_dataset_dir = Path(raw_data_dir)
-        self.data_split = data_split
-        
-        # in original dataset there are no train, dev and test splits. These are added to support end-to-end tests.
-        if self.data_split == "train":
-            self.data_split = "clean_train"
-        if self.data_split == "dev":
-            self.data_split = "clean_dev"
-        if self.data_split == "test":
-            self.data_split = "clean_test"
-        
-        self.extract_archive_dir = extract_archive_dir
-        self.resampled_audios_dir = Path(resampled_audios_dir)
-        self.already_extracted = already_extracted
-        
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-        self.output_manifest_sample_id_key = output_manifest_sample_id_key
-        self.output_manifest_vtt_filapath_key = output_manifest_vtt_filapath_key
-        self.output_manifest_audio_filapath_key = output_manifest_audio_filapath_key
-        
-        self.verbose = verbose
-
-        data_split_values = ["train", "dev", "test", "clean_train", "clean_dev", "clean_test", "noisy_train", "noisy_dev", "noisy_test"]
-        if self.data_split not in data_split_values:
-            raise ValueError(f'Data split value must be from {data_split_values}. "{self.data_split}" was given.')
-
-    def prepare(self):
-        # Extracting data (unless already done).
-        if not self.already_extracted:
-            tar_gz_filepath = Path(str(self.raw_dataset_dir)) / "masc.tar.gz"
-            if not tar_gz_filepath.exists:
-                raise RuntimeError(
-                    f"Did not find any file matching {tar_gz_filepath}. "
-                    "For MASC dataset we cannot automatically download the data, so "
-                    "make sure to get the data from https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus"
-                    "and put it in the 'raw_dataset_dir' folder."
-                )
-
-            self.dataset_dir = Path(extract_archive(tar_gz_filepath, self.extract_archive_dir))
-        else:
-            logging.info("Skipping dataset untarring...")
-            self.dataset_dir = Path(self.extract_archive_dir) / "masc"
-            
-        self.vtts_dir = self.dataset_dir / "subtitles"
-        self.audios_dir = self.dataset_dir / "audios"
-        if self.data_split == "clean_train" or self.data_split == "noisy_train":
-            self.csv_filepath = self.dataset_dir / "subsets" / f"{self.data_split}.csv"
-        else:
-            self.csv_filepath = self.dataset_dir / "subsets" / f"{self.data_split}_meta.csv"
-
-        if not self.csv_filepath.exists():
-            raise FileNotFoundError(f"{self.csv_filepath} not found.")
-
-        if not self.vtts_dir.exists():
-            raise FileNotFoundError(f"{self.vtts_dir} not found.")
-
-        if not self.audios_dir.exists():
-            raise FileNotFoundError(f"{self.audios_dir} not found.")
-        
-        os.makedirs(self.resampled_audios_dir, exist_ok=True)
-        
-    def read_manifest(self):
-        csv = pd.read_csv(self.csv_filepath)
-        return [row.to_dict() for _, row in csv.iterrows()]
-
-    def process_dataset_entry(self, sample_data):
-        sample_id = sample_data["video_id"]
-        source_audio_filepath = self.audios_dir / f"{sample_id}.wav"
-        target_audio_filepath = self.resampled_audios_dir / f"{sample_id}.wav"
-        vtt_filepath = self.vtts_dir / f"{sample_id}.ar.vtt"
-        
-        # if source audio or vtt file do not exist skip
-        if not (os.path.exists(source_audio_filepath) and os.path.exists(vtt_filepath)):
-            return []
-        
-        # if target audio exists skip resampling
-        if not os.path.exists(target_audio_filepath):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=source_audio_filepath, output_filepath=target_audio_filepath)
-        elif self.verbose:
-            logging.info(f"{target_audio_filepath} already exists. Skipping resampling")
-
-        sample_data.pop("video_id")
-        sample_data.update(
-            {
-                self.output_manifest_sample_id_key: sample_id,
-                self.output_manifest_vtt_filapath_key: str(vtt_filepath),
-                self.output_manifest_audio_filapath_key: str(target_audio_filepath),
-            }
-        )
-        
-        return [DataEntry(data=sample_data)]
diff --git a/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py b/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
deleted file mode 100644
index 745c6548..00000000
--- a/build/lib/sdp/processors/datasets/masc/get_caption_file_segments.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import logging
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.processors.datasets.masc.utils import parse_captions
-
-class GetCaptionFileSegments(BaseParallelProcessor):
-    """
-    This class extracts subtitle information from .vtt (WebVTT) files.
-    Each segment represents a single subtitle line.
-
-    Args:
-        input_caption_file_key (str): The field name in the input manifest containing path to the caption file.
-        output_segments_key (str): The field name to store segment information. Defaults to "segments".
-        verbose (bool): Set true for outputing logging information.
-        
-    Returns:
-        This processor adds an output_segments field to the input manifest with a list of segments.
-        Each segment has a structure:
-            {
-                "segment_id":   <index of subtitle line>,
-                "start_time":   <segment start time>,
-                "end_time":     <segment end time>
-                "text":         <segment text>
-            }
-    """
-    def __init__(
-        self,
-        input_caption_file_key: str,
-        output_segments_key: str = "segments",
-        verbose: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.caption_file_key = input_caption_file_key
-        self.output_segments_key = output_segments_key
-        self.verbose = verbose
-
-    def process_dataset_entry(self, data_entry):
-        caption_file = data_entry[self.caption_file_key]
-        
-        if not os.path.exists(caption_file):
-            if self.verbose:
-                logging.info(f"File {caption_file} does not exist.")
-            return []
-
-        data_entry[self.output_segments_key] = parse_captions(caption_file)
-
-        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/datasets/masc/utils.py b/build/lib/sdp/processors/datasets/masc/utils.py
deleted file mode 100644
index e3adf646..00000000
--- a/build/lib/sdp/processors/datasets/masc/utils.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import webvtt # pip install webvtt-py
-from typing import Optional
-from sdp.processors.datasets.commoncrawl.harv_utils import parse_hours
-from datetime import datetime
-
-def save_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: Optional[str]):
-    """
-    Extracts a segment from audio.
-    
-    Args:
-        audio: input audio
-        start_time (float): segment start time in seconds.
-        end_time (float): segment end time in seconds.
-        audio_filepath (Optional[str]): filepath to store the segment.
-        
-    Returns:
-        audio_segment: audio segment
-    
-    IndexError: Raised if segment boundaries are out of range.
-    """
-    start_time = start_time * 1000
-    end_time = end_time * 1000
-    
-    if start_time >= len(audio) or end_time >= len(audio):
-        raise IndexError("Segment boundaries are out of range.")
-    
-    audio_segment = audio[start_time:end_time]
-    if output_audio_filepath:
-        audio_segment.export(output_audio_filepath, format="wav")
-    
-    return audio_segment
-
-
-def parse_captions(captions_filepath: str):
-    """
-    Creates a list of segments from .vtt caption files.
-    Each segment has a structure:
-    {
-        "segment_id": int,       # Unique identifier for the segment
-        "start_time": float,     # Start time of the segment (in seconds)
-        "end_time": float,       # End time of the segment (in seconds)
-        "text": str              # Text content of the segment
-    }
-    
-    Args:
-        captions_filepath (str): path to .vtt file.
-    """
-    srt_segments = []
-    initial_timestamp = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
-    for index, caption in enumerate(webvtt.read(captions_filepath)):
-        text = ' '.join([text.strip() for text in caption.text.split('\n')])
-        start_time = parse_hours(caption.start) - initial_timestamp
-        end_time = parse_hours(caption.end) - initial_timestamp
-        
-        segment = {
-            "segment_id": index,
-            "start_time": start_time.total_seconds(),
-            "end_time": end_time.total_seconds(),
-            "text": text
-        }
-        srt_segments.append(segment)
-        
-    return srt_segments
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/mcv/__init__.py b/build/lib/sdp/processors/datasets/mcv/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
deleted file mode 100644
index 16544c4f..00000000
--- a/build/lib/sdp/processors/datasets/mcv/create_initial_manifest.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
-# For example sudo apt-get install libsox-fmt-mp3
-import csv
-import glob
-import os
-from pathlib import Path
-from typing import Tuple
-
-import sox
-from sox import Transformer
-from tqdm.contrib.concurrent import process_map
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import extract_archive
-
-
-class CreateInitialManifestMCV(BaseParallelProcessor):
-    """Processor to create initial manifest for the Mozilla Common Voice (MCV) dataset.
-
-    Dataset link: https://commonvoice.mozilla.org/
-
-    Extracts raw MCV data for the specified language and creates an initial manifest
-    using the transcripts provided in the raw data.
-
-    Args:
-        raw_data_dir (str): the path to the directory containing the raw data archive file.
-            Needs to be manually downloaded from https://commonvoice.mozilla.org/.
-        extract_archive_dir (str): directory where the extracted data will be saved.
-        resampled_audio_dir (str): directory where the resampled audio will be saved.
-        data_split (str): "train", "dev" or "test".
-        language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
-        already_extracted (bool): if True, we will not try to extract the raw data.
-            Defaults to False.
-        target_samplerate (int): sample rate (Hz) to use for resampling.
-            Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process.
-            Defaults to 1.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription (with capitalization and punctuation)>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        extract_archive_dir: str,
-        resampled_audio_dir: str,
-        data_split: str,
-        language_id: str,
-        already_extracted: bool = False,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.extract_archive_dir = extract_archive_dir
-        self.resampled_audio_dir = resampled_audio_dir
-        self.data_split = data_split
-        self.language_id = language_id
-        self.already_extracted = already_extracted
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        """Extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        if not self.already_extracted:
-            tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*{self.language_id}.tar.gz")
-            if not tar_gz_files:
-                raise RuntimeError(
-                    f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
-                    "For MCV dataset we cannot automatically download the data, so "
-                    "make sure to get the data from https://commonvoice.mozilla.org/ "
-                    "and put it in the 'raw_data_dir' folder."
-                )
-            elif len(tar_gz_files) > 1:
-                raise RuntimeError(
-                    f"Expecting exactly one *{self.language_id}.tar.gz file in directory {self.raw_data_dir}"
-                )
-
-            data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
-            self.transcription_file = Path(data_folder)
-        else:
-            self.transcription_file = Path(self.extract_archive_dir) / self.language_id
-        self.audio_path_prefix = str(self.transcription_file / "clips")
-        self.transcription_file = str(self.transcription_file / (self.data_split + ".tsv"))
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-    def read_manifest(self):
-        if self.transcription_file is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        with open(self.transcription_file, "rt", encoding="utf8") as csvfile:
-            reader = csv.DictReader(csvfile, delimiter="\t")
-            next(reader, None)  # skip the headers
-            dataset_entries = [(row["path"], row["sentence"]) for row in reader]
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: Tuple[str, str]):
-        file_path, text = data_entry
-        file_name = os.path.splitext(os.path.basename(file_path))[0]
-        transcript_text = text.strip()
-
-        audio_path = os.path.join(self.audio_path_prefix, file_path)
-        output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")
-
-        if not os.path.exists(output_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
-
-        data = {
-            "audio_filepath": output_wav_path,
-            "duration": float(sox.file_info.duration(output_wav_path)),
-            "text": transcript_text,
-        }
-
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/mediaspeech/__init__.py b/build/lib/sdp/processors/datasets/mediaspeech/__init__.py
deleted file mode 100644
index 341a77c5..00000000
--- a/build/lib/sdp/processors/datasets/mediaspeech/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
deleted file mode 100644
index cfba28d1..00000000
--- a/build/lib/sdp/processors/datasets/mediaspeech/create_initial_manifest.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from glob import glob
-from pathlib import Path
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import ffmpeg_convert
-from sdp.utils.common import extract_archive
-
-
-class CreateInitialManifestMediaSpeech(BaseParallelProcessor):
-    """
-    Processor for creating initial manifest for MediaSpeech Arabic dataset.
-    Dataset link: https://www.openslr.org/108/.
-    Prior to calling processor download the tarred dataset and store it under `raw_dataset_dir/AR.tgz` or `raw_dataset_dir/AR.tar.gz`.
-    
-    Args:
-        raw_data_dir (str): The root directory of the dataset.
-        extract_archive_dir (str): Directory where the extracted data will be saved.
-        resampled_audios_dir (str): Directory where the resampled audio will be saved.
-        already_extracted (bool): If True, we will not try to extract the raw data. Defaults to False.
-        target_samplerate (int): Sample rate (Hz) to use for resampling. Defaults to 16000.
-        target_nchannels (int): Number of channels to create during resampling process. Defaults to 1.
-        output_manifest_sample_id_key (str): The field name to store sample ID. Defaults to "sample_id".
-        output_manifest_audio_filapath_key (str): The field name to store audio file path. Defaults to "audio_filepath".
-        output_manifest_text_key (str): The field name to store text. Defaults to "text".
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-        
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <text>,
-            }
-    """
-    def __init__(
-        self,
-        raw_data_dir: str,
-        resampled_audios_dir: str,
-        extract_archive_dir: str,
-        already_extracted: bool = False,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        output_manifest_sample_id_key: str = "sample_id",
-        output_manifest_audio_filapath_key: str = "audio_filepath",
-        output_manifest_text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.extract_archive_dir = extract_archive_dir
-        self.resampled_audios_dir = Path(resampled_audios_dir)
-        self.already_extracted = already_extracted
-        
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-        
-        self.output_manifest_sample_id_key = output_manifest_sample_id_key
-        self.output_manifest_audio_filapath_key = output_manifest_audio_filapath_key
-        self.output_manifest_text_key = output_manifest_text_key
-
-    def prepare(self):
-        # Extracting data (unless already done).
-        if not self.already_extracted:
-            tar_gz_filepath = Path(str(self.raw_data_dir)) / "AR.tgz"
-            if not tar_gz_filepath.exists():
-                # necessary to check in tests
-                tar_gz_filepath = Path(str(self.raw_data_dir)) / "AR.tar.gz"
-
-            if not tar_gz_filepath.exists():
-                raise RuntimeError(
-                    '''Did not find any file matching `AR.tgz` or `AR.tar.gz`. 
-                    For MediaSpeech dataset we cannot automatically download the data, so 
-                    make sure to get the data from https://www.openslr.org/108/ 
-                    and put it in the `raw_data_dir` folder.'''
-                )
-
-            self.dataset_dir = Path(extract_archive(tar_gz_filepath, self.extract_archive_dir))
-        else:
-            logger.info("Skipping dataset untarring...")
-            self.dataset_dir = Path(self.extract_archive_dir) / "AR"
-        
-        os.makedirs(self.resampled_audios_dir, exist_ok=True)
-
-    def read_manifest(self):
-        data_entries = []
-        audio_filepaths = glob(f"{self.dataset_dir}/*.flac")
-
-        for audio_filepath in audio_filepaths:
-            sample_id = os.path.basename(audio_filepath).split(".")[0]
-
-            text_filepath = f"{self.dataset_dir}/{sample_id}.txt"
-            if not os.path.exists(text_filepath):
-                logger.warning(
-                    f'Sample "{sample_id}" has no related .txt files. Skipping'
-                )
-                continue
-
-            data_entries.append(
-                {
-                    "sample_id": sample_id,
-                    "audio_filepath": audio_filepath,
-                    "text_filepath": text_filepath,
-                }
-            )
-
-        return data_entries
-
-    def process_dataset_entry(self, data_entry: DataEntry):
-        data = {}
-        sample_id = data_entry["sample_id"]
-        # Convert source_audio_filepath to .wav
-        data[self.output_manifest_audio_filapath_key] = os.path.join(
-            os.path.join(self.resampled_audios_dir, f"{sample_id}.wav"),
-        )
-
-        ffmpeg_convert(
-            jpg=data_entry["audio_filepath"],
-            wav=data[self.output_manifest_audio_filapath_key],
-            ar=self.target_samplerate,
-            ac=self.target_nchannels,
-        )
-
-        if not os.path.exists(data[self.output_manifest_audio_filapath_key]):
-            return []
-
-        text_file = open(data_entry["text_filepath"], "r")
-        data[self.output_manifest_text_key] = text_file.read()
-
-        return [DataEntry(data=data)]
\ No newline at end of file
diff --git a/build/lib/sdp/processors/datasets/mls/__init__.py b/build/lib/sdp/processors/datasets/mls/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
deleted file mode 100644
index 72d9c00d..00000000
--- a/build/lib/sdp/processors/datasets/mls/create_initial_manifest.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-from typing import Optional
-
-import librosa
-from sox import Transformer
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-MLS_URL_NO_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
-MLS_URL_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}_opus.tar.gz"
-
-
-class CreateInitialManifestMLS(BaseParallelProcessor):
-    """Processor to create initial manifest for the Multilingual LibriSpeech (MLS) dataset.
-
-    Dataset link: https://www.openslr.org/94/
-
-    Downloads and unzips raw MLS data for the specified language,
-    and creates an initial manifest using the transcripts provided in the raw data.
-
-    Args:
-        raw_data_dir (str): the directory where the downloaded data will be/is saved.
-            This is also where the extracted and processed data will be.
-        language (str): the language of the data you wish to be downloaded.
-            This will be used to format the URL from which we attempt to download the data.
-            E.g., "english", "italian", "spanish", etc.
-        data_split (str): "train", "dev" or "test".
-        resampled_audio_dir (str or None): if specified, the directory where the resampled
-            wav files will be stored. If not specified, the audio will not be resampled and
-            the parameters ``target_samplerate`` and ``target_nchannels`` will be ignored.
-        target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
-            be ignored if ``resampled_audio_dir`` is ``None``.
-            Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process. This
-            parameter will be ignored if ``resampled_audio_dir`` is ``None``.
-            Defaults to 1.
-        use_opus_archive (bool): if ``True``, will use the version of the archive file which
-            contains audio files saved in the OPUS format, instead of FLAC. The OPUS files take up
-            less memory than the FLAC files, at the cost of the OPUS files being lower quality than
-            the FLAC files.
-            If ``True``, the parameter ``resampled_audio_dir`` must be ``None``, as resampling OPUS
-            audio files is currently not supported.
-            Defaults to False.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        language: str,
-        data_split: str,
-        resampled_audio_dir: Optional[str],
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        use_opus_archive: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.language = language
-        self.data_split = data_split
-        self.resampled_audio_dir = Path(resampled_audio_dir) if resampled_audio_dir else None
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-        self.use_opus_archive = use_opus_archive
-
-        # validate params
-        if self.use_opus_archive and self.resampled_audio_dir:
-            raise ValueError(
-                f"`use_opus_archive` is True and `resampled_audio_dir` is not None, but we currently do not"
-                " support resampling OPUS-format audio, please either set `use_opus_archive` to False or"
-                " resampled_audio_dir to None."
-            )
-
-        if not resampled_audio_dir:
-            logger.info(
-                "`resampled_audio_dir` is None => will not attempt to resample audio. Please note if you have"
-                " specified `target_samplerate` or `target_nchannels`, they will be ignored."
-            )
-
-
-
-        # will be initialized in self.prepare method
-        self.audio_path_prefix = None
-        self.transcription_file = None
-
-    def prepare(self):
-        """Downloading and extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        if self.use_opus_archive:
-            url = MLS_URL_OPUS.format(language=self.language)
-            if not (self.raw_data_dir / f"mls_{self.language}_opus.tar.gz").exists():
-                download_file(url, str(self.raw_data_dir))
-
-        else:
-            url = MLS_URL_NO_OPUS.format(language=self.language)
-            if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
-                download_file(url, str(self.raw_data_dir))
-
-        data_folder = extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
-
-        self.audio_path_prefix = str(Path(data_folder) / self.data_split / "audio")
-        self.transcription_file = str(Path(data_folder) / self.data_split / "transcripts.txt")
-
-    def read_manifest(self):
-        """Reading the initial data line-by-line."""
-        if self.transcription_file is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        with open(self.transcription_file, "rt", encoding="utf8") as fin:
-            dataset_entries = fin.readlines()
-
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: str):
-        """Processing the data entries.
-
-        Converts all audio into wav format and outputs filepath, duration and
-        transcription text.
-        """
-        if len(data_entry.split("\t")) != 2:
-            raise RuntimeError(f"have more than one tab in line {data_entry}")
-
-        utt_id, text = data_entry.split("\t")
-        transcript_text = text.strip()
-
-        # specify src_audio_path
-        if self.use_opus_archive:
-            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".opus")
-        else:
-            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")
-
-        # specify tgt_audio_path
-        if self.resampled_audio_dir:
-            tgt_audio_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")
-
-            if not os.path.exists(os.path.dirname(tgt_audio_path)):
-                os.makedirs(os.path.dirname(tgt_audio_path), exist_ok=True)
-            if not os.path.exists(tgt_audio_path):
-                tfm = Transformer()
-                tfm.rate(samplerate=self.target_samplerate)
-                tfm.channels(n_channels=self.target_nchannels)
-                tfm.build(input_filepath=src_audio_path, output_filepath=tgt_audio_path)
-
-        else:
-            tgt_audio_path = src_audio_path
-
-        data = {
-            "audio_filepath": tgt_audio_path,
-            "duration": float(librosa.get_duration(path=tgt_audio_path)),
-            "text": transcript_text,
-        }
-
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/mls/restore_pc.py b/build/lib/sdp/processors/datasets/mls/restore_pc.py
deleted file mode 100644
index 33ff22b0..00000000
--- a/build/lib/sdp/processors/datasets/mls/restore_pc.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import json
-import os
-import re
-import string
-import sys
-from glob import glob
-from pathlib import Path
-from typing import Optional
-
-import regex
-from joblib import Parallel, delayed
-from tqdm import tqdm
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseProcessor
-from sdp.utils.common import download_file, extract_archive
-
-sys.setrecursionlimit(1000000)
-
-NA = "n/a"
-MLS_TEXT_URL = "https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz"
-
-
-def abbreviations(text):
-    text = (
-        text.replace("Cap'n", "Captain")
-        .replace("cap'n", "captain")
-        .replace("o'shot", "o shot")
-        .replace("o' shot", "o shot")
-        .replace("on'y", "only")
-        .replace("on' y", "only")
-        .replace(" 'a ", " a ")
-        .replace(" 'em ", " em ")
-        .replace("gen'leman", "gentleman")
-    )
-    return text
-
-
-def process(text):
-    text = (
-        text.replace("www.gutenberg.org", "www dot gutenberg dot org")
-        .replace(".txt", "dot txt")
-        .replace(".zip", "dot zip")
-    )
-
-    text = (
-        text.replace("’", "'")
-        .replace("_", " ")
-        .replace("\n", " ")
-        .replace("\t", " ")
-        .replace("…", "...")
-        .replace("»", '"')
-        .replace("«", '"')
-        .replace("\\", "")
-        .replace("”", '"')
-        .replace("„", '"')
-        .replace("´", "'")
-        .replace("-- --", "--")
-        .replace("--", " -- ")
-        .replace(". . .", "...")
-        .replace("’", "'")
-        .replace("“", '"')
-        .replace("“", '"')
-        .replace("‘", "'")
-        .replace("_", " ")
-        .replace("*", " ")
-        .replace("—", "-")
-        .replace("- -", "--")
-        .replace("•", " ")
-        .replace("^", " ")
-        .replace(">", " ")
-        .replace("■", " ")
-        .replace("/", " ")
-        .replace("––––", "...")
-        .replace("W⸺", "W")
-        .replace("`", "'")
-        .replace("<", " ")
-        .replace("{", " ")
-        .replace("Good-night", "Good night")
-        .replace("good-night", "good night")
-        .replace("good-bye", "goodbye")
-        .replace("Good-bye", "Goodbye")
-        .replace(" !", "!")
-        .replace(" ?", "?")
-        .replace(" ,", ",")
-        .replace(" .", ".")
-        .replace(" ;", ";")
-        .replace(" :", ":")
-        .replace("!!", "!")
-        .replace("--", "-")
-        .replace("“", '"')
-        .replace(", , ", ", ")
-        .replace("=", " ")
-        .replace("l,000", "1,000")
-        .replace("–", "-")
-    )
-    # remove dash in between the words
-    text = re.sub(r"([A-Za-z0-9]+)(-)([A-Za-z0-9]+)", r"\g<1> \g<3>", text)
-    text = re.sub(r"([A-Za-z0-9]+)(\.)([A-Za-z]+)", r"\g<1>\g<2> \g<3>", text)
-    text = re.sub(r"([A-Za-z]+)(\.)([A-Za-z0-9]+)", r"\g<1>\g<2> \g<3>", text)
-
-    # # remove text inside square brackets
-    # text = re.sub(r"(\[.*?\])", " ", text)
-
-    def __fix_space(text):
-        # remove commas between digits
-        text = re.sub(r"([0-9]+)(,)(\d\d\d)", r"\g<1>\g<3>", text)
-        text = re.sub(r"([A-Za-z]+)(,)([A-Za-z0-9]+)", r"\g<1>\g<2> \g<3>", text)
-        return text
-
-    for _ in range(3):
-        text = __fix_space(text)
-
-    text = re.sub(r" +", " ", text)
-
-    # make sure the text starts with an alpha
-    start_idx = 0
-    while not text[start_idx].isalpha():
-        start_idx += 1
-
-    end_text = "END OF THIS PROJECT GUTENBERG"
-    end_idx = len(text)
-    if end_text in text:
-        end_idx = text.find(end_text)
-
-    end_text = "End of the Project Gutenberg"
-    if end_text in text:
-        end_idx = text.find(end_text)
-
-    return text[start_idx:end_idx]
-
-
-def read_text(text_f):
-    with open(text_f, "r") as f:
-        text = f.read()
-    return text
-
-
-def remove_punctuation(text: str, remove_spaces=True, do_lower=True, exclude=None, remove_accents=False):
-    all_punct_marks = string.punctuation + "¿¡⸘"
-
-    if exclude is not None:
-        for p in exclude:
-            all_punct_marks = all_punct_marks.replace(p, "")
-
-        # a weird bug where commas is getting deleted when dash is present in the list of punct marks
-        all_punct_marks = all_punct_marks.replace("-", "")
-    text = re.sub("[" + all_punct_marks + "]", " ", text)
-
-    if exclude and "-" not in exclude:
-        text = text.replace("-", " ")
-
-    text = re.sub(r" +", " ", text)
-    if remove_spaces:
-        text = text.replace(" ", "").replace("\u00A0", "").strip()
-
-    if do_lower:
-        text = text.lower()
-
-    if remove_accents:
-        text = text.replace("á", "a")
-        text = text.replace("é", "e")
-        text = text.replace("í", "i")
-        text = text.replace("ó", "o")
-        text = text.replace("ú", "u")
-        text = text.replace("à", "a")
-        text = text.replace("è", "e")
-        text = text.replace("ù", "u")
-        text = text.replace("â", "a")
-        text = text.replace("ê", "e")
-        text = text.replace("î", "i")
-        text = text.replace("ô", "o")
-        text = text.replace("û", "u")
-
-    return text.strip()
-
-
-def recover_lines(manifest, processed_text, output_dir, restored_text_field):
-    manifest_recovered = f"{output_dir}/{os.path.basename(manifest)}"
-    if os.path.exists(manifest_recovered):
-        return
-
-    lines = []
-    with open(manifest, "r") as f:
-        for line in f:
-            line = json.loads(line)
-            lines.append(line["text"])
-
-    logger.debug(f"processing {manifest}")
-    logger.debug(f"processing - {len(lines)} lines")
-
-    last_found_start_idx = 0
-    recovered_lines = {}
-
-    for idx, cur_line in enumerate(lines):
-        stop_search_for_line = False
-        cur_word_idx = 0
-        cur_line = abbreviations(cur_line)
-        cur_line = cur_line.split()
-        end_match_found = False
-
-        while not stop_search_for_line:
-            cur_word = cur_line[cur_word_idx]
-
-            pattern = cur_word
-            max_start_match_len = min(4, len(cur_line))
-            for i in range(1, max_start_match_len):
-                pattern += f"[^A-Za-z]+{cur_line[i]}"
-
-            pattern = re.compile(pattern)
-
-            for i, m in enumerate(pattern.finditer(processed_text[last_found_start_idx:].lower())):
-                if end_match_found:
-                    break
-                match_idx = m.start() + last_found_start_idx
-                processed_text_list = processed_text[match_idx:].split()
-                raw_text_pointer = (
-                    len(cur_line) - 3
-                )  # added in case some dash separated words and split into multiple words in the cur_line
-                stop_end_search = False
-                right_offset = 20
-                while not end_match_found and raw_text_pointer <= len(processed_text_list) and not stop_end_search:
-                    if cur_line[-1].replace("'", "") == remove_punctuation(
-                        processed_text_list[raw_text_pointer - 1],
-                        remove_spaces=True,
-                        do_lower=True,
-                        remove_accents=False,
-                    ):
-                        # processed text could contain apostrophes that are parts of quotes, let's remove them from the processed text as well
-                        if "'" not in cur_line[-1] and "'" in processed_text_list[raw_text_pointer - 1]:
-                            processed_text_list[raw_text_pointer - 1] = processed_text_list[
-                                raw_text_pointer - 1
-                            ].replace("'", "")
-                        recovered_line = " ".join(processed_text_list[:raw_text_pointer])
-                        if not is_valid(" ".join(cur_line), recovered_line):
-                            raw_text_pointer += 1
-                        else:
-                            recovered_lines[idx] = recovered_line
-                            end_match_found = True
-                            raw_text_pointer += 1
-                            stop_search_for_line = True
-                            last_found_start_idx = raw_text_pointer
-
-                    else:
-                        raw_text_pointer += 1
-                        if raw_text_pointer > (len(cur_line) + right_offset):
-                            stop_end_search = True
-
-            if not end_match_found:
-                stop_search_for_line = True
-
-    logger.debug(
-        f"recovered {len(recovered_lines)} lines out of {len(lines)} -- {round(len(recovered_lines)/len(lines)*100, 2)}% -- {os.path.basename(manifest)}"
-    )
-
-    with open(manifest_recovered, "w") as f_out, open(manifest, "r") as f_in:
-        for idx, line in enumerate(f_in):
-            line = json.loads(line)
-            if idx in recovered_lines:
-                line[restored_text_field] = recovered_lines[idx]
-            else:
-                line[restored_text_field] = NA
-            f_out.write(json.dumps(line, ensure_ascii=False) + "\n")
-
-
-def split_text_into_sentences(text: str):
-    """
-    Split text into sentences.
-
-    Args:
-        text: text
-
-    Returns list of sentences
-    """
-    # TODO: should this be filled up and exposed as a parameter?
-    lower_case_unicode = ""
-    upper_case_unicode = ""
-
-    # end of quoted speech - to be able to split sentences by full stop
-    text = re.sub(r"([\.\?\!])([\"\'])", r"\g<2>\g<1> ", text)
-
-    # remove extra space
-    text = re.sub(r" +", " ", text)
-
-    # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences
-    matches = re.findall(rf"[a-z{lower_case_unicode}]\.\s[a-z{lower_case_unicode}]\.", text)
-    for match in matches:
-        text = text.replace(match, match.replace(". ", "."))
-
-    # Read and split transcript by utterance (roughly, sentences)
-    split_pattern = (
-        rf"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)"
-        rf"(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s(?![0-9]+[a-z]*\.)"
-    )
-    sentences = regex.split(split_pattern, text)
-    return sentences
-
-
-def normalize_text(text_f: str, normalizer: Optional['Normalizer'] = None):
-    """
-    Pre-process and normalized text_f file.
-
-    Args:
-        text_f: path to .txt file to normalize
-        normalizer:
-    """
-    raw_text = read_text(text_f)
-    processed_text = abbreviations(process(raw_text))
-    if normalizer is not None:
-        processed_text_list = normalizer.split_text_into_sentences(processed_text)
-    else:
-        processed_text_list = split_text_into_sentences(processed_text)
-    processed_text_list_merged = []
-    last_segment = ""
-    max_len = 7500
-    for i, text in enumerate(processed_text_list):
-        if len(last_segment) < max_len:
-            last_segment += " " + text
-        else:
-            processed_text_list_merged.append(last_segment.strip())
-            last_segment = ""
-
-        if i == len(processed_text_list) - 1 and len(last_segment) > 0:
-            processed_text_list_merged.append(last_segment.strip())
-
-    for i, text in enumerate(tqdm(processed_text_list_merged)):
-        if normalizer is not None:
-            processed_text_list_merged[i] = normalizer.normalize(
-                text=text, punct_post_process=True, punct_pre_process=True
-            )
-        else:
-            processed_text_list_merged[i] = re.sub(r"\d", r"", processed_text_list_merged[i])
-    processed_text = " ".join(processed_text_list_merged)
-    return processed_text
-
-
-import diff_match_patch as dmp_module
-
-dmp = dmp_module.diff_match_patch()
-dmp.Diff_Timeout = 0
-
-
-def is_valid(line, recovered_line):
-    """Checks that the restore line matches the original line in everything but casing and punctuation marks"""
-    line = abbreviations(line)
-    line_no_punc = remove_punctuation(line, remove_spaces=True, do_lower=True, remove_accents=True)
-    recovered_line_no_punc = remove_punctuation(recovered_line, remove_spaces=True, do_lower=True, remove_accents=True)
-
-    is_same = line_no_punc == recovered_line_no_punc
-
-    return is_same
-
-
-def process_book(book_manifest, texts_dir, submanifests_dir, output_dir, restored_text_field, normalizer):
-    book_id = os.path.basename(book_manifest).split(".")[0]
-    text_f = f"{texts_dir}/{book_id}.txt"
-    manifests = glob(f"{submanifests_dir}/{book_id}_*.json")
-    logger.info(f"{book_id} -- {len(manifests)} manifests")
-
-    # only continue (i.e. do not make early 'return') if there are {book_id}_{spk_id}.json files in submanifests_dir
-    # that are not in output dir - else return early
-    for book_id_spk_id in [os.path.basename(x).strip(".json") for x in manifests]:
-        if not os.path.exists(os.path.join(output_dir, f"{book_id_spk_id}.json")):
-            logger.info(f"Did not find {book_id_spk_id} in {output_dir} => will process this book")
-            break
-    else:
-        return
-
-    try:
-        processed_text = normalize_text(text_f, normalizer)
-        # re-run abbreviations since new are being added
-        processed_text = abbreviations(processed_text)
-        [
-            recover_lines(
-                manifest=manifest,
-                processed_text=processed_text,
-                output_dir=output_dir,
-                restored_text_field=restored_text_field,
-            )
-            for manifest in manifests
-        ]
-    except:
-        logger.info(f"{text_f} failed")
-        return
-
-
-class RestorePCForMLS(BaseProcessor):
-    """Recovers original text from the MLS Librivox texts.
-
-    This processor can be used to restore punctuation and capitalization for the
-    MLS data. Uses the original data in https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz.
-    Saves recovered text in ``restored_text_field`` field.
-    If text was not recovered, ``restored_text_field`` will be equal to ``n/a``.
-
-    Args:
-        language_long (str): the full name of the language, used for
-            choosing the folder of the contents of
-            "https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz".
-            E.g., "english", "spanish", "italian", etc.
-        language_short (str or None): the short name of the language, used for
-            specifying the normalizer we want to use. E.g., "en", "es", "it", etc.
-            If set to None, we will not try to normalize the provided Librivox text.
-        lv_text_dir (str): the directory where the contents of
-            https://dl.fbaipublicfiles.com/mls/lv_text.tar.gz will be saved.
-        submanifests_dir (str): the directory where submanifests (one for each
-            combo of speaker + book) will be stored.
-        restored_submanifests_dir (str): the directory where restored
-            submanifests (one for each combo of speaker + book) will be stored.
-        restored_text_field (str): the field where the recovered text will be stored.
-        n_jobs (int): number of jobs to use for parallel processing. Defaults to -1.
-        show_conversion_breakdown (bool): whether to show how much of each
-            submanifest was restored. Defaults to True.
-
-    Returns:
-        All the same data as in the input manifest with an additional key::
-
-            <restored_text_field>: <restored text or n/a if match was not found>``
-    """
-
-    def __init__(
-        self,
-        language_long: str,
-        language_short: Optional[str],
-        lv_text_dir: str,
-        submanifests_dir: str,
-        restored_submanifests_dir: str,
-        restored_text_field: str,
-        n_jobs: int = -1,
-        show_conversion_breakdown: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.language_long = language_long
-        self.language_short = language_short
-        self.lv_text_dir = Path(lv_text_dir)
-        self.submanifests_dir = Path(submanifests_dir)
-        self.restored_submanifests_dir = Path(restored_submanifests_dir)
-        self.restored_text_field = restored_text_field
-        self.n_jobs = n_jobs
-        self.show_conversion_breakdown = show_conversion_breakdown
-
-    def process(self):
-        """Main processing happens here.
-
-        * Download & extract lv_text.
-        * Create submanifests.
-        * Restore P&C to submanifests.
-        * Group back submanifests into a single manifest
-        """
-        from nemo_text_processing.text_normalization.normalize import Normalizer
-
-        os.makedirs(self.lv_text_dir, exist_ok=True)
-
-        # Download & extract lv_text.
-        download_file(MLS_TEXT_URL, str(self.lv_text_dir))
-        lv_text_data_folder = extract_archive(
-            str(self.lv_text_dir / os.path.basename(MLS_TEXT_URL)), str(self.lv_text_dir)
-        )
-
-        # Create submanifests
-        os.makedirs(self.submanifests_dir, exist_ok=True)
-
-        data = {}
-        with open(self.input_manifest_file, "r") as f:
-            for line in tqdm(f):
-                item = json.loads(line)
-                name = Path(item["audio_filepath"]).stem
-                reader_id, lv_book_id, sample_id = name.split("_")
-                key = f"{lv_book_id}_{reader_id}"
-                if key not in data:
-                    data[key] = {}
-                data[key][sample_id] = line
-
-        for key, v in data.items():
-            with open(f"{self.submanifests_dir}/{key}.json", "w") as f_out:
-                for sample_id in sorted(v.keys()):
-                    line = v[sample_id]
-                    f_out.write(line)
-
-        # Restore P&C to submanifests.
-        os.makedirs(str(self.restored_submanifests_dir), exist_ok=True)
-
-        if self.language_short:
-            try:
-                normalizer = Normalizer(
-                    input_case="cased",
-                    lang=self.language_short,
-                    cache_dir="CACHE_DIR",
-                    overwrite_cache=False,
-                    post_process=True,
-                )
-            except NotImplementedError:  # some languages don't support text normalization
-                logger.info(
-                    f"Could not find NeMo Normalizer for language {self.language_short}, so"
-                    " will not normalize the Librivox text before attempting to restore punctuation"
-                    " and capitalization."
-                )
-                normalizer = None
-        else:
-            logger.info(
-                f"`language_short` was not specified, so will not normalize the Librivox"
-                " text before attempting to restore punctuation and capitalization."
-            )
-            normalizer = None
-
-        # TODO: rename to maybe books_ids_in_datasplit
-        books_ids_in_submanifests = set([x.split("_")[0] for x in data.keys()])
-
-        Parallel(n_jobs=self.n_jobs)(
-            delayed(process_book)(
-                book_id,
-                str(Path(lv_text_data_folder) / self.language_long),
-                str(self.submanifests_dir),
-                str(self.restored_submanifests_dir),
-                self.restored_text_field,
-                normalizer,
-            )
-            for book_id in tqdm(books_ids_in_submanifests)
-        )
-
-        # get stats --- keep track of book/spk ids in  our datasplit
-        book_id_spk_ids_in_datasplit = set()  # set of tuples (book_id, spk_id), ...
-        original_manifest_duration = 0
-        with open(self.input_manifest_file, "r") as f:
-            for line in f:
-                line = json.loads(line)
-                book_id, spk_id = Path(line["audio_filepath"]).stem.split("_")[:2]
-                book_id_spk_ids_in_datasplit.add((book_id, spk_id))
-                original_manifest_duration += line["duration"]
-        logger.info(
-            f"duration ORIGINAL total (for current datasplit): {round(original_manifest_duration / 60 / 60, 2)} hrs"
-        )
-
-        # make dicts to record durations of manifests
-        filename_to_sub_manifest_durs = collections.defaultdict(float)
-        filename_to_restored_sub_manifest_durs = collections.defaultdict(float)
-
-        # duration in submanifests
-        for book_id, spk_id in book_id_spk_ids_in_datasplit:
-            manifest = os.path.join(self.submanifests_dir, f"{spk_id}_{book_id}.json")
-            with open(manifest, "r") as f:
-                for line in f:
-                    line = json.loads(line)
-                    filename_to_sub_manifest_durs[f"{spk_id}_{book_id}.json"] += line["duration"]
-
-        # duration in restored_submanifests
-        for book_id, spk_id in book_id_spk_ids_in_datasplit:
-            manifest = os.path.join(self.restored_submanifests_dir, f"{spk_id}_{book_id}.json")
-            if os.path.exists(manifest):
-                with open(manifest, "r") as f:
-                    for line in f:
-                        line = json.loads(line)
-                        if line[self.restored_text_field] != NA:
-                            filename_to_restored_sub_manifest_durs[f"{spk_id}_{book_id}.json"] += line["duration"]
-            else:
-                filename_to_restored_sub_manifest_durs[f"{spk_id}_{book_id}.json"] = 0
-
-        if self.show_conversion_breakdown:
-            for filename in filename_to_sub_manifest_durs.keys():
-                orig_dur = filename_to_sub_manifest_durs[filename]
-                restored_dur = filename_to_restored_sub_manifest_durs[filename]
-
-                pc_restored = 100 * restored_dur / orig_dur
-
-                logger.info(
-                    f"{filename}: {orig_dur/60:.2f} mins -> {restored_dur/60:.2f} mins\t({pc_restored:.2f}% restored)"
-                )
-
-        sub_manifest_duration = sum(list(filename_to_sub_manifest_durs.values()))
-        restored_manifest_duration = sum(list(filename_to_restored_sub_manifest_durs.values()))
-
-        logger.info("duration in submanifests (for current datasplit): %.2f hrs", sub_manifest_duration / 60 / 60)
-        logger.info(
-            "duration restored (for current datasplit): %.2f hrs (%.2f%%), lost: %.2f hrs",
-            restored_manifest_duration / 60 / 60,
-            restored_manifest_duration / sub_manifest_duration * 100,
-            (sub_manifest_duration - restored_manifest_duration) / 60 / 60,
-        )
-
-        logger.info(
-            "Combining restored manifest for current datasplit into single manifest at %s", self.output_manifest_file
-        )
-
-        # duration in restored_submanifests
-        with open(self.output_manifest_file, 'w') as fout:
-            for book_id, spk_id in book_id_spk_ids_in_datasplit:
-                manifest = os.path.join(self.restored_submanifests_dir, f"{spk_id}_{book_id}.json")
-                if os.path.exists(manifest):
-                    with open(manifest, "r") as fin:
-                        for line in fin:
-                            fout.write(line)
diff --git a/build/lib/sdp/processors/datasets/mtedx/__init__.py b/build/lib/sdp/processors/datasets/mtedx/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py b/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
deleted file mode 100644
index 8c39257b..00000000
--- a/build/lib/sdp/processors/datasets/mtedx/create_initial_manifest.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import os
-from pathlib import Path
-from typing import List
-import librosa
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-MTEDX_URL = "https://www.openslr.org/resources/100/mtedx_{language_id}.tgz"
-
-class CreateInitialManifestMTEDX(BaseParallelProcessor):
-    """Processor to create initial manifest for the Multilingual TEDx (MTedX dataset.
-
-        Dataset link: https://www.openslr.org/100/
-
-        Downloads dataset for the specified language and creates initial manifest with the provided
-        audio and vtt files.
-
-        Args:
-            raw_data_dir (str): the directory where the downloaded data will be/is saved.
-                                This is also where the extracted and processed data will be.
-            data_split (str): "train", "dev" or "test".
-            language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
-            target_samplerate (int): sample rate (Hz) to use for resampling.
-            already_extracted: (bool): if True, we will not try to extract the raw data.
-                Defaults to False.
-
-        Returns:
-            This processor generates an initial manifest file with the following fields::
-
-                {
-                    "audio_filepath": <path to the audio file>,
-                    "vtt_filepath": <path to the corresponding vtt file>
-                    "duration": <duration of the audio in seconds>
-                }
-        """
-    def __init__(
-            self,
-            raw_data_dir: str,
-            language_id: str,
-            data_split: str,
-            already_extracted: bool = False,
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.language_id = language_id
-        self.data_split = data_split
-        self.already_extracted = already_extracted
-
-    def prepare(self):
-        """Downloading and extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-
-        url = MTEDX_URL.format(language_id=self.language_id)
-        if not (self.raw_data_dir / f"mtedx_{self.language_id}.tgz").exists():
-            download_file(url, str(self.raw_data_dir))
-
-        if not self.already_extracted:
-            extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
-            
-        data_folder = Path(self.raw_data_dir) / f"{self.language_id}-{self.language_id}"/ "data"/ self.data_split
-        self.audio_path_prefix = Path(data_folder) / "wav"
-        self.vtt_path_prefix = Path(data_folder) / "vtt"
-
-    def read_manifest(self):
-        """Creating entries of initial manifest with flac and vtt files"""
-        audio_filepaths = []
-        for audio_file in os.listdir(self.audio_path_prefix):
-            vtt_filepath = os.path.join(self.vtt_path_prefix, audio_file.split('.')[0] + "." + self.language_id  + ".vtt")
-            audio_filepath = os.path.join(self.audio_path_prefix, audio_file)
-            audio_filepaths.append((audio_filepath, vtt_filepath))
-        return audio_filepaths
-
-    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
-        """Processing the data entries."""
-        audio_filepath, vtt_filepath = data_entry
-
-        data = {
-            'audio_filepath': audio_filepath,
-            'vtt_filepath': vtt_filepath,
-            'duration': float(librosa.get_duration(path=audio_filepath)),
-        }
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/slr102/__init__.py b/build/lib/sdp/processors/datasets/slr102/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
deleted file mode 100644
index 949ff1b9..00000000
--- a/build/lib/sdp/processors/datasets/slr102/create_initial_manifest.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
-# For example sudo apt-get install libsox-fmt-mp3
-import csv
-import glob
-import os
-from pathlib import Path
-
-from sox import Transformer
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.utils.common import download_file, extract_archive
-
-DATASET_URL = "https://www.openslr.org/resources/102/ISSAI_KSC_335RS_v1.1_flac.tar.gz"
-
-
-class CreateInitialManifestSLR102(BaseParallelProcessor):
-    """Processor to create initial manifest for the Kazakh Speech Corpus (KSC) / OpenSLR102 dataset.
-
-    Dataset link: https://www.openslr.org/resources/102/
-
-    Extracts raw data for the specified language and creates an initial manifest
-    using the transcripts provided in the raw data.
-
-    Args:
-        raw_data_dir (str): the path to the directory containing the raw data archive file.
-        extract_archive_dir (str): directory where the extracted data will be saved.
-        resampled_audio_dir (str): directory where the resampled audio will be saved.
-        data_split (str): "train", "dev" or "test".
-        target_samplerate (int): sample rate (Hz) to use for resampling.
-            Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process.
-            Defaults to 1.
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription (with capitalization and punctuation)>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        extract_archive_dir: str,
-        resampled_audio_dir: str,
-        data_split: str,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.extract_archive_dir = extract_archive_dir
-        self.resampled_audio_dir = resampled_audio_dir
-        self.data_split = data_split
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        """Extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*.tar.gz")
-
-        if not tar_gz_files:
-            download_file(DATASET_URL, self.raw_data_dir)
-
-        elif len(tar_gz_files) > 1:
-            raise RuntimeError(f"Expecting exactly one *.tar.gz file in directory {self.raw_data_dir}")
-
-        data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
-
-        self.audio_path_prefix = Path(data_folder, "Audios_flac")
-        self.transcription_path_prefix = Path(data_folder, "Transcriptions")
-        self.transcription_path_file = Path(data_folder, "Meta", self.data_split).with_suffix(".csv")
-
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-    def read_manifest(self):
-        if self.transcription_path_file is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        with open(self.transcription_path_file, "rt", encoding="utf8") as csvfile:
-            reader = csv.DictReader(csvfile, delimiter=" ")
-            next(reader, None)  # skip the headers
-            dataset_entries = [row["uttID"] for row in reader]
-        return dataset_entries
-
-    def process_dataset_entry(self, utt_id: str):
-        with open(Path(self.transcription_path_prefix, utt_id).with_suffix(".txt"), "rt") as txtfile:
-            transcript_text = " ".join(txtfile.readlines()).strip()
-
-        audio_path = Path(self.audio_path_prefix, utt_id).with_suffix(".flac")
-        output_wav_path = Path(self.resampled_audio_dir, utt_id).with_suffix(".wav")
-
-        if not os.path.exists(output_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
-
-        data = {
-            "audio_filepath": output_wav_path.as_posix(),
-            "text": transcript_text,
-        }
-
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/slr140/__init__.py b/build/lib/sdp/processors/datasets/slr140/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
deleted file mode 100644
index 2da79027..00000000
--- a/build/lib/sdp/processors/datasets/slr140/create_initial_manifest.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import sox
-from tqdm import tqdm
-from tqdm.contrib.concurrent import thread_map
-
-from sdp.logging import logger
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.common import download_file, extract_archive
-
-DATASET_URL = "https://www.openslr.org/resources/140/{audio}.zip"
-
-AVAILABLE_AUDIOS = [
-    'audio2',
-    'audio3',
-    'audio4',
-    'audio5',
-]
-
-SPLIT_STATS = {
-    'test': 1 / 6,
-    'dev': 1 / 12,
-}
-
-
-class CreateInitialManifestSLR140(BaseParallelProcessor):
-    """Processor to create initial manifest for the SLR140 dataset.
-
-    This is an open source Kazakh speech corpus developed by
-    the Department of Artificial Intelligence and Big Data of Al-Farabi Kazakh National University.
-
-    Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        audios (list | str): should be the subset of the AVAILABLE_AUDIOS or a string "all" for taking all the available audios
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        audios: Union[List[str], str],
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.audios = audios
-
-        if self.audios == "all":
-            self.audios = AVAILABLE_AUDIOS
-
-        if any([audio not in AVAILABLE_AUDIOS for audio in self.audios]):
-            raise ValueError(f"audios have to be one of {AVAILABLE_AUDIOS}")
-
-    def prepare(self):
-        """Downloading and extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        audio_urls = [DATASET_URL.format(audio=audio) for audio in self.audios]
-
-        thread_map(
-            download_file,
-            audio_urls,
-            [str(self.raw_data_dir)] * len(audio_urls),
-            max_workers=self.max_workers,
-            chunksize=self.chunksize,
-        )
-
-        for audio_url in audio_urls:
-            extract_archive(str(self.raw_data_dir / os.path.basename(audio_url)), str(self.raw_data_dir))
-
-        self.transcription_file = str(self.raw_data_dir / "{audio}" / "train.json")
-
-    def read_manifest(self):
-        if self.transcription_file is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        dataset_entries = []
-
-        for audio in self.audios:
-            transcription_file = self.transcription_file.format(audio=audio)
-
-            with open(transcription_file, "rt", encoding="utf-8-sig") as fin:
-                audio_dataset_entries = [json.loads(line) for line in fin.readlines()][0]
-                dataset_entries += audio_dataset_entries
-
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: str):
-        if len(data_entry) != 2:
-            raise RuntimeError(f"Input data is badly formatted! Bad line: {data_entry}")
-
-        audio_path = str(self.raw_data_dir / data_entry["wav"].replace("dataset/", ""))
-        data = {
-            "audio_filepath": audio_path,
-            "duration": float(sox.file_info.duration(audio_path)),
-            "text": data_entry["text"].strip(),
-        }
-
-        return [DataEntry(data=data)]
-
-
-class CustomDataSplitSLR140(BaseProcessor):
-    """Splits SLR140 data into train, dev or test subset.
-
-
-    Args:
-        data_split (str): "train", "dev" or "test".
-
-    Returns:
-        All the same fields as in the input manifest, but only a subset of
-        the data is retained.
-    """
-
-    def __init__(self, data_split: str, split_audio_dir: str, **kwargs):
-        super().__init__(**kwargs)
-        self.data_split = data_split
-        self.split_audio_dir = split_audio_dir
-
-    def process(self):
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-            manifest_data = [json.loads(line) for line in fin.readlines()]
-
-        # sorting and fixing random seed for reproducibility
-        manifest_data = sorted(manifest_data, key=lambda x: x['audio_filepath'])
-        sample_idxs = list(range(len(manifest_data)))
-        rng = np.random.RandomState(0)
-        rng.shuffle(sample_idxs)
-
-        duration = sum([x['duration'] for x in manifest_data])
-        validation_duration, test_duration = (duration * SPLIT_STATS['dev'], duration * SPLIT_STATS['test'])
-
-        split_data = {}
-        split_data['dev'] = self._accumulate_samples(manifest_data, sample_idxs, validation_duration)
-        split_data['test'] = self._accumulate_samples(manifest_data, sample_idxs, test_duration)
-        split_data['train'] = (
-            [manifest_data[x] for x in sample_idxs],
-            sum([manifest_data[x]['duration'] for x in sample_idxs]),
-        )
-
-        number_of_entries = 0
-        total_duration = 0
-
-        split_audio_dir = os.path.join(self.split_audio_dir, self.data_split)
-        os.makedirs(split_audio_dir, exist_ok=True)
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for data_entry in tqdm(split_data[self.data_split][0]):
-                audio_rel_path = os.path.relpath(
-                    data_entry['audio_filepath'], os.path.join(self.split_audio_dir, "audios")
-                )
-                split_filepath = os.path.join(split_audio_dir, audio_rel_path)
-                os.makedirs(os.path.dirname(split_filepath), exist_ok=True)
-                os.rename(data_entry['audio_filepath'], split_filepath)
-                data_entry['audio_filepath'] = split_filepath
-
-                json.dump(data_entry, fout, ensure_ascii=False)
-                number_of_entries += 1
-                total_duration += data_entry["duration"]
-                fout.write("\n")
-
-        logger.info("Total number of entries after processing: %d", number_of_entries)
-        logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
-
-    def _accumulate_samples(
-        self, manifest_data: List[dict], sample_idxs: List[int], duration_threshold: int
-    ) -> Tuple[List[dict], float]:
-        """Create a subset of the manifest data having duration less than duration_threshold.
-
-        Args:
-            manifest_data: data for the manifest file
-            sample_idxs: list of available indices to pick a sample from the manifest data
-            duration_threshold: maximum duration of the samples to be included in the subset
-
-        Returns:
-            tuple: The accumulated subset of the manifest data and total accumulated duration
-        """
-        accumulated_data = []
-        accumulated_duration = 0
-        while accumulated_duration <= duration_threshold:
-            sample_idx = sample_idxs.pop(0)
-            accumulated_data.append(manifest_data[sample_idx])
-            accumulated_duration += manifest_data[sample_idx]['duration']
-        return accumulated_data, accumulated_duration
diff --git a/build/lib/sdp/processors/datasets/slr83/__init__.py b/build/lib/sdp/processors/datasets/slr83/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py b/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
deleted file mode 100644
index 030360f7..00000000
--- a/build/lib/sdp/processors/datasets/slr83/create_initial_manifest.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import sox
-from tqdm import tqdm
-
-from sdp.logging import logger
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.common import download_file, extract_archive
-
-DATASET_URL = "https://www.openslr.org/resources/83/{dialect}.zip"
-
-AVAILABLE_DIALECTS = [
-    'irish_english_male',
-    'midlands_english_female',
-    'midlands_english_male',
-    'northern_english_female',
-    'northern_english_male',
-    'scottish_english_female',
-    'scottish_english_male',
-    'southern_english_female',
-    'southern_english_male',
-    'welsh_english_female',
-    'welsh_english_male',
-]
-
-EXPECTED_SPLIT_STATS = {
-    ('irish_english_male', 'test'): (102, 604.757),
-    ('irish_english_male', 'train'): (293, 1656.917),
-    ('irish_english_male', 'dev'): (53, 302.763),
-    ('midlands_english_female', 'test'): (90, 608.341),
-    ('midlands_english_female', 'train'): (94, 636.843),
-    ('midlands_english_female', 'dev'): (45, 306.261),
-    ('midlands_english_male', 'test'): (106, 604.672),
-    ('midlands_english_male', 'train'): (270, 1568.683),
-    ('midlands_english_male', 'dev'): (52, 301.227),
-    ('northern_english_female', 'test'): (267, 1803.435),
-    ('northern_english_female', 'train'): (330, 2146.816),
-    ('northern_english_female', 'dev'): (145, 906.496),
-    ('northern_english_male', 'test'): (587, 3607.467),
-    ('northern_english_male', 'train'): (1126, 7003.136),
-    ('northern_english_male', 'dev'): (298, 1807.957),
-    ('scottish_english_female', 'test'): (284, 1801.301),
-    ('scottish_english_female', 'train'): (426, 2681.344),
-    ('scottish_english_female', 'dev'): (142, 906.24),
-    ('scottish_english_male', 'test'): (612, 3603.883),
-    ('scottish_english_male', 'train'): (663, 3994.027),
-    ('scottish_english_male', 'dev'): (306, 1800.96),
-    ('southern_english_female', 'test'): (572, 3600.128),
-    ('southern_english_female', 'train'): (3124, 19213.312),
-    ('southern_english_female', 'dev'): (293, 1804.8),
-    ('southern_english_male', 'test'): (582, 3600.555),
-    ('southern_english_male', 'train'): (3295, 20210.773),
-    ('southern_english_male', 'dev'): (296, 1807.445),
-    ('welsh_english_female', 'test'): (239, 1805.739),
-    ('welsh_english_female', 'train'): (774, 5621.675),
-    ('welsh_english_female', 'dev'): (125, 905.387),
-    ('welsh_english_male', 'test'): (557, 3605.931),
-    ('welsh_english_male', 'train'): (726, 4660.651),
-    ('welsh_english_male', 'dev'): (286, 1805.909),
-}
-
-
-class CreateInitialManifestSLR83(BaseParallelProcessor):
-    """Processor to create initial manifest for the SLR83 dataset.
-
-    This is a dataset introduced in `Open-source Multi-speaker Corpora of the
-    English Accents in the British Isles <https://aclanthology.org/2020.lrec-1.804/>`_.
-
-    Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        dialect (str): should be one of the
-
-            * ``irish_english_male``
-            * ``midlands_english_female``
-            * ``midlands_english_male``
-            * ``northern_english_female``
-            * ``northern_english_male``
-            * ``scottish_english_female``
-            * ``scottish_english_male``
-            * ``southern_english_female``
-            * ``southern_english_male``
-            * ``welsh_english_female``
-            * ``welsh_english_male``
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        dialect: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.dialect = dialect
-        if dialect not in AVAILABLE_DIALECTS:
-            raise ValueError(f"dialect has to be one of {AVAILABLE_DIALECTS}")
-
-    def prepare(self):
-        """Downloading and extracting data (unless already done)."""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        url = DATASET_URL.format(dialect=self.dialect)
-
-        if not (self.raw_data_dir / f"{self.dialect}.zip").exists():
-            download_file(url, str(self.raw_data_dir))
-
-        extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
-        self.transcription_file = str(self.raw_data_dir / "line_index.csv")
-
-    def read_manifest(self):
-        if self.transcription_file is None:
-            raise RuntimeError("self.process has to be called before processing the data.")
-
-        with open(self.transcription_file, "rt", encoding="utf8") as fin:
-            dataset_entries = fin.readlines()
-
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: str):
-        split_entry = data_entry.split(", ")
-        if len(split_entry) != 3:
-            raise RuntimeError(f"Input data is badly formatted! Bad line: {data_entry}")
-
-        _, utt_id, transcript_text = split_entry
-        audio_path = str(self.raw_data_dir / (utt_id + ".wav"))
-        data = {
-            "audio_filepath": audio_path,
-            "duration": float(sox.file_info.duration(audio_path)),
-            "text": transcript_text.strip(),
-        }
-
-        return [DataEntry(data=data)]
-
-
-class CustomDataSplitSLR83(BaseProcessor):
-    """Splits SLR83 data into train, dev or test subset.
-
-    The original paper does not provide train/dev/test splits, so we include a
-    custom processing that can be used as a standardized split to compare
-    results. For more details on this data split see `Damage Control During
-    Domain Adaptation for Transducer Based Automatic Speech Recognition
-    <https://arxiv.org/abs/2210.03255>`_.
-
-    .. note::
-        All data dropping has to be done before the split. We will check the
-        total number of files to be what is expected in the reference split.
-        But if you add any custom pre-processing that changes duration or
-        number of files, your splits will likely be different.
-
-    Args:
-        dialect (str): same as in the :class:`sdp.processors.CreateInitialManifestSLR83`.
-        data_split (str): "train", "dev" or "test".
-
-    Returns:
-        All the same fields as in the input manifest, but only a subset of
-        the data is retained.
-    """
-
-    def __init__(self, dialect, data_split, **kwargs):
-        super().__init__(**kwargs)
-        self.dialect = dialect
-        self.data_split = data_split
-
-    def process(self):
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-            manifest_data = [json.loads(line) for line in fin.readlines()]
-
-        # sorting and fixing random seed for reproducibility
-        manifest_data = sorted(manifest_data, key=lambda x: x['audio_filepath'])
-        sample_idxs = list(range(len(manifest_data)))
-        rng = np.random.RandomState(0)
-        rng.shuffle(sample_idxs)
-
-        duration = sum([x['duration'] for x in manifest_data])
-        validation_duration, test_duration = 1800, 3600  # 30 minutes, 1 hour
-        if duration <= 3600:  # 1 hour
-            validation_duration, test_duration = 300, 600  # 5 minutes, 10 minutes
-        elif duration > 3600 and duration <= 9000:  # 2.5 hours
-            validation_duration, test_duration = 900, 1800  # 15 minutes, 30 minutes
-
-        split_data = {}
-        split_data['dev'] = self._accumulate_samples(manifest_data, sample_idxs, validation_duration)
-        split_data['test'] = self._accumulate_samples(manifest_data, sample_idxs, test_duration)
-        split_data['train'] = (
-            [manifest_data[x] for x in sample_idxs],
-            sum([manifest_data[x]['duration'] for x in sample_idxs]),
-        )
-
-        for split in ['train', 'dev', 'test']:
-            actual_stats = (len(split_data[split][0]), round(split_data[split][1], 3))
-            if EXPECTED_SPLIT_STATS[(self.dialect, split)] != actual_stats:
-                raise RuntimeError(
-                    f"Generated split stats (num files, duration) = {actual_stats}. "
-                    f"But expected to see {EXPECTED_SPLIT_STATS[(self.dialect, split)]}. "
-                    f"Did you add some custom pre-processing that changes number of files or duration?"
-                )
-
-        number_of_entries = 0
-        total_duration = 0
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for data_entry in tqdm(split_data[self.data_split][0]):
-                json.dump(data_entry, fout, ensure_ascii=False)
-                number_of_entries += 1
-                total_duration += data_entry["duration"]
-                fout.write("\n")
-
-        logger.info("Total number of entries after processing: %d", number_of_entries)
-        logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
-
-    def _accumulate_samples(
-        self, manifest_data: List[dict], sample_idxs: List[int], duration_threshold: int
-    ) -> Tuple[List[dict], float]:
-        """Create a subset of the manifest data having duration less than duration_threshold.
-
-        Args:
-            manifest_data: data for the manifest file
-            sample_idxs: list of available indices to pick a sample from the manifest data
-            duration_threshold: maximum duration of the samples to be included in the subset
-
-        Returns:
-            tuple: The accumulated subset of the manifest data and total accumulated duration
-        """
-        accumulated_data = []
-        accumulated_duration = 0
-        while accumulated_duration <= duration_threshold:
-            sample_idx = sample_idxs.pop(0)
-            accumulated_data.append(manifest_data[sample_idx])
-            accumulated_duration += manifest_data[sample_idx]['duration']
-        return accumulated_data, accumulated_duration
diff --git a/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py b/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
deleted file mode 100644
index d9155f92..00000000
--- a/build/lib/sdp/processors/datasets/uzbekvoice/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py b/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
deleted file mode 100644
index 27117f2a..00000000
--- a/build/lib/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import json
-import os
-import typing
-import gdown
-
-from sdp.processors.base_processor import BaseProcessor
-from sdp.utils.common import extract_archive
-from sdp.logging import logger
-
-
-class CreateInitialManifestUzbekvoice(BaseProcessor):
-    """
-    Processor to create initial manifest for the Uzbekvoice dataset.
-
-    Will download all files, extract them, and create a manifest file with the
-    "audio_filepath", "text" and "duration" fields.
-
-    Args:    
-        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "text": <transcription>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = raw_data_dir
-
-    def download_extract_files(self, dst_folder: str) -> None:
-        """downloading and extracting files"""
-
-        os.makedirs(dst_folder, exist_ok=True)
-
-        # downloading all files
-        # for big files google drive doesn't allow to try downlaoding them more than once
-        # so, in case of receiveing gdown error we need to download them manually
-
-        #check if clisp.zip and uzbekvoice-dataset.zip are already in dst_folder
-        if os.path.exists(os.path.join(dst_folder, 'clips.zip')) and os.path.exists(os.path.join(dst_folder, 'uzbekvoice-dataset.zip')):
-            print("Files already exist in the folder. Skipping download.")
-        else:
-            print(f"Downloading files from {self.URL}...")
-            try:
-                gdown.download_folder(self.URL, output=dst_folder)
-            except Exception as e:
-                print("Error occured while downloading files from google drive. Please download them manually.")
-                print("URL: ", self.URL)
-                print("Error: ", e)
-        for file in glob.glob(os.path.join(dst_folder, '*.zip')):
-            extract_archive(file, str(dst_folder), force_extract=True)
-            print(f"Extracted {file}")
-
-
-    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
-        """
-        Parse transcript JSON file and put it inside manifest.
-        """
-
-        entries = []
-        root = os.path.join(self.raw_data_dir, 'clips')
-        number_of_entries = 0
-        total_duration = 0
-        # parse json file and collect audio file path, transcript and lenght in entries
-        with open(file_path, encoding="utf-8") as fin:
-            data = json.load(fin)
-            for entry in data:
-                audio_file = os.path.join(root, entry["client_id"], entry["original_sentence_id"] + '.mp3')
-                transcript = entry["original_sentence"]
-                utter_length = entry["clip_duration"]
-                number_of_entries += 1
-                entries.append(
-                    {
-                        "audio_filepath": os.path.abspath(audio_file), 
-                        "text": transcript, 
-                        "duration": utter_length
-                    }
-                )
-            
-
-            logger.info("Total number of entries after processing: %d", number_of_entries)
-            logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
-
-        return entries
-
-    def process_data(self, data_folder: str, manifest_file: str) -> None:
-        entries = self.process_transcript(os.path.join(data_folder, "uzbekvoice-dataset", "voice_dataset.json"))
-
-        with open(manifest_file, "w", encoding="utf-8") as fout:
-            for m in entries:
-                fout.write(json.dumps(m, ensure_ascii=False) + "\n")
-
-
-
-    def process(self):
-        self.download_extract_files(self.raw_data_dir)
-        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/__init__.py b/build/lib/sdp/processors/datasets/voxpopuli/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
deleted file mode 100644
index b7c47ba5..00000000
--- a/build/lib/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-from pathlib import Path
-
-import sox
-from sox import Transformer
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-VOXPOPULI_URL = "https://github.com/facebookresearch/voxpopuli"
-
-
-class CreateInitialManifestVoxpopuli(BaseParallelProcessor):
-    """Processor to create initial manifest for the VoxPopuli dataset.
-
-    Dataset link: https://github.com/facebookresearch/voxpopuli/
-
-    Downloads and unzips raw VoxPopuli data for the specified language,
-    and creates an initial manifest using the transcripts provided in the
-    raw data.
-
-    .. note::
-        This processor will install a couple of Python packages, including
-        PyTorch, so it might be a good idea to run it in an isolated Python
-        environment.
-
-    Args:
-        raw_data_dir (str): the directory where the downloaded data will be/is saved.
-        language_id (str): the language of the data you wish to be downloaded.
-            E.g., "en", "es", "it", etc.
-        data_split (str): "train", "dev" or "test".
-        resampled_audio_dir (str): the directory where the resampled wav
-            files will be stored.
-        target_samplerate (int): sample rate (Hz) to use for resampling.
-            Defaults to 16000.
-        target_nchannels (int): number of channels to create during resampling process.
-            Defaults to 1.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription (with provided normalization)>,
-                "raw_text": <original transcription (without normalization)>,
-                "speaker_id": <speaker id>,
-                "gender": <speaker gender>,
-                "age": <speaker age>,
-                "is_gold_transcript": <whether the transcript has been verified>,
-                "accent": <speaker accent, if known>,
-            }
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        language_id: str,
-        data_split: str,
-        resampled_audio_dir: str,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.language_id = language_id
-        self.data_split = data_split
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        """Downloading data (unless already done)"""
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-        if not (self.raw_data_dir / "transcribed_data").exists():
-            # TODO: some kind of isolated environment?
-            if not os.path.exists(self.raw_data_dir / 'voxpopuli'):
-                logger.info("Downloading voxpopuli and installing requirements")
-                subprocess.run(f"git clone {VOXPOPULI_URL} {self.raw_data_dir / 'voxpopuli'}", check=True, shell=True)
-                subprocess.run(
-                    f"pip install -r {self.raw_data_dir / 'voxpopuli' / 'requirements.txt'}", check=True, shell=True
-                )
-            if not os.path.exists(self.raw_data_dir / 'raw_audios'):
-                logger.info("Downloading raw audios")
-                subprocess.run(
-                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
-                    f"python -m voxpopuli.download_audios --root {self.raw_data_dir} --subset asr",
-                    check=True,
-                    shell=True,
-                )
-            if not os.path.exists(self.raw_data_dir / 'transcribed_data' / self.language_id):
-                logger.info("Segmenting and transcribing the data")
-                subprocess.run(
-                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
-                    f"python -m voxpopuli.get_asr_data  --root {self.raw_data_dir} --lang {self.language_id}",
-                    check=True,
-                    shell=True,
-                )
-
-    def read_manifest(self):
-        with open(
-            self.raw_data_dir / "transcribed_data" / self.language_id / f"asr_{self.data_split}.tsv",
-            "rt",
-            encoding="utf8",
-        ) as fin:
-            dataset_entries = fin.readlines()[1:]  # skip header line
-
-        return dataset_entries
-
-    def process_dataset_entry(self, data_entry: str):
-        if len(data_entry.split("\t")) != 8:
-            raise RuntimeError(f"have more/less than 7 tabs in line {data_entry}")
-
-        utt_id, raw_text, norm_text, spk_id, _, gender, is_gold_transcript, accent = data_entry.split("\t")
-        year = utt_id[:4]
-
-        src_audio_path = os.path.join(self.raw_data_dir, "transcribed_data", self.language_id, year, utt_id + ".ogg")
-        tgt_wav_path = os.path.join(self.resampled_audio_dir, utt_id + ".wav")
-
-        if not os.path.exists(os.path.dirname(tgt_wav_path)):
-            os.makedirs(os.path.dirname(tgt_wav_path), exist_ok=True)
-        if not os.path.exists(tgt_wav_path):
-            tfm = Transformer()
-            tfm.rate(samplerate=self.target_samplerate)
-            tfm.channels(n_channels=self.target_nchannels)
-            tfm.build(input_filepath=src_audio_path, output_filepath=tgt_wav_path)
-
-        data = {
-            "audio_filepath": tgt_wav_path,
-            "duration": float(sox.file_info.duration(tgt_wav_path)),
-            "text": norm_text,
-            "raw_text": raw_text,
-            "speaker_id": spk_id,
-            "gender": gender,
-            "is_gold_transcript": is_gold_transcript,
-            "accent": accent,
-        }
-        return [DataEntry(data=data)]
diff --git a/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py b/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
deleted file mode 100644
index ebe86083..00000000
--- a/build/lib/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import string
-from typing import Dict
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-def is_same(orig_word, norm_word):
-    # word is the same, except last symbol, which could indicate punctuation
-    if orig_word[-1] in string.punctuation and orig_word[:-1].lower() == norm_word.lower():
-        return True, 1
-    # word is the same, except last symbol, which could indicate punctuation
-    # (but by mistake it's been put in norm text)
-    if norm_word[-1] in string.punctuation and norm_word[:-1].lower() == orig_word.lower():
-        return True, 0
-    # word is the same, but casing could be different
-    if orig_word.lower() == norm_word.lower():
-        return True, 1
-
-    return False, None
-
-
-def restore_pc(orig_words, norm_words):
-    # separate out any "¿" so they have a space either side
-    orig_words = orig_words.replace("¿", " ¿ ")
-    orig_words = " ".join(orig_words.split())
-    norm_words = norm_words.replace("¿", " ¿ ")
-    norm_words = " ".join(norm_words.split())
-
-    orig_words_list = orig_words.split()
-    norm_words_list = norm_words.split()
-
-    # copy so not to corrupt
-    # merging any commas and dots between numbers right away to simplify logic below
-    orig_text = list([re.sub(r'(\d)[\.,](\d)', r"\1\2", word) for word in orig_words_list])
-    norm_text = list(norm_words_list)
-    # to simplify logic below, so that we can assume last word always matches
-    orig_text.append("end_text")
-    norm_text.append("end_text")
-
-    idx_orig = 0
-    idx_norm = 0
-    merged_text = []
-    while idx_orig < len(orig_text) and idx_norm < len(norm_text):
-        same, is_orig = is_same(orig_text[idx_orig], norm_text[idx_norm])
-        if same:
-            merged_text.append(orig_text[idx_orig] if is_orig else norm_text[idx_norm])
-            idx_orig += 1
-            idx_norm += 1
-            continue
-
-        # add all "¿" 'words' in orig_text (which didnt have match in norm_text) to merged_text
-        if orig_text[idx_orig] == "¿":
-            merged_text.append("¿")
-            idx_orig += 1
-            continue
-
-        # checking if first letter is a number, but the whole word is not - that happens
-        # on typos like 37a which should really be 37 a. So fixing those
-        # another case is for number + punctuation, like 2017, - handling separately
-        # another case is for numbers separated by comma, like this "1,5". Those are spelled out
-        # separately in normalized form, so just removing the comma here
-        add_punct = ""
-        if orig_text[idx_orig][0].isdigit() and not orig_text[idx_orig].isdigit():
-            number, word = re.split('(\d+)', orig_text[idx_orig])[1:]
-            orig_text[idx_orig] = number
-            if word in string.punctuation:
-                add_punct = word
-            else:
-                orig_text.insert(idx_orig + 1, word)
-
-        # another annoying case is if typo ends with number like here "dell'11"
-        # same logic, but need to go back to the first check, so doing "continue" below
-        if orig_text[idx_orig][-1].isdigit() and not orig_text[idx_orig].isdigit():
-            word, number = re.split('(\d+)', orig_text[idx_orig])[:-1]
-            orig_text[idx_orig] = word
-            orig_text.insert(idx_orig + 1, number)
-            continue
-
-        # word is different, but original is a number - take from normalized in this case until
-        # get same word again (as number might be represented with multiple words)
-        # also handling case for number + punctuation
-        while orig_text[idx_orig].isdigit():
-            idx_orig += 1
-
-        while idx_norm < len(norm_text) and not is_same(orig_text[idx_orig], norm_text[idx_norm])[0]:
-            merged_text.append(norm_text[idx_norm])
-            idx_norm += 1
-
-        # if there is any trailing punctuation from last digit, let's add it
-        merged_text[-1] = merged_text[-1] + add_punct
-
-    if idx_norm != len(norm_text):
-        print(idx_orig, idx_norm, len(orig_text), len(norm_text), orig_text, norm_text, merged_text)
-        raise RuntimeError("Something went wrong during merging")
-
-    # merging all "¿ " to the next word and removing end_text token
-    norm_text = " ".join(merged_text[:-1]).replace("¿ ", "¿")
-
-    return norm_text
-
-
-class NormalizeFromNonPCTextVoxpopuli(BaseParallelProcessor):
-    """Tries to restore punctuation and capitalization from the un-normalized text version.
-
-    VoxPopuli contains two versions of the transcription - original (non-normalized,
-    but with punctuation and capitalization) and normalized (without punctuation or capitalization),
-    but with digits and other forms normalized. This processor can be used
-    to map the normalized and non-normalized versions and produce a normalized
-    version with restored punctuation and capitalization.
-
-    .. note::
-        The current map logic is highly heuristical and might not work for all
-        languages. The processor will return ``n/a`` for any text it was not able
-        to restore, so make sure you check how much data was removed and
-        consider updating the heuristics to retain more data.
-
-    Args:
-        restored_text_field (str): the field where the recovered text (or ``n/a``)
-            will be stored. Defaults to "text".
-        raw_text_key (str): which field contains the original text without normalization.
-            Defaults to "raw_text".
-        norm_text_key (str): which field contains the normalized text.
-            Defaults to "provided_norm_text".
-
-    Returns:
-        All the same data as in the input manifest with an additional key::
-
-            <restored_text_field>: <restored text or n/a if mapping failed>``
-    """
-
-    def __init__(
-        self,
-        restored_text_field: str = "text",
-        raw_text_key: str = "raw_text",
-        norm_text_key: str = "provided_norm_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.restored_text_field = restored_text_field
-        self.raw_text_key = raw_text_key
-        self.norm_text_key = norm_text_key
-
-    def process_dataset_entry(self, data_entry: Dict):
-        try:
-            restored_norm_text = restore_pc(data_entry[self.raw_text_key], data_entry[self.norm_text_key])
-        except:
-            logger.warning(
-                f"Failed to restore normalization.\nRaw text: %s\nNormalized text: %s",
-                data_entry[self.raw_text_key],
-                data_entry[self.norm_text_key],
-            )
-            restored_norm_text = "n/a"
-        data_entry[self.restored_text_field] = restored_norm_text
-        return [DataEntry(data=data_entry)]
diff --git a/build/lib/sdp/processors/huggingface/__init__.py b/build/lib/sdp/processors/huggingface/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/huggingface/create_initial_manifest.py b/build/lib/sdp/processors/huggingface/create_initial_manifest.py
deleted file mode 100644
index e8abfb55..00000000
--- a/build/lib/sdp/processors/huggingface/create_initial_manifest.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import glob
-
-import soundfile as sf
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.logging import logger
-from typing import Optional
-
-class CreateInitialManifestHuggingFace(BaseParallelProcessor):
-    """Processor to create initial manifest for HuggingFace dataset.
-
-    Downloads HuggingFace dataset and creates an initial manifest.
-
-    Args:
-        dataset_name (str): the name of the dataset. E.g., "tarteel-ai/everyayah"
-        raw_data_dir (str): the path to the directory containing the raw dataset files.
-        resampled_audio_dir (str): directory where the resampled audio will be saved.
-        data_split (str): "train", "validation" or "test".
-        already_downloaded (bool): if True, we will not try to load dataset from HuggingFace.
-            Defaults to False.
-        target_samplerate (int): sample rate (Hz) to use for resampling.
-            Defaults to 16000.
-
-    Returns:
-        This processor generates an initial manifest file with the following fields::
-        
-            {
-                "audio_filepath": <path to the audio file>,
-                "duration": <duration of the audio in seconds>,
-                "text": <transcription (with capitalization and punctuation)>,
-            }
-    """
-
-    def __init__(
-        self,
-        dataset_name: str,
-        resampled_audio_dir: str,
-        data_split: str,
-        raw_data_dir: Optional[str] = None,
-        already_downloaded: bool = False,
-        target_samplerate: int = 16000,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.data_split = data_split
-        self.target_samplerate = target_samplerate
-        self.resampled_audio_dir = resampled_audio_dir
-        self.dataset_name = dataset_name
-        self.raw_data_dir = raw_data_dir
-        self.already_downloaded = already_downloaded
-
-    def prepare(self):
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-    def read_manifest(self):
-        import datasets
-        
-        # checking if dataset should be loaded from disk
-        if self.already_downloaded:
-            if os.path.exists(self.raw_data_dir):
-                hf_files = glob.glob(f'{self.raw_data_dir}/*.hf')
-                self.dataset = datasets.load_from_disk(os.path.join(self.raw_data_dir, hf_files[0]))
-            else:
-                logger.info("Dataset not found locally. Initiating download from Hugging Face.")
-        else:
-            logger.info(f"Initiating download of dataset '{self.dataset_name}' from Hugging Face.")
-            self.dataset = datasets.load_dataset(self.dataset_name, split=self.data_split)
-            logger.info(f"Finished download of dataset '{self.dataset_name}' from Hugging Face.")
-        return range(0, len(self.dataset))
-
-    def process_dataset_entry(self, data_id):
-        sample_data = self.dataset[data_id]
-        sample_audio = sample_data["audio"]["array"]
-        audio_filepath = os.path.join(self.resampled_audio_dir, f"{data_id}.wav")
-        sf.write(
-            audio_filepath,
-            sample_audio,
-            self.target_samplerate,
-        )
-        duration = len(sample_audio) / self.target_samplerate
-        text = sample_data["text"]
-
-        return [
-            DataEntry(
-                data={
-                    "audio_filepath": os.path.join("audios", f"{data_id}.wav"),
-                    "duration": duration,
-                    "text": text,
-                }
-            )
-        ]
\ No newline at end of file
diff --git a/build/lib/sdp/processors/huggingface/speech_recognition.py b/build/lib/sdp/processors/huggingface/speech_recognition.py
deleted file mode 100644
index 2e64e7c4..00000000
--- a/build/lib/sdp/processors/huggingface/speech_recognition.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-
-from tqdm import tqdm
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseProcessor
-from sdp.utils.common import load_manifest
-from typing import Optional
-
-class ASRTransformers(BaseProcessor):
-    """This processor transcribes audio files using HuggingFace ASR Transformer models.
-
-    It processes audio files from the manifest and adds transcriptions using the specified
-    pre-trained model from HuggingFace.
-
-    Args:
-        pretrained_model (str): Name of pretrained model on HuggingFace.
-        output_text_key (str): Key to save transcription result in the manifest.
-        input_audio_key (str): Key to read audio file paths from the manifest. Default: "audio_filepath".
-        input_duration_key (str): Key for audio duration in the manifest. Default: "duration".
-        device (str): Inference device (e.g., "cuda", "cpu"). Default: None.
-        batch_size (int): Inference batch size. Default: 1.
-        chunk_length_s (int): Length of audio chunks in seconds. Default: 0.
-        torch_dtype (str): Tensor data type for model inference. Default: "float32".
-        generate_task (str): Task type for generation. Default: "transcribe".
-        generate_language (str): Language for generation. Default: "english".
-        max_new_tokens (int, Optional): Maximum number of new tokens to generate. Default: None.
-
-    Returns:
-        A manifest with transcribed text added to each entry under the specified output_text_key.
-
-    """
-
-    def __init__(
-        self,
-        pretrained_model: str,
-        output_text_key: str,
-        input_audio_key: str = "audio_filepath",
-        input_duration_key: str = "duration",
-        device: str = None,
-        batch_size: int = 1,
-        chunk_length_s: int = 0,
-        torch_dtype: str = "float32",
-        generate_task: str = "transcribe",
-        generate_language: str = "english",
-        max_new_tokens: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        try:
-            import torch
-            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-        except:
-            raise ImportError("Need to install transformers: pip install accelerate transformers")
-
-        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
-        self.pretrained_model = pretrained_model
-        self.input_audio_key = input_audio_key
-        self.output_text_key = output_text_key
-        self.input_duration_key = input_duration_key
-        self.device = device
-        self.batch_size = batch_size
-        self.chunk_length_s = chunk_length_s
-        self.generate_task = generate_task
-        self.generate_language = generate_language
-        self.max_new_tokens = max_new_tokens
-        if torch_dtype == "float32":
-            self.torch_dtype = torch.float32
-        elif torch_dtype == "float16":
-            self.torch_dtype = torch.float16
-        else:
-            raise NotImplementedError(torch_dtype + " is not implemented!")
-
-        if self.device is None:
-            if torch.cuda.is_available():
-                self.device = "cuda:0"
-            else:
-                self.device = "cpu"
-
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-        self.model.to(self.device)
-
-        # Check if using Whisper/Seamless or NVIDIA model based on the model name
-        self.is_whisper_or_seamless = any(x in self.pretrained_model.lower() for x in ['whisper', 'seamless'])
-        
-        # Only set language in generation config for Whisper/Seamless models
-        if self.is_whisper_or_seamless and self.generate_language:
-            self.model.generation_config.language = self.generate_language
-
-        processor = AutoProcessor.from_pretrained(self.pretrained_model)
-
-        self.pipe = pipeline(
-            "automatic-speech-recognition",
-            model=self.model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            max_new_tokens=self.max_new_tokens,
-            chunk_length_s=self.chunk_length_s,
-            batch_size=self.batch_size,
-            return_timestamps=self.is_whisper_or_seamless,  # Only set return_timestamps for Whisper/Seamless models
-            torch_dtype=self.torch_dtype,
-            device=self.device,
-        )
-
-    def process(self):
-        json_list = load_manifest(Path(self.input_manifest_file))
-        json_list_sorted = sorted(json_list, key=lambda d: d[self.input_duration_key], reverse=True)
-
-        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-
-        with Path(self.output_manifest_file).open("w") as f:
-            start_index = 0
-            for _ in tqdm(range(len(json_list_sorted) // self.batch_size)):
-                batch = json_list_sorted[start_index : start_index + self.batch_size]
-                start_index += self.batch_size
-                audio_files = [item[self.input_audio_key] for item in batch]
-                
-                # Only pass generate_kwargs for Whisper/Seamless models
-                if self.is_whisper_or_seamless and self.generate_language and self.generate_task:
-                    results = self.pipe(
-                        audio_files, generate_kwargs={"language": self.generate_language, "task": self.generate_task}
-                    )
-                else:
-                    results = self.pipe(audio_files)
-
-                for i, item in enumerate(batch):
-                    item[self.output_text_key] = results[i]["text"]
-                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/langs/__init__.py b/build/lib/sdp/processors/langs/__init__.py
deleted file mode 100644
index 4fc50543..00000000
--- a/build/lib/sdp/processors/langs/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/processors/langs/arabic.py b/build/lib/sdp/processors/langs/arabic.py
deleted file mode 100644
index 2ebe444b..00000000
--- a/build/lib/sdp/processors/langs/arabic.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import re
-import unicodedata
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-# Arabic letters
-HAMZA = "\u0621"
-ALEF_MADDA = "\u0622"
-ALEF_HAMZA_ABOVE = "\u0623"
-WAW_HAMZA = "\u0624"
-ALEF_HAMZA_BELOW = "\u0625"
-YEH_HAMZA_ABOVE = "\u0626"
-ALEF = "\u0627"
-BEH = "\u0628"
-TEH_MARBUTA = "\u0629"
-TEH = "\u062A"
-THEH = "\u062B"
-JEEM = "\u062C"
-HAH = "\u062D"
-KHAH = "\u062E"
-DAL = "\u062F"
-THAL = "\u0630"
-REH = "\u0631"
-ZAIN = "\u0632"
-SEEN = "\u0633"
-SHEEN = "\u0634"
-SAD = "\u0635"
-DAD = "\u0636"
-TAH = "\u0637"
-ZAH = "\u0638"
-AIN = "\u0639"
-GHAIN = "\u063A"
-FEH = "\u0641"
-QAF = "\u0642"
-KAF = "\u0643"
-LAM = "\u0644"
-MEEM = "\u0645"
-NOON = "\u0646"
-HEH = "\u0647"
-WAW = "\u0648"
-ALEF_MAKSURA = "\u0649"
-YEH = "\u064A"
-
-# Harakats (diacritics)
-FATHAT = "\u064E"
-KASRAH = "\u0650"
-DAMMAH = "\u064F"
-SUKUN = "\u0652"
-SHADDAH = "\u0651"
-KASRATAN = "\u064D"
-DAMMATAN = "\u064C"
-FATHATAN = "\u064B"
-
-# Ligatures
-LAM_ALEF = u'\uFEFB'
-LAM_ALEF_HAMZA_ABOVE = u'\uFEF7'
-LAM_ALEF_HAMZA_BELOW = u'\uFEF9'
-LAM_ALEF_MADDA_ABOVE = u'\uFEF5'
-LIGATURES=(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
-
-# Punctuation marks
-QUESTION_MARK = "\u061F"
-SAMICOLON = "\u061B"
-COMMA = "\u060C"
-
-DIACRITICS = [chr(x) for x in range(0x0600, 0x06ff) if unicodedata.category(chr(x)) == "Mn"]
-PUNCTUATION_MARKS = ["?", "!", ":", ";", "-", ".", ",", "؟","،", "؛"]
-ALEFS = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW)
-
-class ArabicTextPreprocessor(BaseParallelProcessor):
-    """Class for Arabic text preprocessing.
-
-    Operates on the text in the ``input_text_key``, and saves output text in
-    the ``output_text_key``.
-    
-    Args:
-        input_text_key (str):       the text field that will be the input to the processor.
-        output_text_key (str):      the text field that will contain processed text.
-        remove_extra_spaces (bool): replaces consequent spaces by one. Defaults to True.
-        remove_empty_lines (bool):  joins multiline input into single-line text. Defaults to True.
-        remove_diacritics (bool):   removes Arabic diacritical marks from the input text. Defaults to False.
-        remove_punctuation (bool):  removes punctuation marks from the input text. Defaults to False.
-            Processed punctuation marks are: Question mark, Exclamation mark, Colon,Semicolon,
-            Hypen-Minus, Full stop, Comma, Arabic Question Mark, Arabic Comma, Arabic Semicolon.
-        remove_tatweel (bool):      removes tatweel justification sign from the text. Defaults to False.
-        apply_nfkc (bool):   applies compatability decomposition followed by canonical composition.
-            Useful for replacing Arabic letters positional forms with general unicode and ensuring consistent diacritical marks ordering.
-            Find more here https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize.
-            Defaults to False.
-        normalize (bool): normalizes the input text. Normalization includes:    removing diacritical marks,
-            normalization of letter `ALEF`-- `ALEF_HAMZA_BELOW`, `ALEF_HAMZA_ABOVE`, `ALEF_MADDA_ABOVE` will be replaced by `ALEF`,
-            normalization of ligatures: `LAM_ALEF`, `LAM_ALEF_HAMZA_ABOVE`, `LAM_ALEF_HAMZA_BELOW`, `LAM_ALEF_MADDA_ABOVE` ligatures will be replaces by two letters `LAM` and `ALEF`.
-            letter `TEH_MARBUTA` will be replaced by `HEH`. Defaults to False.
-    """
-    def __init__(
-        self,
-        input_text_key: str = "text",
-        output_text_key: str = "text",
-        remove_extra_spaces: bool = True,
-        remove_empty_lines: bool = True,
-        remove_diacritics: bool = False,
-        remove_punctuation: bool = False,
-        remove_tatweel: bool = False,
-        normalize_ligature: bool = False,
-        apply_nfkc: bool = False,
-        normalize: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_text_key = input_text_key
-        self.output_text_key = output_text_key
-        self.output_text_key = output_text_key
-        self.remove_extra_spaces = remove_extra_spaces
-        self.remove_empty_lines = remove_empty_lines
-        self.remove_diacritics = remove_diacritics
-        self.remove_punctuation = remove_punctuation
-        self.remove_tatweel = remove_tatweel
-        self.normalize_ligature = normalize_ligature
-        self.normalize = normalize
-        self.apply_nfkc = apply_nfkc
-
-    def process_dataset_entry(self, data_entry):
-        data_entry[self.output_text_key] = self.clean_data(
-            data_entry[self.input_text_key]
-        )
-        return [DataEntry(data=data_entry)]
-
-    def _remove_diacritics(self, text):
-        for char in DIACRITICS:
-            text = text.replace(char, '')
-        return text
-
-    def _remove_punctuation(self, text):
-        for char in PUNCTUATION_MARKS:
-            text = text.replace(char, '')
-        return text
-
-    def _normalize_teh(self, text):
-        text = text.replace(TEH_MARBUTA, HEH)
-        return text
-    
-    def _normalize_ligature(self, text):
-        LIGUATURES_PATTERN = re.compile(u"[" + u"".join(LIGATURES) + u"]", re.UNICODE)
-        return LIGUATURES_PATTERN.sub(u'%s%s' % (LAM, ALEF), text)
-    
-    def _normalize_alef(self, text):
-        ALEFS_PATTERN = re.compile(u"[" + u"".join(ALEFS) + u"]", re.UNICODE)
-        return re.sub(ALEFS_PATTERN, ALEF, text)
-
-    def _remove_extra_spaces(self, text):
-        text = re.sub(" +", " ", text)
-        return text
-
-    def _remove_empty_lines(self, text):
-        lines = text.split("\n")
-        return ("\n").join([line for line in lines if len(line) >= 1])
-
-    def _normalize(self, text):
-        text = self._remove_diacritics(text)
-        text = self._normalize_alef(text)
-        text = self._normalize_ligature(text)
-        text = self._normalize_teh(text)
-
-        return text
-
-    def clean_data(self, text):
-        if self.remove_extra_spaces:
-            text = self._remove_extra_spaces(text)
-        if self.remove_empty_lines:
-            text = self._remove_empty_lines(text)
-        if self.remove_diacritics:
-            text = self._remove_diacritics(text)
-        if self.remove_tatweel:
-            text = text.replace("ـ", "")
-        if self.remove_punctuation:
-            text = self._remove_punctuation(text)
-        if self.normalize_ligature:
-            text = self._normalize_ligature(text)
-        if self.normalize:
-            text = self._normalize(text)
-        if self.apply_nfkc:
-            text = unicodedata.normalize("NFKC", text)
-        return text
\ No newline at end of file
diff --git a/build/lib/sdp/processors/langs/armenian.py b/build/lib/sdp/processors/langs/armenian.py
deleted file mode 100644
index 586807ed..00000000
--- a/build/lib/sdp/processors/langs/armenian.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-
-import pandas as pd
-
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.common import load_manifest
-
-
-class GetSourceBookName(BaseParallelProcessor):
-    """
-    Processor for extracting source book name from file paths and updating the manifest.
-
-    Args:
-        source_file_key (str): The field containing the file path in the manifest.
-        source_key (str): The field to store the extracted source book name in the manifest.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        source_file_key: str,
-        source_key: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.source_file_key = source_file_key
-        self.source_key = source_key
-
-    def process_dataset_entry(self, data_entry):
-        input_values = os.path.splitext(data_entry[self.source_file_key])[0].split("/")
-
-        data_entry[self.source_key] = input_values[-1]
-        return [DataEntry(data=data_entry)]
-
-
-class MakeTsv(BaseProcessor):
-    """
-    Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
-
-    Args:
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-
-    """
-
-    def process(self):
-        df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file)))
-        df1.to_csv(self.output_manifest_file, index=None, sep='\t')
-
-
-class RandomTsvPart(BaseProcessor):
-    """
-    Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
-
-    Args:
-        part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
-        random_state (int): Seed for reproducibility when generating the random subset.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        part: float,
-        random_state: int = 100,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.part = part
-        self.random_state = random_state
-
-    def process(self):
-        df1 = pd.read_csv(self.input_manifest_file, sep='\t')
-        df1.sample(frac=self.part, random_state=self.random_state).to_csv(
-            self.output_manifest_file, index=None, sep='\t'
-        )
diff --git a/build/lib/sdp/processors/langs/kazakh.py b/build/lib/sdp/processors/langs/kazakh.py
deleted file mode 100644
index 876e9d41..00000000
--- a/build/lib/sdp/processors/langs/kazakh.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-from typing import List
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-class LatinToCyrillic(BaseParallelProcessor):
-    """Converts visually identical latin letters  to cyrillic equivalents.
-
-    Args:
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with latin letters replaced with cyrillic ones.
-    """
-
-    LATIN = "AaƏəBEeKkMHOoPpCcTYyXxhi"
-    CYRILLIC = "АаӘәВЕеКкМНОоРрСсТУуХхһі"
-
-    def __init__(
-        self,
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        latin_counter = collections.defaultdict(int)
-
-        text_in = data_entry[self.text_key]
-        text_out = text_in
-
-        for char in text_in:
-            if char in self.LATIN:
-                cyrillic_eqv = self.CYRILLIC[self.LATIN.index(char)]
-                text_out = text_out.replace(char, cyrillic_eqv)
-                latin_counter[char] += 1
-
-        data_entry[self.text_key] = text_out
-        return [DataEntry(data=data_entry, metrics=latin_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for char, value in counter.items():
-                total_counter[char] += value
-        logger.info("Num of Latin characters")
-        for char, count in total_counter.items():
-            logger.info(f"{char}: {count}")
-        super().finalize(metrics)
diff --git a/build/lib/sdp/processors/modify_manifest/__init__.py b/build/lib/sdp/processors/modify_manifest/__init__.py
deleted file mode 100644
index 2db92b25..00000000
--- a/build/lib/sdp/processors/modify_manifest/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/processors/modify_manifest/common.py b/build/lib/sdp/processors/modify_manifest/common.py
deleted file mode 100644
index 98ad1fa3..00000000
--- a/build/lib/sdp/processors/modify_manifest/common.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from pathlib import Path
-from typing import Dict, List, Union, Optional
-
-import pandas as pd
-from tqdm import tqdm
-
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-    LegacyParallelProcessor,
-)
-from sdp.utils.common import load_manifest
-
-class CombineSources(BaseParallelProcessor):
-    """Can be used to create a single field from two alternative sources.
-
-    E.g.::
-
-        _target_: sdp.processors.CombineSources
-        sources:
-            - field: text_pc
-              origin_label: original
-            - field: text_pc_pred
-              origin_label: synthetic
-            - field: text
-              origin_label: no_pc
-        target: text
-
-    will populate the ``text`` field with data from ``text_pc`` field if it's
-    present and not equal to ``n/a`` (can be customized). If ``text_pc`` is
-    not available, it  will populate ``text`` from ``text_pc_pred`` field,
-    following the same rules. If both are not available, it will fall back to
-    the ``text`` field itself. In all cases it will specify which source was
-    used in the ``text_origin`` field by using the label from the
-    ``origin_label`` field.. If non of the sources is available,
-    it will populate both the target and the origin fields with ``n/a``.
-
-    Args:
-        sources (list[dict]): list of the sources to use in order of preference.
-            Each element in the list should be in the following format::
-
-                {
-                    field: <which field to take the data from>
-                    origin_label: <what to write in the "<target>_origin"
-                }
-        target (str): target field that we are populating.
-        na_indicator (str): if any source field has text equal to the
-            ``na_indicator`` it will be considered as not available. If none
-            of the sources are present, this will also be used as the value
-            for the target and origin fields. Defaults to ``n/a``.
-
-    Returns:
-        The same data as in the input manifest enhanced with the following fields::
-
-            <target>: <populated with data from either <source1> or <source2> \
-                       or with <na_indicator> if none are available>
-            <target>_origin: <label that marks where the data came from>
-    """
-
-    def __init__(
-        self,
-        sources: List[Dict[str, str]],
-        target: str,
-        na_indicator: str = "n/a",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.sources = sources
-        self.target = target
-        self.na_indicator = na_indicator
-
-    def process_dataset_entry(self, data_entry: Dict):
-        for source_dict in self.sources:
-            if data_entry.get(source_dict["field"], self.na_indicator) != self.na_indicator:
-                data_entry[self.target] = data_entry[source_dict["field"]]
-                data_entry[f"{self.target}_origin"] = source_dict["origin_label"]
-                break  # breaking out on the first present label
-        else:  # going here if no break was triggered
-            data_entry[self.target] = self.na_indicator
-            data_entry[f"{self.target}_origin"] = self.na_indicator
-
-        return [DataEntry(data=data_entry)]
-
-
-class AddConstantFields(BaseParallelProcessor):
-    """
-    This processor adds constant fields to all manifest entries using Dask BaseParallelProcessor.
-    It is useful when you want to attach fixed information (e.g., a language label or metadata)
-    to each entry for downstream tasks such as language identification model training.
-    
-    Args:
-        fields (dict): A dictionary containing key-value pairs of fields to add to each manifest entry.
-            For example::
-    
-                {
-                    "label": "en",
-                    "metadata": "mcv-11.0-2022-09-21"
-                }
-    
-    Returns:
-        dict: The same data as in the input manifest with the added constant fields as specified in
-        the ``fields`` dictionary.
-    
-    Example:
-    
-        .. code-block:: yaml
-    
-            - _target_: sdp.processors.modify_manifest.common.AddConstantFields
-              input_manifest_file: ${workspace_dir}/input_manifest.json
-              output_manifest_file: ${workspace_dir}/output_manifest.json
-              fields:
-                label: "en"
-                metadata: "mcv-11.0-2022-09-21"
-    """
-
-    def __init__(self, fields: Dict, **kwargs):
-        super().__init__(**kwargs)
-        self.fields = fields
-
-    def process_dataset_entry(self, data_entry: Dict):
-        data_entry.update(self.fields)
-        return [DataEntry(data=data_entry)]
-
-
-
-class DuplicateFields(BaseParallelProcessor):
-    """This processor duplicates fields in all manifest entries.
-
-    It is useful for when you want to do downstream processing of a variant
-    of the entry. E.g. make a copy of "text" called "text_no_pc", and
-    remove punctuation from "text_no_pc" in downstream processors.
-
-    Args:
-        duplicate_fields (dict): dictionary where keys are the original
-            fields to be copied and their values are the new names of
-            the duplicate fields.
-
-    Returns:
-        The same data as in the input manifest with duplicated fields
-        as specified in the ``duplicate_fields`` input dictionary. 
-    
-    Example:
-        .. code-block:: yaml
-
-            - _target_: sdp.processors.modify_manifest.common.DuplicateFields
-              input_manifest_file: ${workspace_dir}/test1.json
-              output_manifest_file: ${workspace_dir}/test2.json
-              duplicate_fields: {"text":"answer"}
-
-    """
-    def __init__(
-        self,
-        duplicate_fields: Dict,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.duplicate_fields = duplicate_fields
-
-    def process_dataset_entry(self, data_entry: Dict):
-        for field_src, field_tgt in self.duplicate_fields.items():
-            if not field_src in data_entry:
-                raise ValueError(f"Expected field {field_src} in data_entry {data_entry} but there isn't one.")
-
-            data_entry[field_tgt] = data_entry[field_src]
-
-        return [DataEntry(data=data_entry)]
-
-
-class RenameFields(BaseParallelProcessor):
-    """This processor renames fields in all manifest entries.
-
-    Args:
-        rename_fields: dictionary where keys are the fields to be
-            renamed and their values are the new names of the fields.
-
-    Returns:
-        The same data as in the input manifest with renamed fields
-        as specified in the ``rename_fields`` input dictionary.
-    """
-
-    def __init__(
-        self,
-        rename_fields: Dict,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.rename_fields = rename_fields
-
-    def process_dataset_entry(self, data_entry: Dict):
-        for field_src, field_tgt in self.rename_fields.items():
-            if not field_src in data_entry:
-                raise ValueError(f"Expected field {field_src} in data_entry {data_entry} but there isn't one.")
-
-            data_entry[field_tgt] = data_entry[field_src]
-            del data_entry[field_src]
-
-        return [DataEntry(data=data_entry)]
-
-
-class SplitOnFixedDuration(BaseParallelProcessor):
-    """This processor splits audio into a fixed length segments.
-
-    It does not actually create different audio files, but simply adds
-    corresponding ``offset`` and ``duration`` fields. These fields can
-    be automatically processed by NeMo to split audio on the fly during
-    training.
-
-    Args:
-        segment_duration (float): fixed desired duration of each segment.
-        drop_last (bool): whether to drop the last segment if total duration is
-            not divisible by desired segment duration. If False, the last
-            segment will be of a different length which is ``< segment_duration``.
-            Defaults to True.
-        drop_text (bool): whether to drop text from entries as it is most likely
-            inaccurate after the split on duration. Defaults to True.
-
-    Returns:
-        The same data as in the input manifest but all audio that's longer
-        than the ``segment_duration`` will be duplicated multiple times with
-        additional ``offset`` and ``duration`` fields. If ``drop_text=True``
-        will also drop ``text`` field from all entries.
-    """
-
-    def __init__(
-        self,
-        segment_duration: float,
-        drop_last: bool = True,
-        drop_text: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.segment_duration = segment_duration
-        self.drop_last = drop_last
-        self.drop_text = drop_text
-
-    def process_dataset_entry(self, data_entry: Dict):
-        total_duration = data_entry["duration"]
-        total_segments = int(total_duration // self.segment_duration)
-        output = [None] * total_segments
-        for segment_idx in range(total_segments):
-            modified_entry = data_entry.copy()  # shallow copy should be good enough
-            modified_entry["duration"] = self.segment_duration
-            modified_entry["offset"] = segment_idx * self.segment_duration
-            if self.drop_text:
-                modified_entry.pop("text", None)
-            output[segment_idx] = DataEntry(data=modified_entry)
-
-        remainder = total_duration - self.segment_duration * total_segments
-        if not self.drop_last and remainder > 0:
-            modified_entry = data_entry.copy()
-            modified_entry["duration"] = remainder
-            modified_entry["offset"] = self.segment_duration * total_segments
-            if self.drop_text:
-                modified_entry.pop("text", None)
-            output.append(DataEntry(data=modified_entry))
-
-        return output
-
-
-class ChangeToRelativePath(BaseParallelProcessor):
-    """This processor changes the audio filepaths to be relative.
-
-    Args:
-        base_dir: typically a folder where manifest file is going to be
-            stored. All passes will be relative to that folder.
-
-    Returns:
-         The same data as in the input manifest with ``audio_filepath`` key
-         changed to contain relative path to the ``base_dir``.
-    """
-
-    def __init__(
-        self,
-        base_dir: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.base_dir = base_dir
-
-    def process_dataset_entry(self, data_entry: Dict):
-        data_entry["audio_filepath"] = os.path.relpath(data_entry["audio_filepath"], self.base_dir)
-
-        return [DataEntry(data=data_entry)]
-
-
-class SortManifest(BaseProcessor):
-    """Processor which will sort the manifest by some specified attribute.
-
-    Args:
-        attribute_sort_by (str): the attribute by which the manifest will be sorted.
-        descending (bool): if set to False, attribute will be in ascending order.
-            If True, attribute will be in descending order. Defaults to True.
-
-    Returns:
-        The same entries as in the input manifest, but sorted based
-        on the provided parameters.
-    """
-
-    def __init__(
-        self,
-        attribute_sort_by: str,
-        descending: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.attribute_sort_by = attribute_sort_by
-        self.descending = descending
-
-    def process(self):
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-            dataset_entries = [json.loads(line) for line in fin.readlines()]
-
-        dataset_entries = sorted(dataset_entries, key=lambda x: x[self.attribute_sort_by], reverse=self.descending)
-
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for line in dataset_entries:
-                fout.write(json.dumps(line, ensure_ascii=False) + "\n")
-
-
-class KeepOnlySpecifiedFields(BaseProcessor):
-    """Saves a copy of a manifest but only with a subset of the fields.
-
-    Typically will be the final processor to save only relevant fields
-    in the desired location.
-
-    Args:
-        fields_to_keep (list[str]): list of the fields in the input manifest
-            that we want to retain. The output file will only contain these
-            fields.
-
-    Returns:
-        The same data as in input manifest, but re-saved in the new location
-        with only ``fields_to_keep`` fields retained.
-    """
-
-    def __init__(self, fields_to_keep: List[str], **kwargs):
-        super().__init__(**kwargs)
-        self.fields_to_keep = fields_to_keep
-
-    def process(self):
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin, open(
-            self.output_manifest_file, "wt", encoding="utf8"
-        ) as fout:
-            for line in tqdm(fin):
-                line = json.loads(line)
-                new_line = {field: line[field] for field in self.fields_to_keep}
-                fout.write(json.dumps(new_line, ensure_ascii=False) + "\n")
-
-
-class ApplyInnerJoin(BaseProcessor):
-    """Applies inner join to two manifests, i.e. creates a manifest from records that have matching values in both manifests.
-    For more information, please refer to the Pandas merge function documentation:
-    https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge
-
-
-    Args:
-        column_id (Union[str, List[str], None]): Field names to join on. These must be found in both manifests.
-            If `column_id` is None then this defaults to the intersection of the columns in both manifests.
-            Defaults to None.
-        left_manifest_file (Optional[str]): path to the left manifest. Defaults to `input_manifest_file`.
-        right_manifest_file (str): path to the right manifest.
-
-    Returns:
-        Inner join of two manifests.
-    """
-
-    def __init__(
-        self,
-        right_manifest_file: str,
-        left_manifest_file: Optional[str] = None,
-        column_id: Union[str, List[str], None] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.left_manifest_file = left_manifest_file if left_manifest_file != None else self.input_manifest_file
-        self.right_manifest_file = right_manifest_file
-        self.column_id = column_id
-
-    def process(self):
-        m1 = pd.DataFrame.from_records(load_manifest(Path(self.left_manifest_file)))
-        m2 = pd.DataFrame.from_records(load_manifest(Path(self.right_manifest_file)))
-        m3 = pd.merge(m1, m2, on=self.column_id, how="inner")
-
-        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
-            for _, line in m3.iterrows():
-                fout.write(json.dumps(dict(line), ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/modify_manifest/create_manifest.py b/build/lib/sdp/processors/modify_manifest/create_manifest.py
deleted file mode 100644
index 1e416571..00000000
--- a/build/lib/sdp/processors/modify_manifest/create_manifest.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-
-import pandas
-
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    DataEntry,
-)
-
-
-class CreateInitialManifestByExt(BaseParallelProcessor):
-    """
-    Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
-
-    Args:
-        raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory.
-        output_file_key (str): The key to store the paths to the files in the dataset.
-        extension (str): The file extension of the of the files to be added to the manifest.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        output_file_key: str = "audio_filepath",
-        extension: str = "mp3",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.output_file_key = output_file_key
-        self.extension = extension
-
-    def read_manifest(self):
-        # Get all files with the specified extension
-        files = list(self.raw_data_dir.rglob('*.' + self.extension))
-        # Get relative paths and then rebuild proper paths to avoid duplication
-        return [str(self.raw_data_dir / file.relative_to(self.raw_data_dir)) for file in files]
-
-    def process_dataset_entry(self, data_entry):
-        data = {self.output_file_key: data_entry}
-        return [DataEntry(data=data)]
-
-
-class CreateCombinedManifests(BaseParallelProcessor):
-    """Reads JSON lines from specified files and creates a combined manifest.
-
-    This processor iterates over files listed in `manifest_list`, reads each file line by line, 
-    and yields the parsed JSON data from each line.
-
-    Args:
-        manifest_list (list(str)): A list of file paths or directories to process. The processor will 
-                                   recursively read files within the directories and expect each file to contain JSON data.
-        **kwargs: Additional keyword arguments passed to the base class `BaseParallelProcessor`.
-
-    Returns:
-        A generator that yields parsed JSON data from each line in the files listed in `manifest_list`.
-    """
-    def __init__(
-        self,
-        manifest_list: list[str],
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.manifest_list = manifest_list
-
-    def read_manifest(self):
-        for file in self.manifest_list:
-            with open(file, "rt", encoding="utf8") as fin:
-                for line in fin:
-                    yield json.loads(line)
-
-    def process_dataset_entry(self, data_entry):
-        return [DataEntry(data=data_entry)]
-
-
-
diff --git a/build/lib/sdp/processors/modify_manifest/data_to_data.py b/build/lib/sdp/processors/modify_manifest/data_to_data.py
deleted file mode 100644
index 55db4080..00000000
--- a/build/lib/sdp/processors/modify_manifest/data_to_data.py
+++ /dev/null
@@ -1,1227 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import os
-import re
-from typing import Dict, List, Optional
-
-import soundfile
-import torchaudio
-from docx import Document
-from sox import Transformer
-from tqdm import tqdm
-import json
-
-from sdp.logging import logger
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.common import ffmpeg_convert
-from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
-from sdp.utils.get_diff import get_diff_with_subs_grouped
-from sdp.utils.metrics_computation import (
-    get_cer,
-    get_charrate,
-    get_wer,
-    get_wmr,
-    get_wordrate,
-)
-
-
-class GetAudioDuration(BaseParallelProcessor):
-    """
-    Processor that computes the duration of the file in ``audio_filepath_key`` (using soundfile)
-    and saves the duration in ``duration_key``. If there is an error computing the duration,
-    the value at ``duration_key`` will be updated with the value -1.0.
-
-    Args:
-        audio_filepath_key (str): Key to get path to wav file.
-        duration_key (str): Key to put to audio duration.
-    Returns:
-        All the same fields as in the input manifest plus duration_key
-    """
-
-    def __init__(
-        self,
-        audio_filepath_key: str,
-        duration_key: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.audio_filepath_key = audio_filepath_key
-        self.duration_key = duration_key
-
-    def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.audio_filepath_key]
-        try:
-            data, samplerate = soundfile.read(audio_filepath)
-            data_entry[self.duration_key] = data.shape[0] / samplerate
-        except Exception as e:
-            logger.warning(str(e) + " file: " + audio_filepath)
-            data_entry[self.duration_key] = -1.0
-        return [DataEntry(data=data_entry)]
-
-
-class FfmpegConvert(BaseParallelProcessor):
-    """
-    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
-    If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
-    If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
-
-    .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
-        e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
-
-        ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
-
-    Args:
-        converted_audio_dir (str): The directory to store the resampled audio files.
-        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
-        output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
-        id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
-        output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
-        target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
-        target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        converted_audio_dir: str,
-        input_file_key: str,
-        output_file_key: str,
-        id_key: str = None,
-        output_format: str = "wav",
-        base_dir: str = None,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.converted_audio_dir = converted_audio_dir
-        self.input_file_key = input_file_key
-        self.output_file_key = output_file_key
-        self.output_format = output_format
-        self.id_key = id_key
-        self.base_dir = base_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        assert self.output_format == "wav", "Currently only wav format is supported"
-        os.makedirs(self.converted_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        input_file = data_entry[self.input_file_key]
-        if self.id_key:
-            key = data_entry[self.id_key]
-            os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
-        else:
-            key = os.path.splitext(input_file)[0].split("/")[-1]
-
-        if self.base_dir:
-            new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
-            os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
-
-            key = os.path.join(new_dir, key)
-
-        audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
-
-        if not os.path.isfile(audio_file):
-            ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
-
-        data_entry[self.output_file_key] = audio_file
-        return [DataEntry(data=data_entry)]
-
-
-class ReadTxtLines(BaseParallelProcessor):
-    """
-    The text file specified in source_filepath will be read, and each line in it will be added as a line in the output manifest,
-    saved in the field text_key.
-
-    Args:
-        input_file_key (str): The key in the manifest containing the input txt file path .
-        text_key (str): The key to store the read text lines in the manifest.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        input_file_key: str,
-        text_key: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_file_key = input_file_key
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.input_file_key]
-        data_list = []
-        with open(fname, "r") as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    data = data_entry.copy()
-                    data[self.text_key] = line
-                    data_list.append(DataEntry(data=data))
-        return data_list
-
-
-class SoxConvert(BaseParallelProcessor):
-    """Processor for Sox to convert audio files to specified format.
-
-    Args:
-        output_manifest_file (str): Path to the output manifest file.
-        input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
-        output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
-        converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
-        output_format (str): Format of the output audio file.
-        rate (int): Sample rate of the output audio file.
-        channels (int): Number of channels of the output audio file.
-        workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        converted_audio_dir: str,
-        input_audio_file_key: str = "audio_filepath",
-        output_audio_file_key: str = "audio_filepath",
-        output_format: str = "wav",
-        rate: int = 16000,
-        channels: int = 1,
-        workspace_dir: Optional[str] = None,
-        **kwargs,
-    ):
-        # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
-        if "workspace_dir" in kwargs:
-            workspace_dir = kwargs.pop("workspace_dir")
-            
-        super().__init__(**kwargs)
-        self.input_audio_file_key = input_audio_file_key
-        self.output_audio_file_key = output_audio_file_key
-        self.converted_audio_dir = converted_audio_dir
-        self.output_format = output_format
-        self.workspace_dir = workspace_dir
-
-        # Store the new parameters for later use:
-        self.rate = rate
-        self.channels = channels
-
-    def prepare(self):
-        # Debug print for workspace_dir
-        logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
-        os.makedirs(self.converted_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        audio_path = data_entry[self.input_audio_file_key]
-        
-        # If workspace_dir is provided, join it with audio_path to get absolute path
-        if self.workspace_dir is not None:
-            full_audio_path = os.path.join(self.workspace_dir, audio_path)
-        else:
-            full_audio_path = audio_path
-            
-        # Debug print first file path
-        if not hasattr(self, '_debug_printed'):
-            logger.info(f"First audio_path from manifest: {audio_path}")
-            logger.info(f"First full_audio_path: {full_audio_path}")
-            logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
-            self._debug_printed = True
-
-        key = os.path.splitext(audio_path)[0].split("/")[-1]
-        converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
-
-        if not os.path.isfile(converted_file):
-            transformer = Transformer()
-
-            transformer.rate(self.rate)
-            transformer.channels(self.channels)
-
-            transformer.build(full_audio_path, converted_file)
-
-        data_entry[self.output_audio_file_key] = converted_file
-        return [DataEntry(data=data_entry)]
-
-
-class CountNumWords(BaseParallelProcessor):
-    """
-    Processor for counting the number of words in the text_key field saving the number in num_words_key.
-
-    Args:
-        text_key (str): The field containing the input text in the dataset.
-        num_words_key (str): The field to store the number of words in the dataset.
-        alphabet (str): Characters to be used to count words. Any other characters are substituted by whitespace and not take into account.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        text_key: str,
-        num_words_key: str,
-        alphabet: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_key = text_key
-        self.num_words_key = num_words_key
-        self.pattern = re.compile("[^" + alphabet + "]")
-
-    def process_dataset_entry(self, data_entry):
-        text = data_entry[self.text_key]
-        cleaned_string = self.pattern.sub("", text).strip()
-        cleaned_string = re.sub("\\s+", " ", cleaned_string).strip()
-        words = cleaned_string.split()
-        num_words = len(words)
-        data_entry[self.num_words_key] = num_words
-        return [DataEntry(data=data_entry)]
-
-
-class SplitLineBySentence(BaseParallelProcessor):
-    """
-    Processor for splitting lines of text into sentences based on a specified pattern.
-    One line containing N sentences will be transformed into N lines containing one sentence.
-
-    Args:
-        text_key (str): The field containing the text lines in the dataset.
-        end_pattern (str): The regular expression pattern to identify sentence boundaries.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-    """
-
-    def __init__(
-        self,
-        text_key: str,
-        end_pattern: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_key = text_key
-        self.pattern = re.compile(end_pattern)
-
-    def process_dataset_entry(self, data_entry):
-        line = data_entry[self.text_key]
-        data_list = []
-        start = 0
-        ends = [m.start() for m in self.pattern.finditer(line)]
-        if ends:
-            for end in ends:
-                sent = line[start : end + 1].strip()
-                # if sent and sent[0].isupper():
-                data = data_entry.copy()
-                data[self.text_key] = sent
-                data_list.append(DataEntry(data=data))
-                start = end + 1
-            if start < len(line):
-                pass
-        else:
-            data = data_entry.copy()
-            data[self.text_key] = line.strip()
-            data_list.append(DataEntry(data=data))
-        return data_list
-
-
-class InsIfASRInsertion(BaseParallelProcessor):
-    """Processor that adds substrings to transcription if they are present in ASR predictions.
-
-    Will insert substrings into ``data[self.text_key]`` if it is
-    present at that location in ``data[self.pred_text_key]``.
-    It is useful if words are systematically missing from ground truth
-    transcriptions.
-
-    Args:
-        insert_words (list[str]): list of strings that will be inserted
-            into ``data[self.text_key]`` if there is an insertion (containing
-            only that string) in ``data[self.pred_text_key]``.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-            .. note::
-                Because this processor looks for an exact match in the insertion,
-                we recommend including variations with different spaces in
-                ``insert_words``, e.g. ``[' nemo', 'nemo ', ' nemo ']``.
-
-    Returns:
-         The same data as in the input manifest with ``<text_key>`` field changed.
-    """
-
-    def __init__(
-        self,
-        insert_words: List[str],
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.insert_words = insert_words
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        insert_word_counter = collections.defaultdict(int)
-        for insert_word in self.insert_words:
-            if not insert_word in data_entry[self.pred_text_key]:
-                break
-            orig_words, pred_words = (
-                data_entry[self.text_key],
-                data_entry[self.pred_text_key],
-            )
-            diff = get_diff_with_subs_grouped(orig_words, pred_words)
-
-            if len(diff) > 0:  # ie if there are differences between text and pred_text
-                new_sent = ""
-
-                for diff_entry in diff:
-                    if diff_entry[0] == 0:  # no change
-                        new_sent += diff_entry[1]
-
-                    elif diff_entry[0] == -1:  # deletion in original string
-                        new_sent += diff_entry[1]
-
-                    elif diff_entry[0] == 1:  # insertion in original string
-                        if diff_entry[1] == insert_word:
-                            new_sent += insert_word
-                            insert_word_counter[insert_word] += 1
-
-                    elif isinstance(diff_entry, tuple):  # i.e. diff is a substitution
-                        new_sent += diff_entry[0][1]
-                    else:
-                        raise ValueError(f"unexpected item in diff_entry: {diff_entry}")
-
-                new_sent = " ".join(new_sent.split())  # remove any extra spaces
-                data_entry[self.text_key] = new_sent
-
-        return [DataEntry(data=data_entry, metrics=insert_word_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for word, count in counter.items():
-                total_counter[word] += count
-        logger.info("Num of words that were inserted")
-        for word, count in total_counter.items():
-            logger.info(f"{word} {count}")
-        super().finalize(metrics)
-
-
-class SubIfASRSubstitution(BaseParallelProcessor):
-    """Processor that substitutes substrings to transcription if they are present in ASR predictions.
-
-    Will convert a substring in ``data[self.text_key]`` to a
-    substring in ``data[self.pred_text_key]`` if both are located in the
-    same place (ie are part of a 'substitution' operation) and if the substrings
-    correspond to key-value pairs in ``sub_words``.
-    This is useful if words are systematically incorrect in ground truth
-    transcriptions.
-
-    Before starting to look for substitution, this processor adds spaces at the beginning and end of
-    ``data[self.text_key]`` and ``data[self.pred_text_key]``, to ensure that an argument like
-    ``sub_words = {"nmo ": "nemo "}`` would cause a substitution to be made even if the original
-    ``data[self.text_key]`` ends with ``"nmo"`` and ``data[self.pred_text_key]`` ends with ``"nemo"``.
-
-    Args:
-        sub_words (dict): dictionary where a key is a string that might be in
-            ``data[self.text_key]`` and the value is the string that might
-            be in ``data[self.pred_text_key]``. If both are located in the same
-            place (i.e. are part of a 'substitution' operation)
-            then the key string will be converted to the value string
-            in ``data[self.text_key]``.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-            .. note::
-                This processor looks for exact string matches of substitutions,
-                so you may need to be careful with spaces in ``sub_words``. E.g.
-                it is recommended to do ``sub_words = {"nmo ": "nemo "}``
-                instead of ``sub_words = {"nmo" : "nemo"}``.
-
-    Returns:
-         The same data as in the input manifest with ``<text_key>`` field changed.
-    """
-
-    def __init__(
-        self,
-        sub_words: Dict,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.sub_words = sub_words
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        sub_word_counter = collections.defaultdict(int)
-        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
-        data_entry[self.pred_text_key] = add_start_end_spaces(data_entry[self.pred_text_key])
-        for original_word, new_word in self.sub_words.items():
-            if not original_word in data_entry[self.text_key]:
-                break
-            orig_words, pred_words = (
-                data_entry[self.text_key],
-                data_entry[self.pred_text_key],
-            )
-            diff = get_diff_with_subs_grouped(orig_words, pred_words)
-
-            if len(diff) > 0:  # ie if there are differences between text and pred_text
-                new_sent = ""
-
-                for diff_entry in diff:
-                    if diff_entry[0] == 0:  # no change
-                        new_sent += diff_entry[1]
-
-                    elif diff_entry[0] == -1:  # deletion in original string
-                        new_sent += diff_entry[1]
-
-                    elif diff_entry[0] == 1:  # insertion in original string
-                        # don't make changes
-                        pass
-
-                    elif isinstance(diff_entry, tuple):  # substitution
-                        if diff_entry[0][1] == original_word and diff_entry[1][1] == new_word:
-                            # ie. substitution is one we want to use to change the original text
-                            new_sent += new_word
-                            sub_word_counter[original_word] += 1
-
-                        else:
-                            # ie. substitution is one we want to ignore
-                            new_sent += diff_entry[0][1]
-                    else:
-                        raise ValueError(f"unexpected item in diff_entry: {diff_entry}")
-
-                new_sent = add_start_end_spaces(new_sent)
-                data_entry[self.text_key] = new_sent
-
-        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
-        data_entry[self.pred_text_key] = remove_extra_spaces(data_entry[self.pred_text_key])
-
-        return [DataEntry(data=data_entry, metrics=sub_word_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for word, count in counter.items():
-                total_counter[word] += count
-        logger.info("Num of words that were substituted")
-        for word, count in total_counter.items():
-            logger.info(f"{word} {count}")
-        super().finalize(metrics)
-
-
-# TODO: replace with generic regex
-
-
-class SubMakeLowercase(BaseParallelProcessor):
-    """Processor to convert text to lowercase.
-
-    text_key (str): a string indicating which key of the data entries
-        should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-        The same data as in the input manifest with ``<text_key>`` field changed.
-    """
-
-    def __init__(
-        self,
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        data_entry[self.text_key] = data_entry[self.text_key].lower()
-        return [DataEntry(data=data_entry)]
-
-    def finalize(self, metrics):
-        logger.info("Made all letters lowercase")
-        super().finalize(metrics)
-
-
-class SubRegex(BaseParallelProcessor):
-    """Converts a regex match to a string, as defined by key-value pairs in ``regex_to_sub``.
-
-    Before applying regex changes, we will add a space
-    character to the beginning and end of the ``text`` and ``pred_text``
-    keys for each data entry. After the the regex changes,
-    the extra spaces are removed. This includes the spaces in the beginning
-    and end of the text, as well as any double spaces ``"  "``.
-
-    Args:
-        regex_params_list (list[dict]): list of dicts.
-            Each dict must contain a ``pattern`` and a ``repl`` key,
-            and optionally a ``count`` key (by default, ``count`` will be 0).
-            This processor will go through the list in order, and apply a ``re.sub`` operation on
-            the input text in ``data_entry[self.text_key]``, feeding in the specified ``pattern``, ``repl``
-            and ``count`` parameters to ``re.sub``.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with ``<text_key>`` field changed.
-    """
-
-    def __init__(
-        self,
-        regex_params_list: List[Dict],
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.regex_params_list = regex_params_list
-        self.text_key = text_key
-
-        # verify all dicts in regex_params_list have "pattern" and "repl" keys
-        for regex_params_dict in self.regex_params_list:
-            if not "pattern" in regex_params_dict.keys():
-                raise ValueError(
-                    f"Need to have key 'pattern' in all entries of `regex_params_list`: {self.regex_params_list}"
-                )
-            if not "repl" in regex_params_dict.keys():
-                raise ValueError(
-                    f"Need to have key 'repl' in all entries of `regex_params_list`: {self.regex_params_list}"
-                )
-
-    def process_dataset_entry(self, data_entry) -> List:
-        """Replaces each found regex match with a given string."""
-        replace_word_counter = collections.defaultdict(int)
-
-        text_in = data_entry[self.text_key]
-
-        text_in = add_start_end_spaces(text_in)
-        for regex_params in self.regex_params_list:
-            text_out = re.sub(
-                pattern=regex_params["pattern"],
-                repl=regex_params["repl"],
-                string=text_in,
-                # note: this count param is the maximum number of pattern occurrences to be replaced.
-                count=regex_params.get("count", 0),
-            )
-
-            if text_in != text_out:
-                replace_word_counter[regex_params["pattern"]] += 1
-            text_in = text_out
-
-        text_out = remove_extra_spaces(text_out)
-
-        data_entry[self.text_key] = text_out
-
-        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
-
-    def finalize(self, metrics):
-        """Reports how many substitutions were made for each pattern."""
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for word, count in counter.items():
-                total_counter[word] += count
-        logger.info("Number of utterances which applied substitutions for the following patterns:")
-        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
-        for word, count in total_counter_sorted.items():
-            logger.info(f"{word} {count}")
-        super().finalize(metrics)
-
-
-class NormalizeText(BaseParallelProcessor):
-    """This processor applies text normalization (TN) to the text. I.e. converts text from written form into its verbalized form.
-    E.g., "$123" is converted to "one hundred and twenty-three dollars."
-
-    Args:
-        input_text_key (str): the text field that will be the input to the Normalizer. Defaults to: text.
-        input_language (str): language specifying the text normalization rules in ISO 639 Set 1 format. E.g., "en", "es", "it", etc.
-            Defaults to: English.
-        input_case (str): input text capitalization, set to `cased` if text contains capital letters.
-            This flag affects normalization rules applied to the text. Note, `lower_cased` won't lower case input.
-            Defaults to: cased.
-        output_text_key (str): the text field that will be the output from the Normalizer.
-            Defaults to: text.
-
-    Returns:
-        This processor normalizes the text in the `input_text_key` field and saves the normalized text in `output_text_key` field.
-
-    Raises:
-        `NotImplementedError`: when TN is not implemented for the requested language.
-    """
-
-    def __init__(
-        self,
-        input_text_key: str = "text",
-        input_language: str = "en",
-        input_case: str = "cased",
-        output_text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_text_key = input_text_key
-        self.output_text_key = output_text_key
-        self.input_case = input_case
-        self.input_language = input_language
-
-    def prepare(self):
-        from nemo_text_processing.text_normalization.normalize import Normalizer
-        try:
-            self.normalizer = Normalizer(input_case=self.input_case, lang=self.input_language)
-        except NotImplementedError as e:
-            logger.error("Failed to run text normalization: %s", repr(e))
-
-    def process_dataset_entry(self, data_entry):
-        data_entry[self.output_text_key] = self.normalizer.normalize(data_entry[self.input_text_key])
-        return [DataEntry(data=data_entry)]
-
-
-class InverseNormalizeText(BaseParallelProcessor):
-    """This processor applies inverse text normalization (ITN) to the text. I.e. transforms spoken forms of numbers, dates, etc into their written equivalents.
-    E.g., "one hundred and twenty-three dollars." is converted to "$123".
-
-    Args:
-        input_text_key (str): the text field that will be the input to the InverseNormalizer. Defaults to: text.
-        input_language (str): language specifying the text normalization rules in ISO 639 Set 1 format. E.g., "en", "es", "it", etc.
-            Defaults to: English.
-        input_case (str): input text capitalization, set to `cased` if text contains capital letters.
-            This flag affects normalization rules applied to the text. Note, `lower_cased` won't lower case input.
-            Defaults to: cased.
-        output_text_key (str): the text field that will be the output from the InverseNormalizer.
-            Defaults to: text.
-
-    Returns:
-        This processor inverse normalizes the text in the `input_text_key` field and saves the inverse normalized text in `output_text_key` field.
-
-    Raises:
-        `NotImplementedError`: when ITN is not implemented for the requested language.
-    """
-
-    def __init__(
-        self,
-        input_text_key: str = "text",
-        input_language: str = "en",
-        input_case: str = "cased",
-        output_text_key: str = "text",
-        verbose: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_text_key = input_text_key
-        self.output_text_key = output_text_key
-        self.input_case = input_case
-        self.input_language = input_language
-        self.verbose = verbose
-
-    def prepare(self):
-        from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
-        try:
-            self.inverse_normalizer = InverseNormalizer(input_case=self.input_case, lang=self.input_language)
-        except NotImplementedError as e:
-            logger.error("Failed to run text inverse normalization: %s", repr(e))
-
-    def process_dataset_entry(self, data_entry):
-        data_entry[self.output_text_key] = self.inverse_normalizer.inverse_normalize(
-            data_entry[self.input_text_key], verbose=self.verbose
-        )
-        return [DataEntry(data=data_entry)]
-
-
-class CopyManifestData(BaseParallelProcessor):
-    """This processor copies files specified in the manifest to a new location.
-
-    It is useful for creating a consolidated dataset by gathering files from different sources
-    into a single directory.
-
-    Args:
-        copy_path (str): The destination directory where files will be copied.
-        source_filepath (str): The key in the manifest that contains the path to 
-            the file to be copied. Default: "audio_path".
-
-    Returns:
-        The same data as in the input manifest, but the files referenced in the manifest
-        will have been copied to the specified destination directory.
-
-    Example:
-        .. code-block:: yaml
-
-            - _target_: sdp.processors.modify_manifest.data_to_data.CopyManifestData
-              input_manifest_file: ${workspace_dir}/dataset.json
-              output_manifest_file: ${workspace_dir}/dataset_copied.json
-              copy_path: ${workspace_dir}/consolidated_data
-              source_filepath: "audio_filepath"
-    """
-    def __init__(
-        self,
-        copy_path: str,
-        source_filepath: str = "audio_path",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = source_filepath
-        self.copy_path = copy_path
-
-    def prepare(self):
-        os.makedirs(self.copy_path, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.input_field]
-
-        dest_file_path = os.path.join(self.copy_path, os.path.basename(fname))
-        shutil.copy(fname, dest_file_path)
-        data_entry[self.input_field] = dest_file_path
-
-        return [DataEntry(data=data_entry)]
-
-
-class ReadDocxLines(BaseParallelProcessor):
-    """
-    Processor for reading text lines from a docx file and updating the manifest.
-
-    Args:
-        source_filepath (str): The field containing the file path in the manifest.
-        text_key (str): The field to store the read text lines in the manifest.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        source_filepath: str,
-        text_key: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = source_filepath
-        self.output_field = text_key
-
-    def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.input_field]
-
-        # Skip hidden files and directories (e.g., .DS_Store, ._filename)
-        if os.path.basename(fname).startswith('.'):
-            logger.warning(f"Skipping hidden file: {fname}")
-            return []
-
-        data_list = []
-
-        try:
-            doc = Document(fname)
-            for para in doc.paragraphs:
-                line = para.text.strip()
-                if line:
-                    data = data_entry.copy()
-                    data[self.output_field] = line
-                    data_list.append(DataEntry(data=data))
-        except Exception as e:
-            logger.error(f"Error reading document {fname}: {e}")
-
-        return data_list
-
-
-class ExtractFromBrackets(BaseParallelProcessor):
-    """
-    A class for extracting text contained within specified bracket types from strings,
-    handling nested brackets.
-
-    Example Input:
-        data_entry = {
-            "text": "This is a [test] string with [multiple [nested] brackets]."
-        }
-
-    Example Output:
-        [
-            {
-                "text": "test"
-            },
-            {
-                "text": "multiple [nested] brackets"
-            }
-        ]
-
-    Explanation:
-        - It extracts "test" from the first occurrence of brackets.
-        - It extracts "multiple [nested] brackets" from the second occurrence, handling nested brackets correctly.
-
-    Attributes:
-        brackets (List[str]): A list where each element is a pair of strings representing
-                              the opening and closing brackets.
-        text_key (str): The key in the input data from which to extract text, defaults to "text".
-    """
-
-    def __init__(
-        self,
-        brackets: List[str],
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.brackets = brackets
-        self.text_key = text_key
-
-    def extract_text_within_brackets(self, text, brackets):
-        """
-        Extracts text within the specified brackets, including handling nested brackets.
-
-        Args:
-            text (str): The string from which to extract text.
-            brackets (tuple[str, str]): A tuple containing the opening and closing bracket.
-
-        Returns:
-            List[str]: A list of strings, each representing a segment of text found within
-                       the outermost brackets, including any nested brackets content.
-        """
-        open_bracket, close_bracket = brackets
-        depth = 0
-        buffer = ""
-        sentences = []
-
-        for char in text:
-            if char == open_bracket:
-                if depth > 0:
-                    buffer += char  # Add to buffer if already inside brackets
-                depth += 1
-            elif char == close_bracket:
-                depth -= 1
-                if depth == 0:  # Exiting outermost brackets
-                    if buffer:
-                        sentences.append(buffer)
-                        buffer = ""  # Reset buffer for next possible extraction
-                elif depth > 0:
-                    buffer += char  # Still inside nested brackets, continue adding
-            elif depth > 0:
-                buffer += char  # Add characters inside brackets to buffer
-
-        return sentences
-
-    def process_dataset_entry(self, data_entry) -> List:
-        data: list[dict] = []
-        sentences = []
-        text_in = data_entry[self.text_key]
-
-        for bracket in self.brackets:
-            sentences.extend(self.extract_text_within_brackets(text_in, bracket))
-
-        for sentence in sentences:
-            new_entry = data_entry.copy()
-            new_entry[self.text_key] = sentence
-            # new_entry["ORIGINAL TEXT"] = text_in  # for testing
-            data.append(new_entry)
-
-        data_list = []
-        for data_point in data:
-            data_list.append(DataEntry(data=data_point))
-
-        return data_list
-
-
-class GetWER(BaseParallelProcessor):
-    """This processor calculates Word Error Rate (WER) between predicted text and ground truth text.
-
-    It computes the WER for each entry in the manifest and adds the result as a new field.
-    
-    Args:
-        text_key (str): Key for the ground truth text field in the manifest. Default: "text".
-        pred_text_key (str): Key for the predicted text field in the manifest. Default: "pred_text".
-    
-    Returns:
-        The same data as in the input manifest with an additional 'wer' field containing 
-        the calculated Word Error Rate between the specified text fields.
-    """
-    def __init__(
-        self,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        data_entry['wer'] = get_wer(data_entry[self.text_key], data_entry[self.pred_text_key])
-        return [DataEntry(data=data_entry)]
-
-
-class MakeSentence(BaseParallelProcessor):
-    """This processor formats text strings into proper sentences.
-
-    It capitalizes the first character of the text (if enabled) and appends
-    an end symbol if the text does not already end with punctuation.
-
-    Args:
-        text_key (str): The key in the manifest containing the text to be processed.
-            Default: "text".
-        end_symbol (str): The punctuation symbol to add at the end of the text if it
-            doesn't already have one. Default: ":".
-        make_uppercase (bool): Whether to capitalize the first character of the text.
-            Default: True.
-
-    Returns:
-        The same data as in the input manifest with the text field modified to have
-        proper sentence formatting.
-
-    Example:
-        .. code-block:: yaml
-
-            - _target_: sdp.processors.modify_manifest.data_to_data.MakeSentence
-              input_manifest_file: ${workspace_dir}/dataset.json
-              output_manifest_file: ${workspace_dir}/dataset_formatted.json
-              text_key: "transcript"
-              end_symbol: "."
-              make_uppercase: true
-    """
-    def __init__(
-        self,
-        text_key: str = "text",
-        end_symbol: str = ":",
-        make_uppercase: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.make_uppercase = make_uppercase
-        self.text_key = text_key
-        self.end_symbol = end_symbol
-
-    def process_dataset_entry(self, data_entry) -> List:
-        if self.make_uppercase:
-            data_entry[self.text_key] = data_entry[self.text_key][0].upper() + data_entry[self.text_key][1:]
-
-        # Append end_symbol only if the text doesn't end with punctuation
-        if data_entry[self.text_key][-1].isalpha():
-            data_entry[self.text_key] += self.end_symbol
-        return [DataEntry(data=data_entry)]
-
-
-class ASRFileCheck(BaseProcessor):
-    """This processor validates audio files in the manifest and identifies corrupted files.
-
-    It attempts to load each audio file using the torchaudio library and moves corrupted
-    files to a specified directory.
-
-    Args:
-        audio_filepath_key (str): The key in the manifest that contains the path to
-            the audio file. Default: "audio_filepath".
-        corrupted_audio_dir (str): The directory where corrupted audio files will be moved.
-        workspace_dir (str, optional): The base directory for resolving relative paths.
-            Default: None.
-
-    Returns:
-        A manifest with corrupted audio files removed.
-
-    """
-    def __init__(self, audio_filepath_key: str = "audio_filepath", corrupted_audio_dir: str = None, workspace_dir: str = None, **kwargs):
-        """
-        Constructs the necessary attributes for the ASRFileCheck class.
-
-        Parameters:
-        ----------
-        audio_filepath_key : str, optional
-            The key in the manifest entries used to retrieve the path to the audio file. Defaults to 'audio_filepath'.
-        corrupted_audio_dir : str
-            The directory where corrupted audio files will be moved. This is required.
-        workspace_dir : str, optional
-            The base directory where audio files are stored. If provided, audio file paths will be resolved
-            relative to this directory. Defaults to None.
-        """
-        super().__init__(**kwargs)
-        self.audio_filepath_key = audio_filepath_key
-        
-        if corrupted_audio_dir is None:
-            raise ValueError("corrupted_audio_dir parameter is required. Please specify a directory to move corrupted files.")
-        
-        self.corrupted_audio_dir = corrupted_audio_dir
-        self.workspace_dir = workspace_dir
-        self.failed_files = []
-
-    def process(self):
-        """
-        Check each file listed in the manifest to ensure it can be loaded with torchaudio.
-
-        This method reads through the manifest file, attempts to load each audio file using torchaudio,
-        and moves corrupted files. A new manifest file is created with only the valid entries.
-        
-        Specific errors handled:
-        - FileNotFoundError: File doesn't exist
-        - RuntimeError: File format issues or codec problems
-        - Other exceptions: General issues with file loading
-        """
-        from sdp.logging import logger
-        
-        # Debug print to show workspace_dir
-        logger.info(f"ASRFileCheck workspace_dir: {self.workspace_dir}")
-        
-        with open(self.input_manifest_file, 'r') as f:
-            lines = f.readlines()
-
-        entries = []
-        total_lines = len(lines)
-
-        # Ensure the corrupted files directory exists
-        os.makedirs(self.corrupted_audio_dir, exist_ok=True)
-
-        for idx in tqdm(range(total_lines), desc="Checking Audio Files"):
-            line = lines[idx]
-            entry = json.loads(line)
-            audio_path = entry[self.audio_filepath_key]
-            
-            # Debug print first file path
-            if idx == 0:
-                logger.info(f"First audio_path from manifest: {audio_path}")
-            
-            # If workspace_dir is provided, join it with audio_path to get absolute path
-            if self.workspace_dir is not None:
-                full_audio_path = os.path.join(self.workspace_dir, audio_path)
-            else:
-                full_audio_path = audio_path
-            
-            # Debug print first full path
-            if idx == 0:
-                logger.info(f"First full_audio_path: {full_audio_path}")
-                logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
-            
-            try:
-                # Attempt to load the audio file to check if it is corrupted
-                torchaudio.load(full_audio_path)
-                entries.append(entry)  # File is good, append to entries list
-            except FileNotFoundError:
-                logger.warning(f"File not found: {full_audio_path}")
-                self.failed_files.append(audio_path)
-            except RuntimeError as e:
-                logger.warning(f"Audio format error in {audio_path}: {e}")
-                self.failed_files.append(audio_path)
-                
-                # Move the corrupted audio file
-                if os.path.exists(full_audio_path):
-                    dest_path = os.path.join(self.corrupted_audio_dir, os.path.basename(audio_path))
-                    os.rename(full_audio_path, dest_path)
-                    logger.info(f"Moved corrupted file to: {dest_path}")
-            except Exception as e:
-                logger.warning(f"Unknown error loading {audio_path}: {e}")
-                self.failed_files.append(audio_path)
-                
-                # Move the corrupted audio file
-                if os.path.exists(full_audio_path):
-                    dest_path = os.path.join(self.corrupted_audio_dir, os.path.basename(audio_path))
-                    os.rename(full_audio_path, dest_path)
-                    logger.info(f"Moved corrupted file to: {dest_path}")
-
-        # Output non-corrupted entries to a new manifest file
-        with open(self.output_manifest_file, 'w', encoding='utf-8') as f_out:
-            for entry in entries:
-                json.dump(entry, f_out, ensure_ascii=False)
-                f_out.write("\n")
-
-        if self.failed_files:
-            logger.warning(f"Failed to process {len(self.failed_files)} files.")
-            logger.debug(f"Failed files: {self.failed_files}")
-
-
-class ListToEntries(BaseParallelProcessor):
-    """
-    A dataset processor that transforms a single entry containing a list of items into multiple entries,
-    one for each item in the list.
-
-    This is useful when a dataset field (e.g., "segments") contains a list of sub-entries, and you want
-    to flatten these into individual records for further processing or training.
-
-    Args:
-        field_with_list (str): The name of the field in the input entry that contains a list.
-        output_field (str, optional): The name of the output field to assign to items in the list
-            if they are not dictionaries. Required if the list contains primitive types (e.g., strings).
-        fields_to_save (list[str], optional): A list of field names to preserve from the original entry.
-            All other fields will be removed.
-        fields_to_remove (list[str], optional): A list of field names to explicitly remove from the original entry,
-            in addition to those excluded by `fields_to_save`.
-        **kwargs: Additional arguments passed to the BaseParallelProcessor.
-
-    Raises:
-        TypeError: If the specified list field is not of type list.
-        ValueError: If the list items are not dictionaries and `output_field` is not provided.
-    
-    Returns:
-        A manifest where each entry corresponds to one item in the original list from the input entry. 
-        This effectively transforms a single input entry containing a list of items into multiple standalone 
-        entries, each suitable for further dataset processing.
-
-    Example 1 (list of dicts):
-        Input: 
-            {"audio_filepath": "sample.wav", "segments": [{"start": 0.0, "end": 1.5, "text": "Hello"}, {"start": 1.6, "end": 3.0, "text": "World"}]}
-        Output:
-            {"audio_filepath": "sample.wav", "start": 0.0, "end": 1.5, "text": "Hello"}
-            {"audio_filepath": "sample.wav", "start": 1.6, "end": 3.0, "text": "World"}
-    
-    Example 2 (list of primitives, where field_with_list="text_chunks" and output_field="text"):
-        Input:
-            {"audio_filepath": "sample.wav", "text_chunks": ["Hello", "World"]}
-        Output:
-            {"audio_filepath": "sample.wav", "text": "Hello"}
-            {"audio_filepath": "sample.wav", "text": "World"}
-    """
-
-    def __init__(self, 
-        field_with_list: str,
-        output_field: str = None,
-        fields_to_save: list[str] = None,
-        fields_to_remove: list[str] = None,
-        **kwargs):
-        super().__init__(**kwargs)
-        self.field_with_list = field_with_list
-        self.output_field = output_field
-        self.fields_to_save = fields_to_save
-        self.fields_to_remove = fields_to_remove
-        
-    def process_dataset_entry(self, data_entry):
-        _entries = []
-
-        # Check that the target field is actually a list
-        if not isinstance(data_entry[self.field_with_list], list):
-            raise TypeError(f'Values of {self.field_with_list} field should be list type only: {data_entry}')
-        
-        # Remove the list field from the entry and get the list of items
-        items_list = data_entry.pop(self.field_with_list)
-
-        # If items are not dicts, output_field must be specified to store the item
-        if not isinstance(items_list[0], dict) and not self.output_field:
-            raise ValueError(f'Type of items in items list `{self.field_with_list}` is not dict ({type(items_list[0])}). In this case `output_field` should be provided.')
-
-        # Determine which fields to remove from the entry before expanding
-        fields_to_remove = set()
-        if self.fields_to_save is not None:
-            for field in data_entry:
-                if field not in self.fields_to_save:
-                    fields_to_remove.add(field)
-
-        if self.fields_to_remove is not None:
-            fields_to_remove.update(self.fields_to_remove)
-
-        # Remove specified fields
-        for field in fields_to_remove:
-            data_entry.pop(field)
-
-        # Expand the list into multiple entries
-        for item in items_list:
-            _entry = data_entry.copy()
-
-            # If item is a dict, merge its keys; otherwise, store it in `output_field`
-            if isinstance(item, dict):
-                _entry.update(item)
-            else: 
-                _entry[self.output_field] = item
-
-            _entry = DataEntry(_entry)
-            _entries.append(_entry)
-
-        return _entries
diff --git a/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py b/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
deleted file mode 100644
index ff675e0a..00000000
--- a/build/lib/sdp/processors/modify_manifest/data_to_dropbool.py
+++ /dev/null
@@ -1,907 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import json
-import re
-import os 
-import json
-from operator import eq, ge, gt, le, lt, ne
-from typing import List, Union
-
-from sdp.logging import logger
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
-from sdp.utils.get_diff import get_diff, get_diff_with_subs_grouped
-from sdp.utils.metrics_computation import (
-    get_cer,
-    get_charrate,
-    get_wer,
-    get_wmr,
-    get_wordrate,
-)
-
-
-class PreserveByValue(BaseParallelProcessor):
-    """
-    Processor for preserving dataset entries based on a specified condition involving a target value and an input field.
-
-    Args:
-        input_value_key (str): The field in the dataset entries to be evaluated.
-        target_value (Union[int, str]): The value to compare with the input field.
-        operator (str): (Optional) The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        input_value_key: str,
-        target_value: Union[int, str],
-        operator: str = "eq",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_value_key = input_value_key
-        self.target_value = target_value
-        if operator == "lt":
-            self.operator = lt
-        elif operator == "le":
-            self.operator = le
-        elif operator == "eq":
-            self.operator = eq
-        elif operator == "ne":
-            self.operator = ne
-        elif operator == "ge":
-            self.operator = ge
-        elif operator == "gt":
-            self.operator = gt
-        else:
-            raise ValueError(
-                'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)'
-            )
-
-    def process_dataset_entry(self, data_entry):
-        input_value = data_entry[self.input_value_key]
-        target = self.target_value
-        if self.operator(input_value, target):
-            return [DataEntry(data=data_entry)]
-        else:
-            return [DataEntry(data=None)]
-
-
-class DropHighLowCharrate(BaseParallelProcessor):
-    """Drops utterances if their character rate is too low or too high.
-
-    Character rate = ``(num of characters in self.text_key) / (duration of audio)``.
-    A too-low or too-high character rate often implies that the ground
-    truth transcription might be inaccurate.
-
-    Args:
-        high_charrate_threshold (float): upper character rate threshold.
-            If the character rate of an utterance is higher than this number,
-            the utterance will be dropped.
-        low_charrate_threshold (float): lower character rate threshold.
-            If the character rate of an utterance is lower than this number,
-            the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        high_charrate_threshold: float,
-        low_charrate_threshold: float,
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.high_charrate_threshold = high_charrate_threshold
-        self.low_charrate_threshold = low_charrate_threshold
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        """Drops utterances based on the provided thresholds."""
-        charrate = get_charrate(data_entry[self.text_key], data_entry["duration"])
-        if charrate > self.high_charrate_threshold:
-            return [DataEntry(data=None, metrics=(0, 1))]
-        elif charrate < self.low_charrate_threshold:
-            return [DataEntry(data=None, metrics=(1, 0))]
-
-        return [DataEntry(data=data_entry, metrics=(0, 0))]
-
-    def finalize(self, metrics):
-        """Will report how many utterances were dropped for each threshold."""
-        high_drop_counter = 0
-        low_drop_counter = 0
-        for dropped_low, dropped_high in metrics:
-            low_drop_counter += dropped_low
-            high_drop_counter += dropped_high
-        logger.info(
-            "Num of utterances that were dropped due to char rate > %f: %d",
-            self.high_charrate_threshold,
-            high_drop_counter,
-        )
-
-        logger.info(
-            "Num of utterances that were dropped due to char rate < %f: %d",
-            self.low_charrate_threshold,
-            low_drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropHighLowWordrate(BaseParallelProcessor):
-    """Drops utterances if their word rate is too low or too high.
-
-    Word rate = ``(num of words in self.text_key) / (duration of audio)``.
-    A too-low or too-high word rate often implies that the ground
-    truth transcription might be inaccurate.
-
-    Args:
-        high_wordrate_threshold (float): upper word rate threshold.
-            If the word rate of an utterance is higher than this number,
-            the utterance will be dropped.
-        low_wordrate_threshold (float): lower word rate threshold.
-            If the word rate of an utterance is lower than this number,
-            the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        high_wordrate_threshold: float,
-        low_wordrate_threshold: float,
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.high_wordrate_threshold = high_wordrate_threshold
-        self.low_wordrate_threshold = low_wordrate_threshold
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        wordrate = get_wordrate(data_entry[self.text_key], data_entry["duration"])
-        if wordrate > self.high_wordrate_threshold:
-            return [DataEntry(data=None, metrics=(0, 1))]
-        elif wordrate < self.low_wordrate_threshold:
-            return [DataEntry(data=None, metrics=(1, 0))]
-
-        return [DataEntry(data=data_entry, metrics=(0, 0))]
-
-    def finalize(self, metrics):
-        high_drop_counter = 0
-        low_drop_counter = 0
-        for dropped_low, dropped_high in metrics:
-            low_drop_counter += dropped_low
-            high_drop_counter += dropped_high
-        logger.info(
-            "Num of utterances that were dropped due to word rate > %f: %d",
-            self.high_wordrate_threshold,
-            high_drop_counter,
-        )
-        logger.info(
-            "Num of utterances that were dropped due to word rate < %f: %d",
-            self.low_wordrate_threshold,
-            low_drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropHighLowDuration(BaseParallelProcessor):
-    """Drops utterances if their duration is too low or too high.
-
-    Args:
-        high_duration_threshold (float): upper duration threshold (in seconds).
-            If the duration of an utterance's audio is higher than this number,
-            the utterance will be dropped.
-        low_duration_threshold (float): lower duration threshold (in seconds).
-            If the duration of an utterance's audio is lower than this number,
-            the utterance will be dropped.
-        duration_key (str): a string indicating which key of the data entries
-            should be used to find the utterance duration. Defaults to "duration".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        high_duration_threshold: float,
-        low_duration_threshold: float,
-        duration_key: str = "duration",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.high_duration_threshold = high_duration_threshold
-        self.low_duration_threshold = low_duration_threshold
-        self.high_drop_counter = 0
-        self.low_drop_counter = 0
-        self.duration_key = duration_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        duration = data_entry[self.duration_key]
-        if duration > self.high_duration_threshold:
-            return [DataEntry(data=None, metrics=(0, 1))]
-        elif duration < self.low_duration_threshold:
-            return [DataEntry(data=None, metrics=(1, 0))]
-
-        return [DataEntry(data=data_entry, metrics=(0, 0))]
-
-    def finalize(self, metrics):
-        high_drop_counter = 0
-        low_drop_counter = 0
-        for dropped_low, dropped_high in metrics:
-            low_drop_counter += dropped_low
-            high_drop_counter += dropped_high
-        logger.info(
-            "Num of utterances that were dropped due to duration > %f: %d",
-            self.high_duration_threshold,
-            high_drop_counter,
-        )
-        logger.info(
-            "Num of utterances that were dropped due to duration < %f: %d",
-            self.low_duration_threshold,
-            low_drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropIfNoneOfRegexMatch(BaseParallelProcessor):
-    """Drops utterances if ``data[self.text_key]`` does not match any of ``regex_patterns``.
-
-    Before applying regex checks, we will add a space
-    character to the beginning and end of the ``text`` and ``pred_text``
-    keys for each data entry. After the the regex checks, assuming the utterance isn't dropped,
-    the extra spaces are removed. This includes the spaces in the beginning
-    and end of the text, as well as any double spaces ``"  "``.
-
-    Args:
-        regex_patterns (list[str]): If ``data_entry[self.text_key]`` does not
-            match any of the regex patterns in the list, that utterance
-            will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        regex_patterns: List[str],
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.regex_patterns = regex_patterns
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
-        for regex_pattern in self.regex_patterns:
-            if re.search(regex_pattern, data_entry[self.text_key]):
-                break
-        else:  # will only reach this if none of the regex match
-            return [DataEntry(data=None, metrics=1)]
-
-        # will reach this part of code if at least one of the regexes matches
-        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
-        return [DataEntry(data=data_entry, metrics=0)]
-
-    def finalize(self, metrics):
-        total_counter = 0
-        for value in metrics:
-            if value:
-                total_counter += value
-        logger.info("Num of utterances that were dropped due to not containing any of the specified regex patterns")
-        logger.info(f"{total_counter}")
-        super().finalize(metrics)
-
-
-class DropNonAlphabet(BaseParallelProcessor):
-    """Drops utterances if they contain characters that are not in the ``alphabet``.
-
-    Args:
-        alphabet (str): a string containing all of the characters in our alphabet.
-            If an utterance contains at least one character that is not in the
-            ``alphabet``, then that utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-            .. note::
-                Don't forget to include spaces in your alphabet, unless you
-                want to make sure none of the utterances contain spaces.
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        alphabet: str,
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.alphabet = alphabet
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        drop_this_utt = False
-        non_alphabet_counter = collections.defaultdict(int)
-        for char in data_entry[self.text_key]:
-            if char not in self.alphabet:
-                drop_this_utt = True
-                non_alphabet_counter[char] += 1
-        if drop_this_utt:
-            return [DataEntry(data=None, metrics=non_alphabet_counter)]
-        return [DataEntry(data=data_entry, metrics=non_alphabet_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for char, value in counter.items():
-                total_counter[char] += value
-        logger.info("Num of non-alphabet characters")
-        for char, count in total_counter.items():
-            logger.info(f"{char}: {count}")
-        super().finalize(metrics)
-
-
-class DropASRErrorBeginningEnd(BaseParallelProcessor):
-    """Drops utterances if there is a sufficiently long ASR mismatch
-    at the beginning or end of the utterance.
-
-    Args:
-        beginning_error_char_threshold (int): if there is an insertion or deletion at
-            the beginning of the utterance that has more characters than this number,
-            then the utterance will be dropped.
-            If there is a substitution at the beginning of the utterance, then the
-            utterance will be dropped if
-            ``abs(len(deletion) - len(insertion)) > beginning_error_char_threshold``.
-        end_error_char_threshold (int): if there is an insertion or deletion at
-            the end of the utterance that has more characters than this number,
-            then the utterance will be dropped.
-            If there is a substitution at the end of the utterance, then the
-            utterance will be dropped if
-            ``abs(len(deletion) - len(insertion)) > end_error_char_threshold``.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        beginning_error_char_threshold: int,
-        end_error_char_threshold: int,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.beginning_error_char_threshold = beginning_error_char_threshold
-        self.end_error_char_threshold = end_error_char_threshold
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
-
-        diff = get_diff_with_subs_grouped(orig_words, pred_words)
-
-        if len(diff) > 0:  # i.e. if there are differences between text and pred_text
-            first_diff_entry = diff[0]
-            if first_diff_entry[0] == 1 or first_diff_entry[0] == -1:  # i.e. diff is purely an insertion or deletion
-                if len(first_diff_entry[1]) > self.beginning_error_char_threshold:
-                    return [DataEntry(data=None, metrics=(1, 0))]
-            elif first_diff_entry[0] != 0:  # i.e. diff should be a tuple representing substitution
-                len_deletion = len(first_diff_entry[0][1])
-                len_insertion = len(first_diff_entry[1][1])
-                if abs(len_deletion - len_insertion) > self.beginning_error_char_threshold:
-                    return [DataEntry(data=None, metrics=(1, 0))]
-
-            last_diff_entry = diff[-1]
-            if last_diff_entry[0] == 1 or last_diff_entry[0] == -1:  # i.e. diff is purely an insertion or deletion
-                if len(last_diff_entry[1]) > self.end_error_char_threshold:
-                    return [DataEntry(data=None, metrics=(0, 1))]
-            elif last_diff_entry[0] != 0:  # i.e. diff should be a tuple representing substitution
-                len_deletion = len(last_diff_entry[0][1])
-                len_insertion = len(last_diff_entry[1][1])
-                if abs(len_deletion - len_insertion) > self.end_error_char_threshold:
-                    return [DataEntry(data=None, metrics=(0, 1))]
-
-        return [DataEntry(data=data_entry, metrics=(0, 0))]
-
-    def finalize(self, metrics):
-        beginning_drop_counter = 0
-        end_drop_counter = 0
-        for dropped_beginning, dropped_end in metrics:
-            beginning_drop_counter += dropped_beginning
-            end_drop_counter += dropped_end
-        logger.info(
-            "Num of utterances that were dropped due to asr insertions/deletions at the beginning: %d",
-            beginning_drop_counter,
-        )
-        logger.info(
-            "Num of utterances that were dropped due to asr insertions/deletions at the end: %d",
-            end_drop_counter,
-        )
-        super().finalize(metrics)
-
-
-# TODO: needs unification with above class in some way
-class DropASRError(BaseParallelProcessor):
-    """Drops utterances if there is a sufficiently long ASR mismatch anywhere in the utterance.
-
-    Args:
-        consecutive_words_threshold (int): will drop if there is a mismatch of
-            at least this many words in a row.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        consecutive_words_threshold: int,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.consecutive_words_threshold = consecutive_words_threshold
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
-        diffs = get_diff(orig_words, pred_words)
-
-        for diff_entry in diffs:
-            if diff_entry[0] == 0:
-                continue
-            if len(diff_entry[1].split()) >= self.consecutive_words_threshold:
-                return []
-
-        return [DataEntry(data=data_entry)]
-
-
-class DropHighCER(BaseParallelProcessor):
-    """Drops utterances if there is a sufficiently high character-error-rate (CER).
-
-    CER is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
-
-    .. note::
-        We only drop the utterance if ``CER > threshold`` (i.e. strictly greater
-        than) so that if we set the threshold to 0, we will not remove
-        utterances with ``CER == 0``.
-
-    Args:
-        cer_threshold (float): CER threshold above which the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        cer_threshold: float,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.cer_threshold = cer_threshold
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        cer = get_cer(data_entry[self.text_key], data_entry[self.pred_text_key])
-        if cer > self.cer_threshold:
-            return [DataEntry(data=None, metrics=1)]
-        else:
-            return [DataEntry(data=data_entry, metrics=0)]
-
-    def finalize(self, metrics):
-        drop_counter = 0
-        for dropped in metrics:
-            drop_counter += dropped
-        logger.info(
-            "Num of utterances that were dropped due to CER > %d: %d",
-            self.cer_threshold,
-            drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropHighWER(BaseParallelProcessor):
-    """Drops utterances if there is a sufficiently high word-error-rate (WER).
-
-    WER is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
-
-    .. note::
-        We only drop the utterance if ``WER > threshold`` (i.e. strictly greater
-        than) so that if we set the threshold to 0, we will not remove
-        utterances with ``WER == 0``.
-
-    Args:
-        wer_threshold (float): WER threshold above which the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        wer_threshold: float,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.wer_threshold = wer_threshold
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        wer = get_wer(data_entry[self.text_key], data_entry[self.pred_text_key])
-        if wer > self.wer_threshold:
-            return [DataEntry(data=None, metrics=1)]
-        else:
-            return [DataEntry(data=data_entry, metrics=0)]
-
-    def finalize(self, metrics):
-        drop_counter = 0
-        for dropped in metrics:
-            drop_counter += dropped
-        logger.info(
-            "Num of utterances that were dropped due to WER > %d: %d",
-            self.wer_threshold,
-            drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropLowWordMatchRate(BaseParallelProcessor):
-    """Drops utterances if there is a sufficiently low word-match-rate (WMR).
-
-    WMR is measured between ``data[self.text_key]`` and ``data[self.pred_text_key]``.
-
-    .. note::
-        We only drop the utterance if ``WMR < threshold`` (i.e. strictly lower
-        than) so that if we set the threshold to 100, we will not remove
-        utterances with ``WMR == 100``.
-
-    Args:
-        wmr_threshold (float): WMR threshold below which the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-        The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        wmr_threshold: float,
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.wmr_threshold = wmr_threshold
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
-        wmr = get_wmr(orig_words, pred_words)
-        if wmr < self.wmr_threshold:
-            return [DataEntry(data=None, metrics=1)]
-        else:
-            return [DataEntry(data=data_entry, metrics=0)]
-
-    def finalize(self, metrics):
-        drop_counter = 0
-        for dropped in metrics:
-            drop_counter += dropped
-        logger.info(
-            "Num of utterances that were dropped due to WMR < %d: %d",
-            self.wmr_threshold,
-            drop_counter,
-        )
-        super().finalize(metrics)
-
-
-class DropIfRegexMatch(BaseParallelProcessor):
-    """Drops utterances if text matches a regex pattern.
-
-    Before applying regex checks, we will add a space
-    character to the beginning and end of the ``text`` and ``pred_text``
-    keys for each data entry. After the the regex checks, assuming the utterance isn't dropped,
-    the extra spaces are removed. This includes the spaces in the beginning
-    and end of the text, as well as any double spaces ``"  "``.
-
-    Args:
-        regex_patterns (list[str]): a list of strings. The list will be
-            traversed in order. If ``data_entry.data[self.text_key]`` matches
-            the regex, the entry will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        regex_patterns: List[str],
-        text_key: str = "text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.regex_patterns = regex_patterns
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        drop_counter = collections.defaultdict(int)
-        data_entry[self.text_key] = add_start_end_spaces(data_entry[self.text_key])
-        for regex_pattern in self.regex_patterns:
-            if re.search(regex_pattern, data_entry[self.text_key]):
-                for match in re.finditer(regex_pattern, data_entry[self.text_key]):
-                    drop_counter[regex_pattern] += 1
-                return [DataEntry(data=None, metrics=drop_counter)]
-        data_entry[self.text_key] = remove_extra_spaces(data_entry[self.text_key])
-        return [DataEntry(data=data_entry, metrics=drop_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for attribute, value in counter.items():
-                total_counter[attribute] += value
-        logger.info("Regex matches that were dropped in attribute")
-        for attribute, matches in total_counter.items():
-            logger.info(f"{attribute}, {matches}")
-        super().finalize(metrics)
-
-
-class DropOnAttribute(BaseParallelProcessor):
-    """Drops utterances if attribute is set to True/False.
-
-    Args:
-        key (str): which key to use for dropping utterances.
-        drop_if_false (bool): whether to drop if value is False. Defaults
-            to dropping if True.
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        key: str,
-        drop_if_false: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.key = key
-        self.drop_if_false = drop_if_false
-
-    def process_dataset_entry(self, data_entry) -> List:
-        if data_entry[self.key] is not self.drop_if_false:
-            return [DataEntry(data=None, metrics=1)]
-        return [DataEntry(data=data_entry, metrics=0)]
-
-    def finalize(self, metrics):
-        total_counter = 0
-        for counter in metrics:
-            total_counter += counter
-        logger.info("Dropped %d utterances", total_counter)
-        super().finalize(metrics)
-
-
-class DropIfSubstringInInsertion(BaseParallelProcessor):
-    """Drops utterances if a substring matches an ASR insertion.
-
-    Insertions are checked between ``data[self.text_key]`` and
-    ``data[self.pred_text_key]``.
-
-    .. note::
-        We check for exact matches, so you need to be mindful of spaces, e.g.
-        you may wish to do ``substrings_in_insertion = ["nemo "]`` instead
-        of ``substrings_in_insertion = ["nemo"]``.
-
-    Args:
-        substrings_in_insertion (list[str]): a list of strings which might be
-            inserted in predicted ASR text. If the insertion matches a
-            string exactly, the utterance will be dropped.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-        pred_text_key (str): a string indicating which key of the data entries
-            should be used to access the ASR predictions. Defaults to "pred_text".
-
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-    """
-
-    def __init__(
-        self,
-        substrings_in_insertion: List[str],
-        text_key: str = "text",
-        pred_text_key: str = "pred_text",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.substrings_in_insertion = substrings_in_insertion
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        for substring_in_insertion in self.substrings_in_insertion:
-            if substring_in_insertion in data_entry[self.pred_text_key]:
-                orig_words, pred_words = data_entry[self.text_key], data_entry[self.pred_text_key]
-                diff = get_diff_with_subs_grouped(orig_words, pred_words)
-
-                for diff_entry in diff:
-                    if diff_entry[0] == 1:  # insertion in original string
-                        if substring_in_insertion in diff_entry[1]:
-                            return [DataEntry(data=None, metrics=diff_entry[1])]
-        return [DataEntry(data=data_entry, metrics="")]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for diff_entry in metrics:
-            if diff_entry:
-                total_counter[diff_entry] += 1
-        logger.info("Some of the insertions that cause the utterance to be dropped:")
-        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
-
-        for insertion, count in total_counter_sorted.items():
-            logger.info(f"{insertion}, {count}")
-        super().finalize(metrics)
-
-
-class DropRepeatedFields(BaseParallelProcessor):
-    """Drops utterances from the current manifest if their text fields are present in other manifests.
-
-    This class processes multiple manifest files and removes entries from the current manifest if the text field
-    matches any entry in the other manifests. It allows for optional punctuation removal from the text fields 
-    before performing the check.
-    
-    .. note::
-        It is better to process Test/Dev/Train and then Other.tsv
-
-    Args:
-        manifests_paths (list[str]): List of paths to the manifest files to check against.
-        current_manifest_file (str): Path to the current manifest file to be processed.
-        punctuations (str): (Optional): String of punctuation characters to be removed from the text fields before checking for duplicates. Defaults to None.
-        text_key (str): The key in the manifest entries that contains the text field. Defaults to "text".
-    
-    Returns:
-         The same data as in the input manifest with some entries dropped.
-
-    """
-    def __init__(self,
-                 manifests_paths: List[str], 
-                 current_manifest_file: str,
-                 punctuations: str = None,
-                 text_key: str = "text",
-                 **kwargs
-                 ):
-        super().__init__( **kwargs)
-        self.manifests_paths = manifests_paths
-        self.current_manifest_file = current_manifest_file
-        self.text_key = text_key
-        self.punctuations = punctuations
-        self.text_set = set()
-        self.load_data()
-
-    def load_data(self):
-        if self.current_manifest_file in self.manifests_paths:
-            self.manifests_paths.remove(self.current_manifest_file)
-        for path in self.manifests_paths:
-            if os.path.exists(path):
-                with open(path, "rt", encoding="utf8") as fin:
-                    for line in fin:
-                        line_dict = json.loads(line)
-                        line_text = line_dict[self.text_key]
-                        if self.punctuations is not None and len(self.punctuations) > 0:
-                            line_text = self.remove_punctuation(line_text)
-                        self.text_set.add(line_text)
-        
-    def remove_punctuation(self, text):
-        return re.sub(fr'[{self.punctuations}]', '', text)
-    
-    def process_dataset_entry(self, data_entry) -> List:
-        text_for_check = data_entry[self.text_key]
-        if self.punctuations is not None and len(self.punctuations) > 0:
-            text_for_check = self.remove_punctuation(text_for_check)
-        if text_for_check in self.text_set:
-            return [DataEntry(data=None, metrics=1)]
-        return [DataEntry(data=data_entry, metrics=0)]
-    
-    def finalize(self, metrics: List):
-        total_counter = 0
-        for counter in metrics:
-            total_counter += counter
-        logger.info("Dropped %d utterances", total_counter)
-        super().finalize(metrics)
-
-
-class DropDuplicates(BaseProcessor):
-    """
-    Processor that drops all the non unique uterances associated with the specified key, keeping only the first utterance.
-
-    Args:
-        drop_key (str): A string specifying the key in the data entries used to determine uniqueness. Defaults to "text".
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-
-    Returns:
-        A list of unique data entries after removing duplicates.
-
-    """
-
-    def __init__(self, drop_key: str = "text", **kwargs):
-        super().__init__(**kwargs)
-        self.drop_key = drop_key
-        self.seen_texts = set()
-
-    def process(self):
-        unique_entries = []
-        with open(self.input_manifest_file, 'r', encoding='utf-8') as file:
-            for line in file:
-                data_entry = json.loads(line)
-                text_in = data_entry[self.drop_key]
-                if text_in not in self.seen_texts:
-                    self.seen_texts.add(text_in)
-                    unique_entries.append(data_entry)
-
-        with open(self.output_manifest_file, "wt", encoding='utf-8') as fout:
-            for entry in unique_entries:
-                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
-
-        logger.info(f"Total number of entries after processing: {len(unique_entries)}")
-        return unique_entries
diff --git a/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py b/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
deleted file mode 100644
index 10ccd57e..00000000
--- a/build/lib/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-from typing import List
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-# TODO: should be done with general sub-regex processor
-
-
-class MakeLettersUppercaseAfterPeriod(BaseParallelProcessor):
-    """Can be used to replace characters with upper-case version after punctuation.
-
-    Args:
-        punctuation (str): string with all punctuation characters to consider.
-            Defaults to ".!?".
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
-
-    Returns:
-         The same data as in the input manifest with ``<text_key>`` field changed.
-    """
-
-    def __init__(
-        self, punctuation=".!?", text_key: str = "text", **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.punctuation = punctuation
-        self.text_key = text_key
-
-    def process_dataset_entry(self, data_entry) -> List:
-        replace_word_counter = collections.defaultdict(int)
-
-        # keeping in a list, since strings are immutable
-        new_text = []
-
-        idx = 0
-        while idx < len(data_entry[self.text_key]):
-            character = data_entry[self.text_key][idx]
-            # checking that next is space and then we upper whatever is after that
-            # note that Python's upper correctly does not change anything that's not a letter
-            if (
-                character in self.punctuation
-                and idx + 2 < len(data_entry[self.text_key])
-                and data_entry[self.text_key][idx + 1] == " "
-            ):
-                new_text.extend([character, " ", data_entry[self.text_key][idx + 2].upper()])
-                replace_word_counter[data_entry[self.text_key][idx : idx + 3]] += 1
-                idx += 2
-            else:
-                new_text.append(character)
-            idx += 1
-        data_entry[self.text_key] = "".join(new_text)
-
-        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
-
-    def finalize(self, metrics):
-        total_counter = collections.defaultdict(int)
-        for counter in metrics:
-            for word, count in counter.items():
-                total_counter[word] += count
-        logger.info("Some of the substrings that were uppercased")
-        total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
-        for word, count in total_counter_sorted.items():
-            if count > 1:
-                logger.info(f"{word} {count}")
-        super().finalize(metrics)
diff --git a/build/lib/sdp/processors/nemo/__init__.py b/build/lib/sdp/processors/nemo/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/build/lib/sdp/processors/nemo/asr_inference.py b/build/lib/sdp/processors/nemo/asr_inference.py
deleted file mode 100644
index 4359f320..00000000
--- a/build/lib/sdp/processors/nemo/asr_inference.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-from pathlib import Path
-from typing import Optional
-
-from sdp.processors.base_processor import BaseProcessor
-
-# Note that we do not re-use base parallel implementation, since the ASR
-# inference is already run in batches.
-
-# TODO: actually, it might still be beneficial to have another level of
-#       parallelization, but that needs to be tested.
-
-
-class ASRInference(BaseProcessor):
-    """This processor performs ASR inference on each utterance of the input manifest.
-
-    ASR predictions will be saved in the ``pred_text`` key.
-
-    Args:
-        pretrained_model (str, Optional): the name or the filepath of the pretrained NeMo ASR model
-            which will be used to do inference.
-        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-
-    Returns:
-         The same data as in the input manifest with an additional field
-         ``pred_text`` containing ASR model's predictions.
-    """
-
-    def __init__(
-        self,
-        pretrained_model: Optional[str]=None,
-        batch_size: int = 32,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech.py"
-        self.pretrained_model = pretrained_model
-        self.batch_size = batch_size
-
-    def process(self):
-        """This will add "pred_text" key into the output manifest."""
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        if self.pretrained_model.endswith(".nemo"):
-            subprocess.run(
-                f"python {self.script_path} "
-                f"model_path={self.pretrained_model} "
-                f"dataset_manifest={self.input_manifest_file} "
-                f"output_filename={self.output_manifest_file} "
-                f"batch_size={self.batch_size} ",
-                shell=True,
-                check=True,
-            )
-        else:
-            subprocess.run(
-                f"python {self.script_path} "
-                f"pretrained_name={self.pretrained_model} "
-                f"dataset_manifest={self.input_manifest_file} "
-                f"output_filename={self.output_manifest_file} "
-                f"batch_size={self.batch_size} ",
-                shell=True,
-                check=True,
-            )
\ No newline at end of file
diff --git a/build/lib/sdp/processors/nemo/pc_inference.py b/build/lib/sdp/processors/nemo/pc_inference.py
deleted file mode 100644
index 8be34ec8..00000000
--- a/build/lib/sdp/processors/nemo/pc_inference.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-from sdp.processors.base_processor import BaseProcessor
-
-
-def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
-    result = []
-    with manifest.open() as f:
-        for i, line in enumerate(f):
-            data = json.loads(line)
-            result.append(data)
-    return result
-
-
-class PCInference(BaseProcessor):
-    """Adds predictions of a text-based punctuation and capitalization (P&C) model.
-
-    Operates on the text in the ``input_text_field``, and saves predictions in
-    the ``output_text_field``.
-
-    Args:
-        input_text_field (str): the text field that will be the input to the P&C model.
-        output_text_field (str): the text field where the output of the PC model
-            will be saved.
-        batch_size (int): the batch sized used by the P&C model.
-        device (str, Optional): the device used by the P&C model. Can be skipped to auto-select.
-        pretrained_name (str, Optional): the pretrained_name of the P&C model.
-        model_path (str, Optional): the model path to the P&C model.
-        **kwargs: Additional keyword arguments to be passed to the base class `PCInference`.
-
-    .. note::
-        Either ``pretrained_name`` or ``model_path`` have to be specified.
-
-    Returns:
-         The same data as in the input manifest with an additional field
-         <output_text_field> containing P&C model's predictions.
-    """
-
-    def __init__(
-        self,
-        input_text_field: str,
-        output_text_field: str,
-        batch_size: int,
-        device: Optional[str] = None,
-        pretrained_name: Optional[str] = None,
-        model_path: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.pretrained_name = pretrained_name
-        self.model_path = model_path
-        self.input_text_field = input_text_field
-        self.output_text_field = output_text_field
-        self.device = device
-        self.batch_size = batch_size
-
-        # verify self.pretrained_name/model_path
-        if self.pretrained_name is None and self.model_path is None:
-            raise ValueError("pretrained_name and model_path cannot both be None")
-        if self.pretrained_name is not None and self.model_path is not None:
-            raise ValueError("pretrained_name and model_path cannot both be specified")
-
-    def process(self):
-        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
-        from nemo.collections.nlp.models import PunctuationCapitalizationModel
-
-        if self.pretrained_name:
-            model = PunctuationCapitalizationModel.from_pretrained(self.pretrained_name)
-        else:
-            model = PunctuationCapitalizationModel.restore_from(self.model_path)
-
-        if self.device is None:
-            if torch.cuda.is_available():
-                model = model.cuda()
-            else:
-                model = model.cpu()
-        else:
-            model = model.to(self.device)
-
-        manifest = load_manifest(Path(self.input_manifest_file))
-
-        texts = []
-        for item in manifest:
-            texts.append(item[self.input_text_field])
-
-        processed_texts = model.add_punctuation_capitalization(
-            texts,
-            batch_size=self.batch_size,
-        )
-        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        with Path(self.output_manifest_file).open('w') as f:
-            for item, t in zip(manifest, processed_texts):
-                item[self.output_text_field] = t
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
diff --git a/build/lib/sdp/processors/nemo/transcribe_speech.py b/build/lib/sdp/processors/nemo/transcribe_speech.py
deleted file mode 100644
index bb04047b..00000000
--- a/build/lib/sdp/processors/nemo/transcribe_speech.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is copied over from https://github.com/NVIDIA/NeMo/blob/v1.23.0/examples/asr/transcribe_speech.py.
-# It is currently only compatible with NeMo v1.23.0. To use a different version of NeMo, please modify the file.
-
-import contextlib
-import os
-from dataclasses import dataclass, is_dataclass
-from typing import List, Optional, Union
-
-import pytorch_lightning as pl
-import torch
-from omegaconf import OmegaConf, open_dict
-
-from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
-from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
-from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
-from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
-from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
-from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
-from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
-from nemo.collections.asr.parts.utils.transcribe_utils import (
-    compute_output_filename,
-    prepare_audio_data,
-    setup_model,
-    transcribe_partial_audio,
-    write_transcription,
-)
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-
-"""
-Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data.
-
-# Arguments
-  model_path: path to .nemo ASR checkpoint
-  pretrained_name: name of pretrained ASR model (from NGC registry)
-  audio_dir: path to directory with audio files
-  dataset_manifest: path to dataset JSON manifest file (in NeMo format)
-
-  compute_timestamps: Bool to request greedy time stamp information (if the model supports it)
-  compute_langs: Bool to request language ID information (if the model supports it)
-
-  (Optionally: You can limit the type of timestamp computations using below overrides)
-  ctc_decoding.ctc_timestamp_type="all"  # (default all, can be [all, char, word])
-  rnnt_decoding.rnnt_timestamp_type="all"  # (default all, can be [all, char, word])
-
-  (Optionally: You can limit the type of timestamp computations using below overrides)
-  ctc_decoding.ctc_timestamp_type="all"  # (default all, can be [all, char, word])
-  rnnt_decoding.rnnt_timestamp_type="all"  # (default all, can be [all, char, word])
-
-  output_filename: Output filename where the transcriptions will be written
-  batch_size: batch size during inference
-
-  cuda: Optional int to enable or disable execution of model on certain CUDA device.
-  allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
-  amp: Bool to decide if Automatic Mixed Precision should be used during inference
-  audio_type: Str filetype of the audio. Supported = wav, flac, mp3
-
-  overwrite_transcripts: Bool which when set allows repeated transcriptions to overwrite previous results.
-
-  ctc_decoding: Decoding sub-config for CTC. Refer to documentation for specific values.
-  rnnt_decoding: Decoding sub-config for RNNT. Refer to documentation for specific values.
-
-  calculate_wer: Bool to decide whether to calculate wer/cer at end of this script
-  clean_groundtruth_text: Bool to clean groundtruth text
-  langid: Str used for convert_num_to_words during groundtruth cleaning
-  use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
-
-# Usage
-ASR model can be specified by either "model_path" or "pretrained_name".
-Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
-append_pred - optional. Allows you to add more than one prediction to an existing .json
-pred_name_postfix - optional. The name you want to be written for the current model
-Results are returned in a JSON manifest file.
-
-python transcribe_speech.py \
-    model_path=null \
-    pretrained_name=null \
-    audio_dir="<remove or path to folder of audio files>" \
-    dataset_manifest="<remove or path to manifest>" \
-    output_filename="<remove or specify output filename>" \
-    clean_groundtruth_text=True \
-    langid='en' \
-    batch_size=32 \
-    compute_timestamps=False \
-    compute_langs=False \
-    cuda=0 \
-    amp=True \
-    append_pred=False \
-    pred_name_postfix="<remove or use another model name for output filename>"
-"""
-
-
-@dataclass
-class ModelChangeConfig:
-
-    # Sub-config for changes specific to the Conformer Encoder
-    conformer: ConformerChangeConfig = ConformerChangeConfig()
-
-
-@dataclass
-class TranscriptionConfig:
-    # Required configs
-    model_path: Optional[str] = None  # Path to a .nemo file
-    pretrained_name: Optional[str] = None  # Name of a pretrained model
-    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
-    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
-    channel_selector: Optional[
-        Union[int, str]
-    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
-    audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
-    eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
-
-    # General configs
-    output_filename: Optional[str] = None
-    batch_size: int = 32
-    num_workers: int = 0
-    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
-    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
-    random_seed: Optional[int] = None  # seed number going to be used in seed_everything()
-
-    # Set to True to output greedy timestamp information (only supported models)
-    compute_timestamps: bool = False
-    # set to True if need to return full alignment information
-    preserve_alignment: bool = False
-
-    # Set to True to output language ID information
-    compute_langs: bool = False
-
-    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
-    # device anyway, and do inference on CPU only if CUDA device is not found.
-    # If `cuda` is a negative number, inference will be on CPU only.
-    cuda: Optional[int] = None
-    allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
-    amp: bool = False
-    amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
-    audio_type: str = "wav"
-
-    # Recompute model transcription, even if the output folder exists with scores.
-    overwrite_transcripts: bool = True
-
-    # Decoding strategy for CTC models
-    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
-
-    # Decoding strategy for RNNT models
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
-
-    # Decoding strategy for AED models
-    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
-
-    # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
-    decoder_type: Optional[str] = None
-    # att_context_size can be set for cache-aware streaming models with multiple look-aheads
-    att_context_size: Optional[list] = None
-
-    # Use this for model-specific changes before transcription
-    model_change: ModelChangeConfig = ModelChangeConfig()
-
-    # Config for word / character error rate calculation
-    calculate_wer: bool = True
-    clean_groundtruth_text: bool = False
-    langid: str = "en"  # specify this for convert_num_to_words step in groundtruth cleaning
-    use_cer: bool = False
-
-    # can be set to True to return list of transcriptions instead of the config
-    # if True, will also skip writing anything to the output file
-    return_transcriptions: bool = False
-
-    # Set to False to return text instead of hypotheses from the transcribe function, so as to save memory
-    return_hypotheses: bool = True
-
-    # key for groundtruth text in manifest
-    gt_text_attr_name: str = "text"
-
-    # Use model's transcribe() function instead of transcribe_partial_audio() by default
-    # Only use transcribe_partial_audio() when the audio is too long to fit in memory
-    # Your manifest input should have `offset` field to use transcribe_partial_audio()
-    allow_partial_transcribe: bool = False
-
-
-@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
-def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis]]:
-    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
-
-    for key in cfg:
-        cfg[key] = None if cfg[key] == 'None' else cfg[key]
-
-    if is_dataclass(cfg):
-        cfg = OmegaConf.structured(cfg)
-
-    if cfg.random_seed:
-        pl.seed_everything(cfg.random_seed)
-
-    if cfg.model_path is None and cfg.pretrained_name is None:
-        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
-    if cfg.audio_dir is None and cfg.dataset_manifest is None:
-        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
-
-    # Load augmentor from exteranl yaml file which contains eval info, could be extend to other feature such VAD, P&C
-    augmentor = None
-    if cfg.eval_config_yaml:
-        eval_config = OmegaConf.load(cfg.eval_config_yaml)
-        augmentor = eval_config.test_ds.get("augmentor")
-        logging.info(f"Will apply on-the-fly augmentation on samples during transcription: {augmentor} ")
-
-    # setup GPU
-    if cfg.cuda is None:
-        if torch.cuda.is_available():
-            device = [0]  # use 0th CUDA device
-            accelerator = 'gpu'
-            map_location = torch.device('cuda:0')
-        elif cfg.allow_mps and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            logging.warning(
-                "MPS device (Apple Silicon M-series GPU) support is experimental."
-                " Env variable `PYTORCH_ENABLE_MPS_FALLBACK=1` should be set in most cases to avoid failures."
-            )
-            device = [0]
-            accelerator = 'mps'
-            map_location = torch.device('mps')
-        else:
-            device = 1
-            accelerator = 'cpu'
-            map_location = torch.device('cpu')
-    else:
-        device = [cfg.cuda]
-        accelerator = 'gpu'
-        map_location = torch.device(f'cuda:{cfg.cuda}')
-
-    logging.info(f"Inference will be done on device: {map_location}")
-
-    asr_model, model_name = setup_model(cfg, map_location)
-
-    trainer = pl.Trainer(devices=device, accelerator=accelerator)
-    asr_model.set_trainer(trainer)
-    asr_model = asr_model.eval()
-
-    # we will adjust this flag if the model does not support it
-    compute_timestamps = cfg.compute_timestamps
-    compute_langs = cfg.compute_langs
-    # has to be True if timestamps are required
-    preserve_alignment = True if cfg.compute_timestamps else cfg.preserve_alignment
-
-    # Check whether model and decoder type match
-    if isinstance(asr_model, EncDecCTCModel):
-        if cfg.decoder_type and cfg.decoder_type != 'ctc':
-            raise ValueError('CTC model only support ctc decoding!')
-    elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
-        if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
-            raise ValueError('Hybrid model only support ctc or rnnt decoding!')
-    else:  # rnnt model, there could be other models needs to be addressed.
-        if cfg.decoder_type and cfg.decoder_type != 'rnnt':
-            raise ValueError('RNNT model only support rnnt decoding!')
-
-    if cfg.decoder_type and hasattr(asr_model.encoder, 'set_default_att_context_size'):
-        asr_model.encoder.set_default_att_context_size(cfg.att_context_size)
-
-    # Setup decoding strategy
-    if hasattr(asr_model, 'change_decoding_strategy') and hasattr(asr_model, 'decoding'):
-        if isinstance(asr_model.decoding, MultiTaskDecoding):
-            cfg.multitask_decoding.compute_langs = cfg.compute_langs
-            cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment
-            asr_model.change_decoding_strategy(cfg.multitask_decoding)
-        elif cfg.decoder_type is not None:
-            # TODO: Support compute_langs in CTC eventually
-            if cfg.compute_langs and cfg.decoder_type == 'ctc':
-                raise ValueError("CTC models do not support `compute_langs` at the moment")
-
-            decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding
-            decoding_cfg.compute_timestamps = cfg.compute_timestamps  # both ctc and rnnt support it
-            if 'preserve_alignments' in decoding_cfg:
-                decoding_cfg.preserve_alignments = preserve_alignment
-            if 'compute_langs' in decoding_cfg:
-                decoding_cfg.compute_langs = cfg.compute_langs
-            if hasattr(asr_model, 'cur_decoder'):
-                asr_model.change_decoding_strategy(decoding_cfg, decoder_type=cfg.decoder_type)
-            else:
-                asr_model.change_decoding_strategy(decoding_cfg)
-
-        # Check if ctc or rnnt model
-        elif hasattr(asr_model, 'joint'):  # RNNT model
-            cfg.rnnt_decoding.fused_batch_size = -1
-            cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps
-            cfg.rnnt_decoding.compute_langs = cfg.compute_langs
-            if 'preserve_alignments' in cfg.rnnt_decoding:
-                cfg.rnnt_decoding.preserve_alignments = preserve_alignment
-
-            asr_model.change_decoding_strategy(cfg.rnnt_decoding)
-        else:
-            if cfg.compute_langs:
-                raise ValueError("CTC models do not support `compute_langs` at the moment.")
-            cfg.ctc_decoding.compute_timestamps = cfg.compute_timestamps
-
-            asr_model.change_decoding_strategy(cfg.ctc_decoding)
-
-    # Setup decoding config based on model type and decoder_type
-    with open_dict(cfg):
-        if isinstance(asr_model, EncDecCTCModel) or (
-            isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc"
-        ):
-            cfg.decoding = cfg.ctc_decoding
-        else:
-            cfg.decoding = cfg.rnnt_decoding
-
-    if isinstance(asr_model, EncDecMultiTaskModel):
-        # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
-        partial_audio = False
-        filepaths = cfg.dataset_manifest
-        assert cfg.dataset_manifest is not None
-    else:
-        # prepare audio filepaths and decide wether it's partial audio
-        filepaths, partial_audio = prepare_audio_data(cfg)
-
-    if not cfg.allow_partial_transcribe:
-        # by defatul, use model's transcribe() function, unless partial audio is required
-        partial_audio = False
-
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast(dtype=None):
-            yield
-
-    # Compute output filename
-    cfg = compute_output_filename(cfg, model_name)
-
-    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
-    if not cfg.return_transcriptions and not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
-        logging.info(
-            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
-            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
-        )
-        return cfg
-
-    # transcribe audio
-
-    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
-
-    with autocast(dtype=amp_dtype):
-        with torch.no_grad():
-            if partial_audio:
-                transcriptions = transcribe_partial_audio(
-                    asr_model=asr_model,
-                    path2manifest=cfg.dataset_manifest,
-                    batch_size=cfg.batch_size,
-                    num_workers=cfg.num_workers,
-                    return_hypotheses=cfg.return_hypotheses,
-                    channel_selector=cfg.channel_selector,
-                    augmentor=augmentor,
-                    decoder_type=cfg.decoder_type,
-                )
-            else:
-                transcriptions = asr_model.transcribe(
-                    paths2audio_files=filepaths,
-                    batch_size=cfg.batch_size,
-                    num_workers=cfg.num_workers,
-                    return_hypotheses=cfg.return_hypotheses,
-                    channel_selector=cfg.channel_selector,
-                    augmentor=augmentor,
-                )
-
-    logging.info(f"Finished transcribing {len(filepaths)} files !")
-    logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
-
-    # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
-    if type(transcriptions) == tuple and len(transcriptions) == 2:
-        transcriptions = transcriptions[0]
-
-    if cfg.return_transcriptions:
-        return transcriptions
-
-    # write audio transcriptions
-    output_filename, pred_text_attr_name = write_transcription(
-        transcriptions,
-        cfg,
-        model_name,
-        filepaths=filepaths,
-        compute_langs=compute_langs,
-        compute_timestamps=compute_timestamps,
-    )
-    logging.info(f"Finished writing predictions to {output_filename}!")
-
-    if cfg.calculate_wer:
-        output_manifest_w_wer, total_res, _ = cal_write_wer(
-            pred_manifest=output_filename,
-            gt_text_attr_name=cfg.gt_text_attr_name,
-            pred_text_attr_name=pred_text_attr_name,
-            clean_groundtruth_text=cfg.clean_groundtruth_text,
-            langid=cfg.langid,
-            use_cer=cfg.use_cer,
-            output_filename=None,
-        )
-        if output_manifest_w_wer:
-            logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
-            logging.info(f"{total_res}")
-
-    return cfg
-
-
-if __name__ == '__main__':
-    main()  # noqa pylint: disable=no-value-for-parameter
\ No newline at end of file
diff --git a/build/lib/sdp/processors/toloka/__init__.py b/build/lib/sdp/processors/toloka/__init__.py
deleted file mode 100644
index d9155f92..00000000
--- a/build/lib/sdp/processors/toloka/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/build/lib/sdp/processors/toloka/accept_if.py b/build/lib/sdp/processors/toloka/accept_if.py
deleted file mode 100644
index 3b0b7452..00000000
--- a/build/lib/sdp/processors/toloka/accept_if.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from collections import defaultdict
-from typing import Optional
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-try:
-    import toloka.client
-    import toloka.client.project.template_builder
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. AcceptIf processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-from tqdm import tqdm
-
-
-class AcceptIfWERLess(BaseParallelProcessor):
-    """This processor accepts Toloka assignments if the Word Error Rate (WER) is below a threshold.
-
-    It evaluates the WER between ground truth and predicted text for each assignment
-    and accepts those that meet the specified threshold criteria.
-
-    Args:
-        input_data_file (str): Path to the input data file containing API configurations.
-        input_pool_file (str): Path to the input pool file containing pool configurations.
-        threshold (float): The WER threshold below which assignments are accepted. Default: 75.
-        config_file (str): Path to the configuration file. Default: None.
-        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
-        platform (str): The Toloka platform to use. Default: None.
-        pool_id (str): The ID of the Toloka pool. Default: None.
-
-    Returns:
-        A manifest with accepted assignments from Toloka based on the WER threshold.
-        
-    Example:
-    .. code-block:: yaml
-
-        - _target_: sdp.processors.toloka.accept_if.AcceptIfWERLess
-            input_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
-            output_manifest_file: ${workspace_dir}/result_manifest_pred_review.json
-            input_data_file: ${workspace_dir}/data_file.json
-            input_pool_file: ${workspace_dir}/taskpool.json
-            threshold: 50
-    """
-    
-    def __init__(
-        self,
-        input_data_file: str,
-        input_pool_file: str,
-        threshold: float = 75,
-        config_file: str = None,
-        API_KEY: str = None,
-        platform: str = None,
-        pool_id: str = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_data_file = input_data_file
-        self.input_pool_file = input_pool_file
-        self.threshold = threshold
-        self.config_file = config_file
-        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
-        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
-        self.pool_id = pool_id
-        if self.config_file:
-            self.load_config()
-
-    def load_config(self):
-        """
-        Loads configuration data from the specified config file.
-
-        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
-        If the file is missing or improperly formatted, an appropriate error is logged.
-        """
-        try:
-            with open(self.config_file, 'r') as file:
-                config = json.load(file)
-                self.API_KEY = config.get('API_KEY', self.API_KEY)
-                self.platform = config.get('platform', self.platform)
-                self.pool_id = config.get('pool_id', self.pool_id)
-        except FileNotFoundError:
-            logger.error("Configuration file not found.")
-        except json.JSONDecodeError:
-            logger.error("Error decoding JSON from the configuration file.")
-
-    def prepare(self):
-        """
-        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
-
-        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
-        """
-        if not self.API_KEY or not self.platform or not self.pool_id:
-            try:
-                with open(self.input_data_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.API_KEY = data.get("API_KEY", self.API_KEY)
-                    self.platform = data.get("platform", self.platform)
-            except FileNotFoundError:
-                logger.error("Data file not found.")
-            except json.JSONDecodeError:
-                logger.error("Error decoding JSON from the data file.")
-
-            try:
-                with open(self.input_pool_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.pool_id = data.get("pool_id", self.pool_id)
-            except FileNotFoundError:
-                logger.error("Pool file not found.")
-            except json.JSONDecodeError:
-                logger.error("Error decoding JSON from the pool file.")
-
-        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-
-    def process(self):
-        """
-        Accepts Toloka assignments if their Word Error Rate (WER) is below the specified threshold.
-
-        This method reads assignments from the manifest file, evaluates the WER, and accepts assignments that
-        meet the acceptance criteria.
-        """
-        big_dict = defaultdict(int)
-        self.prepare()
-        with open(self.input_manifest_file, 'r') as file:
-            for line in file:
-                data_entry = json.loads(line)
-                if data_entry["wer"] < self.threshold:
-                    if str(data_entry["status"]) == "Status.SUBMITTED":
-                        big_dict[data_entry["assignment_id"]] += 1
-
-        accepted = 0
-        for assignment_id, count in tqdm(big_dict.items()):
-            if count >= 3:  # should be >= 3 and <= 5
-                self.toloka_client.accept_assignment(assignment_id=assignment_id, public_comment='Well done!')
-                accepted += 1
-
-        logger.info(f"Number of accepted task suits: {accepted} of {len(big_dict)}")
-
diff --git a/build/lib/sdp/processors/toloka/create_pool.py b/build/lib/sdp/processors/toloka/create_pool.py
deleted file mode 100644
index 88da4960..00000000
--- a/build/lib/sdp/processors/toloka/create_pool.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import datetime
-import json
-import os
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor
-
-try:
-    import toloka.client
-    import toloka.client.project.template_builder
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. CreatePool processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-
-class CreateTolokaPool(BaseParallelProcessor):
-    """Creates a Toloka pool for a given project based on user-provided configurations.
-
-    This class connects to Toloka, loads necessary settings, creates a new pool,
-    and optionally sets up quality control mechanisms for worker submissions.
-
-    Args:
-        lang (str): The language filter for the pool. Default: 'HY'.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Returns:
-        A newly created pool on the Toloka platform, configured and ready for task assignment.
-    """
-    def __init__(
-        self,
-        lang: str = 'HY',
-        **kwargs,
-    ):
-        """
-        Constructs the necessary attributes for the CreateTolokaPool class.
-
-        Parameters:
-        ----------
-        lang : str, optional
-            The language filter for the pool. Defaults to 'HY'.
-        """
-        super().__init__(**kwargs)
-        self.API_KEY = os.getenv('TOLOKA_API_KEY')
-        if not self.API_KEY:
-            raise ValueError("TOLOKA_API_KEY environment variable is not set")
-            
-        self.platform = os.getenv('TOLOKA_PLATFORM')
-        if not self.platform:
-            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
-            
-        # Project ID will be read from the input manifest file in process_dataset_entry
-        self.project_id = None
-        self.lang = lang
-
-    def process_dataset_entry(self, data_entry):
-        """
-        Creates a new Toloka pool based on the provided dataset entry.
-
-        This method retrieves the project ID from the dataset entry and uses Toloka's API
-        to create a new pool for the specified project and returns the pool details.
-
-        Parameters:
-        ----------
-        data_entry : dict
-            A dictionary containing the data entry information, which should include project_id.
-
-        Returns:
-        -------
-        list
-            A list containing a DataEntry object with the new pool ID if successful, or an empty list if failed.
-        """
-        # Get project_id from the data entry
-        project_id = data_entry.get("project_id")
-        if not project_id:
-            logger.error("No project_id found in data entry")
-            return []
-
-        try:
-            toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-
-            new_pool = toloka.client.Pool(
-                project_id=project_id,
-                private_name='Voice recording',
-                may_contain_adult_content=False,
-                will_expire=datetime.datetime.utcnow() + datetime.timedelta(days=365),
-                reward_per_assignment=0.01,
-                assignment_max_duration_seconds=60 * 10,
-                auto_accept_solutions=False,
-                auto_accept_period_day=14,
-                filter=(
-                    (toloka.client.filter.Languages.in_(self.lang)) & (toloka.client.filter.ClientType == 'TOLOKA_APP')
-                ),
-            )
-            new_pool.set_mixer_config(real_tasks_count=5)
-            self.setup_quality_control(new_pool)
-
-            new_pool = toloka_client.create_pool(new_pool)
-            data = {"pool_id": new_pool.id}
-            return [DataEntry(data=data)]
-        except Exception as e:
-            logger.error(f"Failed to create a new pool in Toloka: {e}")
-            return []
-
-    def setup_quality_control(self, pool):
-        """
-        Sets up quality control rules for the Toloka pool to ensure high-quality task results.
-
-        Parameters:
-        ----------
-        pool : toloka.client.Pool
-            The pool object for which quality control rules will be set up.
-        """
-        # Control for skipped tasks in a row
-        pool.quality_control.add_action(
-            collector=toloka.client.collectors.SkippedInRowAssignments(),
-            conditions=[toloka.client.conditions.SkippedInRowCount >= 2],
-            action=toloka.client.actions.RestrictionV2(
-                scope='POOL',
-                duration=1,
-                duration_unit='DAYS',
-                private_comment='Skips too many task suites in a row',
-            ),
-        )
-
-        # Control for fast responses that might indicate fraud
-        pool.quality_control.add_action(
-            collector=toloka.client.collectors.AssignmentSubmitTime(history_size=10, fast_submit_threshold_seconds=60),
-            conditions=[toloka.client.conditions.FastSubmittedCount >= 5],
-            action=toloka.client.actions.RestrictionV2(
-                scope='ALL_PROJECTS',
-                duration_unit='PERMANENT',
-                private_comment='Fast responses',
-            ),
-        )
diff --git a/build/lib/sdp/processors/toloka/create_project.py b/build/lib/sdp/processors/toloka/create_project.py
deleted file mode 100644
index a229214e..00000000
--- a/build/lib/sdp/processors/toloka/create_project.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-try:
-    import toloka.client
-    import toloka.client.project.template_builder
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. CreateTolokaProject processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-
-class CreateTolokaProject(BaseParallelProcessor):
-    """Creates a Toloka project based on user-provided configurations.
-
-    This class connects to Toloka, configures a new project with a name, description, and instructions,
-    and saves the created project details for future use.
-
-    Args:
-        project_name (str): The name of the project to be created.
-        project_description (str): A description shown to Toloka workers about the project.
-        project_instructions (str): Instructions provided to workers on how to complete assigned tasks.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Returns:
-        A project created on the Toloka platform, configured and ready for task and pool setup.
-    """
-    
-    def __init__(
-        self,
-        project_name: str,
-        project_description: str,
-        project_instructions: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.API_KEY = os.getenv('TOLOKA_API_KEY')
-        if not self.API_KEY:
-            raise ValueError("TOLOKA_API_KEY environment variable is not set")
-            
-        self.platform = os.getenv('TOLOKA_PLATFORM')
-        if not self.platform:
-            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
-            
-        self.project_name = project_name
-        self.project_description = project_description
-        self.project_instructions = project_instructions
-
-    def process(self):
-        """
-        Processes the creation of a Toloka project.
-
-        This method establishes a connection to the Toloka API using the API key and platform from environment variables,
-        then creates a new project with the specified name, description, and instructions. It also defines
-        the task specifications including the input and output fields, and then submits the project to Toloka.
-
-        After creating the project, it saves the project details (including the project ID) to a specified file.
-        """
-        logger.info("Processing Toloka project creation...")
-
-        toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-
-        # Create a new project
-        new_project = toloka.client.Project(
-            public_name=self.project_name,
-            public_description=self.project_description,
-            public_instructions=self.project_instructions,
-        )
-
-        # Setup the project interface
-        text_view = toloka.client.project.template_builder.TextViewV1(
-            toloka.client.project.template_builder.InputData('text')
-        )
-        audio_field = toloka.client.project.template_builder.AudioFieldV1(
-            toloka.client.project.template_builder.OutputData('audio_file'),
-            validation=toloka.client.project.template_builder.RequiredConditionV1(),
-        )
-        width_plugin = toloka.client.project.template_builder.TolokaPluginV1('scroll', task_width=500)
-
-        project_interface = toloka.client.project.TemplateBuilderViewSpec(
-            view=toloka.client.project.template_builder.ListViewV1(items=[text_view, audio_field]),
-            plugins=[width_plugin],
-        )
-
-        # Define task specification
-        input_specification = {'text': toloka.client.project.StringSpec()}
-        output_specification = {'audio_file': toloka.client.project.FileSpec()}
-
-        new_project.task_spec = toloka.client.project.task_spec.TaskSpec(
-            input_spec=input_specification,
-            output_spec=output_specification,
-            view_spec=project_interface,
-        )
-
-        # Create the project in Toloka
-        created_project = toloka_client.create_project(new_project)
-
-        # Always save project details to a file
-        data_file = self.output_manifest_file
-        directory = os.path.dirname(data_file)
-        if not os.path.exists(directory):
-            os.makedirs(directory)
-
-        data = {"project_id": created_project.id, "platform": self.platform}
-
-        with open(data_file, "w") as fout:
-            fout.write(json.dumps(data) + "\n")
-
-        logger.info("Project created successfully: Project ID - {}".format(created_project.id))
-
diff --git a/build/lib/sdp/processors/toloka/create_sentence_set.py b/build/lib/sdp/processors/toloka/create_sentence_set.py
deleted file mode 100644
index 8a86afb6..00000000
--- a/build/lib/sdp/processors/toloka/create_sentence_set.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from docx import Document
-
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-class CreateSentenceSet(BaseParallelProcessor):
-    """Creates a set of sentences from a DOCX file by splitting its content into individual sentences.
-
-    This processor reads a DOCX file, extracts the full text, splits it into sentences
-    based on the Armenian period character, and wraps each sentence into a `DataEntry`.
-
-    Args:
-        **kwargs: Additional arguments passed to the base `BaseParallelProcessor` class.
-
-    Returns:
-        A list of `DataEntry` objects, each containing a single extracted sentence.
-    """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def parse_docx(self, file_path):
-        doc = Document(file_path)
-
-        full_text = []
-        for para in doc.paragraphs:
-            full_text.append(para.text)
-
-        combined_text = '\n'.join(full_text)
-
-        sentences = combined_text.split('։')
-
-        return sentences
-
-    def process_dataset_entry(self, data_entry):
-        file = data_entry["source_filepath"]
-
-        data = [DataEntry(data={"text": text}) for text in self.parse_docx(file)]
-
-        return data
diff --git a/build/lib/sdp/processors/toloka/create_task_set.py b/build/lib/sdp/processors/toloka/create_task_set.py
deleted file mode 100644
index 692a1e2f..00000000
--- a/build/lib/sdp/processors/toloka/create_task_set.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from typing import List, Optional
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-
-try:
-    import toloka.client
-    import toloka.client.project.template_builder
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. CreateTaskSet processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-
-
-class CreateTolokaTaskSet(BaseParallelProcessor):
-    """Creates a set of tasks in a Toloka pool based on user-provided configurations and input data.
-
-    This class reads data from a manifest file, loads the target pool configuration,
-    and uses Toloka's API to create and upload tasks into the specified pool.
-
-    Args:
-        input_data_file (str): Path to the input data file containing API configurations.
-        input_pool_file (str): Path to the input pool file containing pool configurations.
-        limit (float): Percentage of tasks to load from the manifest file. Default: 100.
-
-    Returns:
-        A set of tasks created and uploaded to the specified Toloka pool.
-    """
-    def __init__(
-        self,
-        input_data_file: str,
-        input_pool_file: str,
-        limit: float = 100,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_data_file = input_data_file
-        self.input_pool_file = input_pool_file
-        self.limit = limit
-        self.pool_id = None
-        
-        # Get API key and platform from environment variables
-        self.API_KEY = os.getenv('TOLOKA_API_KEY')
-        if not self.API_KEY:
-            raise ValueError("TOLOKA_API_KEY environment variable is not set")
-            
-        self.platform = os.getenv('TOLOKA_PLATFORM')
-        if not self.platform:
-            raise ValueError("TOLOKA_PLATFORM environment variable is not set")
-        
-        self.toloka_client = None
-
-    def prepare(self):
-        """
-        Prepares the class by loading pool configuration and initializing Toloka client.
-
-        This method sets up the necessary components for task creation, including loading the
-        pool configuration and initializing the Toloka client.
-        """
-        self.load_pool_config()
-        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-
-    def load_pool_config(self):
-        """
-        Loads pool configuration data from the input pool file.
-
-        This method reads the pool configuration from the specified file and extracts the
-        pool ID for use in task creation.
-
-        Raises:
-        ------
-        ValueError
-            If the input pool file does not contain a pool ID.
-        """
-        try:
-            with open(self.input_pool_file, 'r') as file:
-                pool_config = json.load(file)
-                self.pool_id = pool_config.get('pool_id')
-                if not self.pool_id:
-                    raise ValueError("No pool ID found in the pool configuration file.")
-        except FileNotFoundError:
-            raise ValueError(f"Pool configuration file {self.input_pool_file} not found.")
-        except json.JSONDecodeError:
-            raise ValueError(f"Error decoding JSON from the pool configuration file {self.input_pool_file}.")
-
-    def read_manifest(self) -> List[dict]:
-        """
-        Reads and returns a portion of the manifest data from the input manifest file based on the specified limit.
-
-        This method reads the input manifest file, calculates the number of entries to read based on the
-        specified limit, and returns a list of those entries.
-
-        Returns:
-        -------
-        List[dict]
-            A list of manifest data entries that have been read.
-        """
-        logger.info("Reading manifest...")
-        with open(self.input_manifest_file, "rt") as fin:
-            total_lines = sum(1 for _ in fin)
-            lines_to_read = max(1, int(total_lines * (self.limit / 100)))
-            fin.seek(0)
-            entries = [json.loads(fin.readline()) for _ in range(lines_to_read)]
-            return entries
-
-    def process(self):
-        """
-        Creates Toloka tasks based on manifest data and adds them to the specified pool.
-
-        This method reads the input manifest, creates task objects for Toloka, and submits
-        them to the specified pool. It also writes the manifest data to an output file after 
-        tasks have been created.
-
-        Raises:
-        ------
-        ValueError
-            If no pool ID is available or if there are issues with the Toloka API.
-        """
-        logger.info("Processing tasks...")
-        self.prepare()
-
-        if not self.pool_id:
-            raise ValueError("No pool ID available. Cannot create tasks.")
-
-        entries = self.read_manifest()
-        tasks = [
-            toloka.client.Task(input_values={'text': data_entry["text"]}, pool_id=self.pool_id)
-            for data_entry in entries
-        ]
-
-        try:
-            self.toloka_client.create_tasks(tasks, allow_defaults=True)
-            logger.info(f"Created {len(tasks)} tasks.")
-        except Exception as e:
-            logger.error(f"Error creating tasks: {e}")
-            raise ValueError(f"Failed to create tasks: {e}")
-
-        # Write the manifest data to the output file
-        with open(self.output_manifest_file, "wt", encoding='utf-8') as fout:
-            for entry in entries:
-                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
diff --git a/build/lib/sdp/processors/toloka/download_responses.py b/build/lib/sdp/processors/toloka/download_responses.py
deleted file mode 100644
index 9e6c08a3..00000000
--- a/build/lib/sdp/processors/toloka/download_responses.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-try:
-    import toloka.client
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. DownloadResponses processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-
-
-class GetTolokaResults(BaseParallelProcessor):
-    """Fetches and stores results from a specified Toloka pool based on user-configured conditions.
-
-    This class connects to Toloka, retrieves task results from a specified pool, filters them by assignment status,
-    and stores the results in the given output directory.
-
-    Args:
-        input_data_file (str): Path to the input data file containing API configurations.
-        input_pool_file (str): Path to the input pool file containing pool configurations.
-        output_dir (str): Directory where the results will be stored.
-        status (str): Status filter for assignments to retrieve (default: 'ACCEPTED').
-        config_file (str): Path to a configuration file. Default: None.
-        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
-        platform (str): The Toloka environment to use ('PRODUCTION' or 'SANDBOX'). Default: None.
-        pool_id (str): The ID of the Toloka pool to retrieve results from. Default: None.
-
-    Returns:
-        A set of task results from Toloka, stored in the specified output directory.
-    """
-    def __init__(
-        self,
-        input_data_file: str,
-        input_pool_file: str,
-        output_dir: str,
-        status: str = "ACCEPTED",
-        config_file: str = None,
-        API_KEY: str = None,
-        platform: str = None,
-        pool_id: str = None,
-        **kwargs
-    ):
-        """
-        Constructs the necessary attributes for the GetTolokaResults class.
-
-        Parameters:
-        ----------
-        input_data_file : str
-            The path to the input data file containing API configurations.
-        input_pool_file : str
-            The path to the input pool file containing pool configurations.
-        output_dir : str
-            The directory where the output results will be stored.
-        status : str, optional
-            The status filter for assignments to retrieve. Defaults to 'ACCEPTED'.
-        config_file : str, optional
-            The path to the configuration file. Defaults to None.
-        API_KEY : str, optional
-            The API key used to authenticate with Toloka's API. If not provided, it is retrieved from the environment.
-        platform : str, optional
-            Specifies the Toloka environment (e.g., 'PRODUCTION', 'SANDBOX'). If not provided, it is retrieved from the environment.
-        pool_id : str, optional
-            The ID of the pool from which results will be retrieved. Defaults to None.
-        """
-        super().__init__(**kwargs)
-        self.input_data_file = input_data_file
-        self.input_pool_file = input_pool_file
-        self.output_dir = output_dir
-        self.status = status
-        self.config_file = config_file
-        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
-        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
-        self.pool_id = pool_id
-        if self.config_file:
-            self.load_config()
-
-    def load_config(self):
-        """
-        Loads configuration data from the specified config file.
-
-        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
-        If the file is missing or improperly formatted, an appropriate error is logged.
-        """
-        try:
-            with open(self.config_file, 'r') as file:
-                config = json.load(file)
-                self.API_KEY = config.get('API_KEY', self.API_KEY)
-                self.platform = config.get('platform', self.platform)
-                self.pool_id = config.get('pool_id', self.pool_id)
-        except FileNotFoundError:
-            logger.error("Configuration file not found.")
-        except json.JSONDecodeError:
-            logger.error("Error decoding JSON from the configuration file.")
-
-    def prepare(self):
-        """
-        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
-
-        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
-        """
-        if not self.API_KEY or not self.platform or not self.pool_id:
-            try:
-                with open(self.input_data_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.API_KEY = data.get("API_KEY", self.API_KEY)
-                    self.platform = data.get("platform", self.platform)
-            except FileNotFoundError:
-                logger.error("Data file not found.")
-            except json.JSONDecodeError:
-                logger.error("Error decoding JSON from the data file.")
-
-            try:
-                with open(self.input_pool_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.pool_id = data.get("pool_id", self.pool_id)
-            except FileNotFoundError:
-                logger.error("Pool file not found.")
-            except json.JSONDecodeError:
-                logger.error("Error decoding JSON from the pool file.")
-
-        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-        return super().prepare()
-
-    def read_manifest(self):
-        """
-        Retrieves and yields task information from Toloka based on the specified pool and assignment status.
-
-        This method retrieves assignments from Toloka for a given pool and yields task information for
-        each assignment that matches the specified status.
-
-        Yields:
-        ------
-        dict
-            A dictionary containing task information such as task ID, text, attachment ID, status, etc.
-        """
-        for assignment in self.toloka_client.get_assignments(pool_id=self.pool_id):
-            if str(assignment.status) == 'Status.' + self.status:
-                # ACCEPTED, ACTIVE, EXPIRED, REJECTED, SKIPPED, SUBMITTED
-                if (
-                    str(assignment.status) == 'Status.ACCEPTED'
-                    or str(assignment.status) == 'Status.REJECTED'
-                    or str(assignment.status) == 'Status.SUBMITTED'
-                ):
-                    for task, solution in zip(assignment.tasks, assignment.solutions):
-                        suit_id = assignment.task_suite_id
-                        assignment_id = assignment.id
-                        user_id = assignment.user_id
-                        task_id = task.id
-                        text = task.input_values['text']
-                        attachment_id = solution.output_values.get('audio_file', None)
-                        status = assignment.status
-                        task_info = {
-                            'task_id': task_id,
-                            'text': text,
-                            'attachment_id': attachment_id,
-                            'status': str(status),
-                            'suit_id': suit_id,
-                            'assignment_id': assignment_id,
-                            'user_id': user_id,
-                        }
-                        yield task_info
-                else:
-                    for task in assignment.tasks:
-                        suit_id = assignment.task_suite_id
-                        assignment_id = assignment.id
-                        user_id = assignment.user_id
-                        task_id = task.id
-                        text = task.input_values['text']
-                        attachment_id = ""
-                        status = assignment.status
-                        task_info = {
-                            'task_id': task_id,
-                            'text': text,
-                            'attachment_id': attachment_id,
-                            'status': str(status),
-                            'suit_id': suit_id,
-                            'assignment_id': assignment_id,
-                            'user_id': user_id,
-                        }
-                        yield task_info
-
-    def process_dataset_entry(self, data_entry):
-        """
-        Downloads and processes individual task results.
-
-        This method takes a data entry, retrieves the corresponding attachment, and stores it in the
-        specified output directory. The task information is then returned.
-
-        Parameters:
-        ----------
-        data_entry : dict
-            A dictionary containing the data entry information.
-
-        Returns:
-        -------
-        list
-            A list containing a DataEntry object with the task information.
-        """
-        user_id = data_entry["user_id"]
-        task_id = data_entry["task_id"]
-        text = data_entry["text"]
-        attachment_id = data_entry["attachment_id"]
-        status = data_entry["status"]
-        suit_id = data_entry["suit_id"]
-        assignment_id = data_entry["assignment_id"]
-        output_path = os.path.join(self.output_dir, attachment_id + '.wav')
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-
-        if attachment_id != "":
-            with open(output_path, 'wb') as attachment_file:
-                self.toloka_client.download_attachment(attachment_id, out=attachment_file)
-
-        task_info = {
-            'task_id': task_id,
-            'text': text,
-            'attachment_id': attachment_id,
-            'status': status,
-            'audio_filepath': output_path,
-            'suit_id': suit_id,
-            'assignment_id': assignment_id,
-            'user_id': user_id,
-        }
-
-        return [DataEntry(data=task_info)]
-
diff --git a/build/lib/sdp/processors/toloka/reject_if.py b/build/lib/sdp/processors/toloka/reject_if.py
deleted file mode 100644
index 7da755a4..00000000
--- a/build/lib/sdp/processors/toloka/reject_if.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-
-try:
-    import toloka.client
-    import toloka.client.project.template_builder
-    TOLOKA_AVAILABLE = True
-except ImportError:
-    logger.warning("Toloka is currently not supported. RejectIf processor functionality will be limited.")
-    TOLOKA_AVAILABLE = False
-    toloka = None
-
-from docx import Document
-from tqdm import tqdm
-
-
-class RejectIfBanned(BaseParallelProcessor):
-    """Rejects Toloka assignments if the user is banned.
-
-    This class connects to Toloka, checks the user’s ban status, and rejects any assignments 
-    from users who are identified as banned.
-
-    Args:
-        input_data_file (str): Path to the input data file containing API configurations.
-        input_pool_file (str): Path to the input pool file containing pool configurations.
-        config_file (str): Path to the configuration file. Default: None.
-        API_KEY (str): The API key for authenticating with Toloka's API. Default: None.
-        platform (str): The Toloka environment to use ('PRODUCTION' or 'SANDBOX'). Default: None.
-        pool_id (str): The ID of the Toloka pool to retrieve assignments from. Default: None.
-
-    Returns:
-        A list of rejected assignments for users who are banned.
-    """
-    def __init__(
-        self,
-        input_data_file: str,
-        input_pool_file: str,
-        config_file: str = None,
-        API_KEY: str = None,
-        platform: str = None,
-        pool_id: str = None,
-        **kwargs
-    ):
-        """
-        Constructs the necessary attributes for the RejectIfBanned class.
-
-        Parameters:
-        ----------
-        input_data_file : str
-            The path to the input data file containing API configurations.
-        input_pool_file : str
-            The path to the input pool file containing pool configurations.
-        config_file : str, optional
-            The path to the configuration file. Defaults to None.
-        API_KEY : str, optional
-            The API key used to authenticate with Toloka's API. If not provided, it is retrieved from the environment.
-        platform : str, optional
-            Specifies the Toloka environment (e.g., 'PRODUCTION', 'SANDBOX'). If not provided, it is retrieved from the environment.
-        pool_id : str, optional
-            The ID of the pool from which assignments will be retrieved. Defaults to None.
-        """
-        super().__init__(**kwargs)
-        self.input_data_file = input_data_file
-        self.input_pool_file = input_pool_file
-        self.config_file = config_file
-        self.API_KEY = API_KEY or os.getenv('TOLOKA_API_KEY')
-        self.platform = platform or os.getenv('TOLOKA_PLATFORM')
-        self.pool_id = pool_id
-        if self.config_file:
-            self.load_config()
-
-    def load_config(self):
-        """
-        Loads configuration data from the specified config file.
-
-        This method attempts to read configuration details such as API key, platform, and pool ID from a JSON file.
-        If the file is missing or improperly formatted, an appropriate error is logged.
-        """
-        try:
-            with open(self.config_file, 'r') as file:
-                config = json.load(file)
-                self.API_KEY = config.get('API_KEY', self.API_KEY)
-                self.platform = config.get('platform', self.platform)
-                self.pool_id = config.get('pool_id', self.pool_id)
-        except FileNotFoundError:
-            print("Configuration file not found.")
-        except json.JSONDecodeError:
-            print("Error decoding JSON from the configuration file.")
-
-    def prepare(self):
-        """
-        Prepares the class by loading API configuration, pool configuration, and initializing Toloka client.
-
-        This method loads necessary configurations and initializes the Toloka client to interact with Toloka's API.
-        """
-        if not self.API_KEY or not self.platform or not self.pool_id:
-            try:
-                with open(self.input_data_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.API_KEY = data.get("API_KEY", self.API_KEY)
-                    self.platform = data.get("platform", self.platform)
-            except FileNotFoundError:
-                print("Data file not found.")
-            except json.JSONDecodeError:
-                print("Error decoding JSON from the data file.")
-
-            try:
-                with open(self.input_pool_file, 'r') as file:
-                    data = json.loads(file.readline())
-                    self.pool_id = data.get("pool_id", self.pool_id)
-            except FileNotFoundError:
-                print("Pool file not found.")
-            except json.JSONDecodeError:
-                print("Error decoding JSON from the pool file.")
-
-        self.toloka_client = toloka.client.TolokaClient(self.API_KEY, self.platform)
-
-    def process(self):
-        """
-        Rejects Toloka assignments if the user is in the banned list.
-
-        This method retrieves the list of banned users and rejects assignments from these users if they have
-        submitted assignments that are still in the 'SUBMITTED' status.
-        """
-        self.prepare()
-        list_of_banned = []
-        reject_list = []
-        list_of_banned = [
-            restriction.user_id for restriction in self.toloka_client.get_user_restrictions(scope='ALL_PROJECTS')
-        ]
-        print("LIST OF BANNED -------------------------", list_of_banned)
-        with open(self.input_manifest_file, 'r') as file:
-            for line in file:
-                data_entry = json.loads(line)
-                if data_entry["user_id"] in list_of_banned:
-                    if str(data_entry["status"]) == "Status.SUBMITTED":
-                        if data_entry['assignment_id'] not in reject_list:
-                            reject_list.append(data_entry['assignment_id'])
-
-        print("REJECTION LIST -------------------------", reject_list)
-        for assignment_id in tqdm(reject_list, desc="Rejecting assignments"):
-            self.toloka_client.reject_assignment(assignment_id=assignment_id, public_comment='Bad quality of audio.')
-
diff --git a/build/lib/sdp/run_processors.py b/build/lib/sdp/run_processors.py
deleted file mode 100644
index 8c498cf2..00000000
--- a/build/lib/sdp/run_processors.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import tempfile
-import uuid
-from typing import List, Optional
-import psutil
-import json
-
-import hydra
-from omegaconf import OmegaConf, open_dict
-
-from sdp.logging import logger
-
-from sdp.utils.import_manager import ImportManager
-
-# registering new resolvers to simplify config files
-OmegaConf.register_new_resolver("subfield", lambda node, field: node[field])
-OmegaConf.register_new_resolver("not", lambda x: not x)
-OmegaConf.register_new_resolver("equal", lambda field, value: field == value)
-
-
-# customizing logger
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter(
-    '[SDP %(levelname)1.1s %(asctime)s %(module)s:%(lineno)d] %(message)s',
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-handler.setFormatter(formatter)
-logger.handlers
-logger.addHandler(handler)
-logger.propagate = False
-
-def update_processor_imports(config_path: str, init_file: str = None):
-    """
-    Update processor imports based on config file.
-    
-    Args:
-        config_path: Path to the YAML config file
-        init_file: Optional path to __init__.py file to update
-    """
-    try:
-        import yaml
-        manager = ImportManager()
-        manager.sync_with_config(config_path, init_file)
-        logger.info(f"Successfully updated imports for config: {config_path}")
-    except FileNotFoundError as e:
-        logger.error(f"File not found: {e}")
-    except yaml.YAMLError as e:
-        logger.error(f"Error parsing YAML config: {e}")
-    except ImportError as e:
-        logger.error(f"Import error: {e}")
-    except ValueError as e:  # For unexpected data structures in the YAML config
-        logger.error(f"Invalid value encountered: {e}")
-    except Exception as e:  # For any other unexpected errors
-        logger.error(f"An unexpected error occurred: {e}")
-
-
-def select_subset(input_list: List, select_str: str) -> List:
-    """This function parses a string and selects objects based on that.
-
-    The string is expected to be a valid representation of Python slice. The
-    only difference with using an actual slice is that we are always returning
-    a list, never a single element. See examples below for more details.
-
-    Examples::
-
-        >>> processors_to_run = [1, 2, 3, 4, 5]
-        >>> select_subset(processors_to_run, "3:") # to exclude first 3 objects
-        [4, 5]
-
-        >>> select_subset(processors_to_run, ":-1") # to select all but last
-        [1, 2, 3, 4]
-
-        >>> select_subset(processors_to_run, "2:5") # to select 3rd to 5th
-        [3, 4, 5]
-
-        >>> # note that unlike normal slice, we still return a list here
-        >>> select_subset(processors_to_run, "0") # to select only the first
-        [1]
-
-        >>> select_subset(processors_to_run, "-1") # to select only the last
-        [5]
-
-    Args:
-        input_list (list): input list to select objects from.
-        select_str (str): string representing Python slice.
-
-    Returns:
-        list: a subset of the input according to the ``select_str``
-
-    """
-    if ":" not in select_str:
-        selected_objects = [input_list[int(select_str)]]
-    else:
-        slice_obj = slice(*map(lambda x: int(x.strip()) if x.strip() else None, select_str.split(":")))
-        selected_objects = input_list[slice_obj]
-    return selected_objects
-
-
-def run_processors(cfg):
-    logger.info(f"Hydra config: {OmegaConf.to_yaml(cfg)}")
-
-    # Handle import manager if enabled
-    if cfg.get("use_import_manager", False):
-        try:
-            import yaml
-            yaml_path = cfg.get("config_path")
-            if not yaml_path:
-                raise ValueError("No configuration path provided in 'config_path'. Please specify the path.")
-
-            if not os.path.exists(yaml_path):
-                raise FileNotFoundError(f"Configuration file not found: {yaml_path}")
-            
-            logger.info(f"Managing imports for config: {yaml_path}")
-            manager = ImportManager()
-            manager.sync_with_config(yaml_path)
-        except FileNotFoundError as e:
-            logger.error(f"File not found: {e}")
-        except ValueError as e:
-            logger.error(f"Invalid configuration: {e}")
-        except yaml.YAMLError as e:
-            logger.error(f"Error parsing YAML file: {e}")
-        except ImportError as e:
-            logger.error(f"Import-related error: {e}")
-        except Exception as e:
-            logger.error(f"An unexpected error occurred during management of imports: {e}")
-
-    # Detecting dask
-    try:
-        from dask.distributed import Client
-        dask_available = True
-    except ImportError:
-        logger.warning("Dask not installed; using multiprocessing for all processors")
-        dask_available = False
-    
-    # look for global directions in cfg for dask usage
-    global_use_dask = bool(cfg.get("use_dask", True)) and dask_available
-
-    processors_to_run = cfg.get("processors_to_run", "all")
-    if processors_to_run == "all":
-        processors_to_run = ":"
-    selected_cfgs = select_subset(cfg.processors, processors_to_run)
-    
-    # filtering out any processors that have should_run=False
-    processors_cfgs = []
-    for processor_cfg in selected_cfgs:
-        with open_dict(processor_cfg):
-            should_run = processor_cfg.pop("should_run", True)
-        if should_run:
-            processors_cfgs.append(processor_cfg)
-
-    logger.info(
-        "Specified to run the following processors: %s ",
-        [proc_cfg["_target_"] for proc_cfg in processors_cfgs],
-    )
-    
-    
-    
-    processors = []
-    # Create a temporary directory to hold intermediate files if needed.
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        # special check for the first processor.
-        # In case user selected something that does not start from
-        # manifest creation we will try to infer the input from previous
-        # output file
-        if processors_cfgs[0] is not cfg.processors[0] and "input_manifest_file" not in processors_cfgs[0]:
-            # locating starting processor
-            for idx, processor in enumerate(cfg.processors):
-                if processor is processors_cfgs[0]:  # we don't do a copy, so can just check object ids
-                    if "output_manifest_file" in cfg.processors[idx - 1]:
-                        with open_dict(processors_cfgs[0]):
-                            processors_cfgs[0]["input_manifest_file"] = cfg.processors[idx - 1]["output_manifest_file"]
-                    break
-        
-        for idx, processor_cfg in enumerate(processors_cfgs):
-            logger.info('=> Building processor "%s"', processor_cfg["_target_"])
-
-            # we assume that each processor defines "output_manifest_file"
-            # and "input_manifest_file" keys, which can be optional. In case they
-            # are missing, we create tmp files here for them
-            # (1) first use a temporary file for the "output_manifest_file" if it is unspecified
-            if "output_manifest_file" not in processor_cfg:
-                tmp_file_path = os.path.join(tmp_dir, str(uuid.uuid4()))
-                with open_dict(processor_cfg):
-                    processor_cfg["output_manifest_file"] = tmp_file_path
-
-            # (2) then link the current processor's output_manifest_file to the next processor's input_manifest_file
-            # if it hasn't been specified (and if you are not on the last processor)
-            if idx != len(processors_cfgs) - 1 and "input_manifest_file" not in processors_cfgs[idx + 1]:
-                with open_dict(processors_cfgs[idx + 1]):
-                    processors_cfgs[idx + 1]["input_manifest_file"] = processor_cfg["output_manifest_file"]
-            
-            #check if we have processor level directions of using dask
-            flag=processor_cfg.get("use_dask", None)
-
-            # if no processor-specific flag, fallback to global; otherwise use provided value
-            if flag is None:
-                use_dask_flag = global_use_dask
-            else:
-                use_dask_flag = flag
-
-            processor = hydra.utils.instantiate(processor_cfg)
-            processor.use_dask = use_dask_flag
-            # running runtime tests to fail right-away if something is not
-            # matching users expectations
-            processor.test()
-            processors.append(processor)
-
-
-        # Start Dask client if any processor requires it
-        dask_client = None
-        if any(p.use_dask for p in processors):
-            try:
-                num_cpus = psutil.cpu_count(logical=False) or 4
-                logger.info(f"Starting Dask client with {num_cpus} workers")
-                dask_client = Client(n_workers=num_cpus, processes=True)
-                logger.info(f"Dask dashboard at: {dask_client.dashboard_link}")
-            except Exception as e:
-                logger.warning(f"Failed to start Dask client: {e}")
-                dask_client = None
-
-        # Run processors in order
-        try:
-            for proc in processors:
-                if proc.use_dask and dask_client is not None:
-                    proc.dask_client = dask_client
-                    logger.info('=> Running processor "%s" with Dask', proc)
-                else:
-                    logger.info('=> Running processor "%s" with Multiprocessing', proc)
-                proc.process()
-        finally:
-            if dask_client is not None:
-                logger.info("Shutting down Dask client...")
-                dask_client.close(timeout="60s")
-                logger.info("Dask client shutdown complete")
-
-#tmp_dir is removed here after all processing finishes. !!!
diff --git a/build/lib/sdp/utils/__init__.py b/build/lib/sdp/utils/__init__.py
deleted file mode 100644
index 2223b231..00000000
--- a/build/lib/sdp/utils/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from sdp.utils.bootstrap_estimates import BootstrapProcessor
\ No newline at end of file
diff --git a/build/lib/sdp/utils/bootstrap_estimates.py b/build/lib/sdp/utils/bootstrap_estimates.py
deleted file mode 100644
index efdebe83..00000000
--- a/build/lib/sdp/utils/bootstrap_estimates.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-import numpy as np
-from tqdm import tqdm
-from sdp.processors.base_processor import BaseProcessor
-from typing import List, Dict, Union, Optional, Tuple
-from . import metrics_computation as metrics
-
-class BootstrapProcessor(BaseProcessor):
-    """This processor evaluates ASR performance metrics using bootstrapped confidence intervals.
-
-    It calculates metrics such as Word Error Rate (WER), Character Error Rate (CER), Word Match 
-    Rate (WMR), character rate, and word rate. When `calculate_pairwise` is set to `True`, it also 
-    computes the Probability of Improvement (POI) between different ASR models.
-
-    This implementation leverages bootstrapping to provide robust confidence intervals for each metric,
-    helping to understand the variability in metric estimates and the likelihood that one model
-    performs better than another.
-
-    Reference: Bootstrap estimates for confidence intervals in ASR performance evaluation:
-    <https://ieeexplore.ieee.org/document/1326009>
-
-    Args:
-        bootstrap_manifest_files (List[str]): A list of file paths to manifest files (in JSON Lines format)
-            used for metric calculation. Each manifest file contains the ground truth and predicted transcriptions.
-        raw_data_dir (str): The directory containing the data files referenced in the manifests.
-        output_file (str): Path to the output JSON file where results will be saved.
-        num_bootstraps (int): The number of bootstrap iterations to perform, which determines
-            the reliability of the confidence intervals (default: 1000).
-        bootstrap_sample_ratio (float): Proportion of the dataset size used for each bootstrap sample,
-            allowing sub-sampling or over-sampling (default: 1.0, meaning full dataset).
-        calculate_pairwise (bool): Whether to calculate pairwise differences in metric values between
-            models and compute the Probability of Improvement (default: True).
-        metric_type (str): Specifies the metric to calculate. Options include 'wer', 'cer', 'wmr',
-            'charrate', and 'wordrate' (default: 'wer').
-        text_key (str): Key in the manifest that contains the ground truth text (default: 'text').
-        pred_text_key (str): Key in the manifest that contains the predicted text (default: 'pred_text').
-        ci_lower (float): The lower bound percentile for the confidence intervals (default: 2.5).
-        ci_upper (float): The upper bound percentile for the confidence intervals (default: 97.5).
-        random_state (int): Sets a random state for reproducibility of bootstrap sampling.
-
-    Returns:
-        Results saved in a JSON file at the specified `output_file` path, containing individual metric
-        computations for each manifest file and pairwise comparisons between each model if
-        `calculate_pairwise` is enabled.
-
-    """
-
-    def __init__(
-        self,
-        bootstrap_manifest_files: List[str],
-        raw_data_dir: str,
-        output_file: str, 
-        num_bootstraps: int = 1000,
-        bootstrap_sample_ratio: float = 1.0,
-        calculate_pairwise: bool = True, 
-        metric_type: str = 'wer',
-        text_key: str = 'text',
-        pred_text_key: str = 'pred_text',
-        ci_lower: float = 2.5,
-        ci_upper: float = 97.5,
-        random_state: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.bootstrap_manifest_files = bootstrap_manifest_files
-        self.raw_data_dir = raw_data_dir
-        self.output_file = output_file
-        self.num_bootstraps = num_bootstraps
-        self.bootstrap_sample_ratio = bootstrap_sample_ratio
-        self.calculate_pairwise = calculate_pairwise
-        self.metric_type = metric_type.lower()
-        self.text_key = text_key
-        self.pred_text_key = pred_text_key 
-        self.ci_lower = ci_lower
-        self.ci_upper = ci_upper 
-        self.random_state = random_state 
-
-
-        if self.random_state is not None:
-            np.random.seed(self.random_state)
-
-        if self.metric_type not in ['wer', 'cer', 'wmr', 'charrate', 'wordrate']:
-            raise ValueError(f"Invalid metric_type '{self.metric_type}'! Must be one of ['wer', 'cer', 'wmr', 'charrate', 'wordrate']")
-
-    def read_manifest(self, manifest_path: Path) -> List[Dict[str, Union[str, float]]]:
-        manifest_data = []
-        with manifest_path.open('r', encoding='utf-8') as f:
-            for line in f:
-                data = json.loads(line.strip()) 
-                manifest_data.append(data)
-
-        return manifest_data
-
-    def calculate_metric(self, text: str, pred_text: str, duration: Optional[float] = None) -> float:
-        if self.metric_type == 'wer':
-            return metrics.get_wer(text, pred_text)
-        elif self.metric_type == 'cer':
-            return metrics.get_cer(text, pred_text)
-        elif self.metric_type == 'wmr':
-            return metrics.get_wmr(text, pred_text)
-        elif self.metric_type == 'charrate':
-            if duration is None:
-                raise ValueError("Duration is required for calculating character rate.")
-            return metrics.get_charrate(text, duration)
-        elif self.metric_type == 'wordrate':
-            if duration is None:
-                raise ValueError("Duration is required for calculating word rate.")
-            return metrics.get_wordrate(text, duration)
-        else:
-            raise ValueError(f"Unsupported metric_type: {self.metric_type}")
-
-    def bootstrap_metric(self, hypotheses: List[str], references: List[str], durations: Optional[List[float]] = None) -> np.ndarray:
-        """
-        Bootstraps metric computation (WER, CER, etc.) to calculate confidence intervals.
-
-        Args:
-            hypotheses (List[str]): Predicted transcriptions
-            references (List[str]): Ground truth transcriptions
-            durations (Optional[List[float]]): Duration for each transcription, required for charrate and wordrate
-
-        Returns:
-            np.ndarray: Bootstrapped metric values
-        """
-        n = len(hypotheses)
-        sample_size = int(n * self.bootstrap_sample_ratio)
-
-        metric_bootstrap = []
-        for _ in tqdm(range(self.num_bootstraps), desc=f"Bootstrapping {self.metric_type.upper()}"):
-            indices = np.random.choice(n, size=sample_size, replace=True)
-            sampled_hypotheses = [hypotheses[i] for i in indices]
-            sampled_references = [references[i] for i in indices]
-            if durations:
-                sampled_durations = [durations[i] for i in indices]
-                metric = [self.calculate_metric(sampled_references[i], sampled_hypotheses[i], sampled_durations[i])
-                          for i in range(sample_size)]
-            else:
-                metric = [self.calculate_metric(sampled_references[i], sampled_hypotheses[i]) for i in range(sample_size)]
-            metric_bootstrap.append(np.mean(metric))
-
-        return np.array(metric_bootstrap)
-
-    def bootstrap_wer_difference(self, predictions1: List[str], predictions2: List[str], references: List[str], durations: Optional[List[float]] = None) -> Tuple[np.ndarray, float]:
-        """
-        Calculates the bootstrapped difference in metrics between two sets of predictions and the probability of improvement.
-
-        Args:
-            predictions1 (List[str]): Predictions from the first model
-            predictions2 (List[str]): Predictions from the second model
-            references (List[str]): Ground truth references
-            durations (Optional[List[float]]): Durations for each sample, if required for the metric
-
-        Returns:
-            Tuple[np.ndarray, float]: A tuple containing:
-                - np.ndarray: Bootstrapped differences in metric
-                - float: Probability of Improvement (POI)
-        """
-        n = len(references)
-        sample_size = int(n * self.bootstrap_sample_ratio)
-        delta_metric_bootstrap = []
-
-        for _ in tqdm(range(self.num_bootstraps), desc=f"Bootstrapping {self.metric_type.upper()} difference"):
-            indices = np.random.choice(n, size=sample_size, replace=True)
-            sampled_pred1 = [predictions1[i] for i in indices]
-            sampled_pred2 = [predictions2[i] for i in indices]
-            sampled_refs = [references[i] for i in indices]
-
-            if durations:
-                sampled_durations = [durations[i] for i in indices]
-                metric1 = [self.calculate_metric(sampled_refs[i], sampled_pred1[i], sampled_durations[i]) for i in range(sample_size)]
-                metric2 = [self.calculate_metric(sampled_refs[i], sampled_pred2[i], sampled_durations[i]) for i in range(sample_size)]
-            else:
-                metric1 = [self.calculate_metric(sampled_refs[i], sampled_pred1[i]) for i in range(sample_size)]
-                metric2 = [self.calculate_metric(sampled_refs[i], sampled_pred2[i]) for i in range(sample_size)]
-
-            delta_metric = np.mean(metric1) - np.mean(metric2)
-            delta_metric_bootstrap.append(delta_metric)
-
-        poi = np.mean(np.array(delta_metric_bootstrap) > 0)
-        return np.array(delta_metric_bootstrap), poi
-
-    def prepare(self):
-        output_path = Path(self.output_file)
-        output_path.parent.mkdir(exist_ok=True, parents=True)
-
-    def process(self):
-        """
-        Main processing function that loads data, performs metric bootstrapping and optionally 
-        pairwise comparison, and saves the results to a JSON file.
-        """
-        self.prepare()
-        results = {}
-
-        # Load ground truth and predictions
-        bootstrap_manifest_files = [Path(f) for f in self.bootstrap_manifest_files]
-        ground_truth = []
-        predicted_texts = []
-        durations = []
-
-        for manifest_file in bootstrap_manifest_files:
-            manifest_data = self.read_manifest(Path(self.raw_data_dir) / manifest_file)
-            # Use text_key and pred_text_key to extract ground truth and predictions
-            gt_texts = [entry[self.text_key] for entry in manifest_data]
-            pred_texts = [entry[self.pred_text_key] for entry in manifest_data]
-            if 'duration' in manifest_data[0]:  # Check if duration is available
-                file_durations = [entry['duration'] for entry in manifest_data]
-                durations.append(file_durations)
-
-            if not ground_truth:
-                ground_truth = gt_texts  # Ground truth is assumed to be the same for all models
-            predicted_texts.append(pred_texts)
-
-        # Bootstrapping individual metric for each model
-        results["individual_results"] = {}
-        for idx, predicted in enumerate(predicted_texts):
-            if durations:
-                metric_conf_intervals = self.bootstrap_metric(predicted, ground_truth, durations[idx])
-            else:
-                metric_conf_intervals = self.bootstrap_metric(predicted, ground_truth)
-
-            ci_lower_value = np.percentile(metric_conf_intervals, self.ci_lower)
-            ci_upper_value = np.percentile(metric_conf_intervals, self.ci_upper)
-            mean_metric = np.mean(metric_conf_intervals)
-
-            results["individual_results"][bootstrap_manifest_files[idx].name] = {
-                f"mean_{self.metric_type}": mean_metric,
-                "ci_lower": ci_lower_value,
-                "ci_upper": ci_upper_value
-            }
-
-        # Pairwise comparison between models (only if calculate_pairwise is True)
-        if self.calculate_pairwise:
-            results["pairwise_comparisons"] = []
-            num_files = len(predicted_texts)
-            for i in range(num_files):
-                for j in range(i + 1, num_files):
-                    if durations:
-                        delta_metric_bootstrap, poi = self.bootstrap_wer_difference(predicted_texts[i], predicted_texts[j], ground_truth, durations[i])
-                    else:
-                        delta_metric_bootstrap, poi = self.bootstrap_wer_difference(predicted_texts[i], predicted_texts[j], ground_truth)
-
-                    mean_delta_metric = np.mean(delta_metric_bootstrap)
-                    ci_lower_value = np.percentile(delta_metric_bootstrap, self.ci_lower)
-                    ci_upper_value = np.percentile(delta_metric_bootstrap, self.ci_upper)
-
-                    results["pairwise_comparisons"].append({
-                        "file_1": bootstrap_manifest_files[i].name,
-                        "file_2": bootstrap_manifest_files[j].name,
-                        f"delta_{self.metric_type}_mean": mean_delta_metric,
-                        "ci_lower": ci_lower_value,
-                        "ci_upper": ci_upper_value,
-                        "poi": poi
-                    })
-
-        output_path = Path(self.output_file)
-        with output_path.open('w') as out_file:
-            json.dump(results, out_file, indent=4)
-
diff --git a/build/lib/sdp/utils/common.py b/build/lib/sdp/utils/common.py
deleted file mode 100644
index 6d9c4fba..00000000
--- a/build/lib/sdp/utils/common.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import subprocess
-import tarfile
-import urllib
-import zipfile
-from pathlib import Path
-from typing import Dict, List, Union
-
-import wget
-
-from sdp.logging import logger
-
-
-def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
-    # read NeMo manifest as a list of dicts
-    result = []
-    with manifest.open() as f:
-        for line in f:
-            data = json.loads(line)
-            result.append(data)
-    return result
-
-
-def download_file(source_url: str, target_directory: str, verbose=True):
-    # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later
-    target_directory = os.path.abspath(target_directory)
-
-    if verbose:
-        logger.info(f"Trying to download data from {source_url} and save it in this directory: {target_directory}")
-    filename = os.path.basename(urllib.parse.urlparse(source_url).path)
-    target_filepath = os.path.join(target_directory, filename)
-
-    if os.path.exists(target_filepath):
-        if verbose:
-            logger.info(f"Found file {target_filepath} => will not be attempting download from {source_url}")
-    else:
-        logger.info(f"Not found file {target_filepath}")
-        original_dir = os.getcwd()  # record current working directory so can cd back to it
-        os.chdir(target_directory)  # cd to target dir so that temporary download file will be saved in target dir
-
-        wget.download(source_url, target_directory)
-
-        # change back to original directory as the rest of the code may assume that we are in that directory
-        os.chdir(original_dir)
-        if verbose:
-            logger.info("Download completed")
-
-    return target_filepath
-
-
-def extract_archive(archive_path: str, extract_path: str, force_extract: bool = False) -> str:
-    logger.info(f"Attempting to extract all contents from tar file {archive_path} and save in {extract_path}")
-    if not force_extract:
-        if tarfile.is_tarfile(archive_path):
-            with tarfile.open(archive_path, "r") as archive:
-                archive_extracted_dir = os.path.commonprefix(archive.getnames()[1:])
-        elif zipfile.is_zipfile(archive_path):
-            with zipfile.ZipFile(archive_path, "r") as archive:
-                archive_extracted_dir = archive.namelist()[0]
-        else:
-            raise RuntimeError(f"Unknown archive format: {archive_path}. We only support tar and zip archives.")
-
-        archive_contents_dir = os.path.join(extract_path, archive_extracted_dir)
-
-    if not force_extract and os.path.exists(archive_contents_dir):
-        logger.info(f"Directory {archive_contents_dir} already exists => will not attempt to extract file")
-    else:
-        if tarfile.is_tarfile(archive_path):
-            with tarfile.open(archive_path, "r") as archive:
-                archive.extractall(path=extract_path)
-        elif zipfile.is_zipfile(archive_path):
-            with zipfile.ZipFile(archive_path, "r") as archive:
-                archive.extractall(extract_path)
-        logger.info("Finished extracting")
-
-    if force_extract:
-        return None
-    return archive_contents_dir
-
-
-def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
-    process_args = ["ffmpeg", "-nostdin", "-i", jpg, "-ac", str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
-    if ar:
-        process_args = process_args[:-1]
-        process_args.extend(["-ar", str(ar), wav])
-    return subprocess.run(process_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-
-
-def extract_tar_with_strip_components(tar_path, extract_path, strip_components=1):
-    with tarfile.open(tar_path, "r") as tar:
-        members = tar.getmembers()
-        for member in members:
-            components = member.name.split(os.path.sep)
-            if len(components) > strip_components:
-                member.name = os.path.sep.join(components[strip_components:])
-                tar.extract(member, extract_path)
diff --git a/build/lib/sdp/utils/edit_spaces.py b/build/lib/sdp/utils/edit_spaces.py
deleted file mode 100644
index d84d960f..00000000
--- a/build/lib/sdp/utils/edit_spaces.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def remove_extra_spaces(input_string):
-    """
-    Removes extra spaces in between words and at the start and end
-    of the string.
-    e.g. "abc  xyz   abc xyz" --> "abc xyz abc xyz"
-    e.g. " abc xyz " --> "abc xyz"
-    """
-    output_string = " ".join(input_string.split())
-    return output_string
-
-
-def add_start_end_spaces(input_string):
-    """
-    Adds spaces at the start and end of the input string.
-    This is useful for when we specify we are looking for a particular
-    word " <word> ". This will ensure we will find the word even
-    if it is at the beginning or end of the utterances (ie. there will
-    definitely be two spaces around the word).
-
-    e.g. "abc xyz" --> " abc xyz "
-    """
-    # ensure no extra spaces
-    no_extra_spaces_string = remove_extra_spaces(input_string)
-    output_string = f" {no_extra_spaces_string} "
-
-    return output_string
diff --git a/build/lib/sdp/utils/get_diff.py b/build/lib/sdp/utils/get_diff.py
deleted file mode 100644
index a0bc29d1..00000000
--- a/build/lib/sdp/utils/get_diff.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List
-
-import diff_match_patch
-
-from sdp.utils.edit_spaces import remove_extra_spaces
-
-diff = diff_match_patch.diff_match_patch()
-diff.Diff_Timeout = 0
-
-
-def get_diff(orig_words: str, pred_words: str) -> List[tuple]:
-    orig_words = remove_extra_spaces(orig_words)
-    orig_words = orig_words.replace(" ", "\n") + "\n"
-
-    pred_words = remove_extra_spaces(pred_words)
-    pred_words = pred_words.replace(" ", "\n") + "\n"
-
-    orig_enc, pred_enc, enc = diff.diff_linesToChars(orig_words, pred_words)
-    diffs = diff.diff_main(orig_enc, pred_enc, False)
-    diff.diff_charsToLines(diffs, enc)
-    diffs_post = []
-
-    for d in diffs:
-        diffs_post.append((d[0], d[1].replace("\n", " ")))
-    return diffs_post
-
-
-def get_diff_with_subs_grouped(orig_words: str, pred_words: str) -> List[tuple]:
-    """
-    Function to produce a list of word-level diffs, but with the substitutions
-    grouped together.
-        e.g.
-        orig_words = "hello there nemo"
-        pred_words = "hello my name is nemo"
-        will give an output of:
-        [(0, 'hello '), ((-1, 'there '), (1, 'my name is ')), (0, 'nemo ')]
-        (note how the 'there' nad 'my name is' entry are grouped together in a tuple)
-
-        This is to make it easier to find substitutions in the diffs, as
-        dif_match_patch does not show substitutions clearly, only as a deletion followed by
-        an insertion.
-
-    Args:
-        orig_words: a string containing the ground truth.
-        pred_words: a string containing the text predicted by ASR.
-
-    Returns:
-        A list of tuples containing the word-level diffs between the ground truth
-        and ASR.
-    """
-    diffs = get_diff(orig_words, pred_words)
-
-    diffs_group_subs = []
-    i = 0
-    while i < len(diffs):
-        if i < len(diffs) - 1:  # if i == len(diffs), line accessing diffs[i+1] will raise error
-            if diffs[i][0] == -1 and diffs[i + 1][0] == 1:
-                diffs_group_subs.append((diffs[i], diffs[i + 1]))
-                i += 1  # skip extra diff entry so we don't append diffs[i+1] again
-            else:
-                diffs_group_subs.append(diffs[i])
-        else:
-            diffs_group_subs.append(diffs[i])
-
-        i += 1
-
-    return diffs_group_subs
diff --git a/build/lib/sdp/utils/import_manager.py b/build/lib/sdp/utils/import_manager.py
deleted file mode 100644
index 5458892e..00000000
--- a/build/lib/sdp/utils/import_manager.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import ast
-import importlib
-import inspect
-import os
-from pathlib import Path
-from typing import Dict, Optional, Set
-import yaml
-
-from sdp.logging import logger
-
-class ImportManager:
-    """
-    The ImportManager class is a utility designed to manage dynamic imports for a specific Python package based on a provided YAML configuration.
-    This class simplifies the process of selectively importing only the necessary components,
-    enabling the creation of a custom __init__.py file with imports for required processors. 
-    By doing so, it ensures that users only need to install the libraries they actually use, 
-    reducing unnecessary dependencies.
-    
-    To eable the ImportManager, set the `use_import_manager` key to `True` in the YAML config file. (Or provide it as an argument to main.py)
-    use_import_manager: True
-
-    """
-    def __init__(self, base_package: str = "sdp"):
-        self.base_package = base_package
-        self.package_path = self._find_package_path()
-        
-    def _find_package_path(self) -> Path:
-        try:
-            package = importlib.import_module(self.base_package)
-            return Path(package.__file__).parent
-        except ImportError:
-            current_dir = Path.cwd()
-            for parent in [current_dir, *current_dir.parents]:
-                if (parent / self.base_package).is_dir():
-                    return parent / self.base_package
-            raise FileNotFoundError(f"Could not find package '{self.base_package}'")
-
-    def _get_processor_import(self, target: str) -> Optional[str]:
-        try:
-            module_path, class_name = target.rsplit('.', 1)
-            return f"from {module_path} import {class_name}"
-        except ValueError as e:
-        # Raised if the target does not contain a '.'
-            logger.warning(f"Invalid target format for import: '{target}'. Expected '<module>.<class>'. Error: {e}")
-        except AttributeError as e:
-        # Raised if the target module or class does not exist
-            logger.warning(f"Invalid target type for import: {type(target)}. Error: {e}")
-        except Exception as e:
-            logger.warning(f"Could not process import for {target}: {e}")
-        return None
-
-
-
-
-
-
-    def get_required_imports(self, yaml_config: str) -> Set[str]:
-        with open(yaml_config, 'r') as f:
-            config = yaml.safe_load(f)
-            
-        required_imports = set()
-        if 'processors' in config:
-            for processor in config['processors']:
-                if isinstance(processor, dict) and '_target_' in processor:
-                    import_stmt = self._get_processor_import(processor['_target_'])
-                    if import_stmt:
-                        required_imports.add(import_stmt)
-                        logger.debug(f"Found required processor: {processor['_target_']}")
-        
-        return required_imports
-
-    def sync_with_config(self, yaml_config: str, init_file: Optional[str] = None) -> None:
-        """
-        Synchronize the __init__.py imports with the YAML config while preserving existing imports.
-        """
-        if init_file is None:
-            init_file = self.package_path / 'processors' / '__init__.py'
-        else:
-            init_file = Path(init_file)
-
-        logger.info(f"Syncing imports between {yaml_config} and {init_file}")
-
-        # Get current content
-        current_content = ""
-        if init_file.exists():
-            with open(init_file, 'r') as f:
-                current_content = f.read()
-
-        # Parse YAML config and get required imports
-        required_imports = self.get_required_imports(yaml_config)
-        
-        # Mention that this file is auto-generated
-        new_content = []
-        if "let's import all supported processors" in current_content:
-            # Keep the header comment if it exists
-            new_content.append("# This was automaticly generated, to disable: set use_import_manager: False in yaml config\n")
-        
-        # Add imports
-        for import_stmt in sorted(required_imports):
-            new_content.append(import_stmt)
-        
-        # Write the new content
-        init_file.parent.mkdir(parents=True, exist_ok=True)
-        with open(init_file, 'w') as f:
-            f.write('\n'.join(new_content))
-        
-        logger.info(f"Successfully updated {init_file} with required imports")
-
-
-def setup_import_hooks():
-    """Set up import hooks for automatic import management."""
-    original_yaml_load = yaml.safe_load
-    
-    def yaml_load_hook(stream):
-        result = original_yaml_load(stream)
-        if isinstance(result, dict) and 'processors' in result:
-            frame = inspect.currentframe()
-            while frame:
-                if frame.f_code.co_name != 'yaml_load_hook':
-                    break
-                frame = frame.f_back
-            
-            if frame:
-                caller_file = frame.f_code.co_filename
-                if isinstance(stream, str):
-                    yaml_path = stream
-                else:
-                    yaml_path = os.path.abspath(caller_file)
-                    
-                manager = ImportManager()
-                try:
-                    manager.sync_with_config(yaml_path)
-                except Exception as e:
-                    logger.warning(f"Failed to sync imports: {e}")
-        
-        return result
-    
-    yaml.safe_load = yaml_load_hook
\ No newline at end of file
diff --git a/build/lib/sdp/utils/metrics_computation.py b/build/lib/sdp/utils/metrics_computation.py
deleted file mode 100644
index cacf5a2f..00000000
--- a/build/lib/sdp/utils/metrics_computation.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import difflib
-
-import editdistance
-
-sm = difflib.SequenceMatcher()
-
-
-def get_cer(text, pred_text):
-    char_dist = editdistance.eval(text, pred_text)
-    num_chars = len(text)
-    cer = round(char_dist / num_chars * 100.0, 2)
-
-    return cer
-
-
-def get_wer(text, pred_text):
-    text_words = text.split()
-    pred_text_words = pred_text.split()
-    word_dist = editdistance.eval(text_words, pred_text_words)
-
-    num_words = len(text_words)
-    wer = round(word_dist / num_words * 100.0, 2)
-
-    return wer
-
-
-def get_charrate(text, duration):
-    num_chars = len(text)
-    charrate = round(num_chars / duration, 2)
-
-    return charrate
-
-
-def get_wordrate(text, duration):
-    num_words = len(text.split())
-    wordrate = round(num_words / duration, 2)
-
-    return wordrate
-
-
-def get_wmr(text, pred_text):
-    orig = text.strip().split()
-    sm.set_seqs(orig, pred_text.strip().split())
-    num_matches = 0
-    for m in sm.get_matching_blocks():
-        for word_idx in range(m[0], m[0] + m[2]):
-            num_matches += 1
-    wmr = round(num_matches / len(orig) * 100.0, 2)
-    return wmr

From 29daaa32b9db2a167b0bc5781282b859d130621a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 09:17:07 -0700
Subject: [PATCH 07/90] init updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 92114541..5a9ab9be 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -66,6 +66,7 @@
 )
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
 
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,

From 4eb64ef7ece769f634cca998ae95b4614670df3a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 09:42:43 -0700
Subject: [PATCH 08/90] ListYodas2Data

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../yodas2/create_initial_manifest.py         | 65 ++++++++++--
 sdp/processors/datasets/yodas2/download.py    | 98 +++++++++++++------
 2 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index 0888058d..c8d6a446 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -18,30 +18,83 @@
 from sdp.processors import ListToEntries
 from sdp.logging import logger
 
-class CreateInitialManifest(ListToEntries):
+
+class CreateInitialManifestYodas2(ListToEntries):
+    """
+    Custom processor for generating an initial manifest from a dataset entry where
+    one of the fields contains a list of segment dictionaries.
+
+    In addition to flattening entries via ListToEntries, it also:
+    - Extracts `yodas_id` from each entry's audio file name.
+    - Enriches each entry with a precomputed `duration` from an external duration file.
+
+    The final output will contain entries with the following required fields:
+        - source_audio_filepath
+        - yodas_id
+        - duration
+
+    Args:
+        **kwargs: Passed through to ListToEntries, including:
+            - field_with_list (str): Name of the field to flatten.
+            - output_field (str): Required if inner list items are primitives.
+            - fields_to_save (list[str]): Fields to preserve from the original entry.
+    """
+
     def __init__(self, **kwargs):
-         super().__init__(**kwargs)
-        
+        super().__init__(**kwargs)
+
     def get_samples_durations(self, durations_filepath: str):
+        """
+        Loads a mapping of yodas_id → duration from a plain text file.
+
+        The file should contain one line per sample, formatted as:
+        <yodas_id> <duration>
+
+        Args:
+            durations_filepath (str): Path to the durations metadata file.
+
+        Returns:
+            dict[str, float]: Mapping from sample ID to duration.
+        """
         durations = dict()
         with open(durations_filepath, 'r') as durations_txt:
             for line in durations_txt:
                 yodas_id, duration = line.strip().split()
                 durations[yodas_id] = float(duration)
         return durations
-    
+
     def process_dataset_entry(self, data_entry):
+        """
+        Processes a single dataset entry:
+        - Loads durations for all samples listed in the `local_duration` file.
+        - Flattens the input using the ListToEntries logic.
+        - Attaches `yodas_id` (from file basename) and `duration` (from metadata file) to each entry.
+        - Skips entries not present in the durations file, with a warning.
+
+        Args:
+            data_entry (dict): One entry from the input manifest, must contain the
+                               'local_duration' field with path to the durations file.
+
+        Returns:
+            list[DataEntry]: Flattened and enriched list of manifest entries.
+        """
+        # Load durations for all samples
         durations = self.get_samples_durations(data_entry['local_duration'])
+
+        # Flatten the input field with list elements using base ListToEntries
         data_entries = super().process_dataset_entry(data_entry)
 
         yodas_entries = []
         for entry in data_entries:
+            # Extract yodas_id from audio filename (without extension)
             yodas_id = os.path.basename(entry.data['source_audio_filepath']).split('.')[0]
             entry.data['yodas_id'] = yodas_id
+
+            # Add duration if available
             if yodas_id in durations:
                 entry.data['duration'] = durations[yodas_id]
                 yodas_entries.append(entry)
             else:
                 logger.warning(f'Skipping `{yodas_id}` because there is no duration info in metadata.')
-                
-        return yodas_entries
\ No newline at end of file
+
+        return yodas_entries
diff --git a/sdp/processors/datasets/yodas2/download.py b/sdp/processors/datasets/yodas2/download.py
index 066cd73f..02d275ae 100644
--- a/sdp/processors/datasets/yodas2/download.py
+++ b/sdp/processors/datasets/yodas2/download.py
@@ -27,74 +27,110 @@
 from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
 from sdp.processors import ExtractTar
 
-class ListYodas2Data(ListRepoFiles): 
+
+class ListYodas2Data(ListRepoFiles):
+    """
+    Processor for generating a manifest of the YODAS2 dataset stored on the Hugging Face Hub.
+
+    This processor supports two modes:
+    1. Using the `meta.py` file from the dataset repo (if `use_metadata=True`), which provides
+       structured metadata including the number of shards per language subset.
+    2. Parsing the repository's file list directly to infer the manifest structure (default mode).
+
+    Args:
+        use_metadata (bool): Whether to use `meta.py` to generate the manifest (default: False).
+        **kwargs: Passed to the parent class `sdp.processors.ListRepoFiles`, including Hugging Face repo config.
+    """
+
     def __init__(self, use_metadata: bool = False, **kwargs):
-        super().__init__(repo_id = "espnet/yodas2", repo_type = "dataset", **kwargs)
+        # Initialize parent class with hardcoded repo_id and repo_type for YODAS2
+        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
         self.use_metadata = use_metadata
-    
+
     def process(self):
+        """
+        Main entry point for generating the YODAS2 manifest.
+
+        If `use_metadata=True`, it uses the `meta.py` file to generate expected paths.
+        Otherwise, it lists all files from Hugging Face and organizes them by language and shard.
+        The final manifest is written as line-delimited JSON.
+        """
         if self.use_metadata:
             metadata = None
             with tempfile.TemporaryDirectory() as tmp_dir:
-                yodas_metafile = hf_hub_download(repo_id="espnet/yodas2", filename="meta.py", repo_type="dataset", local_dir = tmp_dir)
+                # Download the meta.py script containing shard information
+                yodas_metafile = hf_hub_download(
+                    repo_id="espnet/yodas2",
+                    filename="meta.py",
+                    repo_type="dataset",
+                    local_dir=tmp_dir
+                )
+                # Dynamically import meta.py
                 spec = importlib.util.spec_from_file_location("script", yodas_metafile)
                 metadata = importlib.util.module_from_spec(spec)
                 spec.loader.exec_module(metadata)
-            
+
+            # Write the manifest based on metadata.lang2shard_cnt
             with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
                 for lang_subset in sorted(metadata.lang2shard_cnt.keys()):
                     for shard_no in range(metadata.lang2shard_cnt['aa000']):
                         shard_id = str(shard_no).zfill(8)
-                        data_entry = dict(lang_subset = lang_subset, shard_id = shard_id,
-                                          audio_key = f'data/{lang_subset}/audio/{shard_id}.tar.gz',
-                                          duration_key = f'data/{lang_subset}/duration/{shard_id}.txt',
-                                          text_key = f'data/{lang_subset}/text/{shard_id}.json')
+                        data_entry = {
+                            "lang_subset": lang_subset,
+                            "shard_id": shard_id,
+                            "audio_key": f"data/{lang_subset}/audio/{shard_id}.tar.gz",
+                            "duration_key": f"data/{lang_subset}/duration/{shard_id}.txt",
+                            "text_key": f"data/{lang_subset}/text/{shard_id}.json",
+                        }
                         line = json.dumps(data_entry)
-                        fout.writelines(f'{line}\n')
-        else:       
-            logger.info(f'Recieving files list of espnet/yodas2 dataset from Hugging Face..')
+                        fout.writelines(f"{line}\n")
+        else:
+            logger.info("Receiving files list of espnet/yodas2 dataset from Hugging Face...")
             self.list_repo_files()
-            logger.info(f'Metadata have beeen successfully recieved. Aggregating filenames into shards..')
+            logger.info("Metadata has been successfully received. Aggregating filenames into shards...")
 
             lang2shard_files = {}
-            
+
+            # Parse each file path and organize by language subset and shard
             for file in tqdm(self.files):
-                if not file.startswith('data/'):
+                if not file.startswith("data/"):
                     continue
 
                 path = Path(file)
-                lang_subset = path.parts[1]
+                lang_subset = path.parts[1]  # e.g., "en000"
                 if lang_subset not in lang2shard_files:
-                    lang2shard_files[lang_subset] = dict()
+                    lang2shard_files[lang_subset] = {}
                 lang_shards = lang2shard_files[lang_subset]
 
-                shard_no = path.parts[3].split('.')[0]
+                shard_no = path.parts[3].split('.')[0]  # e.g., "00000001"
                 if shard_no not in lang_shards:
-                    lang_shards[shard_no] = dict()
+                    lang_shards[shard_no] = {}
                 shard_files = lang_shards[shard_no]
 
-                data_type = path.parts[2]
+                data_type = path.parts[2]  # e.g., "audio", "duration", "text"
                 shard_files[data_type] = file
 
-            logger.info(f'Writing data into manifest..')
+            logger.info("Writing data into manifest...")
 
+            # Write aggregated entries to manifest
             with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
                 for lang_subset in sorted(lang2shard_files.keys()):
                     lang_subset_shards = lang2shard_files[lang_subset]
                     for shard_id in sorted(lang_subset_shards.keys()):
-                        data_entry = dict(
-                            lang_subset = lang_subset,
-                            shard_id = shard_id,
-                            )
-                        
+                        data_entry = {
+                            "lang_subset": lang_subset,
+                            "shard_id": shard_id,
+                        }
+
+                        # Add keys for each data type (audio_key, duration_key, etc.)
                         shard_data = lang_subset_shards[shard_id]
                         for data_type in sorted(shard_data.keys()):
-                            data_entry[f'{data_type}_key'] = shard_data[data_type]
-                        
+                            data_entry[f"{data_type}_key"] = shard_data[data_type]
+
                         line = json.dumps(data_entry)
-                        fout.writelines(f'{line}\n')
-        
-        logger.info(f'Metadata successfully saved!')
+                        fout.writelines(f"{line}\n")
+
+        logger.info("Metadata successfully saved!")
         
 
 class DownloadYodas2Data(SnapshotDownload):

From 89383457f3b56a57111e0a5c7d93f2389bf4bbf4 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 09:48:18 -0700
Subject: [PATCH 09/90] ListYodas2Data upd

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/datasets/yodas2/download.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sdp/processors/datasets/yodas2/download.py b/sdp/processors/datasets/yodas2/download.py
index 02d275ae..c3d68861 100644
--- a/sdp/processors/datasets/yodas2/download.py
+++ b/sdp/processors/datasets/yodas2/download.py
@@ -40,6 +40,11 @@ class ListYodas2Data(ListRepoFiles):
     Args:
         use_metadata (bool): Whether to use `meta.py` to generate the manifest (default: False).
         **kwargs: Passed to the parent class `sdp.processors.ListRepoFiles`, including Hugging Face repo config.
+    
+    Returns:
+        A line-delimited JSON manifest, where each line represents information about a YODAS2 dataset shard and 
+        includes keys pointing to the audio files (in .tar.gz format), transcriptions (in .json format), and 
+        durations (in .txt format) stored in the repository.
     """
 
     def __init__(self, use_metadata: bool = False, **kwargs):

From 59a2d3f177f9360babf3c115325ebd8d21859a76 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 13:16:35 -0700
Subject: [PATCH 10/90] LambdaExpression

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .gitignore                                    |   1 +
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   1 +
 .../yodas2/create_initial_manifest.py         |  63 +------
 .../modify_manifest/data_to_data.py           |  56 +++++++
 sdp/utils/apply_operators.py                  | 154 ++++++++++++++++++
 tests/test_data_to_data.py                    |  90 +++++++++-
 7 files changed, 309 insertions(+), 59 deletions(-)
 create mode 100644 sdp/utils/apply_operators.py

diff --git a/.gitignore b/.gitignore
index 559fd1bd..eba37666 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 test_data
 workdir
 lightning_logs
+build
 
 # unit test / coverage reports
 .hypothesis
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index fdb58351..8a97e900 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -236,6 +236,9 @@ Data modifications
 .. autodata:: sdp.processors.ListToEntries
    :annotation:
 
+.. autodata:: sdp.processors.LambdaExpression
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 5a9ab9be..fcef6965 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -93,6 +93,7 @@
     GetWER,
     InsIfASRInsertion,
     InverseNormalizeText,
+    LambdaExpression,
     ListToEntries,
     NormalizeText,
     MakeSentence,
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index c8d6a446..66c8fd41 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -18,83 +18,30 @@
 from sdp.processors import ListToEntries
 from sdp.logging import logger
 
-
 class CreateInitialManifestYodas2(ListToEntries):
-    """
-    Custom processor for generating an initial manifest from a dataset entry where
-    one of the fields contains a list of segment dictionaries.
-
-    In addition to flattening entries via ListToEntries, it also:
-    - Extracts `yodas_id` from each entry's audio file name.
-    - Enriches each entry with a precomputed `duration` from an external duration file.
-
-    The final output will contain entries with the following required fields:
-        - source_audio_filepath
-        - yodas_id
-        - duration
-
-    Args:
-        **kwargs: Passed through to ListToEntries, including:
-            - field_with_list (str): Name of the field to flatten.
-            - output_field (str): Required if inner list items are primitives.
-            - fields_to_save (list[str]): Fields to preserve from the original entry.
-    """
-
     def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
+         super().__init__(**kwargs)
+        
     def get_samples_durations(self, durations_filepath: str):
-        """
-        Loads a mapping of yodas_id → duration from a plain text file.
-
-        The file should contain one line per sample, formatted as:
-        <yodas_id> <duration>
-
-        Args:
-            durations_filepath (str): Path to the durations metadata file.
-
-        Returns:
-            dict[str, float]: Mapping from sample ID to duration.
-        """
         durations = dict()
         with open(durations_filepath, 'r') as durations_txt:
             for line in durations_txt:
                 yodas_id, duration = line.strip().split()
                 durations[yodas_id] = float(duration)
         return durations
-
+    
     def process_dataset_entry(self, data_entry):
-        """
-        Processes a single dataset entry:
-        - Loads durations for all samples listed in the `local_duration` file.
-        - Flattens the input using the ListToEntries logic.
-        - Attaches `yodas_id` (from file basename) and `duration` (from metadata file) to each entry.
-        - Skips entries not present in the durations file, with a warning.
-
-        Args:
-            data_entry (dict): One entry from the input manifest, must contain the
-                               'local_duration' field with path to the durations file.
-
-        Returns:
-            list[DataEntry]: Flattened and enriched list of manifest entries.
-        """
-        # Load durations for all samples
         durations = self.get_samples_durations(data_entry['local_duration'])
-
-        # Flatten the input field with list elements using base ListToEntries
         data_entries = super().process_dataset_entry(data_entry)
 
         yodas_entries = []
         for entry in data_entries:
-            # Extract yodas_id from audio filename (without extension)
             yodas_id = os.path.basename(entry.data['source_audio_filepath']).split('.')[0]
             entry.data['yodas_id'] = yodas_id
-
-            # Add duration if available
             if yodas_id in durations:
                 entry.data['duration'] = durations[yodas_id]
                 yodas_entries.append(entry)
             else:
                 logger.warning(f'Skipping `{yodas_id}` because there is no duration info in metadata.')
-
-        return yodas_entries
+                
+        return yodas_entries
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index a8a583d1..3fc1d2f8 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -31,6 +31,7 @@
     DataEntry,
 )
 from sdp.utils.common import ffmpeg_convert
+from sdp.utils.apply_operators import evaluate_expression
 from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 from sdp.utils.metrics_computation import (
@@ -1281,3 +1282,58 @@ def process_dataset_entry(self, data_entry):
             _entries.append(_entry)
 
         return _entries
+
+
+class LambdaExpression(BaseParallelProcessor):
+    """
+    A dataset processor that evaluates a Python expression on each data entry and either stores
+    the result in a new field or uses it as a filtering condition.
+
+    This processor is useful for dynamic field computation or conditional filtering of entries based
+    on configurable expressions. It leverages `evaluate_expression`, which safely evaluates expressions
+    using the abstract syntax tree (AST).
+
+    Args:
+        new_field (str): The name of the field to store the result of the expression.
+        expression (str): A Python expression to evaluate. It can reference fields of the data entry
+            using the name specified by `lambda_param_name`.
+        lambda_param_name (str, optional): The name to refer to the current data entry in the expression.
+            Default is "entry".
+        filter (bool, optional): If True, the expression result is treated as a condition.
+            The entry is kept only if the result is `True`. Default is False.
+        **kwargs: Additional keyword arguments passed to the BaseParallelProcessor class.
+    
+    Returns:
+        A line-delimited JSON manifest, where each line is a processed entry.
+        The result may contain fewer entries than the input if `filter=True`.
+    """
+    def __init__(
+        self,
+        new_field: str,
+        expression: str,
+        lambda_param_name: str = "entry",
+        filter: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.new_field = new_field
+        self.expression = expression
+        self.lambda_param_name = lambda_param_name
+        self.filter = filter
+
+    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
+        """
+        Process a single data entry by evaluating the expression.
+
+        If `filter` is True, the entry is only retained if the expression evaluates to True.
+        Otherwise, the result is stored in `new_field`.
+        """
+        value = evaluate_expression(self.expression,  data_entry, self.lambda_param_name)
+        if self.filter:
+            if value is not True:
+                return []
+        data_entry[self.new_field] = value   
+        return [DataEntry(data=data_entry)]
+
+    def finalize(self, metrics):
+        super().finalize(metrics)
\ No newline at end of file
diff --git a/sdp/utils/apply_operators.py b/sdp/utils/apply_operators.py
new file mode 100644
index 00000000..7bb5975d
--- /dev/null
+++ b/sdp/utils/apply_operators.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import operator
+import ast
+import re
+from typing import Any, Dict
+
+"""
+This module provides a safe evaluator for simple Python expressions using the abstract syntax tree (AST).
+It restricts execution to a subset of safe operations (arithmetic, logical, comparisons, indexing, etc.)
+and selected built-in functions (e.g., max, min, len), while preventing arbitrary code execution.
+
+Useful in cases where dynamic expressions need to be evaluated using a provided variable context,
+such as configuration systems, data transformation pipelines, or manifest filtering.
+
+Functions:
+    - evaluate_expression: Safely evaluates a Python expression string using restricted AST operations.
+"""
+
+OPERATORS = {
+    ast.Add: operator.add,         # Addition (a + b)
+    ast.Sub: operator.sub,         # Subtraction (a - b)
+    ast.Mult: operator.mul,        # Multiplication (a * b)
+    ast.Div: operator.truediv,     # True Division (a / b)
+    ast.FloorDiv: operator.floordiv, # Floor Division (a // b)
+    ast.Mod: operator.mod,         # Modulus (a % b)
+    ast.Pow: operator.pow,         # Exponentiation (a ** b)
+    ast.BitOr: operator.or_,       # Bitwise OR (a | b)
+    ast.BitAnd: operator.and_,     # Bitwise AND (a & b)
+    ast.BitXor: operator.xor,      # Bitwise XOR (a ^ b)
+    ast.LShift: operator.lshift,   # Left Shift (a << b)
+    ast.RShift: operator.rshift,   # Right Shift (a >> b)
+    ast.Invert: operator.invert,   # Bitwise NOT (~a)
+    ast.USub: operator.neg,        # Negation (-a)
+    ast.UAdd: operator.pos,        # Unary Plus (+a)
+    ast.Eq: operator.eq,           # Equality Check (a == b)
+    ast.NotEq: operator.ne,        # Inequality Check (a != b)
+    ast.Lt: operator.lt,           # Less Than (a < b)
+    ast.LtE: operator.le,          # Less Than or Equal To (a <= b)
+    ast.Gt: operator.gt,           # Greater Than (a > b)
+    ast.GtE: operator.ge,          # Greater Than or Equal To (a >= b)
+    ast.Is: operator.is_,          # Identity Check (a is b)
+    ast.IsNot: operator.is_not,    # Negated Identity Check (a is not b)
+    ast.And: operator.and_,        # Logical AND (a and b)
+    ast.Or: operator.or_,          # Logical OR (a or b)
+    ast.Not: operator.not_,        # Logical NOT (not a)
+}
+
+SAFE_FUNCTIONS = {
+    'max': max,
+    'min': min,
+    'len': len,
+    'sum': sum,
+    'abs': abs,
+    'sorted': sorted,
+}
+
+def evaluate_expression(expression: str, variables: Dict[str, Any] = None, var_prefix: str = None) -> any:
+    """
+    Safely evaluates a Python expression string using a restricted set of AST nodes and operators.
+
+    This function supports arithmetic operations, comparisons, logical expressions, list/dict indexing,
+    conditional expressions, and a whitelist of safe built-in functions. It allows evaluating expressions
+    in the context of a user-provided dictionary of variables.
+
+    Args:
+        expression (str): The expression to evaluate. Example: "x + 2 if y > 3 else 0".
+        variables (Dict[str, Any], optional): A dictionary of variable names and values to use in evaluation.
+        var_prefix (str, optional): If specified, this prefix will be removed from variable names
+            in the expression before evaluation (e.g., "data.x" → "x").
+
+    Returns:
+        any: The result of evaluating the expression.
+
+    Raises:
+        ValueError: If the expression contains unsupported operations or names.
+    """
+    
+    if variables is None:
+        variables = {}
+
+    def _eval(node):
+        if isinstance(node, ast.Expression):
+            return _eval(node.body)
+        elif isinstance(node, ast.BinOp):  # Binary operations
+            left = _eval(node.left)
+            right = _eval(node.right)
+            return OPERATORS[type(node.op)](left, right)
+        elif isinstance(node, ast.UnaryOp):  # Unary operations
+            operand = _eval(node.operand)
+            return OPERATORS[type(node.op)](operand)
+        elif isinstance(node, ast.Subscript): # Accessing elements with []
+            value = _eval(node.value)  # The collection (e.g., list, dict)
+            if isinstance(node.slice, ast.Slice):  # Slice processing
+                start = _eval(node.slice.lower) if node.slice.lower else None
+                stop = _eval(node.slice.upper) if node.slice.upper else None
+                step = _eval(node.slice.step) if node.slice.step else None
+                return value[start:stop:step]
+            else:
+                key = _eval(node.slice) # The index/key
+                return value[key]
+        elif isinstance(node, ast.Compare):  # Comparisons
+            left = _eval(node.left)
+            right = _eval(node.comparators[0])
+            return OPERATORS[type(node.ops[0])](left, right)
+        elif isinstance(node, ast.BoolOp):  # Logical operations
+            values = [_eval(value) for value in node.values]
+            if isinstance(node.op, ast.And):
+                return all(values)
+            elif isinstance(node.op, ast.Or):
+                return any(values)
+        elif isinstance(node, ast.IfExp):  # Ternary if (condition ? true_value : false_value)
+            test = _eval(node.test)
+            return _eval(node.body) if test else _eval(node.orelse)
+        elif isinstance(node, ast.Constant):  # Numbers, strings, etc.
+            return node.value
+        elif isinstance(node, ast.NameConstant):  # True, False, None
+            return node.value
+        elif isinstance(node, ast.Name):  # For identifiers
+            var_name = node.id
+            if var_name in variables:  # Look for variables in the provided dictionary
+                return variables[var_name]
+            elif var_name in {"True", "False"}:
+                return eval(var_name)
+            raise ValueError(f"Unsupported name: {node.id}")
+        elif isinstance(node, ast.Call):  # Function call handling
+            func_name = node.func.id if isinstance(node.func, ast.Name) else None
+            if func_name in SAFE_FUNCTIONS:
+                func = SAFE_FUNCTIONS[func_name]
+                args = [_eval(arg) for arg in node.args]
+                return func(*args)
+            else:
+                raise ValueError(f"Function {func_name} is not allowed")
+        else:
+            raise ValueError(f"Unsupported node type: {type(node)}")
+
+    var_prefix += '.'
+    expression = re.sub(rf'{re.escape(var_prefix)}(\w+)', r'\1', expression)
+
+    # Parse the expression into an AST tree
+    tree = ast.parse(expression, mode='eval')
+    return _eval(tree.body)
\ No newline at end of file
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index f0b124f8..4c6ba1d3 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -19,7 +19,8 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
-    ListToEntries
+    ListToEntries,
+    LambdaExpression,
 )
 
 test_params_list = []
@@ -117,6 +118,93 @@
     ]
 )
 
+test_params_list.extend(
+    [
+        # Simple arithmetic expression
+        (
+            LambdaExpression,
+            {"new_field": "duration_x2", "expression": "entry.duration * 2"},
+            {"duration": 3.5},
+            [{"duration": 3.5, "duration_x2": 7.0}],
+        ),
+
+        # Ternary expression
+        (
+            LambdaExpression,
+            {"new_field": "label", "expression": "'long' if entry.duration > 10 else 'short'"},
+            {"duration": 12.0},
+            [{"duration": 12.0, "label": "long"}],
+        ),
+
+        # Filtering: entry should be dropped (condition is False)
+        (
+            LambdaExpression,
+            {"new_field": "valid", "expression": "entry.duration > 10", "filter": True},
+            {"duration": 5.0},
+            [],
+        ),
+
+        # Filtering: entry should be kept (condition is True)
+        (
+            LambdaExpression,
+            {"new_field": "valid", "expression": "entry.duration > 10", "filter": True},
+            {"duration": 12.0},
+            [{"duration": 12.0, "valid": True}],
+        ),
+
+        # Using built-in function len()
+        (
+            LambdaExpression,
+            {"new_field": "num_chars", "expression": "len(entry.text)"},
+            {"text": "hello world"},
+            [{"text": "hello world", "num_chars": 11}],
+        ),
+
+        # Using built-in max() with sub-expressions
+        (
+            LambdaExpression,
+            {"new_field": "score", "expression": "max(entry.a, entry.b * 2)"},
+            {"a": 4, "b": 3},
+            [{"a": 4, "b": 3, "score": 6}],
+        ),
+
+        # Expression using variable prefix (e.g., entry.a + entry.b)
+        (
+            LambdaExpression,
+            {
+                "new_field": "sum",
+                "expression": "entry.a + entry.b",
+                "lambda_param_name": "entry",
+            },
+            {"a": 1, "b": 2},
+            [{"a": 1, "b": 2, "sum": 3}],
+        ),
+
+        # Logical expression using `and`
+        (
+            LambdaExpression,
+            {
+                "new_field": "check",
+                "expression": "entry.a > 0 and entry.b < 5",
+            },
+            {"a": 1, "b": 4},
+            [{"a": 1, "b": 4, "check": True}],
+        ),
+
+        # Boolean expression without filtering (entry is always returned)
+        (
+            LambdaExpression,
+            {
+                "new_field": "is_zero",
+                "expression": "entry.value == 0",
+            },
+            {"value": 5},
+            [{"value": 5, "is_zero": False}],
+        ),
+    ]
+)
+
+
 
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):

From c632344bb50d70d739cdcc34ff588f98aa9e755d Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 13:38:39 -0700
Subject: [PATCH 11/90] DownloadYodas2Data

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                       |  10 ++
 sdp/processors/datasets/yodas2/download.py | 126 ++++++++++++++++-----
 2 files changed, 108 insertions(+), 28 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 8a97e900..ab45e889 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -110,6 +110,16 @@ MediaSpeech
    :annotation:
 
 
+Yodas2
+''''''''''''
+
+.. autodata:: sdp.processors.datasets.yodas2.ListYodas2Data
+   :annotation:
+
+.. autodata:: sdp.processors.datasets.yodas2.DownloadYodas2Data
+   :annotation:
+
+
 HuggingFace Datasets
 ''''''''''''''''''''
 
diff --git a/sdp/processors/datasets/yodas2/download.py b/sdp/processors/datasets/yodas2/download.py
index c3d68861..a1a229cc 100644
--- a/sdp/processors/datasets/yodas2/download.py
+++ b/sdp/processors/datasets/yodas2/download.py
@@ -139,64 +139,134 @@ def process(self):
         
 
 class DownloadYodas2Data(SnapshotDownload):
-    def __init__(self, **kwargs):
-        super().__init__(repo_id = "espnet/yodas2", repo_type = "dataset", **kwargs)
+    """
+    A specialized processor for downloading the YODAS2 dataset from Hugging Face
+    and updating the input manifest with local file paths to the downloaded files.
+
+    This class:
+    - Loads an input manifest that contains HF repo-relative paths to audio, text, and duration files.
+    - Downloads only the referenced files using Hugging Face `snapshot_download` with `allow_patterns`.
+    - Updates the manifest with paths to the locally downloaded files (under keys `local_audio`, `local_duration`, `local_text`).
+
+    Args:
+        output_manifest_file (str): Path to write the updated output manifest file.
+        input_manifest_file (str): Path to the input manifest listing the files to fetch.
+            Each line must be a JSON object that contains the following **optional but expected** fields:
+
+                - "audio_key" (str): Relative path in the Hugging Face dataset repository to the audio file.
+                - "duration_key" (str): Relative path to a file containing the duration of the audio sample.
+                - "text_key" (str): Relative path to the transcription or label file for the audio sample.
+
+                At least one of these keys should be present in each entry. These keys are used to determine
+                which files to download from the Hugging Face dataset and to map them to local files after download.
+        
+        **kwargs: Additional arguments passed to the base SnapshotDownload processor.
+    
+    Returns:
+        A line-delimited JSON manifest, where each line represents a sample entry
+        and contains absolute local paths to the audio, duration, and text files.
     
-    def write_output_manifest_file(self): 
-        samples = []
-        with open(self.input_manifest_file, 'r', encoding = 'utf8') as fin, open(self.output_manifest_file, 'w', encoding = 'utf8') as fout:
+    .. admonition:: Example
+                
+        Input line::
+            
+            {
+                "lang_subset": "en000", 
+                "shard_id": "00000000", 
+                "audio_key": "data/en000/audio/00000000.tar.gz", 
+                "duration_key": "data/en000/duration/00000000.txt", 
+                "text_key": "data/en000/text/00000000.json"
+            }
+ 
+        Output line::
+            
+            {
+                "lang_subset": "en000", 
+                "shard_id": "00000000", 
+                "audio_key": "data/en000/audio/00000000.tar.gz", 
+                "duration_key": "data/en000/duration/00000000.txt", 
+                "text_key": "data/en000/text/00000000.json", 
+                "local_audio": "/path/to/en000/audio/00000000.tar.gz", 
+                "local_duration": "/path/to/en000/duration/00000000.txt"}
+                "local_text": "/path/to/en000/text/00000000.json"
+            }
+    
+    """
+
+    def __init__(self, **kwargs):
+        # Hardcoded to download the espnet/yodas2 dataset from Hugging Face
+        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
+
+    def write_output_manifest_file(self):
+        """
+        Write a new manifest file that includes local paths to audio, text, and duration files.
+
+        For each line in the input manifest, checks for keys `audio_key`, `duration_key`, and `text_key`.
+        If the corresponding file exists locally after download, adds the local path under
+        `local_audio`, `local_duration`, and `local_text`, respectively.
+        """
+
+        # Open input manifest for reading and output manifest for writing
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin, open(self.output_manifest_file, 'w', encoding='utf8') as fout:
             for line in fin:
                 sample = json.loads(line)
-                audio_file = sample.get('audio_key', None)
+
+                # Try to find and attach local paths for each possible file type
+                audio_file = sample.get('audio_key')
                 if audio_file:
                     local_audio_file = os.path.join(self.local_dir, audio_file)
                     if os.path.exists(local_audio_file):
                         sample['local_audio'] = local_audio_file
 
-                duration_file = sample.get('duration_key', None)
+                duration_file = sample.get('duration_key')
                 if duration_file:
                     local_duration_file = os.path.join(self.local_dir, duration_file)
                     if os.path.exists(local_duration_file):
                         sample['local_duration'] = local_duration_file
-                
-                text_file = sample.get('text_key', None)
+
+                text_file = sample.get('text_key')
                 if text_file:
                     local_text_file = os.path.join(self.local_dir, text_file)
                     if os.path.exists(local_text_file):
                         sample['local_text'] = local_text_file
-                
+
+                # Write the modified sample to the output manifest
                 line = json.dumps(sample)
                 fout.writelines(f'{line}\n')
 
     def process(self):
+        """
+        Main processing function: collects the list of files to download,
+        performs the download, and then writes the output manifest.
+
+        This method:
+        - Scans the input manifest to extract all relevant repo-relative file paths.
+        - Adds them to the `allow_patterns` argument of `snapshot_download_kwargs`.
+        - Triggers the download of only the needed files.
+        - Updates the manifest with local paths.
+        """
         allow_patterns = []
-        with open(self.input_manifest_file, 'r', encoding = 'utf8') as fin:
+
+        # Parse input manifest to extract all file paths to allow in the snapshot
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin:
             for line in fin:
                 sample = json.loads(line)
-                audio_file = sample.get('audio_key', None)
+
+                audio_file = sample.get('audio_key')
                 if audio_file:
                     allow_patterns.append(audio_file)
 
-                duration_file = sample.get('duration_key', None)
+                duration_file = sample.get('duration_key')
                 if duration_file:
                     allow_patterns.append(duration_file)
-                
-                text_file = sample.get('text_key', None)
+
+                text_file = sample.get('text_key')
                 if text_file:
                     allow_patterns.append(text_file)
-    
+
+        # Restrict snapshot download to only needed files
         self.snapshot_download_kwargs['allow_patterns'] = allow_patterns
+
+        # Download the snapshot and write updated manifest
         self.download()
         self.write_output_manifest_file()
-
-
-class ExtractYodas2Data(ExtractTar):
-    def __init__(self, **kwargs):
-        kwargs['get_extracted_filepaths'] = True
-        super().__init__(**kwargs)
-    
-    def process_dataset_entry(self, data_entry):
-        super().process_dataset_entry()
-        audio_samples = []
-        for audio_filepath in data_entry[self.output_filepath_field]:
-            sample = dict(data_entry['lang_'])
\ No newline at end of file

From cba6b06607c53514bf399040b945edca095630ae Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 13:40:58 -0700
Subject: [PATCH 12/90] init for yodas2 processors

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/datasets/__init__.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sdp/processors/datasets/__init__.py b/sdp/processors/datasets/__init__.py
index e69de29b..83d692ce 100644
--- a/sdp/processors/datasets/__init__.py
+++ b/sdp/processors/datasets/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sdp.processors.datasets.yodas2.download import ListYodas2Data, DownloadYodas2Data
+from sdp.processors.datasets.yodas2.create_initial_manifest import CreateInitialManifest
\ No newline at end of file

From 613a48644bc14e8b0e56af1690d416783e3071f7 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:03:30 -0700
Subject: [PATCH 13/90] Fixed docs

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index ab45e889..a230f5c8 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -113,10 +113,10 @@ MediaSpeech
 Yodas2
 ''''''''''''
 
-.. autodata:: sdp.processors.datasets.yodas2.ListYodas2Data
+.. autodata:: sdp.processors.ListYodas2Data
    :annotation:
 
-.. autodata:: sdp.processors.datasets.yodas2.DownloadYodas2Data
+.. autodata:: sdp.processors.DownloadYodas2Data
    :annotation:
 
 

From 8220deb40b518bbba1ab9c8b5c999c6f11e48dfd Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:04:10 -0700
Subject: [PATCH 14/90] Fixed docs

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          |   6 +-
 sdp/processors/__init__.py                    | 115 ++++----
 sdp/processors/datasets/__init__.py           |  16 --
 .../yodas2/create_initial_manifest.py         | 255 +++++++++++++++-
 sdp/processors/datasets/yodas2/download.py    | 272 ------------------
 sdp/processors/huggingface/huggingface_hub.py |   4 +-
 6 files changed, 320 insertions(+), 348 deletions(-)
 delete mode 100644 sdp/processors/datasets/__init__.py
 delete mode 100644 sdp/processors/datasets/yodas2/download.py

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index f81aab68..58f23d96 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -23,7 +23,7 @@ workspace_dir: ???
 install_requirements: True
 
 processors:
-  - _target_: sdp.processors.datasets.yodas2.ListYodas2Data
+  - _target_: sdp.processors.ListYodas2Data
     output_manifest_file: ${workspace_dir}/manifest_00.json
     use_metadata: True
   
@@ -37,7 +37,7 @@ processors:
     input_value_key: src_lang
     target_value: ${params.source_lang}
   
-  - _target_: sdp.processors.datasets.yodas2.DownloadYodas2Data
+  - _target_: sdp.processors.DownloadYodas2Data
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
     local_dir: ${workspace_dir}/${params.source_lang}/
     max_workers: 8
@@ -51,7 +51,7 @@ processors:
     output_filepath_field: 'extracted_audios'
     get_extracted_filepaths: True
   
-  - _target_: sdp.processors.datasets.yodas2.CreateInitialManifest
+  - _target_: sdp.processors.CreateInitialManifestYodas2
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_05.json
     field_with_list: 'extracted_audios'
     output_field: 'source_audio_filepath'
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fcef6965..b114d750 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -14,60 +14,6 @@
 
 # let's import all supported processors here to simplify target specification
 
-from sdp.processors.datasets.coraa.create_initial_manifest import (
-    CreateInitialManifestCORAA,
-)
-from sdp.processors.datasets.coraal import (
-    CreateInitialManifestCORAAL,
-    TrainDevTestSplitCORAAL,
-)
-from sdp.processors.datasets.fleurs.create_initial_manifest import (
-    CreateInitialManifestFleurs,
-)
-from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
-    CreateInitialManifestUzbekvoice,
-)
-from sdp.processors.datasets.ksc2.create_initial_manifest import (
-    CreateInitialManifestKSC2,
-)
-from sdp.processors.datasets.lhotse import LhotseImport
-from sdp.processors.datasets.librispeech.create_initial_manifest import (
-    CreateInitialManifestLibrispeech,
-)
-from sdp.processors.datasets.masc import (
-    CreateInitialManifestMASC,
-    AggregateSegments,
-    RegExpVttEntries,
-    GetCaptionFileSegments
-)
-from sdp.processors.datasets.mediaspeech.create_initial_manifest import CreateInitialManifestMediaSpeech
-from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
-from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
-from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
-from sdp.processors.datasets.mtedx.create_initial_manifest import (
-    CreateInitialManifestMTEDX,
-)
-from sdp.processors.datasets.slr83.create_initial_manifest import (
-    CreateInitialManifestSLR83,
-    CustomDataSplitSLR83,
-)
-from sdp.processors.datasets.slr102.create_initial_manifest import (
-    CreateInitialManifestSLR102,
-)
-from sdp.processors.datasets.slr140.create_initial_manifest import (
-    CreateInitialManifestSLR140,
-    CustomDataSplitSLR140,
-)
-from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
-    CreateInitialManifestVoxpopuli,
-)
-from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
-    NormalizeFromNonPCTextVoxpopuli,
-)
-from sdp.processors.huggingface.speech_recognition import ASRTransformers
-from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
-
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ApplyInnerJoin,
@@ -126,6 +72,11 @@
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
+
+from sdp.processors.huggingface.speech_recognition import ASRTransformers
+from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.pc_inference import PCInference
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
@@ -135,3 +86,59 @@
 from sdp.processors.toloka.create_task_set import CreateTolokaTaskSet
 from sdp.processors.toloka.download_responses import GetTolokaResults
 from sdp.processors.toloka.reject_if import RejectIfBanned
+
+from sdp.processors.datasets.coraa.create_initial_manifest import (
+    CreateInitialManifestCORAA,
+)
+from sdp.processors.datasets.coraal import (
+    CreateInitialManifestCORAAL,
+    TrainDevTestSplitCORAAL,
+)
+from sdp.processors.datasets.fleurs.create_initial_manifest import (
+    CreateInitialManifestFleurs,
+)
+from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
+    CreateInitialManifestUzbekvoice,
+)
+from sdp.processors.datasets.ksc2.create_initial_manifest import (
+    CreateInitialManifestKSC2,
+)
+from sdp.processors.datasets.lhotse import LhotseImport
+from sdp.processors.datasets.librispeech.create_initial_manifest import (
+    CreateInitialManifestLibrispeech,
+)
+from sdp.processors.datasets.masc import (
+    CreateInitialManifestMASC,
+    AggregateSegments,
+    RegExpVttEntries,
+    GetCaptionFileSegments
+)
+from sdp.processors.datasets.mediaspeech.create_initial_manifest import CreateInitialManifestMediaSpeech
+from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
+from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
+from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
+from sdp.processors.datasets.mtedx.create_initial_manifest import (
+    CreateInitialManifestMTEDX,
+)
+from sdp.processors.datasets.slr83.create_initial_manifest import (
+    CreateInitialManifestSLR83,
+    CustomDataSplitSLR83,
+)
+from sdp.processors.datasets.slr102.create_initial_manifest import (
+    CreateInitialManifestSLR102,
+)
+from sdp.processors.datasets.slr140.create_initial_manifest import (
+    CreateInitialManifestSLR140,
+    CustomDataSplitSLR140,
+)
+from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
+    CreateInitialManifestVoxpopuli,
+)
+from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
+    NormalizeFromNonPCTextVoxpopuli,
+)
+from sdp.processors.datasets.yodas2.create_initial_manifest import(
+    ListYodas2Data,
+    DownloadYodas2Data,
+    CreateInitialManifestYodas2,
+)
\ No newline at end of file
diff --git a/sdp/processors/datasets/__init__.py b/sdp/processors/datasets/__init__.py
deleted file mode 100644
index 83d692ce..00000000
--- a/sdp/processors/datasets/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from sdp.processors.datasets.yodas2.download import ListYodas2Data, DownloadYodas2Data
-from sdp.processors.datasets.yodas2.create_initial_manifest import CreateInitialManifest
\ No newline at end of file
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index 66c8fd41..51cd8275 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -13,11 +13,264 @@
 # limitations under the License.
 
 import os
+from pathlib import Path
+import json
+from tqdm import tqdm
+import tempfile
+import importlib.util
 
-from sdp.processors.base_processor import DataEntry, BaseParallelProcessor
 from sdp.processors import ListToEntries
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
 from sdp.logging import logger
 
+
+class ListYodas2Data(ListRepoFiles):
+    """
+    Processor for generating a manifest of the YODAS2 dataset stored on the Hugging Face Hub.
+
+    This processor supports two modes:
+        1. Using the `meta.py` file from the dataset repo (if `use_metadata=True`), which provides
+        structured metadata including the number of shards per language subset.
+        
+        2. Parsing the repository's file list directly to infer the manifest structure (default mode).
+
+    Args:
+        use_metadata (bool): Whether to use `meta.py` to generate the manifest (default: False).
+        **kwargs: Passed to the parent class `sdp.processors.ListRepoFiles`, including Hugging Face repo config.
+    
+    Returns:
+        A line-delimited JSON manifest, where each line represents information about a YODAS2 dataset shard and 
+        includes keys pointing to the audio files (in .tar.gz format), transcriptions (in .json format), and 
+        durations (in .txt format) stored in the repository.
+    """
+
+    def __init__(self, use_metadata: bool = False, **kwargs):
+        # Initialize parent class with hardcoded repo_id and repo_type for YODAS2
+        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
+        self.use_metadata = use_metadata
+
+    def process(self):
+        from huggingface_hub import hf_hub_download
+
+        """
+        Main entry point for generating the YODAS2 manifest.
+
+        If `use_metadata=True`, it uses the `meta.py` file to generate expected paths.
+        Otherwise, it lists all files from Hugging Face and organizes them by language and shard.
+        The final manifest is written as line-delimited JSON.
+        """
+        if self.use_metadata:
+            metadata = None
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                # Download the meta.py script containing shard information
+                yodas_metafile = hf_hub_download(
+                    repo_id="espnet/yodas2",
+                    filename="meta.py",
+                    repo_type="dataset",
+                    local_dir=tmp_dir
+                )
+                # Dynamically import meta.py
+                spec = importlib.util.spec_from_file_location("script", yodas_metafile)
+                metadata = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(metadata)
+
+            # Write the manifest based on metadata.lang2shard_cnt
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                for lang_subset in sorted(metadata.lang2shard_cnt.keys()):
+                    for shard_no in range(metadata.lang2shard_cnt['aa000']):
+                        shard_id = str(shard_no).zfill(8)
+                        data_entry = {
+                            "lang_subset": lang_subset,
+                            "shard_id": shard_id,
+                            "audio_key": f"data/{lang_subset}/audio/{shard_id}.tar.gz",
+                            "duration_key": f"data/{lang_subset}/duration/{shard_id}.txt",
+                            "text_key": f"data/{lang_subset}/text/{shard_id}.json",
+                        }
+                        line = json.dumps(data_entry)
+                        fout.writelines(f"{line}\n")
+        else:
+            logger.info("Receiving files list of espnet/yodas2 dataset from Hugging Face...")
+            self.list_repo_files()
+            logger.info("Metadata has been successfully received. Aggregating filenames into shards...")
+
+            lang2shard_files = {}
+
+            # Parse each file path and organize by language subset and shard
+            for file in tqdm(self.files):
+                if not file.startswith("data/"):
+                    continue
+
+                path = Path(file)
+                lang_subset = path.parts[1]  # e.g., "en000"
+                if lang_subset not in lang2shard_files:
+                    lang2shard_files[lang_subset] = {}
+                lang_shards = lang2shard_files[lang_subset]
+
+                shard_no = path.parts[3].split('.')[0]  # e.g., "00000001"
+                if shard_no not in lang_shards:
+                    lang_shards[shard_no] = {}
+                shard_files = lang_shards[shard_no]
+
+                data_type = path.parts[2]  # e.g., "audio", "duration", "text"
+                shard_files[data_type] = file
+
+            logger.info("Writing data into manifest...")
+
+            # Write aggregated entries to manifest
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                for lang_subset in sorted(lang2shard_files.keys()):
+                    lang_subset_shards = lang2shard_files[lang_subset]
+                    for shard_id in sorted(lang_subset_shards.keys()):
+                        data_entry = {
+                            "lang_subset": lang_subset,
+                            "shard_id": shard_id,
+                        }
+
+                        # Add keys for each data type (audio_key, duration_key, etc.)
+                        shard_data = lang_subset_shards[shard_id]
+                        for data_type in sorted(shard_data.keys()):
+                            data_entry[f"{data_type}_key"] = shard_data[data_type]
+
+                        line = json.dumps(data_entry)
+                        fout.writelines(f"{line}\n")
+
+        logger.info("Metadata successfully saved!")
+        
+
+class DownloadYodas2Data(SnapshotDownload):
+    """
+    A specialized processor for downloading the YODAS2 dataset from Hugging Face
+    and updating the input manifest with local file paths to the downloaded files.
+
+    This class:
+    - Loads an input manifest that contains HF repo-relative paths to audio, text, and duration files.
+    - Downloads only the referenced files using Hugging Face `snapshot_download` with `allow_patterns`.
+    - Updates the manifest with paths to the locally downloaded files (under keys `local_audio`, `local_duration`, `local_text`).
+
+    Args:
+        output_manifest_file (str): Path to write the updated output manifest file.
+        input_manifest_file (str): Path to the input manifest listing the files to fetch.
+            Each line must be a JSON object that contains the following **optional but expected** fields:
+
+                - "audio_key" (str): Relative path in the Hugging Face dataset repository to the audio file.
+                - "duration_key" (str): Relative path to a file containing the duration of the audio sample.
+                - "text_key" (str): Relative path to the transcription or label file for the audio sample.
+
+                At least one of these keys should be present in each entry. These keys are used to determine
+                which files to download from the Hugging Face dataset and to map them to local files after download.
+        
+        **kwargs: Additional arguments passed to the base SnapshotDownload processor.
+    
+    Returns:
+        A line-delimited JSON manifest, where each line represents a sample entry
+        and contains absolute local paths to the audio, duration, and text files.
+    
+    .. admonition:: Example
+                
+        Input line::
+            
+            {
+                "lang_subset": "en000", 
+                "shard_id": "00000000", 
+                "audio_key": "data/en000/audio/00000000.tar.gz", 
+                "duration_key": "data/en000/duration/00000000.txt", 
+                "text_key": "data/en000/text/00000000.json"
+            }
+ 
+        Output line::
+            
+            {
+                "lang_subset": "en000", 
+                "shard_id": "00000000", 
+                "audio_key": "data/en000/audio/00000000.tar.gz", 
+                "duration_key": "data/en000/duration/00000000.txt", 
+                "text_key": "data/en000/text/00000000.json", 
+                "local_audio": "/path/to/data/en000/audio/00000000.tar.gz", 
+                "local_duration": "/path/to/data/en000/duration/00000000.txt"}
+                "local_text": "/path/to/data/en000/text/00000000.json"
+            }
+    
+    """
+
+    def __init__(self, **kwargs):
+        # Hardcoded to download the espnet/yodas2 dataset from Hugging Face
+        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
+
+    def write_output_manifest_file(self):
+        """
+        Write a new manifest file that includes local paths to audio, text, and duration files.
+
+        For each line in the input manifest, checks for keys `audio_key`, `duration_key`, and `text_key`.
+        If the corresponding file exists locally after download, adds the local path under
+        `local_audio`, `local_duration`, and `local_text`, respectively.
+        """
+
+        # Open input manifest for reading and output manifest for writing
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin, open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            for line in fin:
+                sample = json.loads(line)
+
+                # Try to find and attach local paths for each possible file type
+                audio_file = sample.get('audio_key')
+                if audio_file:
+                    local_audio_file = os.path.join(self.local_dir, audio_file)
+                    if os.path.exists(local_audio_file):
+                        sample['local_audio'] = local_audio_file
+
+                duration_file = sample.get('duration_key')
+                if duration_file:
+                    local_duration_file = os.path.join(self.local_dir, duration_file)
+                    if os.path.exists(local_duration_file):
+                        sample['local_duration'] = local_duration_file
+
+                text_file = sample.get('text_key')
+                if text_file:
+                    local_text_file = os.path.join(self.local_dir, text_file)
+                    if os.path.exists(local_text_file):
+                        sample['local_text'] = local_text_file
+
+                # Write the modified sample to the output manifest
+                line = json.dumps(sample)
+                fout.writelines(f'{line}\n')
+
+    def process(self):
+        """
+        Main processing function: collects the list of files to download,
+        performs the download, and then writes the output manifest.
+
+        This method:
+        - Scans the input manifest to extract all relevant repo-relative file paths.
+        - Adds them to the `allow_patterns` argument of `snapshot_download_kwargs`.
+        - Triggers the download of only the needed files.
+        - Updates the manifest with local paths.
+        """
+        allow_patterns = []
+
+        # Parse input manifest to extract all file paths to allow in the snapshot
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin:
+            for line in fin:
+                sample = json.loads(line)
+
+                audio_file = sample.get('audio_key')
+                if audio_file:
+                    allow_patterns.append(audio_file)
+
+                duration_file = sample.get('duration_key')
+                if duration_file:
+                    allow_patterns.append(duration_file)
+
+                text_file = sample.get('text_key')
+                if text_file:
+                    allow_patterns.append(text_file)
+
+        # Restrict snapshot download to only needed files
+        self.snapshot_download_kwargs['allow_patterns'] = allow_patterns
+
+        # Download the snapshot and write updated manifest
+        self.download()
+        self.write_output_manifest_file()
+
+
 class CreateInitialManifestYodas2(ListToEntries):
     def __init__(self, **kwargs):
          super().__init__(**kwargs)
diff --git a/sdp/processors/datasets/yodas2/download.py b/sdp/processors/datasets/yodas2/download.py
deleted file mode 100644
index a1a229cc..00000000
--- a/sdp/processors/datasets/yodas2/download.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-import json
-import os
-from glob import glob
-from tqdm import tqdm
-import tempfile
-import importlib.util
-
-from huggingface_hub import hf_hub_download
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
-from sdp.processors import ExtractTar
-
-
-class ListYodas2Data(ListRepoFiles):
-    """
-    Processor for generating a manifest of the YODAS2 dataset stored on the Hugging Face Hub.
-
-    This processor supports two modes:
-    1. Using the `meta.py` file from the dataset repo (if `use_metadata=True`), which provides
-       structured metadata including the number of shards per language subset.
-    2. Parsing the repository's file list directly to infer the manifest structure (default mode).
-
-    Args:
-        use_metadata (bool): Whether to use `meta.py` to generate the manifest (default: False).
-        **kwargs: Passed to the parent class `sdp.processors.ListRepoFiles`, including Hugging Face repo config.
-    
-    Returns:
-        A line-delimited JSON manifest, where each line represents information about a YODAS2 dataset shard and 
-        includes keys pointing to the audio files (in .tar.gz format), transcriptions (in .json format), and 
-        durations (in .txt format) stored in the repository.
-    """
-
-    def __init__(self, use_metadata: bool = False, **kwargs):
-        # Initialize parent class with hardcoded repo_id and repo_type for YODAS2
-        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
-        self.use_metadata = use_metadata
-
-    def process(self):
-        """
-        Main entry point for generating the YODAS2 manifest.
-
-        If `use_metadata=True`, it uses the `meta.py` file to generate expected paths.
-        Otherwise, it lists all files from Hugging Face and organizes them by language and shard.
-        The final manifest is written as line-delimited JSON.
-        """
-        if self.use_metadata:
-            metadata = None
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                # Download the meta.py script containing shard information
-                yodas_metafile = hf_hub_download(
-                    repo_id="espnet/yodas2",
-                    filename="meta.py",
-                    repo_type="dataset",
-                    local_dir=tmp_dir
-                )
-                # Dynamically import meta.py
-                spec = importlib.util.spec_from_file_location("script", yodas_metafile)
-                metadata = importlib.util.module_from_spec(spec)
-                spec.loader.exec_module(metadata)
-
-            # Write the manifest based on metadata.lang2shard_cnt
-            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-                for lang_subset in sorted(metadata.lang2shard_cnt.keys()):
-                    for shard_no in range(metadata.lang2shard_cnt['aa000']):
-                        shard_id = str(shard_no).zfill(8)
-                        data_entry = {
-                            "lang_subset": lang_subset,
-                            "shard_id": shard_id,
-                            "audio_key": f"data/{lang_subset}/audio/{shard_id}.tar.gz",
-                            "duration_key": f"data/{lang_subset}/duration/{shard_id}.txt",
-                            "text_key": f"data/{lang_subset}/text/{shard_id}.json",
-                        }
-                        line = json.dumps(data_entry)
-                        fout.writelines(f"{line}\n")
-        else:
-            logger.info("Receiving files list of espnet/yodas2 dataset from Hugging Face...")
-            self.list_repo_files()
-            logger.info("Metadata has been successfully received. Aggregating filenames into shards...")
-
-            lang2shard_files = {}
-
-            # Parse each file path and organize by language subset and shard
-            for file in tqdm(self.files):
-                if not file.startswith("data/"):
-                    continue
-
-                path = Path(file)
-                lang_subset = path.parts[1]  # e.g., "en000"
-                if lang_subset not in lang2shard_files:
-                    lang2shard_files[lang_subset] = {}
-                lang_shards = lang2shard_files[lang_subset]
-
-                shard_no = path.parts[3].split('.')[0]  # e.g., "00000001"
-                if shard_no not in lang_shards:
-                    lang_shards[shard_no] = {}
-                shard_files = lang_shards[shard_no]
-
-                data_type = path.parts[2]  # e.g., "audio", "duration", "text"
-                shard_files[data_type] = file
-
-            logger.info("Writing data into manifest...")
-
-            # Write aggregated entries to manifest
-            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-                for lang_subset in sorted(lang2shard_files.keys()):
-                    lang_subset_shards = lang2shard_files[lang_subset]
-                    for shard_id in sorted(lang_subset_shards.keys()):
-                        data_entry = {
-                            "lang_subset": lang_subset,
-                            "shard_id": shard_id,
-                        }
-
-                        # Add keys for each data type (audio_key, duration_key, etc.)
-                        shard_data = lang_subset_shards[shard_id]
-                        for data_type in sorted(shard_data.keys()):
-                            data_entry[f"{data_type}_key"] = shard_data[data_type]
-
-                        line = json.dumps(data_entry)
-                        fout.writelines(f"{line}\n")
-
-        logger.info("Metadata successfully saved!")
-        
-
-class DownloadYodas2Data(SnapshotDownload):
-    """
-    A specialized processor for downloading the YODAS2 dataset from Hugging Face
-    and updating the input manifest with local file paths to the downloaded files.
-
-    This class:
-    - Loads an input manifest that contains HF repo-relative paths to audio, text, and duration files.
-    - Downloads only the referenced files using Hugging Face `snapshot_download` with `allow_patterns`.
-    - Updates the manifest with paths to the locally downloaded files (under keys `local_audio`, `local_duration`, `local_text`).
-
-    Args:
-        output_manifest_file (str): Path to write the updated output manifest file.
-        input_manifest_file (str): Path to the input manifest listing the files to fetch.
-            Each line must be a JSON object that contains the following **optional but expected** fields:
-
-                - "audio_key" (str): Relative path in the Hugging Face dataset repository to the audio file.
-                - "duration_key" (str): Relative path to a file containing the duration of the audio sample.
-                - "text_key" (str): Relative path to the transcription or label file for the audio sample.
-
-                At least one of these keys should be present in each entry. These keys are used to determine
-                which files to download from the Hugging Face dataset and to map them to local files after download.
-        
-        **kwargs: Additional arguments passed to the base SnapshotDownload processor.
-    
-    Returns:
-        A line-delimited JSON manifest, where each line represents a sample entry
-        and contains absolute local paths to the audio, duration, and text files.
-    
-    .. admonition:: Example
-                
-        Input line::
-            
-            {
-                "lang_subset": "en000", 
-                "shard_id": "00000000", 
-                "audio_key": "data/en000/audio/00000000.tar.gz", 
-                "duration_key": "data/en000/duration/00000000.txt", 
-                "text_key": "data/en000/text/00000000.json"
-            }
- 
-        Output line::
-            
-            {
-                "lang_subset": "en000", 
-                "shard_id": "00000000", 
-                "audio_key": "data/en000/audio/00000000.tar.gz", 
-                "duration_key": "data/en000/duration/00000000.txt", 
-                "text_key": "data/en000/text/00000000.json", 
-                "local_audio": "/path/to/en000/audio/00000000.tar.gz", 
-                "local_duration": "/path/to/en000/duration/00000000.txt"}
-                "local_text": "/path/to/en000/text/00000000.json"
-            }
-    
-    """
-
-    def __init__(self, **kwargs):
-        # Hardcoded to download the espnet/yodas2 dataset from Hugging Face
-        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
-
-    def write_output_manifest_file(self):
-        """
-        Write a new manifest file that includes local paths to audio, text, and duration files.
-
-        For each line in the input manifest, checks for keys `audio_key`, `duration_key`, and `text_key`.
-        If the corresponding file exists locally after download, adds the local path under
-        `local_audio`, `local_duration`, and `local_text`, respectively.
-        """
-
-        # Open input manifest for reading and output manifest for writing
-        with open(self.input_manifest_file, 'r', encoding='utf8') as fin, open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-            for line in fin:
-                sample = json.loads(line)
-
-                # Try to find and attach local paths for each possible file type
-                audio_file = sample.get('audio_key')
-                if audio_file:
-                    local_audio_file = os.path.join(self.local_dir, audio_file)
-                    if os.path.exists(local_audio_file):
-                        sample['local_audio'] = local_audio_file
-
-                duration_file = sample.get('duration_key')
-                if duration_file:
-                    local_duration_file = os.path.join(self.local_dir, duration_file)
-                    if os.path.exists(local_duration_file):
-                        sample['local_duration'] = local_duration_file
-
-                text_file = sample.get('text_key')
-                if text_file:
-                    local_text_file = os.path.join(self.local_dir, text_file)
-                    if os.path.exists(local_text_file):
-                        sample['local_text'] = local_text_file
-
-                # Write the modified sample to the output manifest
-                line = json.dumps(sample)
-                fout.writelines(f'{line}\n')
-
-    def process(self):
-        """
-        Main processing function: collects the list of files to download,
-        performs the download, and then writes the output manifest.
-
-        This method:
-        - Scans the input manifest to extract all relevant repo-relative file paths.
-        - Adds them to the `allow_patterns` argument of `snapshot_download_kwargs`.
-        - Triggers the download of only the needed files.
-        - Updates the manifest with local paths.
-        """
-        allow_patterns = []
-
-        # Parse input manifest to extract all file paths to allow in the snapshot
-        with open(self.input_manifest_file, 'r', encoding='utf8') as fin:
-            for line in fin:
-                sample = json.loads(line)
-
-                audio_file = sample.get('audio_key')
-                if audio_file:
-                    allow_patterns.append(audio_file)
-
-                duration_file = sample.get('duration_key')
-                if duration_file:
-                    allow_patterns.append(duration_file)
-
-                text_file = sample.get('text_key')
-                if text_file:
-                    allow_patterns.append(text_file)
-
-        # Restrict snapshot download to only needed files
-        self.snapshot_download_kwargs['allow_patterns'] = allow_patterns
-
-        # Download the snapshot and write updated manifest
-        self.download()
-        self.write_output_manifest_file()
diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
index d7584b0a..8a553d1f 100644
--- a/sdp/processors/huggingface/huggingface_hub.py
+++ b/sdp/processors/huggingface/huggingface_hub.py
@@ -36,7 +36,7 @@ class ListRepoFiles(BaseProcessor):
 
     Returns:
         A line-delimited JSON manifest where each line looks like:
-        {"file_key": "path/to/file.ext"}
+        ``{"file_key": "path/to/file.ext"}``
     """
 
     def __init__(
@@ -87,7 +87,7 @@ class SnapshotDownload(BaseProcessor):
 
     Returns:
         A JSON file containing one line:
-        {"destination_dir": "/path/to/downloaded/repo"}
+        ``{"destination_dir": "/path/to/downloaded/repo"}``
     """
 
     def __init__(

From 9e7114057fce60aff62762b63a8a358dbddd6acc Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:14:30 -0700
Subject: [PATCH 15/90] ExtractTar

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                   |  15 ++--
 sdp/processors/__init__.py             |   1 +
 sdp/processors/manage_files/extract.py | 118 +++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 6 deletions(-)
 create mode 100644 sdp/processors/manage_files/extract.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index a230f5c8..f67968a9 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -243,12 +243,6 @@ Data modifications
 .. autodata:: sdp.processors.SubIfASRSubstitution
    :annotation:
 
-.. autodata:: sdp.processors.ListToEntries
-   :annotation:
-
-.. autodata:: sdp.processors.LambdaExpression
-   :annotation:
-
 Data filtering
 ''''''''''''''
 
@@ -353,6 +347,15 @@ Miscellaneous
 .. autodata:: sdp.processors.CreateCombinedManifests
    :annotation:
 
+.. autodata:: sdp.processors.ListToEntries
+   :annotation:
+
+.. autodata:: sdp.processors.LambdaExpression
+   :annotation:
+
+.. autodata:: sdp.processors.ExtractTar
+   :annotation:
+
 .. _sdp-base-classes:
 
 Base classes
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index b114d750..527b5013 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -72,6 +72,7 @@
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
+from sdp.processors.manage_files.extract import ExtractTar
 
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
diff --git a/sdp/processors/manage_files/extract.py b/sdp/processors/manage_files/extract.py
new file mode 100644
index 00000000..83638aba
--- /dev/null
+++ b/sdp/processors/manage_files/extract.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import os
+from pathlib import Path
+
+from sdp.logging import logger
+from sdp.processors.base_processor import DataEntry, BaseParallelProcessor
+
+
+class ExtractTar(BaseParallelProcessor):
+    """
+    A processor that extracts `.tar` archives for each entry in a dataset.
+
+    This processor reads a filepath to a tar archive from a specific field in the dataset entry,
+    extracts the contents into a specified directory, and optionally appends the extracted file paths
+    or the extraction directory to the entry under a new field.
+
+    Args:
+        field_to_tar_filepath (str): The field in the input entry that contains the path to the `.tar` file.
+        extraction_dir (str): The base directory where extracted files should be placed.
+        remove_source_tar (bool): If True, deletes the original `.tar` file after successful extraction.
+        skip_invalid_filepaths (bool): If True, logs and skips invalid paths instead of raising exceptions.
+        filepath_prefix_field (str): Optional field in the entry used as a subdirectory prefix under `extraction_dir`.
+        output_filepath_field (str): Field name where the output (path or list of paths) will be stored.
+        get_extracted_filepaths (bool): If True, collects and returns a list of all extracted file paths.
+
+    Returns:
+        A manifest where each entry is updated with the path to the extracted files or directory.
+    """
+
+    def __init__(
+        self, 
+        field_to_tar_filepath: str, 
+        extraction_dir: str, 
+        remove_source_tar: bool = False, 
+        skip_invalid_filepaths: bool = False,
+        filepath_prefix_field: str = None,
+        output_filepath_field: str = 'extracted',
+        get_extracted_filepaths: bool = False, 
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.field_to_tar_filepath = field_to_tar_filepath
+        self.extraction_dir = extraction_dir
+        self.remove_source_tar = remove_source_tar
+        self.skip_invalid_filepaths = skip_invalid_filepaths
+        self.filepath_prefix_field = filepath_prefix_field
+        self.output_filepath_field = output_filepath_field
+        self.get_extracted_filepaths = get_extracted_filepaths
+
+    def process_dataset_entry(self, data_entry):
+        # Read the tar file path from the specified field
+        tar_filepath = data_entry[self.field_to_tar_filepath]
+
+        # Handle missing or invalid tar file path
+        if not isinstance(tar_filepath, str) or not os.path.exists(tar_filepath):
+            if self.skip_invalid_filepaths:
+                logger.info(f"Invalid filepath {tar_filepath}. Skipping..")
+                output_filepath = None
+            else:
+                raise ValueError(f"Invalid filepath {tar_filepath}.")
+        else:
+            # Determine output path using optional prefix and tar filename
+            output_filepath_prefix = (
+                data_entry[self.filepath_prefix_field]
+                if self.filepath_prefix_field and data_entry.get(self.filepath_prefix_field)
+                else ''
+            )
+            output_filepath = os.path.join(
+                self.extraction_dir,
+                output_filepath_prefix,
+                os.path.basename(tar_filepath).split('.')[0]
+            )
+            os.makedirs(output_filepath, exist_ok=True)
+
+            # Extract tar archive into target directory
+            try:
+                with tarfile.open(tar_filepath, 'r') as tar:
+                    tar.extractall(path=output_filepath)
+            except Exception as e:
+                if self.skip_invalid_filepaths:
+                    logger.info(f"Error extracting {tar_filepath}: {e}. Skipping..")
+                    output_filepath = None
+                else:
+                    raise ValueError(f"Error extracting {tar_filepath}: {e}")
+
+            # Gather list of all extracted files if requested
+            extracted_filepaths = []
+            if output_filepath is not None and self.get_extracted_filepaths:
+                extraction_folder_path = Path(output_filepath)
+                extracted_filepaths = [
+                    str(file) for file in extraction_folder_path.rglob("*") if file.is_file()
+                ]
+
+            # Optionally remove the original tar archive after extraction
+            if self.remove_source_tar:
+                os.remove(tar_filepath)
+
+        # Write the extraction result into the entry (either path or file list)
+        if self.get_extracted_filepaths:
+            data_entry[self.output_filepath_field] = extracted_filepaths
+        else:
+            data_entry[self.output_filepath_field] = output_filepath
+
+        return [DataEntry(data=data_entry)]

From 813f67048b9d1d7b0547bcc5fb5c52986eb4b7cf Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:38:58 -0700
Subject: [PATCH 16/90] CreateInitialManifestYodas2

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |  3 +
 .../yodas2/create_initial_manifest.py         | 82 +++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index f67968a9..d1b161b7 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -119,6 +119,9 @@ Yodas2
 .. autodata:: sdp.processors.DownloadYodas2Data
    :annotation:
 
+.. autodata:: sdp.processors.CreateInitialManifestYodas2
+   :annotation:
+
 
 HuggingFace Datasets
 ''''''''''''''''''''
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index 51cd8275..7c0ea683 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -272,10 +272,72 @@ def process(self):
 
 
 class CreateInitialManifestYodas2(ListToEntries):
+    """
+    A dataset processor specialized for the YODAS2 dataset.
+
+    This processor extends ``ListToEntries`` to:
+    
+    - Expand each input record that contains a list of audio items (e.g., under a ``shards`` or ``files`` field).
+    - Append duration and YODAS ID metadata to each resulting entry using a provided durations file.
+
+    Each input entry must include a field (e.g., ``local_duration``) pointing to a `.txt` file
+    with lines in the format:
+
+    .. code-block:: text
+
+        <yodas_id> <duration>
+
+    Example input line:
+
+    .. code-block:: json
+
+        {
+            "lang_subset": "en000",
+            "shard_id": "00000000",
+            "audio_key": "data/en000/audio/00000000.tar.gz",
+            "duration_key": "data/en000/duration/00000000.txt",
+            "text_key": "data/en000/text/00000000.json",
+            "src_lang": "en",
+            "local_audio": "/data3/sdp_test/en/data/en000/audio/00000000.tar.gz",
+            "local_duration": "/data3/sdp_test/en/data/en000/duration/00000000.txt",
+            "extracted_audios": [
+                "/path/to/data/en000/00000000/YuseO8GhcWk.wav",
+                "/path/to/data/en000/00000000/Y9zJ8mT5Bou.wav"
+            ]
+        }
+    
+    Args:
+        **kwargs: Passed directly to the ``sdp.processors.ListToEntries`` base processor.
+
+    Returns:
+        A manifest where each entry corresponds to one YODAS2 sample with the following structure:
+
+        .. code-block:: json
+
+            {
+                "lang_subset": "en000",
+                "shard_id": "00000000",
+                "src_lang": "en",
+                "source_audio_filepath": "/path/to/data/en000/00000000/YuseO8GhcWk.wav",
+                "yodas_id": "YuseO8GhcWk",
+                "duration": 216.9
+            }
+    """
     def __init__(self, **kwargs):
+        # Initialize with ListToEntries configuration
          super().__init__(**kwargs)
         
     def get_samples_durations(self, durations_filepath: str):
+        """
+        Parse durations file into a dict mapping yodas_id -> duration.
+
+        Args:
+            durations_filepath (str): Path to durations.txt file, where each line
+                contains "<yodas_id> <duration>".
+
+        Returns:
+            dict[str, float]: Mapping from yodas_id to duration.
+        """
         durations = dict()
         with open(durations_filepath, 'r') as durations_txt:
             for line in durations_txt:
@@ -284,13 +346,33 @@ def get_samples_durations(self, durations_filepath: str):
         return durations
     
     def process_dataset_entry(self, data_entry):
+        """
+        Process a single dataset entry.
+
+        - Reads durations file from the entry
+        - Expands list field using base ListToEntries
+        - Extracts `yodas_id` from audio path and adds `duration` if found
+
+        Args:
+            data_entry (dict): Original manifest entry with a list field and a pointer
+                to a local durations.txt file.
+
+        Returns:
+            List[DataEntry]: Processed and filtered list of entries with yodas_id and duration.
+        """
+        # Load duration metadata for current group of items
         durations = self.get_samples_durations(data_entry['local_duration'])
+
+        # Expand the list of items into individual entries (inherited logic)
         data_entries = super().process_dataset_entry(data_entry)
 
         yodas_entries = []
         for entry in data_entries:
+            # Extract YODAS ID from filename (e.g., "YABC1234" from "YABC1234.wav")
             yodas_id = os.path.basename(entry.data['source_audio_filepath']).split('.')[0]
             entry.data['yodas_id'] = yodas_id
+
+            # Attach duration if available
             if yodas_id in durations:
                 entry.data['duration'] = durations[yodas_id]
                 yodas_entries.append(entry)

From 8ea0094af12568b37f2389e11e5cf133b25ab70c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:50:11 -0700
Subject: [PATCH 17/90] Audio converstion processors moved to convert_audio.py

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/__init__.py                    |   6 +-
 sdp/processors/manage_files/convert_audio.py  | 170 ++++++++++++++++++
 .../modify_manifest/data_to_data.py           | 152 +---------------
 3 files changed, 175 insertions(+), 153 deletions(-)
 create mode 100644 sdp/processors/manage_files/convert_audio.py

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 527b5013..0a52a46c 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -34,7 +34,6 @@
     CopyManifestData,
     CountNumWords,
     ExtractFromBrackets,
-    FfmpegConvert,
     GetAudioDuration,
     GetWER,
     InsIfASRInsertion,
@@ -45,7 +44,6 @@
     MakeSentence,
     ReadDocxLines,
     ReadTxtLines,
-    SoxConvert,
     SplitLineBySentence,
     SubIfASRSubstitution,
     SubMakeLowercase,
@@ -73,6 +71,10 @@
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.manage_files.extract import ExtractTar
+from sdp.processors.manage_files.convert_audio import (
+    FfmpegConvert,
+    SoxConvert,
+)
 
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
diff --git a/sdp/processors/manage_files/convert_audio.py b/sdp/processors/manage_files/convert_audio.py
new file mode 100644
index 00000000..7cdc2e73
--- /dev/null
+++ b/sdp/processors/manage_files/convert_audio.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Optional
+from sox import Transformer
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+from sdp.utils.common import ffmpeg_convert
+
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+    If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
+    If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+
+    .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
+        e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
+
+        ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
+
+    Args:
+        converted_audio_dir (str): The directory to store the resampled audio files.
+        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
+        output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
+        id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
+        output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
+        target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_file_key: str,
+        output_file_key: str,
+        id_key: str = None,
+        output_format: str = "wav",
+        base_dir: str = None,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.converted_audio_dir = converted_audio_dir
+        self.input_file_key = input_file_key
+        self.output_file_key = output_file_key
+        self.output_format = output_format
+        self.id_key = id_key
+        self.base_dir = base_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        assert self.output_format == "wav", "Currently only wav format is supported"
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        input_file = data_entry[self.input_file_key]
+        if self.id_key:
+            key = data_entry[self.id_key]
+            os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
+        else:
+            key = os.path.splitext(input_file)[0].split("/")[-1]
+
+        if self.base_dir:
+            new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
+            os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
+
+            key = os.path.join(new_dir, key)
+
+        audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
+
+        if not os.path.isfile(audio_file):
+            ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.output_file_key] = audio_file
+        return [DataEntry(data=data_entry)]
+
+
+class SoxConvert(BaseParallelProcessor):
+    """Processor for Sox to convert audio files to specified format.
+
+    Args:
+        output_manifest_file (str): Path to the output manifest file.
+        input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
+        output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
+        converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
+        output_format (str): Format of the output audio file.
+        rate (int): Sample rate of the output audio file.
+        channels (int): Number of channels of the output audio file.
+        workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        converted_audio_dir: str,
+        input_audio_file_key: str = "audio_filepath",
+        output_audio_file_key: str = "audio_filepath",
+        output_format: str = "wav",
+        rate: int = 16000,
+        channels: int = 1,
+        workspace_dir: Optional[str] = None,
+        **kwargs,
+    ):
+        # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
+        if "workspace_dir" in kwargs:
+            workspace_dir = kwargs.pop("workspace_dir")
+            
+        super().__init__(**kwargs)
+        self.input_audio_file_key = input_audio_file_key
+        self.output_audio_file_key = output_audio_file_key
+        self.converted_audio_dir = converted_audio_dir
+        self.output_format = output_format
+        self.workspace_dir = workspace_dir
+
+        # Store the new parameters for later use:
+        self.rate = rate
+        self.channels = channels
+
+    def prepare(self):
+        # Debug print for workspace_dir
+        logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
+        os.makedirs(self.converted_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        audio_path = data_entry[self.input_audio_file_key]
+        
+        # If workspace_dir is provided, join it with audio_path to get absolute path
+        if self.workspace_dir is not None:
+            full_audio_path = os.path.join(self.workspace_dir, audio_path)
+        else:
+            full_audio_path = audio_path
+            
+        # Debug print first file path
+        if not hasattr(self, '_debug_printed'):
+            logger.info(f"First audio_path from manifest: {audio_path}")
+            logger.info(f"First full_audio_path: {full_audio_path}")
+            logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
+            self._debug_printed = True
+
+        key = os.path.splitext(audio_path)[0].split("/")[-1]
+        converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
+
+        if not os.path.isfile(converted_file):
+            transformer = Transformer()
+
+            transformer.rate(self.rate)
+            transformer.channels(self.channels)
+
+            transformer.build(full_audio_path, converted_file)
+
+        data_entry[self.output_audio_file_key] = converted_file
+        return [DataEntry(data=data_entry)]
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 3fc1d2f8..c683cfe8 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,12 +15,11 @@
 import collections
 import os
 import re
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import soundfile
 import torchaudio
 from docx import Document
-from sox import Transformer
 from tqdm import tqdm
 import json
 
@@ -30,7 +29,6 @@
     BaseProcessor,
     DataEntry,
 )
-from sdp.utils.common import ffmpeg_convert
 from sdp.utils.apply_operators import evaluate_expression
 from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
 from sdp.utils.get_diff import get_diff_with_subs_grouped
@@ -77,78 +75,6 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
-class FfmpegConvert(BaseParallelProcessor):
-    """
-    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
-    If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``.
-    If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
-
-    .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``).
-        e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be
-
-        ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``.
-
-    Args:
-        converted_audio_dir (str): The directory to store the resampled audio files.
-        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
-        output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``.
-        id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None.
-        output_format (str): (Optional) Format of the output audio files. Defaults to `wav`.
-        target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000.
-        target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        converted_audio_dir: str,
-        input_file_key: str,
-        output_file_key: str,
-        id_key: str = None,
-        output_format: str = "wav",
-        base_dir: str = None,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.converted_audio_dir = converted_audio_dir
-        self.input_file_key = input_file_key
-        self.output_file_key = output_file_key
-        self.output_format = output_format
-        self.id_key = id_key
-        self.base_dir = base_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        assert self.output_format == "wav", "Currently only wav format is supported"
-        os.makedirs(self.converted_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        input_file = data_entry[self.input_file_key]
-        if self.id_key:
-            key = data_entry[self.id_key]
-            os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True)
-        else:
-            key = os.path.splitext(input_file)[0].split("/")[-1]
-
-        if self.base_dir:
-            new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir))
-            os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True)
-
-            key = os.path.join(new_dir, key)
-
-        audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format
-
-        if not os.path.isfile(audio_file):
-            ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels)
-
-        data_entry[self.output_file_key] = audio_file
-        return [DataEntry(data=data_entry)]
-
-
 class ReadTxtLines(BaseParallelProcessor):
     """
     The text file specified in source_filepath will be read, and each line in it will be added as a line in the output manifest,
@@ -184,82 +110,6 @@ def process_dataset_entry(self, data_entry):
         return data_list
 
 
-class SoxConvert(BaseParallelProcessor):
-    """Processor for Sox to convert audio files to specified format.
-
-    Args:
-        output_manifest_file (str): Path to the output manifest file.
-        input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file.
-        output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file.
-        converted_audio_dir (str): Path to the directory where the converted audio files will be stored.
-        output_format (str): Format of the output audio file.
-        rate (int): Sample rate of the output audio file.
-        channels (int): Number of channels of the output audio file.
-        workspace_dir (str, Optional): Path to the workspace directory. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        converted_audio_dir: str,
-        input_audio_file_key: str = "audio_filepath",
-        output_audio_file_key: str = "audio_filepath",
-        output_format: str = "wav",
-        rate: int = 16000,
-        channels: int = 1,
-        workspace_dir: Optional[str] = None,
-        **kwargs,
-    ):
-        # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor
-        if "workspace_dir" in kwargs:
-            workspace_dir = kwargs.pop("workspace_dir")
-            
-        super().__init__(**kwargs)
-        self.input_audio_file_key = input_audio_file_key
-        self.output_audio_file_key = output_audio_file_key
-        self.converted_audio_dir = converted_audio_dir
-        self.output_format = output_format
-        self.workspace_dir = workspace_dir
-
-        # Store the new parameters for later use:
-        self.rate = rate
-        self.channels = channels
-
-    def prepare(self):
-        # Debug print for workspace_dir
-        logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}")
-        os.makedirs(self.converted_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        audio_path = data_entry[self.input_audio_file_key]
-        
-        # If workspace_dir is provided, join it with audio_path to get absolute path
-        if self.workspace_dir is not None:
-            full_audio_path = os.path.join(self.workspace_dir, audio_path)
-        else:
-            full_audio_path = audio_path
-            
-        # Debug print first file path
-        if not hasattr(self, '_debug_printed'):
-            logger.info(f"First audio_path from manifest: {audio_path}")
-            logger.info(f"First full_audio_path: {full_audio_path}")
-            logger.info(f"Path exists: {os.path.exists(full_audio_path)}")
-            self._debug_printed = True
-
-        key = os.path.splitext(audio_path)[0].split("/")[-1]
-        converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}"
-
-        if not os.path.isfile(converted_file):
-            transformer = Transformer()
-
-            transformer.rate(self.rate)
-            transformer.channels(self.channels)
-
-            transformer.build(full_audio_path, converted_file)
-
-        data_entry[self.output_audio_file_key] = converted_file
-        return [DataEntry(data=data_entry)]
-
-
 class CountNumWords(BaseParallelProcessor):
     """
     Processor for counting the number of words in the text_key field saving the number in num_words_key.

From 24f8cc0dc1fa687ec44c7e391d3e0a66da86a77e Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 14:57:57 -0700
Subject: [PATCH 18/90] RemoveFiles

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                  |  3 +
 sdp/processors/__init__.py            |  1 +
 sdp/processors/manage_files/remove.py | 86 +++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 sdp/processors/manage_files/remove.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index d1b161b7..2347cb54 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -359,6 +359,9 @@ Miscellaneous
 .. autodata:: sdp.processors.ExtractTar
    :annotation:
 
+.. autodata:: sdp.processors.RemoveFiles
+   :annotation:
+
 .. _sdp-base-classes:
 
 Base classes
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 0a52a46c..66113fe3 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -75,6 +75,7 @@
     FfmpegConvert,
     SoxConvert,
 )
+from sdp.processors.manage_files.remove import RemoveFiles
 
 from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
diff --git a/sdp/processors/manage_files/remove.py b/sdp/processors/manage_files/remove.py
new file mode 100644
index 00000000..0befa50a
--- /dev/null
+++ b/sdp/processors/manage_files/remove.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+
+from sdp.logging import logger
+from sdp.processors.base_processor import DataEntry, BaseParallelProcessor
+
+
+class RemoveFiles(BaseParallelProcessor):
+    """
+    A processor that removes files or directories from the filesystem based on a filepath
+    specified in the input data entry.
+
+    This processor is typically used for cleanup tasks after processing files.
+
+    Args:
+        filepath_field (str): The key in the data entry that holds the path to the file or directory to remove.
+        
+        drop_filepath_field (bool): Whether to remove the filepath field from the resulting data entry. Defaults to True.
+        
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
+    
+    Returns:
+        A manifest where each entry is the same as the input, optionally without the filepath field,
+        and with the file or directory at the specified path removed from disk.
+    
+    Example entry before processing::
+    
+        {
+            "id": "abc123",
+            "path_to_remove": "/tmp/some_file.wav"
+        }
+    
+    Example entry after processing (if `drop_filepath_field=True`)::
+    
+        {
+            "id": "abc123"
+        }
+    """
+
+    def __init__(self,
+                filepath_field: str,
+                drop_filepath_field: bool = True,
+                **kwargs):
+
+        super().__init__(**kwargs)
+        self.filepath_field = filepath_field
+        self.drop_filepath_field = drop_filepath_field
+
+    def process_dataset_entry(self, data_entry):
+        """
+        Remove the file or directory specified in the given field of the data entry.
+
+        Args:
+            data_entry (dict): A single input sample from the dataset manifest.
+
+        Returns:
+            List[DataEntry]: A single-element list containing the updated entry.
+        """
+        filepath = data_entry[self.filepath_field]
+
+        # Remove the target path from the filesystem
+        if os.path.isdir(filepath):
+            shutil.rmtree(filepath)  # Recursively delete directory
+        else:
+            os.remove(filepath)  # Delete a single file
+
+        # Optionally remove the filepath field from the data entry
+        if self.drop_filepath_field:
+            data_entry.pop(self.filepath_field)
+
+        # Wrap and return the modified entry
+        return [DataEntry(data=data_entry)]

From e73bd830564c3c5be0e4f795b822f5f160ef126b Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 15:07:31 -0700
Subject: [PATCH 19/90] ASR inference refactoring

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/__init__.py                                  | 6 ++++--
 sdp/processors/{ => inference/asr}/nemo/asr_inference.py    | 2 +-
 .../{nemo => inference/asr/nemo/utils}/transcribe_speech.py | 0
 .../asr/transformers}/speech_recognition.py                 | 0
 4 files changed, 5 insertions(+), 3 deletions(-)
 rename sdp/processors/{ => inference/asr}/nemo/asr_inference.py (97%)
 rename sdp/processors/{nemo => inference/asr/nemo/utils}/transcribe_speech.py (100%)
 rename sdp/processors/{huggingface => inference/asr/transformers}/speech_recognition.py (100%)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 66113fe3..73326a44 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -77,12 +77,14 @@
 )
 from sdp.processors.manage_files.remove import RemoveFiles
 
-from sdp.processors.huggingface.speech_recognition import ASRTransformers
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
 from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
 
-from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
+from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
+
 from sdp.processors.nemo.pc_inference import PCInference
+
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool
 from sdp.processors.toloka.create_project import CreateTolokaProject
diff --git a/sdp/processors/nemo/asr_inference.py b/sdp/processors/inference/asr/nemo/asr_inference.py
similarity index 97%
rename from sdp/processors/nemo/asr_inference.py
rename to sdp/processors/inference/asr/nemo/asr_inference.py
index 4359f320..a826fc54 100644
--- a/sdp/processors/nemo/asr_inference.py
+++ b/sdp/processors/inference/asr/nemo/asr_inference.py
@@ -49,7 +49,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech.py"
+        self.script_path = Path(__file__).parent / "utils" / "transcribe_speech.py"
         self.pretrained_model = pretrained_model
         self.batch_size = batch_size
 
diff --git a/sdp/processors/nemo/transcribe_speech.py b/sdp/processors/inference/asr/nemo/utils/transcribe_speech.py
similarity index 100%
rename from sdp/processors/nemo/transcribe_speech.py
rename to sdp/processors/inference/asr/nemo/utils/transcribe_speech.py
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/inference/asr/transformers/speech_recognition.py
similarity index 100%
rename from sdp/processors/huggingface/speech_recognition.py
rename to sdp/processors/inference/asr/transformers/speech_recognition.py

From 5a293c695e27449b74559e7aee761bca25eb2c57 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 15:19:49 -0700
Subject: [PATCH 20/90] FasterWhisperInference

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   1 +
 .../faster_whisper_inference.py               | 464 ++++++++++++++++++
 3 files changed, 468 insertions(+)
 create mode 100644 sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 2347cb54..35f1234f 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -164,6 +164,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.ASRTransformers
    :annotation:
 
+.. autodata:: sdp.processors.FasterWhisperInference
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 73326a44..7e378b69 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -82,6 +82,7 @@
 
 from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
 from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
+from sdp.processors.inference.asr.faster_whisper.faster_whisper_inference import FasterWhisperInference
 
 from sdp.processors.nemo.pc_inference import PCInference
 
diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
new file mode 100644
index 00000000..48988978
--- /dev/null
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -0,0 +1,464 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import json
+from copy import deepcopy
+from tqdm import tqdm
+import librosa
+from dataclasses import dataclass, field, asdict, is_dataclass
+from typing import List, Optional, Any, Dict
+from omegaconf import OmegaConf, MISSING
+
+from sdp.logging import logger
+from multiprocessing import Pool
+import traceback
+
+from sdp.processors.base_processor import BaseProcessor
+
+"""
+This module implements `FasterWhisperInference`, a multiprocessing-compatible audio transcription
+processor using the FasterWhisper library.
+
+It reads an input manifest, runs inference on available devices (GPU/CPU), and outputs predictions,
+including optional timestamp and word-level information.
+
+Classes:
+    - InferenceConfig: Configuration for whisper decoding and inference behavior.
+    - ModelConfig: Configuration for the Whisper model loading.
+    - DatasetConfig: Configuration for dataset input/output handling.
+    - WhisperInferenceConfig: Combined config container.
+    - FasterWhisperInference: Main processor class for transcribing input audio files in parallel.
+"""
+
+def serialize(obj):
+    """
+    Recursively serializes a dataclass, list, or dict to a JSON-compatible structure.
+    
+    Args:
+        obj (Any): Object to serialize (dataclass, list, or dict).
+
+    Returns:
+        JSON-serializable version of the object.
+    """
+    if is_dataclass(obj):
+        return asdict(obj)
+    elif isinstance(obj, list):
+        return [serialize(item) for item in obj]
+    elif isinstance(obj, dict):
+        return {k: serialize(v) for k, v in obj.items()}
+    return obj
+
+@dataclass
+class InferenceConfig:
+    from faster_whisper.vad import VadOptions
+
+    language: Optional[str] = None
+    task: str = "transcribe"
+    log_progress: bool = False
+    beam_size: int = 5
+    best_of: int = 5
+    patience: float = 1
+    length_penalty: float = 1
+    repetition_penalty: float = 1
+    no_repeat_ngram_size: int = 0
+    temperature: List[float] = field(default_factory=lambda: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
+    compression_ratio_threshold: Optional[float] = 2.4
+    log_prob_threshold: Optional[float] = -1.0
+    no_speech_threshold: Optional[float] = 0.6
+    condition_on_previous_text: bool = True
+    prompt_reset_on_temperature: float = 0.5
+    initial_prompt: Optional[Any] = None
+    prefix: Optional[str] = None
+    suppress_blank: bool = True
+    suppress_tokens: Optional[List[int]] = field(default_factory=lambda: [-1])
+    without_timestamps: bool = True
+    max_initial_timestamp: float = 1.0
+    word_timestamps: bool = False
+    prepend_punctuations: str = "\"'“¿([{-"
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、"
+    multilingual: bool = False
+    vad_filter: bool = True
+    vad_parameters: Optional[VadOptions] = None
+    max_new_tokens: Optional[int] = None
+    chunk_length: Optional[int] = None
+    clip_timestamps: Optional[Any] = "0"
+    hallucination_silence_threshold: Optional[float] = None
+    hotwords: Optional[str] = None
+    language_detection_threshold: Optional[float] = 0.5
+    language_detection_segments: int = 1
+
+@dataclass
+class ModelConfig:
+    model_size_or_path: str = MISSING
+    device: str = "auto"
+    device_index: Optional[List[int]] = field(default_factory=lambda: [0])
+    compute_type: str = "default"
+    cpu_threads: int = 0
+    num_workers: int = 1
+    download_root: Optional[str] = None
+    local_files_only: bool = False
+    files: Optional[dict] = None
+
+
+@dataclass
+class DatasetConfig:
+    manifest_filepath: str = MISSING
+    output_dir: str = MISSING
+    skip_corrupted: bool = False
+    save_timestamps_separately: bool = True
+    offset: bool = False
+
+
+@dataclass
+class WhisperInferenceConfig:
+    model: ModelConfig = field(default_factory=lambda: ModelConfig())
+    dataset: DatasetConfig = field(default_factory=lambda: DatasetConfig())
+    inference: InferenceConfig = field(default_factory=lambda: InferenceConfig())
+
+
+class FasterWhisperInference(BaseProcessor):
+    """
+    Processor that performs parallel audio transcription using the FasterWhisper model.
+
+    This class reads a manifest of audio files, transcribes them using multiprocessing
+    (each device or CPU thread handles a portion), and writes results in a NeMo-compatible manifest.
+
+    Args:
+        input_manifest_file (str): Path to the input manifest.
+        output_manifest_file (Optional[str]): Path to the output manifest (default: <output_dir>/predictions_all.json).
+        model_size_or_path (str): Whisper model path or model name (e.g., 'base', 'medium').
+        device (str): Device type to use ('auto', 'cuda', or 'cpu').
+        num_devices (int): Number of workers/devices to use (-1 = all available).
+        model_download_root (Optional[str]): Directory where model checkpoints will be downloaded.
+        output_dir (Optional[str]): Directory to store output predictions and timestamps.
+        skip_corrupted_audios (bool): Whether to skip audio files that raise exceptions.
+        save_timestamps_separately (bool): If True, saves segment/word timestamps as separate files.
+        slice_by_offset (bool): If True, slices audio using offset/duration before inference.
+        inference (Optional[Dict]): Additional inference parameters for Whisper.
+        language_detection_only (bool): If True, only perform language detection.
+        in_memory_chunksize (int): Number of samples to load per worker at once.
+        audio_filepath_field (str): Name of the field in manifest pointing to audio path.
+     
+     Returns:
+        A manifest file where each line corresponds to the transcription result of an input sample.
+        Each entry contains the following fields:
+
+        - "language" (str, optional): Detected language (if language detection is enabled).
+        - "language_probability" (float, optional): Confidence score of detected language.
+        - "segments" (List[Dict], optional): List of segment-level transcriptions with start/end times and text (if timestamps are embedded).
+        - "pred_text" (str): Final transcribed text obtained by concatenating segment texts.
+        - "segments" (str, optional): Path to file containing segment timestamps (if saved separately).
+        - "words" (str, optional): Path to file containing word-level timestamps (if enabled and saved separately).
+        
+        The output manifest is written to `output_manifest_file`, which defaults to `<output_dir>/predictions_all.json`.
+    
+     .. note::
+        For detailed configuration options and advanced usage of FasterWhisper, refer to the official repository:
+        https://github.com/SYSTRAN/faster-whisper
+    """
+    def __init__(self, 
+                 input_manifest_file: str,
+                 output_manifest_file: Optional[str] = None,
+                 model_size_or_path: str = "base",
+                 device: str = "auto",
+                 num_devices: int = -1,
+                 model_download_root: Optional[str] = None,
+                 output_dir: Optional[str] = None,
+                 skip_corrupted_audios: bool = False,
+                 save_timestamps_separately: bool = True,
+                 slice_by_offset: bool = False,
+                 inference: Optional[Dict] = {},
+                 language_detection_only: bool = False,
+                 in_memory_chunksize: int = 100000,
+                 audio_filepath_field: str = 'audio_filepath',
+                 ):
+        
+        super().__init__(input_manifest_file = input_manifest_file,
+                         output_manifest_file = output_manifest_file,
+                        )
+
+        #DatasetConfig setup
+        if not self.output_manifest_file and not output_dir:
+            raise ValueError("Either `output_manifest_file` or `output_dir` must be provided.")
+        
+        if not output_dir:
+            output_dir = os.path.splitext(self.output_manifest_file)[0]
+        
+        if not self.output_manifest_file:
+            self.output_manifest_file = os.path.join(output_dir, 'predictions_all.json')
+        
+        dataset_cfg = DatasetConfig(manifest_filepath = self.input_manifest_file, 
+                                    output_dir = output_dir, 
+                                    skip_corrupted = skip_corrupted_audios,
+                                    save_timestamps_separately = save_timestamps_separately,
+                                    offset = slice_by_offset)
+
+        #InferenceConfig setup
+        inference_cfg = OmegaConf.structured(InferenceConfig(**inference))
+
+        #ModelConfig setup
+        device, device_ids = self.setup_devices(device, num_devices)
+        self.device_ids = device_ids
+        model_cfg = ModelConfig(model_size_or_path = model_size_or_path,
+                                device = device,
+                                download_root = model_download_root)
+        
+        #GeneralConfig setup
+        self.config = WhisperInferenceConfig(model=model_cfg, 
+                                             dataset=dataset_cfg, 
+                                             inference=inference_cfg, 
+                                            )
+        
+        #Additional args
+        self.audio_filepath_field = audio_filepath_field
+        self.language_detection_only = language_detection_only
+        self.in_memory_chunksize = in_memory_chunksize
+
+    @staticmethod 
+    def setup_devices(device: str = "auto", num_devices: int = -1):
+        """
+        Determines device type and number of workers to use for inference.
+
+        Returns:
+            Tuple[str, List[int]]: Selected device type and list of device indices.
+        """
+        try:
+            import torch
+            TORCH_AVAILABLE = True
+        except ImportError:
+            TORCH_AVAILABLE = False
+
+        max_available_workers = os.cpu_count()
+        if device in ["cuda", "auto"] and TORCH_AVAILABLE:
+            cuda_available_workers = torch.cuda.device_count()
+            if cuda_available_workers == 0:
+                if device == "cuda":
+                    raise RuntimeError("GPU was requested, but no CUDA devices are available.")
+                else:
+                    logger.warning("No GPU found in auto mode — switching to CPU.")
+                    device == "cpu"
+            else:
+                logger.info("CUDA devices found. GPU will be used as workers.")
+                device == "cuda"
+                max_workers = max_available_workers
+        elif device == "cpu":
+            logger.info("CPU will be used as workers.")
+        else:
+            raise ValueError(f"Invalid device type: {device}")
+
+        if num_devices < -1 or num_devices == 0: 
+            raise ValueError(f"Invalid number of workers: {num_devices}.")
+        elif num_devices == -1: 
+            workers = max_available_workers
+            logger.info(f"Using {workers} {device.upper()} worker(s).")
+        elif num_devices > max_available_workers:
+            workers = max_available_workers
+            logger.warning(f"Requested {num_devices} {device.upper()} workers, but only {max_available_workers} {device.upper()} available — using {workers}.")
+        else:
+            logger.info(f"Using {workers} {device.upper()} worker(s).")
+        
+        device_ids = list(range(num_devices))
+        return device, device_ids
+    
+    def prepare(self):
+        """
+        Creates output directories required for storing prediction and timestamp files.
+        """
+        os.makedirs(self.config.dataset.output_dir, exist_ok = True)
+        if self.config.dataset.save_timestamps_separately:
+            os.makedirs(os.path.join(self.config.dataset.output_dir, "segments"), exist_ok = True)
+            if self.config.inference.word_timestamps:
+                os.makedirs(os.path.join(self.config.dataset.output_dir, "words"), exist_ok = True)
+    
+    def _chunk_manifest(self):
+        """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``."""
+        manifest_chunk = []
+        for idx, data_entry in enumerate(self.read_manifest(), 1):
+            manifest_chunk.append(data_entry)
+            if idx % self.in_memory_chunksize == 0:
+                yield manifest_chunk
+                manifest_chunk = []
+        if manifest_chunk:
+            yield manifest_chunk
+
+    def read_manifest(self):
+        """Reading the input manifest file."""
+        if not self.input_manifest_file:
+            raise NotImplementedError("Override this method if no input manifest file is used")
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            for line in fin:
+                yield json.loads(line)
+    
+    def get_entries_for_device(self, device_id: int):
+        """
+        Yields manifest entries assigned to a given device.
+
+        Uses round-robin assignment of sorted entries by duration.
+        """
+        for chunk in self._chunk_manifest():
+            chunk.sort(key=lambda entry: entry["duration"])
+            batch = chunk[device_id::len(self.device_ids)]
+            for entry in batch:
+                yield entry
+    
+    def get_audio_segment(self, audio_filepath: str, offset: float, duration: float):
+        """Loads a segment of audio based on offset and duration."""
+        audio, sr = librosa.load(audio_filepath, sr=None)
+        start_sample = int(offset * sr)
+        end_sample = int((offset + duration) * sr)
+        audio_segment = audio[start_sample : end_sample]
+        return audio_segment 
+
+    def write_timestamps(self, filename: str, segments: List[Dict]):
+        """Saves timestamp information (segments and optionally word-level) to separate files."""
+
+        output_segments_filepath = os.path.join(self.config.dataset.output_dir, 'segments', f'{filename}.json')
+        sample_words = []
+        with open(output_segments_filepath, 'w', encoding = 'utf8') as output_manifest:
+            for segment in segments:
+                words = segment.pop('words')
+                if self.config.inference.word_timestamps:
+                    for word in words:
+                        word['segment_id'] = segment['id']
+                        sample_words.append(word)
+                
+                line = json.dumps(segment)
+                output_manifest.writelines(f'{line}\n')
+        
+        def write_words(output_words_filepath: str, words: List[Dict]):
+            output_manifest_filepath = os.path.join(self.config.dataset.output_dir, 'words', f'{filename}.json')
+            with open(output_manifest_filepath, 'w', encoding = 'utf8') as output_manifest:
+                for word in words:
+                    line = json.dumps(word)
+                    output_manifest.writelines(f'{line}\n')
+                return output_manifest_filepath
+
+        output_words_filepath = None
+        if self.config.inference.word_timestamps:
+            output_words_filepath = write_words(output_words_filepath, sample_words)
+        
+        return dict(segments = output_segments_filepath, words = output_words_filepath)
+
+    def transcribe(self, device_id: int):
+        """""
+        Transcribes all samples assigned to a given device.
+
+        Loads the Whisper model, reads samples, performs language detection or full transcription,
+        and writes predictions to a device-specific output file.
+        """
+
+        from faster_whisper import WhisperModel
+        from faster_whisper.audio import decode_audio
+
+        model_cfg = deepcopy(self.config.model)
+        model_cfg.device_index = [device_id]
+        model = WhisperModel(**asdict(model_cfg))
+
+        inference_cfg = OmegaConf.to_container(self.config.inference, resolve=True)
+
+        output_manifest_file = os.path.join(self.config.dataset.output_dir, f'prediction_{device_id}.json')
+        
+        with open(output_manifest_file, 'w', encoding='utf8') as fout:
+            for entry in tqdm(self.get_entries_for_device(device_id), desc = f"Transcribing ({self.config.model.device.upper()} {device_id})"):
+                audio_filepath = entry[self.audio_filepath_field]
+
+                if self.language_detection_only:
+                    try:
+                        audio = decode_audio(audio_filepath)
+                        language, language_probability, all_language_probs = model.model.detect_language(audio = audio,
+                                                vad_filter = self.config.inference.vad_filter,
+                                                vad_parameters = self.config.inference.vad_parameters,
+                                                language_detection_segments = self.config.inference.language_detection_segments,
+                                                language_detection_threshold = self.config.inference.language_detection_threshold)
+                    except Exception:
+                        if self.config.dataset.skip_corrupted:
+                            logger.warning(f"Sample can't be processed: {audio_filepath}. Skipping.")
+                            continue
+                        else:
+                            traceback.print_exc()
+                            exit(1)
+                    
+                    result = dict(language = language, language_probability = language_probability)
+                else:
+                    try:
+                        if self.config.dataset.offset:
+                            audio = self.get_audio_segment(audio_filepath, batch['offset'][i], batch['duration'][i])
+                        else:
+                            audio = audio_filepath
+                    
+                        segments, info = model.transcribe(audio = audio, **inference_cfg)
+                    
+                    except Exception:
+                        if self.config.dataset.skip_corrupted:
+                            logger.warning(f"Sample can't be transcribed: {audio_filepath}. Skipping.")
+                            continue
+                        else:
+                            traceback.print_exc()
+                            exit(1)
+                    
+                    result = serialize(info)
+                    result.pop('all_language_probs', None)
+                    result.pop('transcription_options', None)
+                    result.pop('vad_options', None)
+
+                    _segments = []
+                    for segment in segments:
+                        _segments.append(serialize(segment))
+                    segments = _segments
+
+                    if self.config.dataset.save_timestamps_separately:
+                        audio_filename = os.path.splitext(os.path.basename(audio_filepath))[0]
+                        timestamps_filepaths = self.write_timestamps(audio_filename, segments)
+                        result.update(timestamps_filepaths)
+                    else:
+                        result['segments'] = segments
+                    
+                    pred_text = ' '.join(str(segment['text']) for segment in segments).strip()
+                    result['pred_text'] = pred_text
+                
+                fout.write(json.dumps(result, ensure_ascii=False) + "\n")
+                fout.flush()
+        
+        return output_manifest_file
+    
+    def process(self):
+        """
+        Main entry point for the processor.
+
+        Prepares directories, distributes transcription tasks across devices, and aggregates results
+        into the final output manifest file.
+        """
+        self.prepare()
+
+        with Pool(processes=len(self.device_ids)) as pool:
+            output_rank_manifests = pool.map(self.transcribe, self.device_ids)
+        
+        with open(self.output_manifest_filepath, 'w', encoding='utf8') as output_manifest:
+            for rank_manifest_filepath in tqdm(output_rank_manifests):
+                with open(rank_manifest_filepath, 'r', encoding='utf8') as rank_manifest:
+                    for line in rank_manifest:
+                        output_manifest.writelines(line)
+            
+
+
+
+
+
+
+
+
+                
+

From 5ab8afc04e09bf1f8dc8d55c93bdfcd60caea692 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 5 May 2025 15:28:22 -0700
Subject: [PATCH 21/90] Doc fix

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../faster_whisper/faster_whisper_inference.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index 48988978..d5806152 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -138,7 +138,7 @@ class FasterWhisperInference(BaseProcessor):
 
     Args:
         input_manifest_file (str): Path to the input manifest.
-        output_manifest_file (Optional[str]): Path to the output manifest (default: <output_dir>/predictions_all.json).
+        output_manifest_file (Optional[str]): Path to the output manifest (default: `<output_dir>/predictions_all.json`).
         model_size_or_path (str): Whisper model path or model name (e.g., 'base', 'medium').
         device (str): Device type to use ('auto', 'cuda', or 'cpu').
         num_devices (int): Number of workers/devices to use (-1 = all available).
@@ -152,20 +152,20 @@ class FasterWhisperInference(BaseProcessor):
         in_memory_chunksize (int): Number of samples to load per worker at once.
         audio_filepath_field (str): Name of the field in manifest pointing to audio path.
      
-     Returns:
+    Returns:
         A manifest file where each line corresponds to the transcription result of an input sample.
         Each entry contains the following fields:
 
-        - "language" (str, optional): Detected language (if language detection is enabled).
-        - "language_probability" (float, optional): Confidence score of detected language.
-        - "segments" (List[Dict], optional): List of segment-level transcriptions with start/end times and text (if timestamps are embedded).
-        - "pred_text" (str): Final transcribed text obtained by concatenating segment texts.
-        - "segments" (str, optional): Path to file containing segment timestamps (if saved separately).
-        - "words" (str, optional): Path to file containing word-level timestamps (if enabled and saved separately).
+        - ``languag`` (str, optional): Detected language (if language detection is enabled).
+        - ``language_probability`` (float, optional): Confidence score of detected language.
+        - ``segments`` (List[Dict], optional): List of segment-level transcriptions with start/end times and text (if timestamps are embedded).
+        - ``pred_text`` (str): Final transcribed text obtained by concatenating segment texts.
+        - ``segments`` (str, optional): Path to file containing segment timestamps (if saved separately).
+        - ``words`` (str, optional): Path to file containing word-level timestamps (if enabled and saved separately).
         
         The output manifest is written to `output_manifest_file`, which defaults to `<output_dir>/predictions_all.json`.
     
-     .. note::
+    .. note::
         For detailed configuration options and advanced usage of FasterWhisper, refer to the official repository:
         https://github.com/SYSTRAN/faster-whisper
     """

From 312ec51c44efdf35bb71a668020e04acf7525bec Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 09:01:51 -0700
Subject: [PATCH 22/90] Fix typo

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../inference/asr/faster_whisper/faster_whisper_inference.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index d5806152..53ca821e 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -156,7 +156,7 @@ class FasterWhisperInference(BaseProcessor):
         A manifest file where each line corresponds to the transcription result of an input sample.
         Each entry contains the following fields:
 
-        - ``languag`` (str, optional): Detected language (if language detection is enabled).
+        - ``language`` (str, optional): Detected language (if language detection is enabled).
         - ``language_probability`` (float, optional): Confidence score of detected language.
         - ``segments`` (List[Dict], optional): List of segment-level transcriptions with start/end times and text (if timestamps are embedded).
         - ``pred_text`` (str): Final transcribed text obtained by concatenating segment texts.

From c725a7c00e1ae9f2b16361d21db7f6fb28faea7c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 09:12:53 -0700
Subject: [PATCH 23/90] DropSpecifiedFields

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                     |  3 ++
 sdp/processors/__init__.py               |  1 +
 sdp/processors/modify_manifest/common.py | 35 ++++++++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 35f1234f..0f436fd7 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -338,6 +338,9 @@ Miscellaneous
 .. autodata:: sdp.processors.KeepOnlySpecifiedFields
    :annotation:
 
+.. autodata:: sdp.processors.DropSpecifiedFields
+   :annotation:
+
 .. autodata:: sdp.processors.GetAudioDuration
    :annotation:
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 7e378b69..7b7483d4 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -24,6 +24,7 @@
     RenameFields,
     SortManifest,
     SplitOnFixedDuration,
+    DropSpecificFields,
 )
 from sdp.processors.modify_manifest.create_manifest import (
     CreateCombinedManifests,
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 98ad1fa3..b8d63291 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -401,3 +401,38 @@ def process(self):
         with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
             for _, line in m3.iterrows():
                 fout.write(json.dumps(dict(line), ensure_ascii=False) + "\n")
+
+
+class DropSpecifiedFields(BaseProcessor):
+    """
+    A processor that removes specified fields from each data entry in the manifest.
+
+    This processor reads an input manifest line by line, drops the fields listed in `fields_to_drop` 
+    from each JSON entry, and writes the cleaned entries to the output manifest.
+
+    Args:
+        fields_to_drop (List[str]): A list of keys to remove from each manifest entry.
+        **kwargs: Additional arguments passed to the BaseProcessor (e.g., input/output manifest paths).
+
+    Returns:
+        A line-delimited JSON manifest, where each entry is the same as the input,
+        but with the specified fields removed.
+    """
+
+    def __init__(self, fields_to_drop: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self.fields_to_drop = fields_to_drop
+
+    def process(self):
+        # Open the input and output manifest files
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin, open(
+            self.output_manifest_file, "wt", encoding="utf8"
+        ) as fout:
+            # Iterate over each line (entry) in the input manifest
+            for line in tqdm(fin):
+                # Parse JSON entry from the current line
+                entry = json.loads(line)
+                # Create a new entry by excluding the specified fields
+                new_line = {field: entry[field] for field in entry if field not in self.fields_to_drop}
+                # Write the cleaned entry to the output manifest
+                fout.write(json.dumps(new_line, ensure_ascii=False) + "\n")

From 2327a1815e1d284378a3108d9cb03bf9ac904f72 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 09:55:46 -0700
Subject: [PATCH 24/90] WhisperHallucinationFeatures

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   4 +-
 .../post_processing/whisper_hallucinations.py | 105 ++++++++++++++++++
 tests/test_data_to_data.py                    |  94 ++++++++++++++++
 4 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 sdp/processors/inference/asr/post_processing/whisper_hallucinations.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 0f436fd7..9262ab13 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -292,6 +292,9 @@ Data filtering
 
 .. autodata:: sdp.processors.AcceptIfWERLess
     :annotation:
+   
+.. autodata:: sdp.processors.WhisperHallucinationFeatures
+    :annotation:
 
 .. autodata:: sdp.processors.CreateTolokaPool
     :annotation:
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 7b7483d4..a0592b69 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -24,7 +24,7 @@
     RenameFields,
     SortManifest,
     SplitOnFixedDuration,
-    DropSpecificFields,
+    DropSpecifiedFields,
 )
 from sdp.processors.modify_manifest.create_manifest import (
     CreateCombinedManifests,
@@ -84,6 +84,8 @@
 from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
 from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
 from sdp.processors.inference.asr.faster_whisper.faster_whisper_inference import FasterWhisperInference
+from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
+
 
 from sdp.processors.nemo.pc_inference import PCInference
 
diff --git a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
new file mode 100644
index 00000000..e252a8aa
--- /dev/null
+++ b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class WhisperHallucinationFeatures(BaseParallelProcessor):
+    """
+    A processor for detecting common hallucination patterns in ASR (automatic speech recognition) model outputs.
+    
+    This processor calculates simple features from the transcript text to help identify potential hallucinations,
+    such as repeated word patterns, overly long words, or unnaturally high character rates.
+
+    The following boolean features are computed and added to each manifest entry:
+        - `hall_repeated_ngrams`: True if the fraction of unique words is below a threshold.
+        - `hall_long_word`: True if a word is unusually long or significantly longer than the rest.
+        - `hall_frequent_single_word`: True if the total character count per second is too low.
+
+    Args:
+        unique_words_threshold (float): Maximum share of unique words before flagging repeated n-grams. Default is 0.4.
+        long_word_threshold (int): Minimum character length of a word to be considered 'too long'. Default is 25.
+        long_word_rel_threshold (float): Relative difference between longest and second-longest word to flag. Default is 3.
+        char_rate_threshold (float): Minimum average characters per second for a transcript. Default is 4.
+        text_field (str): The key in the data entry containing the transcript. Default is 'text'.
+        **kwargs: Additional arguments passed to BaseParallelProcessor.
+
+    Returns:
+        A manifest where each entry includes new boolean hallucination-related features.
+    
+        Example entry after processing::
+            
+            {
+                "text": "<some transcript here>",
+                "duration": 2.5,
+                "hall_repeated_ngrams": False,
+                "hall_long_word": True,
+                "hall_frequent_single_word": False
+            }
+            
+    """
+
+    def __init__(
+        self,
+        unique_words_threshold=0.4,
+        long_word_threshold=25,
+        long_word_rel_threshold=3,
+        char_rate_threshold=4,
+        text_field='text',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.unique_words_threshold = unique_words_threshold
+        self.long_word_threshold = long_word_threshold
+        self.long_word_rel_threshold = long_word_rel_threshold
+        self.char_rate_threshold = char_rate_threshold
+        self.text_field = text_field
+
+    def repeated_ngrams(self, words):
+        # Calculate the share of unique words in the transcript
+        unique_words_share = len(set(words)) / len(words)
+        return unique_words_share <= self.unique_words_threshold
+
+    def long_word(self, words):
+        # Sort word lengths in ascending order
+        word_lengths = sorted([len(word) for word in words])
+        
+        # Check if the longest word exceeds the absolute threshold
+        if word_lengths[-1] >= self.long_word_threshold:
+            return True
+
+        # Check if the longest word is much longer than the second longest
+        elif len(words) > 1:
+            diff = (word_lengths[-1] - word_lengths[-2]) / word_lengths[-2]
+            return diff >= self.long_word_rel_threshold
+
+        return False
+
+    def frequent_single_word(self, words, duration):
+        # Calculate average character rate (characters per second)
+        chars = sum(len(word) for word in words)
+        char_rate = chars / duration
+        return char_rate <= self.char_rate_threshold
+
+    def process_dataset_entry(self, data_entry):
+        # Extract the text field and tokenize into words
+        text = data_entry[self.text_field]
+        words = text.split()
+
+        # Compute and assign hallucination features
+        data_entry['hall_repeated_ngrams'] = self.repeated_ngrams(words)
+        data_entry['hall_long_word'] = self.long_word(words)
+        data_entry['hall_frequent_single_word'] = self.frequent_single_word(words, data_entry.get('duration'))
+
+        return [DataEntry(data=data_entry)]
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 4c6ba1d3..55533c69 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -23,6 +23,8 @@
     LambdaExpression,
 )
 
+from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
+
 test_params_list = []
 
 test_params_list.extend(
@@ -204,6 +206,98 @@
     ]
 )
 
+test_params_list.extend(
+    [
+        # Case: repeated n-grams (low unique words share)
+        (
+            WhisperHallucinationFeatures,
+            {"unique_words_threshold": 0.5},
+            {"text": "word word word word", "duration": 2.0},
+            [{   
+                "text": "word word word word", 
+                "duration": 2.0,
+                "hall_repeated_ngrams": True,
+                "hall_long_word": False,
+                "hall_frequent_single_word": False,
+            }],
+        ),
+
+        # Case: high unique word share
+        (
+            WhisperHallucinationFeatures,
+            {"unique_words_threshold": 0.2},
+            {"text": "this is a very diverse sentence", "duration": 3.0},
+            [{
+                "text": "this is a very diverse sentence", 
+                "duration": 3.0,
+                "hall_repeated_ngrams": False,
+                "hall_long_word": False,
+                "hall_frequent_single_word": False,
+            }],
+        ),
+
+        # Case: one very long word
+        (
+            WhisperHallucinationFeatures,
+            {"long_word_threshold": 10},
+            {"text": "short supercalifragilisticexpialidocious", "duration": 3.0},
+            [{
+                "text": "short supercalifragilisticexpialidocious", 
+                "duration": 3.0,
+                "hall_repeated_ngrams": False,
+                "hall_long_word": True,
+                "hall_frequent_single_word": False,
+            }],
+        ),
+
+        # Case: long word with large relative difference
+        (
+            WhisperHallucinationFeatures,
+            {"long_word_threshold": 100, "long_word_rel_threshold": 2.0},
+            {"text": "hi extraordinarylongword tiny", "duration": 3.0},
+            [{
+                "text": "hi extraordinarylongword tiny", 
+                "duration": 3.0,
+                "hall_repeated_ngrams": False,
+                "hall_long_word": True,
+                "hall_frequent_single_word": False,
+            }],
+        ),
+
+        # Case: low character rate (chars/sec)
+        (
+            WhisperHallucinationFeatures,
+            {"char_rate_threshold": 10.0},
+            {"text": "a b", "duration": 2.0},
+            [{   
+                "text": "a b", 
+                "duration": 2.0,
+                "hall_repeated_ngrams": False,
+                "hall_long_word": False,
+                "hall_frequent_single_word": True,
+            }],
+        ),
+
+        # Case: all metrics triggered
+        (
+            WhisperHallucinationFeatures,
+            {
+                "unique_words_threshold": 0.5,
+                "long_word_threshold": 10,
+                "long_word_rel_threshold": 1.0,
+                "char_rate_threshold": 5.0,
+            },
+            {"text": "verylongword verylongword verylongword", "duration": 12.0},
+            [{   
+                "text": "verylongword verylongword verylongword", 
+                "duration": 12.0,
+                "hall_repeated_ngrams": True,
+                "hall_long_word": True,
+                "hall_frequent_single_word": True,
+            }],
+        ),
+    ]
+)
 
 
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)

From 29cf370041049b45bb9c9d9d84a6a779625a041f Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 10:19:31 -0700
Subject: [PATCH 25/90] vLLMInference

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                      |   3 +
 sdp/processors/__init__.py                |   1 +
 sdp/processors/inference/llm/vllm/vllm.py | 150 ++++++++++++++++++++++
 3 files changed, 154 insertions(+)
 create mode 100644 sdp/processors/inference/llm/vllm/vllm.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 9262ab13..cb039b2d 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -167,6 +167,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.FasterWhisperInference
    :annotation:
 
+.. autodata:: sdp.processors.vLLMInference
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index a0592b69..656d9b58 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -86,6 +86,7 @@
 from sdp.processors.inference.asr.faster_whisper.faster_whisper_inference import FasterWhisperInference
 from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
 
+from sdp.processors.inference.llm.vllm.vllm import vLLMInference
 
 from sdp.processors.nemo.pc_inference import PCInference
 
diff --git a/sdp/processors/inference/llm/vllm/vllm.py b/sdp/processors/inference/llm/vllm/vllm.py
new file mode 100644
index 00000000..5081c414
--- /dev/null
+++ b/sdp/processors/inference/llm/vllm/vllm.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import json
+from tqdm import tqdm
+
+from sdp.processors.base_processor import BaseProcessor
+
+
+class vLLMInference(BaseProcessor):
+    """
+    A processor that performs inference using a vLLM model on entries from an input manifest.
+
+    This class supports three prompt configuration modes:
+    - a static prompt template (`prompt`)
+    - a field in each entry containing the prompt (`prompt_field`)
+    - a YAML file containing the prompt structure (`prompt_file`)
+
+    The prompts are converted into chat-style input using a tokenizer chat template,
+    passed to the vLLM engine for generation, and the results are written to an output manifest.
+
+    Args:
+        prompt (str, optional): A fixed prompt used for all entries.
+        prompt_field (str, optional): The key in each entry that holds the prompt template.
+        prompt_file (str, optional): Path to a YAML file containing the prompt structure.
+        generation_field (str): Name of the output field to store generated text. Default is 'generation'.
+        model (dict): Parameters to initialize the vLLM model.
+        inference (dict): Sampling parameters passed to vLLM.SamplingParams.
+        apply_chat_template (dict): Arguments passed to the tokenizer's `apply_chat_template` method.
+        **kwargs: Passed to the BaseProcessor (includes `input_manifest_file` and `output_manifest_file`).
+
+    Raises:
+        ValueError: If zero or more than one prompt configuration methods are used simultaneously.
+
+    Returns:
+        A line-delimited JSON manifest where each entry includes the original fields
+        plus a field with the generated output.
+
+    .. note::
+        For detailed parameter options, refer to the following documentation:
+
+        - model: https://docs.vllm.ai/en/latest/api/vllm/vllm.entrypoints.llm.html#vllm.entrypoints.llm.LLM
+        - inference: https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html
+        - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating#applychattemplate
+    """
+
+    def __init__(self,
+                 prompt: str = None,
+                 prompt_field: str = None,
+                 prompt_file: str = None,
+                 generation_field: str = 'generation',
+                 model: dict = {},
+                 inference: dict = {},
+                 apply_chat_template: dict = {},
+                 **kwargs):
+
+        from vllm import SamplingParams
+        from transformers import AutoTokenizer
+
+        super().__init__(**kwargs)
+    
+        self.prompt = prompt
+        self.prompt_field = prompt_field
+        self.generation_field = generation_field
+
+        # Ensure that exactly one prompt method is used
+        prompt_args_counter = sum([prompt is not None, prompt_field is not None, prompt_file is not None])
+        if prompt_args_counter < 1:
+            raise ValueError(f'One of `prompt`, `prompt_field` or `prompt_file` should be provided.')
+        elif prompt_args_counter > 1:
+            err = []
+            if prompt:
+                err.append(f'`prompt` ({prompt})')
+            if prompt_field:
+                err.append(f'`prompt_field` ({prompt_field})')
+            if prompt_file:
+                err.append(f'`prompt_file` ({prompt_file})')
+            raise ValueError(f'Found more than one prompt values: {", ".join(err)}.')
+
+        if prompt_file:
+            self.prompt = self._read_prompt_file(prompt_file)
+
+        self.model_params = model
+        self.sampling_params = SamplingParams(**inference)
+        self.chat_template_params = apply_chat_template
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_params['model'])
+
+    def _read_prompt_file(self, prompt_filepath):
+        """Read a YAML file with a chat-style prompt template."""
+        with open(prompt_filepath, 'r') as prompt: 
+            return yaml.safe_load(prompt)
+
+    def get_entry_prompt(self, data_entry):
+        """Format the prompt for a single data entry using the chat template."""
+        entry_chat = []
+        prompt = self.prompt
+        if self.prompt_field:
+            prompt = data_entry[self.prompt_field]
+
+        for role in prompt:
+            entry_chat.append(dict(
+                role=role,
+                content=prompt[role].format(**data_entry)
+            ))
+
+        entry_prompt = self.tokenizer.apply_chat_template(
+            entry_chat,
+            **self.chat_template_params
+        )
+
+        return entry_prompt
+
+    def process(self):
+        """Main processing function: reads entries, builds prompts, runs generation, writes results."""
+        from vllm import LLM
+
+        entries = []
+        entry_prompts = []
+
+        # Read entries and generate prompts
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin:
+            for line in tqdm(fin, desc = "Building prompts: "):
+                data_entry = json.loads(line)
+                entries.append(data_entry)
+                entry_prompt = self.get_entry_prompt(data_entry)
+                entry_prompts.append(entry_prompt)
+
+        # Run vLLM inference
+        llm = LLM(**self.model_params)
+        outputs = llm.generate(entry_prompts, self.sampling_params)
+
+        # Write results to output manifest
+        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            for data_entry, output in tqdm(zip(entries, outputs), desc="Writing outputs: "):
+                data_entry[self.generation_field] = output.outputs[0].text
+                line = json.dumps(data_entry)
+                fout.writelines(f'{line}\n')

From 71ebb33f22406c7f434be8b0798a6a6612a4b11a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 10:37:02 -0700
Subject: [PATCH 26/90] CleanQwenGeneration

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   1 +
 .../llm/post_processing/qwen_cleaning.py      | 131 ++++++++++++++++++
 tests/test_data_to_data.py                    |  52 +++++++
 4 files changed, 187 insertions(+)
 create mode 100644 sdp/processors/inference/llm/post_processing/qwen_cleaning.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index cb039b2d..08f29703 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -298,6 +298,9 @@ Data filtering
    
 .. autodata:: sdp.processors.WhisperHallucinationFeatures
     :annotation:
+   
+.. autodata:: sdp.processors.CleanQwenGeneration
+    :annotation:
 
 .. autodata:: sdp.processors.CreateTolokaPool
     :annotation:
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 656d9b58..2004f9c9 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -87,6 +87,7 @@
 from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
 
 from sdp.processors.inference.llm.vllm.vllm import vLLMInference
+from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
 from sdp.processors.nemo.pc_inference import PCInference
 
diff --git a/sdp/processors/inference/llm/post_processing/qwen_cleaning.py b/sdp/processors/inference/llm/post_processing/qwen_cleaning.py
new file mode 100644
index 00000000..11fa7a86
--- /dev/null
+++ b/sdp/processors/inference/llm/post_processing/qwen_cleaning.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import string
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.metrics_computation import get_cer
+
+
+class CleanQwenGeneration(BaseParallelProcessor):
+    """
+    A processor that filters and post-processes model generations, replacing them with
+    reference text if they are considered low quality based on character error rate (CER)
+    and uppercase letter proportion.
+
+    This processor is typically used after running a generation model (e.g., Qwen) to clean
+    up outputs and ensure alignment with reference transcriptions.
+
+    Args:
+        cer_threshold (float): Maximum allowable character error rate (CER) between the
+            normalized generation and reference text. If exceeded, the generation is
+            replaced by the reference.
+        upper_case_threshold (float): Threshold for the proportion of uppercase letters in
+            the generation. If the ratio exceeds this value, the generation is replaced.
+        generation_field (str): Key in the input manifest for the model-generated text.
+        text_field (str): Key in the input manifest for the reference (target) text.
+        **kwargs: Additional arguments passed to the `BaseParallelProcessor`.
+
+    Returns:
+        A manifest where each entry contains the cleaned generation in the specified
+        generation field. If a replacement occurred, it is recorded in the metrics.
+
+    Metrics:
+        - 1 if the generation was replaced with the reference text.
+        - 0 if the generation was left as-is.
+    """
+
+    def __init__(
+        self,
+        cer_threshold=10,
+        upper_case_threshold=0.6,
+        generation_field='generation',
+        text_field='text',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.cer_threshold = cer_threshold
+        self.upper_case_threshold = upper_case_threshold
+        self.generation_field = generation_field
+        self.text_field = text_field
+
+    def clean(self, generation):
+        """Remove template prompts and special tokens from model generation."""
+        if "<|endoftext|>" in generation:
+            generation = generation.split("<|endoftext|>")[0]
+
+        if "Input transcript:" in generation:
+            generation = generation.replace("Input transcript:", "")
+
+        if "Output:" in generation:
+            generation = generation.replace("Output:", "")
+
+        if "Output transcript:" in generation:
+            generation = generation.replace("Output transcript:", "")
+
+        if "\n" in generation:
+            generation = generation.replace("\n", "")
+
+        return generation
+
+    def maybe_replace_with_text(self, generation, text):
+        """
+        Determine if generation should be replaced with reference text based on
+        CER and uppercase ratio.
+        """
+        chars = generation.replace(' ', '')
+        total_chars = len(chars)
+
+        # Replace if generation is empty
+        if not total_chars:
+            return text, 1
+
+        # Replace if excessive capitalization
+        uppercase_count = sum(1 for char in chars if char.isupper())
+        if uppercase_count / total_chars > self.upper_case_threshold:
+            return text, 1
+
+        # Normalize both strings for CER comparison
+        normalized_text = text.lower().translate(str.maketrans('', '', string.punctuation)).strip()
+        normalized_generation = generation.lower().translate(str.maketrans('', '', string.punctuation)).strip()
+
+        if not normalized_text:
+            return text, 1
+
+        cer = get_cer(normalized_text, normalized_generation)
+
+        if cer > self.cer_threshold:
+            return text, 1
+
+        return generation, 0
+
+    def process_dataset_entry(self, data_entry):
+        """Process a single entry from the manifest: clean and validate generation."""
+        text = data_entry[self.text_field]
+        generation = data_entry[self.generation_field]
+
+        generation = self.clean(generation)
+        maybe_replaced_generation, replaced = self.maybe_replace_with_text(generation, text)
+
+        data_entry[self.generation_field] = maybe_replaced_generation.strip()
+
+        return [DataEntry(data=data_entry, metrics=replaced)]
+
+    def finalize(self, metrics):
+        """Log the total number of replaced generations."""
+        logger.info(
+            f"Num of utterances that were replaced by text: {sum(metrics)}"
+        )
+        super().finalize(metrics)
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 55533c69..6d051857 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -24,6 +24,7 @@
 )
 
 from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
+from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
 test_params_list = []
 
@@ -299,6 +300,57 @@
     ]
 )
 
+test_params_list.extend(
+    [
+        # Case: generation is fine, no replacement
+        (
+            CleanQwenGeneration,
+            {"cer_threshold": 10, "upper_case_threshold": 0.6},
+            {"text": "hello world", "generation": "hello world"},
+            [{"text": "hello world", "generation": "hello world"}],
+        ),
+
+        # Case: generation is completely uppercase → replaced
+        (
+            CleanQwenGeneration,
+            {"cer_threshold": 10, "upper_case_threshold": 0.5},
+            {"text": "hello world", "generation": "HELLO WORLD"},
+            [{"text": "hello world", "generation": "hello world"}],
+        ),
+
+        # Case: generation contains <|endoftext|> and prompt remnants → cleaned
+        (
+            CleanQwenGeneration,
+            {},
+            {"text": "hello", "generation": "Input transcript: hello\nOutput transcript: hello<|endoftext|>"},
+            [{"text": "hello", "generation": "hello"}],
+        ),
+
+        # Case: generation is too different → high CER → replaced
+        (
+            CleanQwenGeneration,
+            {"cer_threshold": 0.2},
+            {"text": "hello world", "generation": "xyz abc"},
+            [{"text": "hello world", "generation": "hello world"}],
+        ),
+
+        # Case: generation is empty → replaced
+        (
+            CleanQwenGeneration,
+            {},
+            {"text": "reference", "generation": ""},
+            [{"text": "reference", "generation": "reference"}],
+        ),
+
+        # Case: text is empty → fallback to replacement
+        (
+            CleanQwenGeneration,
+            {},
+            {"text": "", "generation": "some output"},
+            [{"text": "", "generation": ""}],
+        ),
+    ]
+)
 
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):

From 1525c0c4ff9ffa025f989036d6161f1ba4a40ed3 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 10:42:31 -0700
Subject: [PATCH 27/90] Updated SubRegex

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../modify_manifest/data_to_data.py           | 43 ++++++++++++-------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index c683cfe8..08d0668e 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -410,23 +410,28 @@ def finalize(self, metrics):
 
 
 class SubRegex(BaseParallelProcessor):
-    """Converts a regex match to a string, as defined by key-value pairs in ``regex_to_sub``.
+    """
+    Applies a sequence of regex substitutions to the specified text field in each data entry.
+
+    This processor performs regex-based substitutions as defined in either a provided list of 
+    regex parameter dictionaries or a YAML configuration file. Each substitution is applied in 
+    the order specified.
 
-    Before applying regex changes, we will add a space
-    character to the beginning and end of the ``text`` and ``pred_text``
-    keys for each data entry. After the the regex changes,
-    the extra spaces are removed. This includes the spaces in the beginning
-    and end of the text, as well as any double spaces ``"  "``.
+    Before substitutions are applied, a space is temporarily added to the beginning and end of the text 
+    to improve regex match consistency. After all substitutions, leading/trailing spaces and repeated 
+    spaces are removed.
 
     Args:
-        regex_params_list (list[dict]): list of dicts.
-            Each dict must contain a ``pattern`` and a ``repl`` key,
-            and optionally a ``count`` key (by default, ``count`` will be 0).
-            This processor will go through the list in order, and apply a ``re.sub`` operation on
-            the input text in ``data_entry[self.text_key]``, feeding in the specified ``pattern``, ``repl``
-            and ``count`` parameters to ``re.sub``.
-        text_key (str): a string indicating which key of the data entries
-            should be used to find the utterance transcript. Defaults to "text".
+        regex_params_list (List[Dict], optional): A list of dictionaries specifying the regex substitutions. 
+            Each dictionary must include:
+                - "pattern": A regex pattern to match.
+                - "repl": A replacement string.
+                - "count" (optional): Maximum number of replacements to make. Defaults to 0 (replace all).
+        regex_params_yaml (str, optional): Path to a YAML file that defines the same list of dictionaries 
+            as `regex_params_list`. Either `regex_params_list` or `regex_params_yaml` must be provided.
+            If both are provided, `regex_params_yaml` takes precedence.
+        text_key (str): The key in each data entry whose value will be modified. Defaults to "text".
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
 
     Returns:
          The same data as in the input manifest with ``<text_key>`` field changed.
@@ -434,12 +439,20 @@ class SubRegex(BaseParallelProcessor):
 
     def __init__(
         self,
-        regex_params_list: List[Dict],
+        regex_params_list: List[Dict] = None,
+        regex_params_yaml: str = None,
         text_key: str = "text",
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if not regex_params_list and not regex_params_yaml:
+            raise ValueError(f'One of `regex_params_list` or `regex_params_yaml` should be provided.')
+        
         self.regex_params_list = regex_params_list
+        if regex_params_yaml:
+            with open(regex_params_yaml, 'r') as regex_params_file: 
+                self.regex_params_list = yaml.safe_load(regex_params_file)
+
         self.text_key = text_key
 
         # verify all dicts in regex_params_list have "pattern" and "repl" keys

From 994a5f2ac8cc54ff109b082aeba4c205802c03fa Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 10:51:02 -0700
Subject: [PATCH 28/90] CountNumWords updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../modify_manifest/data_to_data.py           | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 08d0668e..e72548c2 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -112,31 +112,43 @@ def process_dataset_entry(self, data_entry):
 
 class CountNumWords(BaseParallelProcessor):
     """
-    Processor for counting the number of words in the text_key field saving the number in num_words_key.
+    A processor that counts the number of words in the `text_key` field of each dataset entry and stores the result in `num_words_key`.
+
+    Before counting, the text is optionally cleaned using a custom `alphabet`:
+    - If `alphabet` is provided, all characters not in the alphabet are replaced with whitespace.
+    - Consecutive whitespace characters are collapsed into a single space.
+    - The number of resulting space-separated tokens is counted as the number of words.
 
     Args:
-        text_key (str): The field containing the input text in the dataset.
-        num_words_key (str): The field to store the number of words in the dataset.
-        alphabet (str): Characters to be used to count words. Any other characters are substituted by whitespace and not take into account.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        text_key (str): The key in the input data entry containing the text to be analyzed.
+        num_words_key (str): The key under which the word count will be stored in the output entry. Defaults to "num_words".
+        alphabet (str, optional): A string of allowed characters (e.g., lowercase letters). All characters not in this set will be replaced with whitespace before counting. If not provided, no filtering is applied.
+        **kwargs: Additional arguments passed to the BaseParallelProcessor.
 
+    Returns:
+        A manifest where each entry is the original data entry with an added field `num_words_key` (default: `"num_words"`),
+        indicating the number of words in the `text_key` field.
     """
 
     def __init__(
         self,
         text_key: str,
-        num_words_key: str,
-        alphabet: str,
+        num_words_key: str = "num_words",
+        alphabet: str = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.text_key = text_key
         self.num_words_key = num_words_key
-        self.pattern = re.compile("[^" + alphabet + "]")
+        self.pattern = None
+        if alphabet:
+            self.pattern = re.compile("[^" + alphabet + "]")
 
     def process_dataset_entry(self, data_entry):
         text = data_entry[self.text_key]
-        cleaned_string = self.pattern.sub("", text).strip()
+        cleaned_string = text
+        if self.pattern:
+            cleaned_string = self.pattern.sub("", cleaned_string).strip()
         cleaned_string = re.sub("\\s+", " ", cleaned_string).strip()
         words = cleaned_string.split()
         num_words = len(words)

From 3dbedbab20b54d7dca891879345a8bd6656278de Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:02:28 -0700
Subject: [PATCH 29/90] FilterWithCharacterHistograms

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   1 +
 .../modify_manifest/data_to_data.py           | 167 +++++++++++++++++-
 3 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 08f29703..796fc045 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -202,6 +202,9 @@ Data modifications
 .. autodata:: sdp.processors.CountNumWords
    :annotation:
 
+.. autodata:: sdp.processors.FilterWithCharacterHistograms
+   :annotation:
+
 .. autodata:: sdp.processors.NormalizeText
    :annotation:
    
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2004f9c9..0c43ed03 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -35,6 +35,7 @@
     CopyManifestData,
     CountNumWords,
     ExtractFromBrackets,
+    FilterWithCharacterHistograms,
     GetAudioDuration,
     GetWER,
     InsIfASRInsertion,
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index e72548c2..e4f9c761 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,6 +15,12 @@
 import collections
 import os
 import re
+import tempfile
+import shutil
+import requests
+import wget
+import tarfile
+from glob import glob
 from typing import Dict, List
 
 import soundfile
@@ -1211,4 +1217,163 @@ def process_dataset_entry(self, data_entry) -> List[DataEntry]:
         return [DataEntry(data=data_entry)]
 
     def finalize(self, metrics):
-        super().finalize(metrics)
\ No newline at end of file
+        super().finalize(metrics)
+
+
+class CharacterHistograms(BaseParallelProcessor):
+    """
+    A processor that filters text based on character histogram similarity to trusted data in the target language.
+
+    This processor computes the ratio of characters in a given text that are found in a reference character histogram
+    for a specific language. If this ratio is below a certain threshold, the text is likely mislabeled or noisy.
+
+    Histograms are sourced from the NLLB paper (https://arxiv.org/pdf/2207.04672), see page 30 for methodology. This
+    technique is a lightweight language ID filter, designed to catch mismatches between text content and claimed language.
+
+    Reference implementation: https://github.com/facebookresearch/fairseq/blob/main/examples/m2m_100/process_data/clean_histogram.py
+
+    Args:
+        text_field (str): Key in the data entry containing the text to evaluate.
+        lang_field (str, optional): Key in the data entry that identifies the language. Required if `lang` is not provided.
+        lang (str, optional): Language code to use for all entries (overrides `lang_field`). Required if `lang_field` is not provided.
+        threshold (float): Threshold ratio to determine if text matches the histogram. Used only externally (not enforced in this processor).
+        cache_dir (str, optional): Directory where histograms are downloaded and cached.
+        threshold_char (str): Character used to truncate the histogram file (default is ']').
+        output_score_field (str): Key name under which the computed character match ratio will be stored.
+        **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
+
+    Raises:
+        ValueError: If both `lang` and `lang_field` are provided, or if neither is provided.
+                    Also raised if histogram for specified language is missing.
+
+    Returns:
+        A manifest where each entry includes the additional field `output_score_field` with the character match ratio.
+            Example::
+            
+                {
+                    "text": "hello world",
+                    "lang": "en",
+                    "hist_token_ratio": 0.95
+                }
+    """
+
+    HISTOGRAMS_URL = 'https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz'
+
+    def __init__(self,
+                 text_field: str,
+                 lang_field: str = None,
+                 lang: str = None,
+                 threshold: float = 0.8,
+                 cache_dir: str = None,
+                 threshold_char: str = "]",
+                 output_score_field: str = "hist_token_ratio",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.text_field = text_field
+
+        # Ensure exactly one of `lang` or `lang_field` is provided
+        if lang_field is None and lang is None: 
+            raise ValueError("One of the arguments `lang` or `lang_field` must be provided.")
+        if lang_field is not None and lang is not None: 
+            raise ValueError(
+                f"Both `lang` ({lang}) and `lang_field` ({lang_field}) are provided, which makes the source of language ambiguous. Please provide only one of them."
+            )
+
+        self.lang_field = lang_field
+        self.lang = lang
+        self.threshold = threshold
+        self.cache_dir = cache_dir
+        self.threshold_char = threshold_char
+        self.output_score_field = output_score_field
+        self.histograms = dict()
+
+    def _read_hist(self, lang: str):
+        """
+        Read and parse the histogram file for a given language, stopping at the threshold character.
+        """
+        hist_file = os.path.join(self.cache_dir, lang)
+        chars = []
+        with open(hist_file) as hist:
+            for line in hist:
+                char = line[0] 
+                chars.append(char)
+                if char == self.threshold_char:
+                    break
+        self.histograms[lang] = set(chars)
+
+    def _download_histograms(self):
+        """
+        Download and extract histogram files into the cache directory.
+        """
+        logger.info('Downloading histograms collection..')
+        response = requests.get(self.HISTOGRAMS_URL)
+        if response.status_code != 200:
+            raise requests.exceptions.RequestException(
+                f"Failed to download model file. Status code: {response.status_code}"
+            )
+
+        if self.cache_dir is None:
+            self.cache_dir = tempfile.mkdtemp()
+        
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        histograms_tarfile = wget.download(self.HISTOGRAMS_URL, out=self.cache_dir)
+        with tarfile.open(histograms_tarfile, "r:gz") as tar:
+            tar.extractall(path=self.cache_dir)
+
+        # Flatten subdirectories into the main cache_dir
+        histograms_filepaths = glob(f'{self.cache_dir}/checkpoint/edunov/cc60_multilingual/clean_hists/*')
+        for histogram_filepath in histograms_filepaths:
+            shutil.move(histogram_filepath, os.path.join(self.cache_dir, os.path.basename(histogram_filepath)))
+
+        os.remove(histograms_tarfile)
+        shutil.rmtree(f'{self.cache_dir}/checkpoint/edunov/cc60_multilingual/clean_hists/')
+        logger.info(f'Histograms have been downloaded to {self.cache_dir}.')
+
+    def prepare(self):
+        """
+        Ensure histograms are available and read them into memory.
+        """
+        if (self.cache_dir is None or 
+            not os.path.exists(self.cache_dir) or 
+            not os.path.isdir(self.cache_dir) or 
+            len(os.listdir(self.cache_dir)) == 0):
+            
+            self._download_histograms()
+
+        logger.info('Reading histograms...')
+        available_langs = os.listdir(self.cache_dir)
+        if self.lang is not None:
+            if self.lang in available_langs:
+                self._read_hist(self.lang)
+            else:
+                raise ValueError(f"Invalid value for `lang`: {self.lang}. Please provide one of the following: {available_langs}")
+            logger.info(f'Histogram for `{self.lang}` has been read.')
+        else:
+            for lang in tqdm(available_langs):
+                self._read_hist(lang)
+            logger.info(f'Histograms have been read.')
+
+    def process_dataset_entry(self, data_entry):
+        """
+        Compute and attach the character histogram match ratio for a given text entry.
+
+        Args:
+            data_entry (dict): A dictionary containing at least `text_field` and either `lang_field` or a preset `lang`.
+
+        Returns:
+            List[DataEntry]: A list with one updated `DataEntry` including the character match ratio field.
+        """
+        # Determine language for this entry
+        lang = self.lang if self.lang is not None else data_entry[self.lang_field]
+        if lang not in self.histograms:
+            raise ValueError(f'lang `{lang}` is not supported.')
+
+        # Compute how many characters match the histogram
+        text = data_entry[self.text_field].strip()
+        cnt = len([c for c in text if c in self.histograms[lang]])
+        token_ratio = cnt / len(text) if len(text) > 0 else 0.0
+
+        # Store the ratio in the data entry
+        data_entry[self.output_score_field] = token_ratio
+        return [DataEntry(data=data_entry)]

From f2f0822d72c735b58f7b7dc1bb529b25ec6e3bc8 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:02:54 -0700
Subject: [PATCH 30/90] FilterWithCharacterHistograms upd

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/modify_manifest/data_to_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index e4f9c761..28842493 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -1220,7 +1220,7 @@ def finalize(self, metrics):
         super().finalize(metrics)
 
 
-class CharacterHistograms(BaseParallelProcessor):
+class FilterWithCharacterHistograms(BaseParallelProcessor):
     """
     A processor that filters text based on character histogram similarity to trusted data in the target language.
 
@@ -1249,7 +1249,7 @@ class CharacterHistograms(BaseParallelProcessor):
     Returns:
         A manifest where each entry includes the additional field `output_score_field` with the character match ratio.
             Example::
-            
+
                 {
                     "text": "hello world",
                     "lang": "en",

From b9f28da0775d747d47d059c445e9a5795436622a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:19:23 -0700
Subject: [PATCH 31/90] Moved nemo PCInference to inference processors

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/__init__.py                              | 2 +-
 sdp/processors/{ => inference/nlp}/nemo/pc_inference.py | 0
 sdp/processors/nemo/__init__.py                         | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename sdp/processors/{ => inference/nlp}/nemo/pc_inference.py (100%)
 delete mode 100644 sdp/processors/nemo/__init__.py

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 0c43ed03..a37e39de 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -90,7 +90,7 @@
 from sdp.processors.inference.llm.vllm.vllm import vLLMInference
 from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
-from sdp.processors.nemo.pc_inference import PCInference
+from sdp.processors.inference.nlp.nemo.pc_inference import PCInference
 
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool
diff --git a/sdp/processors/nemo/pc_inference.py b/sdp/processors/inference/nlp/nemo/pc_inference.py
similarity index 100%
rename from sdp/processors/nemo/pc_inference.py
rename to sdp/processors/inference/nlp/nemo/pc_inference.py
diff --git a/sdp/processors/nemo/__init__.py b/sdp/processors/nemo/__init__.py
deleted file mode 100644
index e69de29b..00000000

From d30886c9510148e55a5728174498896e17c226fb Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:23:08 -0700
Subject: [PATCH 32/90] FastTextLangIdClassifier

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../inference/nlp/fasttext/fasttext.py        | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 sdp/processors/inference/nlp/fasttext/fasttext.py

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
new file mode 100644
index 00000000..8b1da439
--- /dev/null
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import requests
+import tempfile
+import wget
+
+import fasttext
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class FastTextLangIdClassifier(BaseParallelProcessor):
+    """
+    This processor supports language identification using pretrained FastText models.
+    It classifies text and adds the predicted label and probability to the dataset entry.
+    If needed, it downloads the model, loads it into memory, and performs prediction on the 
+    specified input text field.
+
+    Args:
+        model_name_or_path (str): Path to a FastText model file or the name of a supported remote model ('lid.176.bin' or 'lid.176.ftz').
+        text_field (str): The name of the field in the dataset entry that contains the input text for classification.
+        output_field (str): The name of the field to store the predicted label. Defaults to "label".
+        cache_dir (str, optional): Directory to store the downloaded model file. If not provided, a temporary directory is used.
+        **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
+
+    Returns:
+        A manifest where each entry contains the original data fields plus:
+            - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
+            - `<output_field>_prob`: The probability of the prediction.
+            
+    """
+
+    SUPPROTED_MODELS_URLS = {
+        'lid.176.bin' : 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',
+        'lid.176.ftz' : 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
+    }
+    
+    def __init__(
+        self,
+        model_name_or_path: str,
+        text_field: str,
+        output_field: str = "label",
+        cache_dir: str = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.model_name_or_path = model_name_or_path
+        self.text_field = text_field
+        self.output_field = output_field
+        self.cache_dir = cache_dir
+        self._model = None
+
+    def _download_model(self):
+        """Downloads the FastText model from a predefined URL and stores it in the cache directory."""
+        model_url = self.SUPPROTED_MODELS_URLS[self.model_name_or_path]
+        logger.info(f'Downloading {self.model_name_or_path}..')
+        response = requests.get(model_url)
+
+        if response.status_code != 200:
+            raise requests.exceptions.RequestException(
+            f"Failed to download model file. Status code: {response.status_code}"
+        )
+
+        if self.cache_dir is None:
+            self.cache_dir = tempfile.mkdtemp()
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        self.model_name_or_path = wget.download(model_url, out=self.cache_dir)
+        logger.info(f'Model `{self.model_name_or_path}` has been downloaded to {self.cache_dir}.')
+
+    def _load_model(self):
+        """Lazily loads the FastText model into memory."""
+        if self._model is None:
+            self._model = fasttext.load_model(self.model_name_or_path)
+    
+    def prepare(self):
+        """
+        Prepares the model for classification:
+        - Checks if the model file exists locally.
+        - Downloads the model if only the name is given and it's known.
+        - Raises ValueError if the path or model name is invalid.
+        """
+        if not os.path.exists(self.model_name_or_path):
+            if os.path.exists(os.path.join(self.cache_dir, self.model_name_or_path)):
+                self.model_name_or_path = os.path.join(self.cache_dir, self.model_name_or_path)
+            elif self.model_name_or_path in self.SUPPROTED_MODELS_URLS:
+                self._download_model()
+            else:
+                raise ValueError(f'Current model is not supported or filepath is invalid: {self.model_name_or_path}.')
+
+    def process_dataset_entry(self, data_entry: dict):
+        """Applies the classifier to a single dataset entry."""
+
+        self._load_model()
+        label, prob = self._model.predict(data_entry[self.text_field])
+        data_entry[self.output_field] = label[0].replace('__label__', '')
+        data_entry[f"{self.output_field}_prob"] = prob[0]
+
+        return [DataEntry(data=data_entry)]

From e25aeb7d181c7c1577e6ac861c36d858e6012a1d Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:24:43 -0700
Subject: [PATCH 33/90] FastTextLangIdClassifier

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst       | 5 ++++-
 sdp/processors/__init__.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 796fc045..79a98ccf 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -152,7 +152,7 @@ Data enrichment
 ###############
 
 The following processors can be used to add additional attributes to the data by
-running different NeMo models (e.g., ASR predictions). These attributes are typically
+running different models (e.g., ASR predictions). These attributes are typically
 used in the downstream processing for additional enhancement or filtering.
 
 .. autodata:: sdp.processors.ASRInference
@@ -170,6 +170,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.vLLMInference
    :annotation:
 
+.. autodata:: sdp.processors.FastTextLangIdClassifier
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index a37e39de..ecb903bc 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -91,6 +91,7 @@
 from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
 from sdp.processors.inference.nlp.nemo.pc_inference import PCInference
+from sdp.processors.inference.nlp.fasttext.fasttext import FastTextLangIdClassifier
 
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool

From 6a2365da11e1b40b376165668eaf151a9f2a45dd Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 11:35:30 -0700
Subject: [PATCH 34/90] CometoidWMTQualityEstimation

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |   3 +
 sdp/processors/__init__.py                    |   2 +
 .../inference/quality_estimation/pymarian.py  | 213 ++++++++++++++++++
 3 files changed, 218 insertions(+)
 create mode 100644 sdp/processors/inference/quality_estimation/pymarian.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 79a98ccf..d5ab460d 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -173,6 +173,9 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.FastTextLangIdClassifier
    :annotation:
 
+.. autodata:: sdp.processors.CometoidWMTQualityEstimation
+   :annotation:
+
 Text-only processors
 ####################
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index ecb903bc..f70debb1 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -93,6 +93,8 @@
 from sdp.processors.inference.nlp.nemo.pc_inference import PCInference
 from sdp.processors.inference.nlp.fasttext.fasttext import FastTextLangIdClassifier
 
+from sdp.processors.inference.quality_estimation.pymarian import CometoidWMTQualityEstimation
+
 from sdp.processors.toloka.accept_if import AcceptIfWERLess
 from sdp.processors.toloka.create_pool import CreateTolokaPool
 from sdp.processors.toloka.create_project import CreateTolokaProject
diff --git a/sdp/processors/inference/quality_estimation/pymarian.py b/sdp/processors/inference/quality_estimation/pymarian.py
new file mode 100644
index 00000000..a818169f
--- /dev/null
+++ b/sdp/processors/inference/quality_estimation/pymarian.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+from tqdm import tqdm
+import termplotlib as tpl
+import numpy as np
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor
+
+class CometoidWMTQualityEstimation(BaseParallelProcessor):
+    """
+    A processor for estimating translation quality using pretrained COMET-like models 
+    based on MarianNMT and the pymarian Evaluator.
+
+    This processor evaluates the quality of source-target text pairs (bitext) using 
+    COMETOID-style quality estimation and appends the resulting score to each dataset entry.
+
+    Args:
+        source_text_field (str): The key in the data entry containing the source (original) text.
+        target_text_field (str): The key in the data entry containing the target (translated) text.
+        model_name_or_path (str): Hugging Face model name or path to local model checkpoint.
+        vocab_path (str, optional): Path to the vocabulary file. If None and model is from HF, it will be downloaded.
+        save_model_to (str, optional): Directory to download and cache the model and vocab.
+        mini_batch (int): Mini-batch size for evaluation.
+        maxi_batch (int): Maxi-batch size for evaluation.
+        output_field (str): The name of the field where the quality score will be saved in the output manifest.
+        device_type (str): Device type to use: 'cpu' or 'gpu'.
+        num_devices (int): Number of CPU threads or GPU devices to use. Use -1 to use all available.
+        chunksize (int): Number of lines to process in each chunk.
+
+    Returns:
+        A manifest file where each entry has an added key (`output_field`) with the computed score.
+    
+    .. note::
+        This processor uses MarianNMT models fine-tuned for quality estimation. See https://marian-nmt.github.io/.
+    """
+
+    # Mapping of supported model aliases to Hugging Face repo paths
+    MODEL_NAME_TO_HF_PATH = {
+        "cometoid-wmt23": "marian-nmt/cometoid22-wmt23",
+        "cometoid-wmt23-mqm": "marian-nmt/cometoid22-wmt23",
+    }
+
+    # Marian evaluation arguments depending on device
+    MARIAN_GPU_ARGS = "-w 8000 -d {device_indicies}"
+    MARIAN_CPU_ARGS = "-w 2000 --cpu-threads {num_threads}"
+
+    def __init__(self,
+                 source_text_field: str,
+                 target_text_field: str,
+                 model_name_or_path: str,
+                 vocab_path: str = None,
+                 save_model_to: str = None,
+                 mini_batch: int = 16,
+                 maxi_batch: int = 96,
+                 output_field: str = 'cometoid_score',
+                 device_type: str = 'cpu',
+                 num_devices: int = -1,
+                 chunksize = 5000,
+                 **kwargs,
+    ):
+        super().__init__(max_workers = num_devices, chunksize = chunksize, in_memory_chunksize = chunksize, **kwargs)
+        self.source_text_field = source_text_field
+        self.target_text_field = target_text_field
+        self.model_name_or_path = model_name_or_path
+        self.vocab_path = vocab_path
+        self.save_model_to = save_model_to
+        self.device_type = device_type
+        self.mini_batch = mini_batch
+        self.maxi_batch = maxi_batch
+        self.output_field = output_field
+        self.model = None
+
+    def load_model(self):
+        from pymarian import Evaluator
+        from huggingface_hub import hf_hub_download
+
+        """
+        Load the model and vocabulary from Hugging Face if necessary.
+        Assemble command-line arguments for launching pymarian Evaluator.
+        Depending on the device (CPU/GPU), configure parallelism parameters.
+        """
+        repo_id = None
+        if self.model_name_or_path in self.MODEL_NAME_TO_HF_PATH:
+            repo_id = self.MODEL_NAME_TO_HF_PATH[self.model_name_or_path]
+            self.model_name_or_path = hf_hub_download(repo_id, filename="checkpoints/marian.model.bin", local_dir = self.save_model_to)
+       
+        if not os.path.exists(self.model_name_or_path):
+            raise ValueError(f'`model_name_or_path`: model name is not valid or model path does not exist ({self.model_name_or_path}).')
+        
+        if not self.vocab_path and repo_id is not None:
+            self.vocab_path = hf_hub_download(repo_id=repo_id, filename="vocab.spm", local_dir = self.save_model_to)
+        
+        if not os.path.exists(self.vocab_path):
+            raise FileNotFoundError(f'`vocab_path`: path does not exist ({self.vocab_path}).')
+    
+        marian_args = f"-m {self.model_name_or_path} -v {self.vocab_path} {self.vocab_path} --like comet-qe"
+
+        if self.device_type == "cpu":
+            max_available_cpus = os.cpu_count()
+            if self.max_workers == -1 or self.max_workers > max_available_cpus:
+                self.max_workers = max_available_cpus
+
+            cpu_args = self.MARIAN_CPU_ARGS.format(num_threads = self.max_workers)
+            marian_args += f' {cpu_args}'
+        else:
+            try:
+                import torch
+                if torch.cuda.is_available():
+                    max_available_gpus = torch.cuda.device_count()
+                if self.max_workers == -1 or self.max_workers > max_available_gpus:
+                    self.max_workers = max_available_cpus
+            except Exception:
+                pass
+
+            device_indicies = ' '.join([str(i) for i in range(self.max_workers)])
+            gpu_args = self.MARIAN_GPU_ARGS.format(device_indicies = device_indicies)
+            marian_args += f' {gpu_args}'
+        
+        marian_args += f' --mini-batch {self.mini_batch} --maxi-batch {self.maxi_batch}'
+        
+        self.model = Evaluator(marian_args)
+
+    def process_dataset_entry(self):
+        pass
+
+    def process(self):
+        """
+        Process the entire manifest in chunks.
+        For each pair of texts (source–target), compute the translation quality score.
+        Save the resulting scores in output_manifest_file.
+        """
+        self.load_model()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                entries = []
+                bitext_pairs = []
+                for data_entry in manifest_chunk:
+                    src = str(data_entry[self.source_text_field]).replace('\t', ' ')
+                    tgt = str(data_entry[self.target_text_field]).replace('\t', ' ')
+                    bitext_pairs.append(f'{src}\t{tgt}')
+                    entries.append(data_entry)
+                    
+                scores = self.model.evaluate(bitext_pairs)
+                for entry, score in tqdm(zip(entries, scores)):
+                    metrics.append(score)
+                    entry[self.output_field] = score
+                    json.dump(entry, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    fout.write("\n")
+        
+        self.finalize(metrics)
+    
+    def finalize(self, metrics):
+        """
+        Print statistics about the quality scores: histogram, min, max, mean, median.
+        Use termplotlib to render the histogram directly in the terminal.
+        """
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        logger.info("Histogram of scores:")
+
+        bins = np.arange(0, 1.1, 0.1)
+        hist, bin_edges = np.histogram(metrics, bins=bins)
+
+        labels = []
+        for i in range(len(bin_edges) - 1):
+            left = f"{bin_edges[i]:.1f}"
+            right = f"{bin_edges[i+1]:.1f}"
+            if i < len(bin_edges) - 2:
+                labels.append(f"[{left}–{right})")
+            else:
+                labels.append(f"[{left}–{right}]")
+        
+        fig = tpl.figure()
+        fig.barh(hist, labels)
+        fig.show()
+
+        logger.info(f"Min score: {np.min(metrics):.4f}")
+        logger.info(f"Max score: {np.max(metrics):.4f}")
+        logger.info(f"Mean score: {np.mean(metrics):.4f}")
+        logger.info(f"Median score: {np.median(metrics):.4f}")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 701226a78819a5b2aafefc82d2d596370eea2e95 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:06:13 -0700
Subject: [PATCH 35/90] WhisperHallucinationFeatures renamed to
 DetectWhisperHallucinationFeatures

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                               |  2 +-
 sdp/processors/__init__.py                         |  2 +-
 .../asr/post_processing/whisper_hallucinations.py  |  2 +-
 tests/test_data_to_data.py                         | 14 +++++++-------
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index d5ab460d..31563e36 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -305,7 +305,7 @@ Data filtering
 .. autodata:: sdp.processors.AcceptIfWERLess
     :annotation:
    
-.. autodata:: sdp.processors.WhisperHallucinationFeatures
+.. autodata:: sdp.processors.DetectWhisperHallucinationFeatures
     :annotation:
    
 .. autodata:: sdp.processors.CleanQwenGeneration
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index f70debb1..2f945335 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -85,7 +85,7 @@
 from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
 from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
 from sdp.processors.inference.asr.faster_whisper.faster_whisper_inference import FasterWhisperInference
-from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
+from sdp.processors.inference.asr.post_processing.whisper_hallucinations import DetectWhisperHallucinationFeatures
 
 from sdp.processors.inference.llm.vllm.vllm import vLLMInference
 from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
diff --git a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
index e252a8aa..73a72855 100644
--- a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
+++ b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
@@ -15,7 +15,7 @@
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
 
 
-class WhisperHallucinationFeatures(BaseParallelProcessor):
+class DetectWhisperHallucinationFeatures(BaseParallelProcessor):
     """
     A processor for detecting common hallucination patterns in ASR (automatic speech recognition) model outputs.
     
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 6d051857..afe224a4 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -23,7 +23,7 @@
     LambdaExpression,
 )
 
-from sdp.processors.inference.asr.post_processing.whisper_hallucinations import WhisperHallucinationFeatures
+from sdp.processors.inference.asr.post_processing.whisper_hallucinations import DetectWhisperHallucinationFeatures
 from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
 test_params_list = []
@@ -211,7 +211,7 @@
     [
         # Case: repeated n-grams (low unique words share)
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {"unique_words_threshold": 0.5},
             {"text": "word word word word", "duration": 2.0},
             [{   
@@ -225,7 +225,7 @@
 
         # Case: high unique word share
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {"unique_words_threshold": 0.2},
             {"text": "this is a very diverse sentence", "duration": 3.0},
             [{
@@ -239,7 +239,7 @@
 
         # Case: one very long word
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {"long_word_threshold": 10},
             {"text": "short supercalifragilisticexpialidocious", "duration": 3.0},
             [{
@@ -253,7 +253,7 @@
 
         # Case: long word with large relative difference
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {"long_word_threshold": 100, "long_word_rel_threshold": 2.0},
             {"text": "hi extraordinarylongword tiny", "duration": 3.0},
             [{
@@ -267,7 +267,7 @@
 
         # Case: low character rate (chars/sec)
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {"char_rate_threshold": 10.0},
             {"text": "a b", "duration": 2.0},
             [{   
@@ -281,7 +281,7 @@
 
         # Case: all metrics triggered
         (
-            WhisperHallucinationFeatures,
+            DetectWhisperHallucinationFeatures,
             {
                 "unique_words_threshold": 0.5,
                 "long_word_threshold": 10,

From 8305f90ffa838b3befa2c042338f2bc607dece46 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:13:18 -0700
Subject: [PATCH 36/90] FasterWhisperInference docs updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../faster_whisper_inference.py               | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index 53ca821e..d73b85a2 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -153,17 +153,25 @@ class FasterWhisperInference(BaseProcessor):
         audio_filepath_field (str): Name of the field in manifest pointing to audio path.
      
     Returns:
-        A manifest file where each line corresponds to the transcription result of an input sample.
+        A final merged manifest file where each line corresponds to the transcription result of an input audio sample.
+        The manifest is assembled from multiple per-worker (rank) manifest files, each produced by a separate device or process.
+
         Each entry contains the following fields:
 
         - ``language`` (str, optional): Detected language (if language detection is enabled).
         - ``language_probability`` (float, optional): Confidence score of detected language.
-        - ``segments`` (List[Dict], optional): List of segment-level transcriptions with start/end times and text (if timestamps are embedded).
-        - ``pred_text`` (str): Final transcribed text obtained by concatenating segment texts.
-        - ``segments`` (str, optional): Path to file containing segment timestamps (if saved separately).
-        - ``words`` (str, optional): Path to file containing word-level timestamps (if enabled and saved separately).
-        
-        The output manifest is written to `output_manifest_file`, which defaults to `<output_dir>/predictions_all.json`.
+        - ``pred_text`` (str): Final transcribed text obtained by concatenating all segment texts.
+
+        One of the following timestamp representations will also be included, depending on the value of `save_timestamps_separately`:
+
+        - If ``save_timestamps_separately=False``:
+            - ``segments`` (List[Dict]): List of segment dictionaries with start/end timestamps and transcribed text.
+
+        - If ``save_timestamps_separately=True``:
+            - ``segments`` (str): Path to a JSON file containing segment-level timestamps.
+            - ``words`` (str, optional): Path to a JSON file containing word-level timestamps (if `word_timestamps=True`).
+
+        The final combined manifest is written to ``output_manifest_file``, which defaults to ``<output_dir>/predictions_all.json``.
     
     .. note::
         For detailed configuration options and advanced usage of FasterWhisper, refer to the official repository:
@@ -253,7 +261,6 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
             else:
                 logger.info("CUDA devices found. GPU will be used as workers.")
                 device == "cuda"
-                max_workers = max_available_workers
         elif device == "cpu":
             logger.info("CPU will be used as workers.")
         else:

From 2538d3fe00e2fbca05d0e2a4184fc21d00f5ef9e Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:33:19 -0700
Subject: [PATCH 37/90] Requirements updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 requirements/main.txt                                 |  8 ++++++++
 .../asr/faster_whisper/faster_whisper_inference.py    | 11 +++++++++++
 sdp/processors/inference/llm/vllm/vllm.py             |  3 +++
 sdp/processors/inference/nlp/fasttext/fasttext.py     |  6 +++++-
 .../inference/quality_estimation/pymarian.py          |  3 +++
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index d133867a..85d25f18 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -11,6 +11,7 @@ rarfile
 regex
 sox
 tqdm
+termplotlib
 gdown
 webvtt-py
 wget
@@ -23,3 +24,10 @@ distributed
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
+
+# for FasterWhisperInference processor is required: 
+    # pip install pytorch-lightning, nvidia-cublas-cu12, nvidia-cudnn-cu12==9.*, faster_whisper
+    # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
+# for vLLMInference processor is required: pip install optree>=0.13.0, vllm
+# for FastTextLangIdClassifier processor is required: pip install fasttext
+# for CometoidWMTQualityEstimation processor is required: pip install pymarian
\ No newline at end of file
diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index d73b85a2..f272325f 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -174,8 +174,19 @@ class FasterWhisperInference(BaseProcessor):
         The final combined manifest is written to ``output_manifest_file``, which defaults to ``<output_dir>/predictions_all.json``.
     
     .. note::
+        Make sure to install the following packages before using this processor:
+        
+            pip install pytorch-lightning nvidia-cublas-cu12 nvidia-cudnn-cu12==9.* faster_whisper
+
+        Additionally, ensure that the dynamic libraries for cuBLAS and cuDNN are discoverable at runtime:
+
+            export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
+
+        This is required for CUDA backend components to function correctly when using FasterWhisper with GPU acceleration.
+
         For detailed configuration options and advanced usage of FasterWhisper, refer to the official repository:
         https://github.com/SYSTRAN/faster-whisper
+        
     """
     def __init__(self, 
                  input_manifest_file: str,
diff --git a/sdp/processors/inference/llm/vllm/vllm.py b/sdp/processors/inference/llm/vllm/vllm.py
index 5081c414..94daec8a 100644
--- a/sdp/processors/inference/llm/vllm/vllm.py
+++ b/sdp/processors/inference/llm/vllm/vllm.py
@@ -54,6 +54,9 @@ class vLLMInference(BaseProcessor):
         - model: https://docs.vllm.ai/en/latest/api/vllm/vllm.entrypoints.llm.html#vllm.entrypoints.llm.LLM
         - inference: https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html
         - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating#applychattemplate
+
+        Make sure to install `optree>=0.13.0` and `vllm` before using this processor:
+            pip install optree>=0.13.0, vllm
     """
 
     def __init__(self,
diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index 8b1da439..3bc34c89 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -41,7 +41,11 @@ class FastTextLangIdClassifier(BaseParallelProcessor):
         A manifest where each entry contains the original data fields plus:
             - `<output_field>`: The predicted label (e.g., language code for `lid.176.bin`).
             - `<output_field>_prob`: The probability of the prediction.
-            
+    
+    .. note::
+        Make sure to install `fasttext` before using this processor:
+            pip install fasttext
+
     """
 
     SUPPROTED_MODELS_URLS = {
diff --git a/sdp/processors/inference/quality_estimation/pymarian.py b/sdp/processors/inference/quality_estimation/pymarian.py
index a818169f..0e15eaad 100644
--- a/sdp/processors/inference/quality_estimation/pymarian.py
+++ b/sdp/processors/inference/quality_estimation/pymarian.py
@@ -47,6 +47,9 @@ class CometoidWMTQualityEstimation(BaseParallelProcessor):
     
     .. note::
         This processor uses MarianNMT models fine-tuned for quality estimation. See https://marian-nmt.github.io/.
+
+        Make sure to install `pymarian` before using this processor:
+            pip install pymarian
     """
 
     # Mapping of supported model aliases to Hugging Face repo paths

From 4d51b97db97c695b5e4fa210916b3cc91f197e4f Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:39:56 -0700
Subject: [PATCH 38/90] readme moved

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../{ => multilingual}/granary/readme.md          |  0
 .../faster_whisper/faster_whisper_inference.py    | 15 ++-------------
 2 files changed, 2 insertions(+), 13 deletions(-)
 rename dataset_configs/{ => multilingual}/granary/readme.md (100%)

diff --git a/dataset_configs/granary/readme.md b/dataset_configs/multilingual/granary/readme.md
similarity index 100%
rename from dataset_configs/granary/readme.md
rename to dataset_configs/multilingual/granary/readme.md
diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index f272325f..bdcdd5fc 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -186,7 +186,7 @@ class FasterWhisperInference(BaseProcessor):
 
         For detailed configuration options and advanced usage of FasterWhisper, refer to the official repository:
         https://github.com/SYSTRAN/faster-whisper
-        
+
     """
     def __init__(self, 
                  input_manifest_file: str,
@@ -468,15 +468,4 @@ def process(self):
             for rank_manifest_filepath in tqdm(output_rank_manifests):
                 with open(rank_manifest_filepath, 'r', encoding='utf8') as rank_manifest:
                     for line in rank_manifest:
-                        output_manifest.writelines(line)
-            
-
-
-
-
-
-
-
-
-                
-
+                        output_manifest.writelines(line)
\ No newline at end of file

From cc910d979f3ddd19d6f5dd6525dba1c90a5513d8 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:44:46 -0700
Subject: [PATCH 39/90] Added prompts and subregex params

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../granary/{readme.md => README.md}          |  0
 .../partials/pr_recovery_prompts/bg.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/cs.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/da.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/de.yaml      | 29 +++++++++
 .../partials/pr_recovery_prompts/el.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/en.yaml      | 47 ++++++++++++++
 .../partials/pr_recovery_prompts/es.yaml      | 28 +++++++++
 .../partials/pr_recovery_prompts/et.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/fi.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/fr.yaml      | 43 +++++++++++++
 .../partials/pr_recovery_prompts/hr.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/hu.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/it.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/lt.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/lv.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/mt.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/nl.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/pl.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/pt.yaml      | 27 ++++++++
 .../partials/pr_recovery_prompts/ro.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/ru.yaml      | 61 +++++++++++++++++++
 .../partials/pr_recovery_prompts/sk.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/sl.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/sv.yaml      | 26 ++++++++
 .../partials/pr_recovery_prompts/uk.yaml      | 26 ++++++++
 .../partials/subregex_params/common.yaml      | 31 ++++++++++
 27 files changed, 767 insertions(+)
 rename dataset_configs/multilingual/granary/{readme.md => README.md} (100%)
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/common.yaml

diff --git a/dataset_configs/multilingual/granary/readme.md b/dataset_configs/multilingual/granary/README.md
similarity index 100%
rename from dataset_configs/multilingual/granary/readme.md
rename to dataset_configs/multilingual/granary/README.md
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml
new file mode 100644
index 00000000..d396a1be
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml
@@ -0,0 +1,26 @@
+system: |
+  Описание:
+    Имате транскрипция, която може да съдържа пунктуация и главни букви, може да не ги съдържа или да съдържа грешна пунктуация. Задачата е да коригирате текста, като възстановите пунктуацията и главните букви.
+
+  Правила:
+    - Не променяйте, добавяйте или премахвайте думи от текста. Всички промени трябва да бъдат ограничени до пунктуация и главни букви.
+    - Възстановете правилната пунктуация, като използвате само точки, запетаи, въпросителни знаци, удивителни знаци, точка и запетая, двоеточия и тирета. Всички останали символи (включително емоджита) трябва да бъдат премахнати или заменени с допустими пунктуационни знаци.
+    - Ако текстът вече съдържа достатъчна пунктуация, оставете го непроменен.
+    - Коригирайте неправилната или липсваща пунктуация.
+    - Главните букви трябва да бъдат възстановени за началото на изреченията и собствените имена.
+
+  Примери:
+  - input: "днес е хубав ден слънцето грее 😊 но е много горещо!!!"
+    output: "Днес е хубав ден. Слънцето грее, но е много горещо."
+
+  - input: "къде си бил вчера защо не ми се обади 📞"
+    output: "Къде си бил вчера? Защо не ми се обади?"
+
+  - input: "утре имаме среща в 10 сутринта не закъснявай 😅"
+    output: "Утре имаме среща в 10 сутринта. Не закъснявай."
+
+  - input: "харесва ми този филм беше много интересен 🍿"
+    output: "Харесва ми този филм. Беше много интересен."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml
new file mode 100644
index 00000000..008be04d
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml
@@ -0,0 +1,26 @@
+system: |
+  Popis:
+    Máte přepis, který může obsahovat interpunkci a velká písmena, nemusí je obsahovat, nebo obsahuje nesprávnou interpunkci. Úkolem je opravit text obnovením interpunkce a velkých písmen.
+
+  Pravidla:
+    - Neměňte, nepřidávejte ani neodstraňujte žádná slova z textu. Veškeré úpravy by se měly týkat pouze interpunkce a velkých písmen.
+    - Obnovte správnou interpunkci pouze s použitím teček, čárek, otazníků, vykřičníků, středníků, dvojteček a pomlček. Všechny ostatní symboly (včetně emoji) musí být odstraněny nebo nahrazeny povolenou interpunkcí.
+    - Pokud text již obsahuje dostatečnou interpunkci, nechte jej beze změn.
+    - Opravte nesprávnou nebo chybějící interpunkci.
+    - Velká písmena používejte na začátku vět a pro vlastní jména.
+
+  Příklady:
+  - input: "ahoj jak se dnes máš 😊 rád tě vidím!!!"
+    output: "Ahoj, jak se dnes máš? Rád tě vidím."
+
+  - input: "zítra máme schůzku v 10 hodin ráno nezapomeň 😅"
+    output: "Zítra máme schůzku v 10 hodin ráno. Nezapomeň."
+
+  - input: "miluji tento film byl tak zajímavý 🎥"
+    output: "Miluji tento film. Byl tak zajímavý."
+
+  - input: "kde jsi byl včera proč jsi mi nezavolal 📞"
+    output: "Kde jsi byl včera? Proč jsi mi nezavolal?"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml
new file mode 100644
index 00000000..6008402c
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml
@@ -0,0 +1,26 @@
+system: |
+  Beskrivelse:
+    Du har en transskription, der kan indeholde tegnsætning og store bogstaver, måske ikke indeholder dem, eller indeholder forkert tegnsætning. Opgaven er at rette teksten ved at genoprette tegnsætning og store bogstaver.
+
+  Regler:
+    - Ændr ikke, tilføj ikke eller fjern nogen ord i teksten. Alle ændringer skal begrænses til tegnsætning og store bogstaver.
+    - Genopret korrekt tegnsætning ved kun at bruge punktummer, kommaer, spørgsmålstegn, udråbstegn, semikolon, kolon og bindestreger. Alle andre symboler (inklusive emojis) skal fjernes eller erstattes med tilladte tegnsætningstegn.
+    - Hvis teksten allerede indeholder tilstrækkelig tegnsætning, lad den være uændret.
+    - Ret forkert eller manglende tegnsætning.
+    - Brug store bogstaver i begyndelsen af sætninger og for egennavne.
+
+  Eksempler:
+  - input: "hej hvordan har du det i dag 😊 det er dejligt at se dig!!!"
+    output: "Hej, hvordan har du det i dag? Det er dejligt at se dig."
+
+  - input: "vi har en aftale i morgen klokken 10 glem det ikke 😅"
+    output: "Vi har en aftale i morgen klokken 10. Glem det ikke."
+
+  - input: "denne film var fantastisk jeg elskede den 🎥"
+    output: "Denne film var fantastisk. Jeg elskede den."
+
+  - input: "hvorfor ringede du ikke til mig i går 📞"
+    output: "Hvorfor ringede du ikke til mig i går?"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml
new file mode 100644
index 00000000..7c81afe2
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml
@@ -0,0 +1,29 @@
+system: |
+  Beschreibung:
+    Sie haben eine Transkription, die möglicherweise Zeichensetzung und Großschreibung enthält, nicht enthält oder fehlerhafte Zeichensetzung aufweist. Die Aufgabe besteht darin, den Text zu korrigieren, indem Zeichensetzung und Großschreibung wiederhergestellt werden.
+
+  Regeln:
+    - Ändern, hinzufügen oder entfernen Sie keine Wörter im Text. Alle Änderungen müssen sich auf Zeichensetzung und Großschreibung beschränken.
+    - Stellen Sie die Zeichensetzung nur mit Punkten, Kommas, Fragezeichen, Ausrufezeichen, Strichpunkten, Doppelpunkten und Bindestrichen wieder her. Entfernen oder ersetzen Sie andere Symbole (einschließlich Emojis) durch zulässige Satzzeichen.
+    - Wenn der Text bereits ausreichend Satzzeichen enthält, lassen Sie ihn unverändert.
+    - Korrigieren Sie unvollständige oder fehlerhafte Zeichensetzung.
+    - Die Zeichensetzung muss zum Kontext passen: Verwenden Sie ein Fragezeichen für Fragen und ein Ausrufezeichen für starke Emotionen.
+    - Schreiben Sie den ersten Buchstaben jedes Satzes groß.
+    - Schreiben Sie Eigennamen und Abkürzungen groß.
+    - Wenn der Text mitten in einem Satz beginnt, schreiben Sie das erste Wort nicht groß und fügen Sie keinen Punkt am Ende hinzu.
+
+  Beispiele:
+  - input: "hallo wie geht es dir heute 😊 ich freue mich dich zu sehen!!!"
+    output: "Hallo, wie geht es dir heute? Ich freue mich, dich zu sehen."
+
+  - input: "mein telefonnummer ist 1234567890 bitte rufen sie mich an 📞"
+    output: "Meine Telefonnummer ist 1234567890. Bitte rufen Sie mich an."
+
+  - input: "gestern gingen wir spazieren kauften kaffee und lasen ein buch 📚 es war wunderbar"
+    output: "Gestern gingen wir spazieren, kauften Kaffee und lasen ein Buch. Es war wunderbar."
+
+  - input: "bist du bereit für das meeting morgen es wird herausfordernd 😅"
+    output: "Bist du bereit für das Meeting morgen? Es wird herausfordernd."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml
new file mode 100644
index 00000000..875d7ffd
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml
@@ -0,0 +1,26 @@
+system: |
+  Περιγραφή:
+    Έχετε μια απομαγνητοφώνηση που μπορεί να περιέχει σημεία στίξης και κεφαλαία γράμματα, μπορεί να μην τα περιέχει ή να περιέχει λανθασμένα σημεία στίξης. Η αποστολή είναι να διορθώσετε το κείμενο επαναφέροντας τα σημεία στίξης και τα κεφαλαία γράμματα.
+
+  Κανόνες:
+    - Μην αλλάζετε, προσθέτετε ή αφαιρείτε λέξεις από το κείμενο. Όλες οι αλλαγές πρέπει να περιορίζονται στα σημεία στίξης και στα κεφαλαία γράμματα.
+    - Επαναφέρετε τα σωστά σημεία στίξης χρησιμοποιώντας μόνο τελείες, κόμματα, ερωτηματικά, θαυμαστικά, άνω τελείες, άνω-κάτω τελείες και παύλες. Όλοι οι άλλοι χαρακτήρες (συμπεριλαμβανομένων των emojis) πρέπει να αφαιρεθούν ή να αντικατασταθούν με επιτρεπτά σημεία στίξης.
+    - Εάν το κείμενο περιέχει ήδη επαρκή σημεία στίξης, αφήστε το όπως είναι.
+    - Διορθώστε τα λανθασμένα ή ελλιπή σημεία στίξης.
+    - Χρησιμοποιήστε κεφαλαία γράμματα στην αρχή κάθε πρότασης και για κύρια ονόματα.
+
+  Παραδείγματα:
+  - input: "τι όμορφη μέρα σήμερα ο ήλιος λάμπει 😊 αλλά κάνει τόσο κρύο έξω"
+    output: "Τι όμορφη μέρα σήμερα! Ο ήλιος λάμπει, αλλά κάνει τόσο κρύο έξω."
+
+  - input: "γιατί δεν με πήρες χθες 📞"
+    output: "Γιατί δεν με πήρες χθες;"
+
+  - input: "αύριο έχουμε συνάντηση φέρε μαζί σου τα έγγραφα"
+    output: "Αύριο έχουμε συνάντηση. Φέρε μαζί σου τα έγγραφα."
+
+  - input: "πρόσεχε ένα αυτοκίνητο πλησιάζει"
+    output: "Πρόσεχε! Ένα αυτοκίνητο πλησιάζει."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml
new file mode 100644
index 00000000..9789b1a1
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml
@@ -0,0 +1,47 @@
+system: |
+  Description:
+    You have a transcript that may contain punctuation and capitalization, may not contain them, or may contain incorrect punctuation. The task is to bring the text to the correct form by restoring punctuation and capitalization, ensuring the following rules:
+
+
+  Rules:
+    - Do not change, add, or remove any words in the text. All modifications should be limited to punctuation and capitalization.
+    - Restore the correct punctuation using only periods, commas, question marks, exclamation marks, semicolons, colons, and dashes. All other symbols (including quotes, parentheses, emojis, etc.) must be removed or replaced with allowed punctuation marks.
+    - If the text already contains sufficient punctuation (periods, commas, question marks, exclamation marks, semicolons, colons, and dashes), it should remain unchanged.
+    - If punctuation is incomplete, incorrect, or contains invalid symbols (e.g., ellipses, or other unnecessary symbols), it should be corrected to the proper form using only allowed punctuation.
+    - Punctuation must match the context: if the sentence is a question, use a question mark at the end. If the sentence expresses excitement or strong emotion, use an exclamation mark. Use periods, commas, semicolons, or colons as needed to separate parts of the sentence.
+    - All alphanumeric characters (including digits, e.g., 3:30pm) should remain unchanged.
+    - Capitalize the first letter of each sentence.
+    - Capitalize proper nouns and abbreviations.
+    - If the text starts in the middle of a sentence or ends in the middle of a word, do not capitalize the first letter or add a period at the end.
+    - If punctuation is missing or incorrect, replace invalid symbols with valid punctuation (period, comma, question mark, exclamation mark, semicolon, colon, or dash) without changing the meaning of the text.
+
+  Examples:
+  - input: "I have 3 pets: a dog, a cat, and a rabbit! ?~_~P??~_~P??~_~P?"
+    output: "I have 3 pets: a dog, a cat, and a rabbit!"
+
+  - input: "My phone number is 123-456-7890. Call me when you get a chance! ?~_~S~^"
+    output: "My phone number is 123-456-7890. Call me when you get a chance!"
+
+  - input: "I can't believe this...!!! This is so exciting!!!"
+    output: "I can't believe this! This is so exciting!"
+
+  - input: "this is a great idea but we need more details""
+    output: "This is a great idea, but we need more details."
+
+  - input: "We had a great time ?~@~T everything went smoothly!"
+    output: "We had a great time ?~@~T everything went smoothly!"
+
+  - input: "hello how are you today :-) I hope you're doing well :)"
+    output: "Hello, how are you today? I hope you're doing well."
+
+  - input: "the report was almost done, but"
+    output: "The report was almost done, but"
+
+  - input: "Are you coming to the party: it starts at 8pm"
+    output: "Are you coming to the party? It starts at 8pm."
+
+  - input: "i need to buy groceries however I don't have much time."
+    output: "I need to buy groceries; however, I don't have much time."
+
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml
new file mode 100644
index 00000000..67d493c5
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml
@@ -0,0 +1,28 @@
+system: |
+  Descripción:
+    Tienes una transcripción que puede contener puntuación y capitalización, puede no contenerlas, o puede contener puntuación incorrecta. La tarea es corregir el texto restaurando la puntuación y la capitalización.
+
+  Reglas:
+    - No cambies, añadas ni elimines palabras del texto. Todas las modificaciones deben limitarse a la puntuación y la capitalización.
+    - Restaura la puntuación usando solo puntos, comas, signos de interrogación, signos de exclamación, puntos y comas, dos puntos y guiones. Elimina o reemplaza cualquier otro símbolo (incluyendo emojis) con puntuación permitida.
+    - Si el texto ya contiene puntuación suficiente, déjalo sin cambios.
+    - Corrige la puntuación si es incorrecta o contiene símbolos no válidos.
+    - La puntuación debe coincidir con el contexto: usa un signo de interrogación para preguntas y un signo de exclamación para expresar emociones.
+    - Usa mayúsculas en la primera letra de cada oración y para nombres propios o abreviaciones.
+    - Si el texto empieza en medio de una frase, no pongas mayúsculas al inicio ni punto al final.
+
+  Ejemplos:
+  - input: "hola como estas hoy 😊 espero que estés bien!!!"
+    output: "Hola, ¿cómo estás hoy? Espero que estés bien."
+
+  - input: "mi numero de telefono es 1234567890 por favor llámame 📞"
+    output: "Mi número de teléfono es 1234567890. Por favor, llámame."
+
+  - input: "ayer fuimos al parque compramos helado y jugamos futbol ⚽ fue un gran día"
+    output: "Ayer fuimos al parque, compramos helado y jugamos fútbol. Fue un gran día."
+
+  - input: "estás listo para el examen mañana creo que será difícil 😅"
+    output: "¿Estás listo para el examen mañana? Creo que será difícil."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml
new file mode 100644
index 00000000..f4489a14
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml
@@ -0,0 +1,27 @@
+system: |
+  Kirjeldus:
+    Teil on transkriptsioon, mis võib sisaldada kirjavahemärke ja suuri tähti, ei pruugi neid sisaldada või sisaldab valesid kirjavahemärke. Ülesanne on tekst parandada, taastades kirjavahemärgid ja suurtähed.
+
+  Reeglid:
+    - Ärge muutke, lisage ega eemaldage teksti sõnu. Kõik muudatused peavad piirduma kirjavahemärkide ja suurtähtedega.
+    - Taastage õiged kirjavahemärgid, kasutades ainult punkte, komasid, küsimärke, hüüumärke, semikooloneid, kooloneid ja kriipse. Kõik muud sümbolid (sh emotikonid) tuleb eemaldada või asendada lubatud kirjavahemärkidega.
+    - Kui tekst sisaldab juba piisavalt kirjavahemärke, jätke see muutmata.
+    - Parandage valed või puuduvad kirjavahemärgid.
+    - Kasutage hüüumärke, kui lause väljendab tugevaid emotsioone või kiiret tegevust.
+    - Kasutage suurte tähtedega iga lause algust ja nimesid.
+
+  Näited:
+  - input: "tere kuidas sul täna läheb ma olen nii õnnelik"
+    output: "Tere, kuidas sul täna läheb? Ma olen nii õnnelik!"
+
+  - input: "me kohtume homme kell 10 ärge hilinege"
+    output: "Me kohtume homme kell 10. Ärge hilinege!"
+
+  - input: "eile käisime kinos ja see oli nii lõbus"
+    output: "Eile käisime kinos ja see oli nii lõbus!"
+
+  - input: "vaata ette auto tuleb"
+    output: "Vaata ette! Auto tuleb!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml
new file mode 100644
index 00000000..54e5099d
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml
@@ -0,0 +1,27 @@
+system: |
+  Kuvaus:
+    Sinulla on transkriptio, joka voi sisältää välimerkkejä ja isoja kirjaimia, ei välttämättä sisällä niitä tai sisältää virheellisiä välimerkkejä. Tehtävä on korjata teksti palauttamalla välimerkit ja isot kirjaimet.
+
+  Säännöt:
+    - Älä muuta, lisää tai poista sanoja tekstistä. Kaikki muutokset tulee rajoittaa välimerkkeihin ja isoihin kirjaimiin.
+    - Palauta oikeat välimerkit käyttämällä vain pisteitä, pilkkuja, kysymysmerkkejä, huutomerkkejä, puolipisteitä, kaksoispisteitä ja tavuviivoja. Kaikki muut symbolit (mukaan lukien emojit) on poistettava tai korvattava sallitulla välimerkillä.
+    - Jos tekstissä on jo riittävästi välimerkkejä, jätä se muuttamatta.
+    - Korjaa virheelliset tai puuttuvat välimerkit.
+    - Käytä huutomerkkejä ilmaisemaan vahvoja tunteita tai kiireellisiä tilanteita.
+    - Käytä isoja kirjaimia lauseiden alussa ja erisnimissä.
+
+  Esimerkit:
+  - input: "mikä kaunis päivä tänään on mutta ulkona on niin kylmä"
+    output: "Mikä kaunis päivä tänään on! Mutta ulkona on niin kylmä."
+
+  - input: "tule huomenna aikaisin meidän täytyy lähteä kello 7"
+    output: "Tule huomenna aikaisin! Meidän täytyy lähteä kello 7."
+
+  - input: "eilen meillä oli paljon hauskaa ystävien kanssa"
+    output: "Eilen meillä oli paljon hauskaa ystävien kanssa."
+
+  - input: "varo auto tulee"
+    output: "Varo! Auto tulee!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml
new file mode 100644
index 00000000..44a9c409
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml
@@ -0,0 +1,43 @@
+system: |
+  Description:
+    Vous avez une transcription qui peut contenir une ponctuation et une capitalisation, ne pas en contenir, ou contenir une ponctuation incorrecte. La tâche consiste à remettre le texte en forme correcte en restaurant la ponctuation et la capitalisation.
+
+  Règles:
+    - Ne changez pas, n'ajoutez pas et ne supprimez aucun mot du texte. Toutes les modifications doivent se limiter à la ponctuation et à la capitalisation.
+    - Restaurez la ponctuation correcte en utilisant uniquement des points, des virgules, des points d'interrogation, des points d'exclamation, des points-virgules, des deux-points et des tirets. Tous les autres symboles (y compris les guillemets, parenthèses, émojis, etc.) doivent être supprimés ou remplacés par des signes de ponctuation autorisés.
+    - Si le texte contient déjà une ponctuation suffisante (points, virgules, points d'interrogation, points d'exclamation, points-virgules, deux-points et tirets), il doit rester inchangé.
+    - Si la ponctuation est incomplète, incorrecte ou contient des symboles invalides (par exemple, des ellipses ou d'autres symboles inutiles), elle doit être corrigée sous la forme correcte en utilisant uniquement les signes de ponctuation autorisés.
+    - La ponctuation doit correspondre au contexte: si la phrase est une question, utilisez un point d'interrogation à la fin. Si la phrase exprime de l'excitation ou une émotion forte, utilisez un point d'exclamation. Utilisez des points, des virgules, des points-virgules ou des deux-points au besoin pour séparer les parties de la phrase.
+    - Tous les caractères alphanumériques (y compris les chiffres, par ex. 3:30pm) doivent rester inchangés.
+    - Mettez en majuscule la première lettre de chaque phrase.
+    - Mettez en majuscule les noms propres et les abréviations.
+    - Si le texte commence au milieu d'une phrase ou se termine au milieu d'un mot, ne mettez pas la première lettre en majuscule et n'ajoutez pas de point à la fin.
+    - Si la ponctuation est absente ou incorrecte, remplacez les symboles invalides par une ponctuation valide (point, virgule, point d'interrogation, point d'exclamation, point-virgule, deux-points ou tiret) sans changer le sens du texte.
+
+  Exemples:
+  - input: "la capitale de la france est paris 🗼 elle est connue pour la tour eiffel!!!"
+    output: "La capitale de la France est Paris. Elle est connue pour la tour Eiffel!"
+
+  - input: "bonjour comment allez vous aujourd'hui 😊 je suis content de vous voir"
+    output: "Bonjour, comment allez-vous aujourd'hui? Je suis content de vous voir."
+
+  - input: "nous avons acheté 5 pommes, 3 oranges, et 2 bananes...! 🍎🍌"
+    output: "Nous avons acheté 5 pommes, 3 oranges et 2 bananes."
+
+  - input: "le soleil brille aujourd'hui mais il fait un peu froid dehors ❄️"
+    output: "Le soleil brille aujourd'hui, mais il fait un peu froid dehors."
+
+  - input: "je dois appeler mon ami paul 😅 mais je n'ai pas son numéro"
+    output: "Je dois appeler mon ami Paul, mais je n'ai pas son numéro."
+
+  - input: "il est 3:30pm maintenant, on se retrouve a 4pm?"
+    output: "Il est 3:30pm maintenant. On se retrouve à 4pm?"
+
+  - input: "je ne comprends pas ce que tu veux dire...!! 🤔 peux tu expliquer encore?"
+    output: "Je ne comprends pas ce que tu veux dire! Peux-tu expliquer encore?"
+
+  - input: "la date d'aujourd'hui est le 1er janvier 2023 🎉 bonne année à tous!"
+    output: "La date d'aujourd'hui est le 1er janvier 2023. Bonne année à tous!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml
new file mode 100644
index 00000000..e81f185b
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml
@@ -0,0 +1,26 @@
+system: |
+  Opis:
+    Imate transkripciju koja može sadržavati interpunkciju i velika slova, možda ih ne sadrži ili sadrži pogrešnu interpunkciju. Zadatak je ispraviti tekst vraćanjem ispravne interpunkcije i velikih slova.
+
+  Pravila:
+    - Nemojte mijenjati, dodavati ili uklanjati riječi iz teksta. Sve izmjene trebaju biti ograničene na interpunkciju i velika slova.
+    - Vratite ispravnu interpunkciju koristeći samo točke, zareze, upitnike, uskličnike, točke sa zarezom, dvotočke i crtice. Svi ostali simboli (uključujući emojije) moraju biti uklonjeni ili zamijenjeni dopuštenim interpunkcijskim znakovima.
+    - Ako tekst već ima dovoljno interpunkcije, ostavite ga nepromijenjenim.
+    - Ispravite pogrešnu ili nedostajuću interpunkciju.
+    - Koristite velika slova na početku svake rečenice i za vlastita imena.
+
+  Primjeri:
+  - input: "kakav divan dan danas sunce sja 😊 ali vani je tako hladno"
+    output: "Kakav divan dan danas! Sunce sja, ali vani je tako hladno."
+
+  - input: "zašto mi nisi jučer nazvao 📞"
+    output: "Zašto mi nisi jučer nazvao?"
+
+  - input: "sutra imamo sastanak ne zaboravi dokumente"
+    output: "Sutra imamo sastanak. Ne zaboravi dokumente."
+
+  - input: "pazi auto dolazi"
+    output: "Pazi! Auto dolazi."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml
new file mode 100644
index 00000000..ce59b5b2
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml
@@ -0,0 +1,26 @@
+system: |
+  Leírás:
+    Van egy átirat, amely tartalmazhat írásjeleket és nagybetűket, lehet, hogy nem tartalmazza őket, vagy hibás írásjeleket tartalmaz. A feladat az, hogy helyreállítsa a helyesírást és az írásjeleket.
+
+  Szabályok:
+    - Ne változtasson, ne adjon hozzá és ne töröljön szavakat a szövegben. Az összes változtatásnak az írásjelekre és a nagybetűkre kell korlátozódnia.
+    - Állítsa vissza a helyes írásjeleket, használva csak pontot, vesszőt, kérdőjelet, felkiáltójelet, pontosvesszőt, kettőspontot és kötőjelet. Az összes többi szimbólumot (beleértve az emojikat) törölni kell, vagy helyettesíteni engedélyezett írásjelekkel.
+    - Ha a szöveg már elegendő írásjeleket tartalmaz, hagyja változatlanul.
+    - Javítsa a hibás vagy hiányzó írásjeleket.
+    - Használjon nagybetűt minden mondat elején és tulajdonneveknél.
+
+  Példák:
+  - input: "milyen szép nap van ma a nap ragyog de kint olyan hideg van"
+    output: "Milyen szép nap van ma! A nap ragyog, de kint olyan hideg van."
+
+  - input: "miért nem hívtál tegnap 📞"
+    output: "Miért nem hívtál tegnap?"
+
+  - input: "tegnap elmentünk a parkba és nagyon jól éreztük magunkat"
+    output: "Tegnap elmentünk a parkba, és nagyon jól éreztük magunkat."
+
+  - input: "vigyázz jön egy autó"
+    output: "Vigyázz! Jön egy autó!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml
new file mode 100644
index 00000000..8a4592ca
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml
@@ -0,0 +1,27 @@
+system: |
+  Descrizione:
+    Hai una trascrizione che potrebbe contenere punteggiatura e lettere maiuscole, potrebbe non contenerle o potrebbe avere una punteggiatura errata. Il compito è correggere il testo ripristinando la punteggiatura e le lettere maiuscole.
+
+  Regole:
+    - Non modificare, aggiungere o rimuovere parole dal testo. Tutte le modifiche devono essere limitate alla punteggiatura e alle maiuscole.
+    - Ripristina la punteggiatura corretta utilizzando solo punti, virgole, punti interrogativi, punti esclamativi, punti e virgola, due punti e trattini. Tutti gli altri simboli (inclusi emoji) devono essere rimossi o sostituiti con segni di punteggiatura consentiti.
+    - Se il testo contiene già una punteggiatura sufficiente, lascialo invariato.
+    - Correggi la punteggiatura errata o mancante.
+    - Usa punti esclamativi per esprimere forti emozioni o urgenza.
+    - Usa lettere maiuscole all'inizio di ogni frase e per nomi propri.
+
+  Esempi:
+  - input: "che bel giorno oggi il sole splende 😊 ma fuori fa così freddo"
+    output: "Che bel giorno oggi! Il sole splende, ma fuori fa così freddo."
+
+  - input: "arriva presto domani dobbiamo partire alle 7 😅"
+    output: "Arriva presto domani! Dobbiamo partire alle 7."
+
+  - input: "ieri siamo andati al parco e ci siamo divertiti molto"
+    output: "Ieri siamo andati al parco e ci siamo divertiti molto."
+
+  - input: "attento c'è una macchina"
+    output: "Attento! C'è una macchina!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml
new file mode 100644
index 00000000..6e0eb355
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml
@@ -0,0 +1,27 @@
+system: |
+  Aprašymas:
+    Jūs turite transkripciją, kurioje gali būti skyrybos ženklų ir didžiųjų raidžių, jų gali nebūti arba jie gali būti neteisingi. Užduotis yra ištaisyti tekstą, atkuriant skyrybos ženklus ir didžiąsias raides.
+
+  Taisyklės:
+    - Nekeiskite, nepridėkite ir neištrinkite žodžių iš teksto. Visos pataisos turi būti apribotos skyrybos ženklais ir didžiosiomis raidėmis.
+    - Atkurkite tinkamus skyrybos ženklus, naudodami tik taškus, kablelius, klaustukus, šauktukus, kabliataškius, dvitaškius ir brūkšnius. Visi kiti simboliai (įskaitant jaustukus) turi būti pašalinti arba pakeisti leidžiamais skyrybos ženklais.
+    - Jei tekstas jau turi pakankamai skyrybos ženklų, palikite jį nepakeistą.
+    - Ištaisykite neteisingus arba trūkstamus skyrybos ženklus.
+    - Naudokite šauktukus, kad išreikštumėte stiprias emocijas ar skubumą.
+    - Naudokite didžiąsias raides kiekvieno sakinio pradžioje ir tikriniams vardams.
+
+  Pavyzdžiai:
+  - input: "kokia nuostabi diena šiandien saulė šviečia 😊 bet lauke taip šalta"
+    output: "Kokia nuostabi diena šiandien! Saulė šviečia, bet lauke taip šalta."
+
+  - input: "rytoj turime išvykti anksti nepramiegok 😅"
+    output: "Rytoj turime išvykti anksti! Nepramiegok."
+
+  - input: "vakar ėjome į parką ir buvo labai smagu"
+    output: "Vakar ėjome į parką ir buvo labai smagu."
+
+  - input: "žiūrėk automobilis atvažiuoja"
+    output: "Žiūrėk! Automobilis atvažiuoja!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml
new file mode 100644
index 00000000..1c0516cd
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml
@@ -0,0 +1,27 @@
+system: |
+  Apraksts:
+    Jums ir transkripcija, kurā var būt pieturzīmes un lielie burti, kurā var nebūt vai kurā var būt nepareizas pieturzīmes. Uzdevums ir labot tekstu, atjaunojot pieturzīmes un lielos burtus.
+
+  Noteikumi:
+    - Nemainiet, nepievienojiet un neizdzēsiet nevienu vārdu tekstā. Visas izmaiņas jāierobežo ar pieturzīmēm un lielajiem burtiem.
+    - Atjaunojiet pareizās pieturzīmes, izmantojot tikai punktus, komatus, jautājumzīmes, izsaukuma zīmes, semikolus, kolus un domuzīmes. Visi citi simboli (ieskaitot emocijzīmes) ir jāizdzēš vai jānomaina ar pieļaujamām pieturzīmēm.
+    - Ja tekstā jau ir pietiekami daudz pieturzīmju, atstājiet to nemainītu.
+    - Izlabojiet nepareizas vai trūkstošas pieturzīmes.
+    - Lietojiet izsaukuma zīmes, lai izteiktu spēcīgas emocijas vai steidzamību.
+    - Izmantojiet lielos burtus katras teikuma sākumā un īpašvārdiem.
+
+  Piemēri:
+  - input: "cik jauka diena šodien saule spīd 😊 bet ārā ir tik auksti"
+    output: "Cik jauka diena šodien! Saule spīd, bet ārā ir tik auksti."
+
+  - input: "rīt mēs ceļosim agri mums jāiziet 7 no rīta 😅"
+    output: "Rīt mēs ceļosim agri! Mums jāiziet 7 no rīta."
+
+  - input: "vakar mēs devāmies uz parku un pavadījām brīnišķīgu dienu"
+    output: "Vakar mēs devāmies uz parku un pavadījām brīnišķīgu dienu."
+
+  - input: "uzmanies tuvojas mašīna"
+    output: "Uzmanies! Tuvojas mašīna!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml
new file mode 100644
index 00000000..f7d95574
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml
@@ -0,0 +1,26 @@
+system: |
+  Deskrizzjoni:
+    Għandek traskrizzjoni li tista' tinkludi punteġġjatura u ittri kapitali, tista' ma tinkludihomx, jew għandha punteġġjatura żbaljata. Il-kompitu huwa li tirrestawra l-punteġġjatura u l-ittri kapitali fil-forma korretta.
+
+  Regoli:
+    - Tibdilx, iżżidx jew tneħħix kliem mit-test. Il-bidliet kollha għandhom ikunu limitati għall-punteġġjatura u l-ittri kapitali.
+    - Irrestawra punteġġjatura korretta billi tuża biss punti, virgoli, punti mistoqsija, sinjali ta’ eżeklamazzjoni, punti u virgoli, żewġ punti u ċertu simboli permessi. Emojis għandhom jitneħħew jew jinbidlu b’punteġġjatura valida.
+    - Jekk it-test diġà għandu biżżejjed punteġġjatura, ħallih kif inhu.
+    - Ikkoreġi punteġġjatura ħażina jew nieqsa.
+    - Uża ittri kapitali fil-bidu ta' kull sentenza u għal ismijiet proprji.
+
+  Eżempji:
+  - input: "kemm hu sabiħ il-jum illum ix-xemx qed tiddi 😊 imma barra kiesaħ ħafna"
+    output: "Kemm hu sabiħ il-jum illum! Ix-xemx qed tiddi, imma barra kiesaħ ħafna."
+
+  - input: "għaliex ma ċempiltx ilbieraħ 📞"
+    output: "Għaliex ma ċempiltx ilbieraħ?"
+
+  - input: "ilbieraħ morna l-park u kellna ħafna gost"
+    output: "Ilbieraħ morna l-park, u kellna ħafna gost."
+
+  - input: "oqgħod attent hemm karozza ġejja"
+    output: "Oqgħod attent! Hemm karozza ġejja."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml
new file mode 100644
index 00000000..22069c31
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml
@@ -0,0 +1,27 @@
+system: |
+  Beschrijving:
+    Je hebt een transcriptie die interpunctie en hoofdletters kan bevatten, niet kan bevatten, of verkeerde interpunctie kan bevatten. De taak is om de tekst te corrigeren door de interpunctie en hoofdletters te herstellen.
+
+  Regels:
+    - Verander, voeg niets toe of verwijder geen woorden uit de tekst. Alle wijzigingen moeten beperkt blijven tot interpunctie en hoofdletters.
+    - Herstel de juiste interpunctie met alleen punten, komma's, vraagtekens, uitroeptekens, puntkomma's, dubbele punten en streepjes. Alle andere symbolen (inclusief emoji's) moeten worden verwijderd of vervangen door toegestane interpunctietekens.
+    - Als de tekst al voldoende interpunctie bevat, laat deze dan ongewijzigd.
+    - Corrigeer foutieve of ontbrekende interpunctie.
+    - Gebruik uitroeptekens om sterke emoties of urgentie te uiten.
+    - Gebruik hoofdletters aan het begin van elke zin en voor eigennamen.
+
+  Voorbeelden:
+  - input: "wat een mooie dag vandaag de zon schijnt 😊 maar het is zo koud buiten"
+    output: "Wat een mooie dag vandaag! De zon schijnt, maar het is zo koud buiten."
+
+  - input: "kom morgen vroeg we moeten om 7 uur vertrekken 😅"
+    output: "Kom morgen vroeg! We moeten om 7 uur vertrekken."
+
+  - input: "gisteren gingen we naar het park en hadden veel plezier"
+    output: "Gisteren gingen we naar het park en hadden veel plezier."
+
+  - input: "pas op er komt een auto"
+    output: "Pas op! Er komt een auto!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml
new file mode 100644
index 00000000..741acc73
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml
@@ -0,0 +1,27 @@
+system: |
+  Opis:
+    Masz transkrypcję, która może zawierać interpunkcję i wielkie litery, może ich nie zawierać lub zawierać nieprawidłową interpunkcję. Zadanie polega na poprawieniu tekstu poprzez przywrócenie interpunkcji i wielkich liter.
+
+  Zasady:
+    - Nie zmieniaj, nie dodawaj ani nie usuwaj żadnych słów w tekście. Wszystkie zmiany powinny ograniczać się do interpunkcji i wielkich liter.
+    - Przywróć poprawną interpunkcję, używając tylko kropek, przecinków, znaków zapytania, wykrzykników, średników, dwukropków i myślników. Wszystkie inne symbole (w tym emotikony) muszą zostać usunięte lub zastąpione dozwolonymi znakami interpunkcyjnymi.
+    - Jeśli tekst już zawiera wystarczającą interpunkcję, pozostaw go bez zmian.
+    - Popraw błędną lub brakującą interpunkcję.
+    - Używaj wykrzykników, aby wyrazić silne emocje lub pilność.
+    - Używaj wielkich liter na początku każdego zdania i dla nazw własnych.
+
+  Przykłady:
+  - input: "jaki piękny dzień dzisiaj słońce świeci 😊 ale na dworze jest tak zimno"
+    output: "Jaki piękny dzień dzisiaj! Słońce świeci, ale na dworze jest tak zimno."
+
+  - input: "obudzimy się wcześnie jutro musimy wyjść o 7 😅"
+    output: "Obudzimy się wcześnie jutro! Musimy wyjść o 7."
+
+  - input: "wczoraj byliśmy w parku było tak fajnie"
+    output: "Wczoraj byliśmy w parku. Było tak fajnie."
+
+  - input: "uważaj samochód nadjeżdża"
+    output: "Uważaj! Samochód nadjeżdża!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml
new file mode 100644
index 00000000..17f0b80f
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml
@@ -0,0 +1,27 @@
+system: |
+  Descrição:
+    Você tem uma transcrição que pode conter pontuação e letras maiúsculas, pode não conter ou pode conter pontuação incorreta. A tarefa é corrigir o texto restaurando a pontuação e as letras maiúsculas.
+
+  Regras:
+    - Não altere, adicione ou remova palavras do texto. Todas as alterações devem se limitar à pontuação e letras maiúsculas.
+    - Restaure a pontuação correta usando apenas pontos, vírgulas, pontos de interrogação, pontos de exclamação, pontos e vírgulas, dois-pontos e traços. Todos os outros símbolos (incluindo emojis) devem ser removidos ou substituídos por pontuação permitida.
+    - Se o texto já contiver pontuação suficiente, deixe-o inalterado.
+    - Corrija pontuações incorretas ou ausentes.
+    - Use pontos de exclamação para expressar fortes emoções ou urgência.
+    - Use letras maiúsculas no início de cada frase e para nomes próprios.
+
+  Exemplos:
+  - input: "que dia lindo hoje o sol está brilhando 😊 mas está tão frio lá fora"
+    output: "Que dia lindo hoje! O sol está brilhando, mas está tão frio lá fora."
+
+  - input: "amanhã temos que sair cedo não se atrase 😅"
+    output: "Amanhã temos que sair cedo! Não se atrase."
+
+  - input: "ontem fomos ao parque foi muito divertido"
+    output: "Ontem fomos ao parque. Foi muito divertido."
+
+  - input: "cuidado um carro está vindo"
+    output: "Cuidado! Um carro está vindo!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml
new file mode 100644
index 00000000..9f97300d
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml
@@ -0,0 +1,26 @@
+system: |
+  Descriere:
+    Aveți o transcriere care poate conține punctuație și majuscule, poate să nu le conțină sau poate avea punctuație greșită. Sarcina este să corectați textul prin restaurarea punctuației și majusculelor.
+
+  Reguli:
+    - Nu modificați, adăugați sau eliminați cuvinte din text. Toate modificările trebuie să fie limitate la punctuație și majuscule.
+    - Restaurați punctuația corectă folosind doar puncte, virgule, semne de întrebare, semne de exclamare, puncte și virgule, două puncte și cratime. Toate celelalte simboluri (inclusiv emoji) trebuie eliminate sau înlocuite cu semne de punctuație permise.
+    - Dacă textul are deja suficientă punctuație, lăsați-l neschimbat.
+    - Corectați punctuația incorectă sau lipsă.
+    - Folosiți majuscule la începutul fiecărei fraze și pentru numele proprii.
+
+  Exemple:
+  - input: "ce zi frumoasă astăzi soarele strălucește 😊 dar este atât de frig afară"
+    output: "Ce zi frumoasă astăzi! Soarele strălucește, dar este atât de frig afară."
+
+  - input: "unde ai fost ieri de ce nu m-ai sunat 📞"
+    output: "Unde ai fost ieri? De ce nu m-ai sunat?"
+
+  - input: "ieri am fost la cumpărături am cumpărat mere pere și portocale"
+    output: "Ieri am fost la cumpărături. Am cumpărat mere, pere și portocale."
+
+  - input: "ai grijă vine o mașină"
+    output: "Ai grijă! Vine o mașină."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml
new file mode 100644
index 00000000..288948e7
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml
@@ -0,0 +1,61 @@
+system: |
+  Описание: 
+    У вас есть текст, который может содержать пунктуацию и капитализацию, может не содержать их или содержать некорректную пунктуацию. Задача — привести текст к правильной форме, восстановив пунктуацию и капитализацию, соблюдая следующие правила:
+
+  Правила:
+    - Не изменять, не добавлять и не удалять слова в тексте. Все изменения должны касаться только пунктуации и капитализации.
+    - Восстанавливать правильную пунктуацию, используя только точки, запятые, вопросы, восклицательные знаки, точки с запятой, двоеточия и тире. Все остальные символы (включая кавычки, скобки, смайлики и т.д.) должны быть удалены или заменены разрешенными знаками пунктуации.
+    - Если в тексте уже есть достаточная пунктуация (точки, запятые, вопросы, восклицательные знаки, точки с запятой, двоеточия и тире), она должна оставаться без изменений.
+    - Если пунктуация неполная, некорректная или содержит недопустимые символы (например, многоточие или другие лишние символы), они должны быть заменены на правильную пунктуацию с использованием только разрешенных знаков.
+    - Пунктуация должна соответствовать контексту: если предложение является вопросом, в конце должен быть вопросительный знак. Если предложение выражает волнение или сильные эмоции, используйте восклицательный знак. Точки, запятые, точки с запятой или двоеточия должны использоваться для разделения частей предложения.
+    - Все алфавитно-цифровые символы (включая цифры, например, 15:30) должны оставаться без изменений.
+    - Заглавную букву следует ставить в начале каждого предложения.
+    - Заглавными буквами следует писать имена собственные и аббревиатуры.
+    - Если текст начинается в середине предложения или заканчивается на середине слова, не нужно ставить заглавную букву в начале и не нужно добавлять точку в конце.
+    - Если пунктуация отсутствует или неправильная, замените недопустимые символы на правильные знаки пунктуации (точка, запятая, вопросительный знак, восклицательный знак, точка с запятой, двоеточие или тире) без изменения смысла текста.
+
+  Примеры:
+    - input: "я так и не понял, что произошло"
+      output: "Я так и не понял, что произошло."
+      
+    - input: "Знаешь, я думал что мы пойдем в кино, а оказывается отменили сеанс, обидно!"
+      output: "Знаешь, я думал, что мы пойдем в кино, а оказывается, отменили сеанс. Обидно!"
+    
+    - input: "Как ты? давно не виделись! ну как дела?"
+      output: "Как ты? Давно не виделись! Ну как дела?"
+    
+    - input: "Этот вопрос требует внимания а если честно он довольно сложный"
+      output: "Этот вопрос требует внимания, а если честно, он довольно сложный."
+    
+    - input: "Сегодня вечером мы будем обсуждать проект- нам нужно завершить работу к 20:00"
+      output: "Сегодня вечером мы будем обсуждать проект — нам нужно завершить работу к 20:00."
+    
+    - input: "игорь не пришел на встречу... может быть он опоздает?"
+      output: "Игорь не пришел на встречу. Может быть, он опоздает?"
+    
+    - input: "ты видела этот фильм? просто невероятно, как такое вообще снимают"
+      output: "Ты видела этот фильм? Просто невероятно, как такое вообще снимают!"
+    
+    - input: "Мы почти пришли домой но мне нужно заехать за продуктами"
+      output: "Мы почти пришли домой, но мне нужно заехать за продуктами."
+    
+    - input: "Привет, что ты думаешь по поводу нового закона? он реально важен для экономики."
+      output: "Привет, что ты думаешь по поводу нового закона? Он реально важен для экономики."
+    
+    - input: "Она сказала ну не знаю посмотрим"
+      output: "Она сказала: ну, не знаю, посмотрим."
+    
+    - input: "Ты мне не ответил на прошлый вопрос :("
+      output: "Ты мне не ответил на прошлый вопрос."
+    
+    - input: "Зачем ты так? Это не твоя вина не переживай:)"
+      output: "Зачем ты так? Это не твоя вина, не переживай!"
+    
+    - input: "А вот и новость! Ура нам удалось завершить проект! :D"
+      output: "А вот и новость! Ура, нам удалось завершить проект!"
+    
+    - input: "Я был в шоке от увиденного не мог поверить :O"
+      output: "Я был в шоке от увиденного, не мог поверить!"
+  
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml
new file mode 100644
index 00000000..ef078993
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml
@@ -0,0 +1,26 @@
+system: |
+  Popis:
+    Máte prepis, ktorý môže obsahovať interpunkciu a veľké písmená, nemusí ich obsahovať, alebo obsahuje nesprávnu interpunkciu. Úlohou je opraviť text obnovením interpunkcie a veľkých písmen.
+
+  Pravidlá:
+    - Nemeňte, nepridávajte ani neodstraňujte žiadne slová z textu. Všetky úpravy by sa mali týkať iba interpunkcie a veľkých písmen.
+    - Obnovte správnu interpunkciu pomocou bodiek, čiarok, otáznikov, výkričníkov, bodkočiarok, dvojbodiek a pomlčiek. Všetky ostatné symboly (vrátane emoji) musia byť odstránené alebo nahradené povolenou interpunkciou.
+    - Ak text už obsahuje dostatočnú interpunkciu, nechajte ho nezmenený.
+    - Opravte nesprávnu alebo chýbajúcu interpunkciu.
+    - Používajte veľké písmená na začiatku viet a pre vlastné mená.
+
+  Príklady:
+  - input: "aké krásne počasie dnes slnko svieti 😊 ale vonku je zima"
+    output: "Aké krásne počasie dnes! Slnko svieti, ale vonku je zima."
+
+  - input: "prečo si mi nezavolal včera 📞"
+    output: "Prečo si mi nezavolal včera?"
+
+  - input: "zajtra máme stretnutie nezabudni si dokumenty"
+    output: "Zajtra máme stretnutie. Nezabudni si dokumenty."
+
+  - input: "pozor auto prichádza"
+    output: "Pozor! Auto prichádza."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml
new file mode 100644
index 00000000..93117503
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml
@@ -0,0 +1,26 @@
+system: |
+  Opis:
+    Imate transkripcijo, ki lahko vsebuje ločila in velike črke, jih morda ne vsebuje, ali pa vsebuje napačna ločila. Naloga je popraviti besedilo z obnovo pravilnih ločil in velikih črk.
+
+  Pravila:
+    - Ne spreminjajte, dodajajte ali odstranjujte besed v besedilu. Vse spremembe naj se omejijo na ločila in velike črke.
+    - Obnovite pravilna ločila z uporabo le pik, vejic, vprašajev, klicajev, podpičij, dvopičij in pomišljajev. Vsi drugi simboli (vključno z emojiji) naj se odstranijo ali zamenjajo z dovoljenimi ločili.
+    - Če besedilo že vsebuje dovolj ločil, naj ostane nespremenjeno.
+    - Popravite napačna ali manjkajoča ločila.
+    - Uporabite velike črke na začetku stavkov in za lastna imena.
+
+  Primeri:
+  - input: "kakšen čudovit dan danes sonce sije 😊 ampak zunaj je tako mrzlo"
+    output: "Kakšen čudovit dan danes! Sonce sije, ampak zunaj je tako mrzlo."
+
+  - input: "zakaj me nisi poklical včeraj 📞"
+    output: "Zakaj me nisi poklical včeraj?"
+
+  - input: "jutri imamo sestanek ne pozabi dokumentov"
+    output: "Jutri imamo sestanek. Ne pozabi dokumentov."
+
+  - input: "pazi prihaja avto"
+    output: "Pazi! Prihaja avto."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml
new file mode 100644
index 00000000..f309a0e1
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml
@@ -0,0 +1,26 @@
+system: |
+  Beskrivning:
+    Du har en transkription som kan innehålla interpunktion och stora bokstäver, kanske inte innehåller dem eller innehåller felaktig interpunktion. Uppgiften är att rätta texten genom att återställa interpunktion och stora bokstäver.
+
+  Regler:
+    - Ändra inte, lägg inte till eller ta bort några ord i texten. Alla ändringar ska begränsas till interpunktion och stora bokstäver.
+    - Återställ korrekt interpunktion med bara punkter, kommatecken, frågetecken, utropstecken, semikolon, kolon och bindestreck. Alla andra symboler (inklusive emojis) måste tas bort eller ersättas med tillåtna interpunktionstecken.
+    - Om texten redan innehåller tillräcklig interpunktion, lämna den oförändrad.
+    - Rätta felaktig eller saknad interpunktion.
+    - Använd stora bokstäver i början av varje mening och för egennamn.
+
+  Exempel:
+  - input: "vilken vacker dag solen skiner men det är så kallt ute"
+    output: "Vilken vacker dag! Solen skiner, men det är så kallt ute."
+
+  - input: "varför ringde du inte igår 📞"
+    output: "Varför ringde du inte igår?"
+
+  - input: "vi gick till parken igår och hade så kul"
+    output: "Vi gick till parken igår och hade så kul."
+
+  - input: "se upp bilen kommer"
+    output: "Se upp! Bilen kommer!"
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml
new file mode 100644
index 00000000..88d7744b
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml
@@ -0,0 +1,26 @@
+system: |
+  Опис:
+    Ви маєте транскрипцію, яка може містити знаки пунктуації та великі літери, може не містити їх або містити неправильні знаки пунктуації. Завдання полягає у виправленні тексту шляхом відновлення пунктуації та великих літер.
+
+  Правила:
+    - Не змінюйте, не додавайте і не видаляйте слова з тексту. Усі зміни повинні обмежуватися пунктуацією та великими літерами.
+    - Відновіть правильну пунктуацію, використовуючи лише крапки, коми, знаки питання, знаки оклику, крапки з комою, двокрапки та тире. Усі інші символи (включно з емодзі) необхідно видалити або замінити дозволеними знаками пунктуації.
+    - Якщо текст уже містить достатньо пунктуації, залиште його без змін.
+    - Виправте неправильну або відсутню пунктуацію.
+    - Використовуйте великі літери на початку кожного речення та для власних назв.
+
+  Приклади:
+  - input: "який прекрасний день сьогодні сонце світить 😊 але надворі так холодно"
+    output: "Який прекрасний день сьогодні! Сонце світить, але надворі так холодно."
+
+  - input: "чому ти не подзвонив вчора 📞"
+    output: "Чому ти не подзвонив вчора?"
+
+  - input: "завтра у нас зустріч не забудь взяти документи"
+    output: "Завтра у нас зустріч. Не забудь взяти документи."
+
+  - input: "будь обережним наближається автомобіль"
+    output: "Будь обережним! Наближається автомобіль."
+    
+user: |
+  Input transcript: {text}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/common.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/common.yaml
new file mode 100644
index 00000000..950d932f
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/common.yaml
@@ -0,0 +1,31 @@
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": "—", "repl": "-"}
+- {"pattern": "–", "repl": "-"}
+- {"pattern": "-", "repl": "-"}
+- {"pattern": "_", "repl": " "}
+- {"pattern": "——", "repl": "-"}
+- {"pattern": "Ё", "repl": "Е"} 
+- {"pattern": "ё", "repl": "е"}
+
+- {"pattern": "♫", "repl": " "}
+- {"pattern": "♪", "repl": " "}
+- {"pattern": "♬", "repl": " "}
+- {"pattern": "♩", "repl": " "}
+- {"pattern": "♭", "repl": " "}
+- {"pattern": '\|', "repl": " "} # : -> :
+- {"pattern": ";", "repl": ","}
+
+- {"pattern": '\[[^\]]*\]', "repl": ""} # delete content inside []
+- {"pattern": ' ?\([^\)]+\)', "repl": ""} # delete content inside ()
+- {"pattern": ' ?{[^}]+}', "repl": ""} # delete content inside {}
+
+- {"pattern": "[^ !$%',-.0123456789;?ABCDEFGHIJKLMNOPQRSßTUVWXYŸZabcdefghijklmnopqrsẞtuvwxyÿz¡£¿ÀÁÂÃÄÅÆÇÈÉÊÌÍÎÑÒÓÔÕÖØÙÚÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýĀāĂăĄąĆćĊċČčĎďĐđĒēĖėĘęĚěĠġĢģĦħĪīĮįĶķĹĺĻļĽľŁłŃńŅņŇňŐőŒœŔŕŘřŚśŠšŤťŪūŮůŰűŲųŹźŻżŽžȘșȚțΆΈΉΌΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίαβγδεζηθικλμνξοπρστυφχψωϊόύώЁЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїҐґ€₴₽/:]", "repl": " "}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"} 
+- {"pattern": '\s+', "repl": " "}  

From cea4d620de44fe3bad0626d27d4971560b8c8d95 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 12:58:11 -0700
Subject: [PATCH 40/90] Common phrases are added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../granary/partials/common_phrases/bg.txt    | 184 ++++++++
 .../granary/partials/common_phrases/cs.txt    | 126 +++++
 .../granary/partials/common_phrases/da.txt    | 322 +++++++++++++
 .../granary/partials/common_phrases/de.txt    |   8 +
 .../granary/partials/common_phrases/el.txt    |  34 ++
 .../granary/partials/common_phrases/en.txt    |   9 +
 .../granary/partials/common_phrases/es.txt    | 153 +++++++
 .../granary/partials/common_phrases/et.txt    | 281 ++++++++++++
 .../granary/partials/common_phrases/fi.txt    |   2 +
 .../granary/partials/common_phrases/fr.txt    | 156 +++++++
 .../granary/partials/common_phrases/hr.txt    |  88 ++++
 .../granary/partials/common_phrases/hu.txt    | 116 +++++
 .../granary/partials/common_phrases/it.txt    |   9 +
 .../granary/partials/common_phrases/lt.txt    | 431 ++++++++++++++++++
 .../granary/partials/common_phrases/lv.txt    |   2 +
 .../granary/partials/common_phrases/mt.txt    | 150 ++++++
 .../granary/partials/common_phrases/nl.txt    |   5 +
 .../granary/partials/common_phrases/pl.txt    |  10 +
 .../granary/partials/common_phrases/pt.txt    |  27 ++
 .../granary/partials/common_phrases/ro.txt    | 285 ++++++++++++
 .../granary/partials/common_phrases/sk.txt    |  83 ++++
 .../granary/partials/common_phrases/sl.txt    | 363 +++++++++++++++
 .../granary/partials/common_phrases/sv.txt    | 184 ++++++++
 23 files changed, 3028 insertions(+)
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/bg.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/cs.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/da.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/de.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/el.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/en.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/es.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/et.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/fi.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/fr.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/hr.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/hu.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/it.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/lt.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/lv.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/mt.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/nl.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/pl.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/pt.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/ro.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/sk.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/sl.txt
 create mode 100644 dataset_configs/multilingual/granary/partials/common_phrases/sv.txt

diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/bg.txt b/dataset_configs/multilingual/granary/partials/common_phrases/bg.txt
new file mode 100644
index 00000000..4c16dee2
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/bg.txt
@@ -0,0 +1,184 @@
+Абонирайте се 2302208
+Егорова 70956
+Семкин Корректор А 68602
+Редактор субтитров А 68129
+Благодаря 15129
+От 2885
+Абонирайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 2088
+се 2086
+Корректор А 1050
+Благодаря ви 928
+Българхи 837
+Върху 531
+За 363
+Въпрос 332
+Абонирайте се, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините, господините 325
+Много 300
+Ни 297
+АПЛОДИСМЕНТА 275
+Абон 267
+Против 257
+за 254
+Абонирайте 252
+На 232
+Места 225
+Момент 222
+Борис 220
+Абонирайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се 217
+Мене 213
+Върхава 208
+субтитров А 193
+Въпроса 192
+Благодаря, господин 181
+Минута 176
+Дякую за перегляд 173
+НО 167
+Ви 163
+Ред 159
+Абонираме 159
+господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 157
+Свет 152
+Абонирам 151
+Ис 151
+Мир 150
+Мата 149
+Мен 149
+Вина 148
+Върхаме 145
+на 142
+Тили 139
+Мик 138
+Мин 136
+Абонирайте се върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху върху вър 132
+Три 129
+Мята 128
+да 121
+Възможност 120
+Благодаря Ви 120
+Сега 116
+Алиси 115
+Продолжение следует 114
+Продължа ли се 113
+Мастер 112
+Манье 111
+Идеи 111
+Менете 111
+Миси 111
+Мината 111
+Мега 111
+Менок 111
+Малки 110
+Мисли 110
+Медия 110
+Ответа 110
+Майс 109
+Мага 109
+Алик 109
+Вечен 108
+Медел 108
+Меню 108
+Свети 108
+Въздържа ли се 107
+Минут 107
+Меним 105
+Продължението 104
+против 104
+от 103
+Меси 101
+Минути 97
+Но 95
+Субтитры создавал DimaTorzok 94
+Адси 89
+Болич 89
+Отправена 89
+Мен сед 89
+Моги 89
+Риск 88
+Мелници 88
+МОТС 88
+Медомата 88
+Мисия 88
+Малетич 88
+Методи 88
+Веченки 88
+Брифинг 88
+Менути 88
+Майсет 88
+Отговори 88
+Винаги 88
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 86
+Вдохме 86
+Продължение 86
+господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 85
+господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 85
+Матсо 85
+Трът 82
+Отвърля се 81
+Телите 81
+процента 80
+Боцена 79
+Месец 79
+Али 77
+О, ние 76
+Двайсет 76
+Една минута 76
+Миште 76
+Венесър 74
+Върхи 73
+Стивилизация 73
+Овреждания 73
+Миколеги 73
+Бълдин 73
+Върнато 73
+Афиоти 73
+Моцелив 73
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 72
+Благчену 72
+Аз 71
+Приема се 71
+Еди 71
+Абонирайте се, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, господин, госп 69
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 68
+Отхвърля се 68
+Мозъг 68
+Към четир 66
+Меѓи 64
+Благодаря, господин председател 64
+Афни 63
+Да 63
+Мотив 63
+господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 62
+Върши момент 62
+Въпросите 62
+Веден пазар 62
+Менализиране 62
+Вършава 62
+Въпроси 62
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 61
+Българ 61
+Матеренезик 61
+Върляси 61
+Майко 61
+господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 60
+Продължа 60
+По 60
+Три матриси 60
+Майкъл 60
+Бън 60
+Реоформи 58
+Тивата 57
+Бъдете 57
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 56
+Абонирани 56
+Продългаме 55
+Българси 55
+Модиги 55
+Тъчленки 54
+Пряма-си 54
+се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господинайте се, господ 53
+Волоберем 53
+Мържи 53
+Грозин Мако 53
+Българност 53
+Във холандия 52
+Въздържали се 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/cs.txt b/dataset_configs/multilingual/granary/partials/common_phrases/cs.txt
new file mode 100644
index 00000000..f2b2deba
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/cs.txt
@@ -0,0 +1,126 @@
+Titulky vytvořil JohnyX 136094
+Konec 134856
+Děkuji 7495
+Děkujeme 5523
+Titulky vytvořil Jirka Kováč 1660
+JohnyX 778
+Děkuji za pozornost 693
+Hvala 656
+Pro 638
+Kováček 597
+Hrv 425
+vytvořil JohnyX 407
+Tak 388
+Můžeme 277
+Já 248
+Čili 224
+On 222
+to 209
+Ano 207
+Zdravíme 203
+Titulky vytvořil Jirka Kováček 202
+Děkuji za pozornosť 194
+že 192
+Můžete 190
+hradeckralove 170
+pro 160
+Aby 156
+Tv 149
+Kdo 149
+Kino 148
+Rok 148
+Kov 148
+Kole 148
+Skol 148
+A USA 147
+Elections 147
+Elmar 147
+Sv 147
+OSN 147
+Kost 147
+Klet 146
+Ně 146
+Koto 144
+Proti 142
+Kováčku 141
+Kult 141
+109 141
+LL 139
+Kone 138
+proti 136
+A tak 132
+Kováčeku 124
+Zdraví se 122
+Kdo se zdržel 117
+Kolega 111
+Kudov 111
+Klosk 111
+Olibe 111
+Ménota 111
+Růst 110
+Rá 25 110
+První 108
+Analyse 105
+Velkou 104
+Ménote 103
+Čí 103
+Kováč 100
+Ale 98
+Třeba 98
+Jury 8 90
+práva 89
+Ahojte 89
+Osobne 89
+Hrufad 89
+Kolejku 89
+Klasná 89
+Kvěl 89
+Vyrové 88
+Klasovat 87
+com www 81
+A půl 81
+přijato 76
+Vypadáme 74
+Vypadá se 74
+Děkáte 74
+Apočujeme 74
+Dělá se 73
+Oči jde 73
+Přešel 73
+Hvala što pratite kanal 73
+Vyrošet 72
+org www 71
+Necháme se 68
+Konec se 66
+hlasování 64
+Konec 13 64
+Děkujeme pro 63
+procent 63
+Anozovojte 63
+Děkuji vám 62
+Hvala půl 62
+Dnesá dva pro 62
+Kudobel 61
+Ďakujem 60
+Pouštěji 60
+Kolegyně 59
+Amen 58
+protože 58
+Představu 58
+Kováčen 58
+Děvěr 57
+Je 56
+Zdravíte se 56
+Ony 56
+se 55
+Hvala barandikov 55
+Zdravíte 54
+Pánšogor 54
+Číslo jedna 54
+Předstvíte 53
+Vářejnou 53
+Je to představá 51
+Průzat 51
+Vátej klo 51
+Mláme my 51
+Hvala za pozornosť 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/da.txt b/dataset_configs/multilingual/granary/partials/common_phrases/da.txt
new file mode 100644
index 00000000..114d7d6a
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/da.txt
@@ -0,0 +1,322 @@
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 152271
+Danske tekster af Nicolai Winther 128882
+Tak fordi du lyttede med 21659
+Tak fordi du så med 17218
+Tak 16072
+Hvorfor 10744
+Hvad er det 10281
+Danske tekster af Jesper Buhl 3586
+Hvorfor er det 3529
+Vi 3157
+Hr 1928
+Tak for at du så med 1683
+Hvad 1648
+Ja 1415
+Danske tekster af Jesper Buhlsen 1239
+For 1160
+Hvor 1056
+vi 882
+Hvr 782
+Hv 748
+Tak for nu 728
+Tak for mig 715
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 671
+En 665
+tekster af Jesper Buhl Scandinavian Text Service 2018 645
+Hvor er det 557
+Winther 503
+Danskevældig 500
+Hvad er det her 485
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 465
+Nicolai Winther 452
+Nej 450
+Du 445
+Skål 441
+Hold 437
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 434
+Råd 434
+af Nicolai Winther 431
+Hej 428
+Ole 428
+tekster af Nicolai Winther 418
+Hvala 380
+Tak fordi du lyttede 363
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 357
+Selv 352
+Tak fordi du lager 345
+Lidt 336
+Så 328
+til 327
+Tak for det 321
+Danske tekst 313
+Hva 312
+Det er det 296
+Kommer 296
+Skal 295
+Godt 286
+Imod 281
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 280
+at 280
+Takk for at gøre med 275
+Takk for at du så med 271
+Gud 258
+Om 256
+Takk for 252
+det 238
+Mål 238
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 237
+Danske tekster 236
+Jeg 235
+Jeg mener 234
+Værsgo 226
+Fyra 226
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 224
+16 223
+Al 222
+Og 221
+2021 217
+og 216
+kom 214
+80 212
+Text 207
+15 200
+28 199
+for 194
+12 194
+Vel 191
+Hvad er 191
+Tak fordi du så 186
+Komisær 183
+med 182
+af Jesper Buhl Scandinavian Text Service 2018 181
+17 178
+20 176
+Men 175
+Amen 173
+Danske tekster af Jesper 164
+Åh 157
+Hvad med det 156
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 152
+Lige 151
+Hæ 150
+LK 148
+I mod 148
+mig 148
+Hra 148
+Trat 148
+Pro7 148
+Spil 147
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 146
+Kors 146
+Tak for at gøre det 144
+tj 144
+Fy 141
+Holdning 139
+Applåder 136
+Ute 135
+Heard 134
+Hvem er for 125
+Takk 123
+Hvad er det med 121
+Evv 117
+Hvad siger det 115
+Hejdå 114
+Låd 112
+Røst 111
+Brugina 111
+Ræd 111
+Lysen 111
+Låder 111
+Lån 111
+Endelig 111
+Brunnen 111
+URN 111
+Kulir 111
+Mås 111
+her 110
+Hildy 110
+Læf 110
+00 og 3 110
+Horsley 110
+Søs 110
+Tak fordi du hører 109
+Skripper 109
+Hågen 109
+Hildia 109
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 108
+Også 107
+Så er det 107
+Hvem er i mod 107
+Læb 107
+Tak for at spørgsmål 107
+Lytter 106
+Tak fordi du med 106
+Tak fordi du lytt 105
+Hvem stemmer for 103
+Takk for at se 102
+Kål 102
+Først 102
+Likowski 99
+Takk for at være med 98
+Et 97
+Tak for 97
+Tak for at være med 97
+Kroner 97
+Danske nu 96
+Kompromis 4 96
+Hvad er sko 95
+Det er ikke 95
+Trænet 95
+Hvorfor er det ikke 92
+Tak fordi du har været med 92
+vedtaget 91
+Hvad er det nu 91
+Mange tak 90
+Kølge 89
+Tak skal du have med 89
+Røsning 89
+Skogårs 89
+Procænd 89
+Kansielers 89
+Vi er imod 89
+Hakeepo 89
+Horslej 89
+Pæs u 89
+Lige produkter 88
+Hvornisk 88
+Røde 88
+Længer 88
+Rapporten 88
+Løsning 88
+Hjelper 88
+Jesper Buhl Scandinavian Text Service 2018 87
+Tak for at du 87
+Haldninger 87
+Femme 13 86
+Hæmmer 86
+Broch 86
+Tak fordi 85
+Forst 85
+Hvad er det så 85
+Svolde 85
+Kjerov 85
+Danske tekster af Jesper Buhlmann 84
+Skal denne 83
+Tine smager 82
+Tak fordi du 81
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 80
+Røde 6 80
+11 79
+Kompromis 8 79
+Tak skal du se 79
+Al FimS 78
+Text Service 2018 77
+om 77
+Tak fordi du gør det 77
+Lange 77
+Hvad er den her 75
+En eksempel 75
+Rufløs 74
+Hvorfor med den 74
+Hordel 74
+Hildes politik 74
+Røsninger 74
+Røstegang 74
+er 73
+Hvad gælder 73
+Hæssig 73
+Eksumheder 73
+Buhl Scandinavian Text Service 2018 72
+Tak for dig 72
+Kompromis 2 72
+Ratsificeringen 72
+Scandinavian Text Service 2018 71
+Hvorfor nu 71
+Tak, Hestemes 71
+Femme tuner 71
+Danske tek 71
+Tak fordi du lade det 71
+Læden 71
+Ristemes 71
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 70
+Håk han 70
+Hølgen 69
+Hvorfor med 68
+Tak for at gøre 68
+Tak for at gøre med 68
+Hvorfor men 67
+Raffeløren 67
+tak 66
+Multicelle 66
+Tak for me 65
+2018 64
+fordi du lyttede med 64
+så med 64
+Hva er det 64
+Forløp 64
+København 63
+Redskorskj 63
+Hvad tager valg 63
+Læn mig 63
+Skal det ikke 62
+Salabores 62
+Fællet bort 62
+Vænkorska 62
+nu 61
+Vedtaget 61
+Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 Danske tekster af Jesper Buhl Scandinavian Text Service 2018 61
+Tak for at se 61
+Det er tæt 61
+Kjækulia 61
+Rækkelig 61
+Ja, tak 60
+på 60
+13 er også med 60
+Hvis det siger 60
+Hvad slags 60
+Tak fordi jeg rade 59
+Det er ikke det 59
+Tak fordi I do 59
+Næt så 59
+Følgel 59
+Sideraritet 58
+Tak for at sige 57
+Det er med 57
+Fragerudig 57
+Danske med 56
+Danske tekster af Nicolai Winning 55
+Jeg har været løp 55
+Hvorfor tænge 55
+Hvorforstøvning 55
+Fereyra 55
+Det er frukket af mig 55
+Hvis præmulier 54
+Korspinelli 54
+Hvorfor crevier 54
+Erfus Soler 54
+Rødlækken 54
+Hva for dem 54
+Løftkvaliteten 54
+Køyhle 54
+Rekstemt længere 54
+Tak fordi du tæt 54
+Tak, synes du 54
+du så med 53
+Hvad gør du 53
+Hvad er så 53
+Jeg vil gøre 53
+Retspraksis 53
+Ygtighed 53
+lyttede med 52
+Tak skal du 52
+Hvad er det på 52
+Hvorfor det 52
+Tak for at gøre processen 52
+Repo 51
+Tak fordi du prøve 51
+Takk for nu 51
+Tak fordi du så dyr 51
+Holden funder 51
+Hvad er den 51
+Tak fordi du forstår 51
+Det er I 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/de.txt b/dataset_configs/multilingual/granary/partials/common_phrases/de.txt
new file mode 100644
index 00000000..611e6b34
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/de.txt
@@ -0,0 +1,8 @@
+Ja 734
+Vielen Dank 518
+Entschuldigung 174
+Dank 150
+Wer ist dafür 81
+Enthaltungen 65
+169 in Fiverr 58
+Dagegen 57
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/el.txt b/dataset_configs/multilingual/granary/partials/common_phrases/el.txt
new file mode 100644
index 00000000..d5ef53da
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/el.txt
@@ -0,0 +1,34 @@
+Υπότιτλοι AUTHORWAVE 152340
+Ευχαριστώ 21201
+Σας ευχαριστώ 9020
+Ωραία 6126
+Ευχαριστούμε 1484
+AUTHORWAVE 1467
+Ευχαριστώ πολύ 611
+Κατά 453
+Υποτιτλοι AUTHORWAVE 361
+Σας ευχαριστούμε 262
+Εγκρίνεται 259
+Αποχές 239
+11 223
+Αποχές εγκρίνεται 160
+Απορρίπτεται 128
+Πέρας 119
+Ωστόσο 112
+Σας ευχαριστώ πολύ 104
+πολύ 93
+Αποχές απορρίπτεται 92
+Πολύ 91
+Εκατόν 89
+Κατά αποχές εγκρίνεται 82
+Άρχεται η ψηφοφορία 80
+Ότι 75
+Παρακαλώ 74
+Πρόεδρε 68
+ευχαριστώ 63
+Εγγρίνεται 62
+ίδιο 62
+και 61
+Ωραίτε 55
+Υπέρ 53
+Οπότε 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/en.txt b/dataset_configs/multilingual/granary/partials/common_phrases/en.txt
new file mode 100644
index 00000000..6e0594c4
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/en.txt
@@ -0,0 +1,9 @@
+Thank you 1297
+you 254
+Yeah 217
+Check 117
+Sayemashka 81
+methods 75
+Fifty-four 62
+Amen -1
+Thank you very much -1
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/es.txt b/dataset_configs/multilingual/granary/partials/common_phrases/es.txt
new file mode 100644
index 00000000..8ab0d60b
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/es.txt
@@ -0,0 +1,153 @@
+¡Gracias por ver el video 188372
+Gracias 166346
+Gracias por ver el video 75415
+Gracias, señor presidente 23074
+CC por Antarctica Films Argentina 17436
+Gracias, señora presidenta 8787
+Bien 7478
+¡Gracias 4042
+¿Qué 3549
+¿Qué es lo que se hace 2450
+Chao 2450
+¿Qué pasa 2069
+Salud 1779
+Adiós 1317
+Sí 1224
+video 941
+¡Suscríbete 921
+¿Qué es lo que está pasando 804
+¿Está bien 777
+¿Vale 632
+Más 574
+Muchas gracias 548
+No 499
+Aplausos 497
+Señor 444
+Hola 428
+Ya 426
+Amén 375
+¡Grac 360
+¡Vamos 334
+¡Gracias por ver 325
+Dios mío 281
+¡Suscríbete al canal 274
+¡Gracias por ver el video ¡Gracias por ver el video 270
+aprobada 262
+¿Verdad 250
+Yo 241
+Siguiente 238
+¿Qué es eso 230
+Sal 229
+del 228
+Un 227
+que 223
+Te 222
+Inter 217
+¡Gracias por ver este video 214
+¿En contra 210
+presidente 209
+señor presidente 197
+por ver el video 187
+Muy bien 173
+¿Y 171
+¡Suscríbete y activa la campanita 168
+el video 165
+A ver 163
+¿Qué es lo que se ha hecho 159
+Presidenta 155
+Contra 154
+Lo hago 149
+Teo 149
+Debate 149
+¡Felicidad 148
+Lista 148
+Elma 148
+Plen 148
+No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, 147
+Corto 147
+¿Cuándo 147
+Empel 146
+Sra 145
+ESF 142
+Aprobada 139
+LL 133
+¿Por qué 130
+¿Alguien 130
+Madre 127
+¿No 126
+POS 126
+En contra 124
+¿Quién es 122
+Gracias, señoras y señores 122
+¿Puedo 121
+Bienvenido 121
+¿Qué es lo que se puede hacer 118
+Acesse 111
+¡Hé 111
+¡Fast 111
+Estranjo 111
+¡Dios 111
+¿Tú 111
+¿Bien 110
+Pero 102
+Presidente 101
+Estudio 101
+¿Cuánto 100
+ver el video 98
+en contra 97
+favor 92
+presidenta 90
+A favor 89
+¡Garante 89
+¡Fossil 89
+¡Gazán 89
+Llegué 89
+¿Qué opinión 89
+¡Aquí 89
+¡Germano 89
+¡Hasta luego 89
+¡Mamá 89
+y el objetivo 88
+Convergencia 88
+Gracias por ver el vídeo 86
+¡Ay 85
+votación 78
+¿Puedo ver 77
+Salini 77
+Ah, no 75
+¿Dónde está 74
+Bueno 73
+gracias 73
+Salim Daffi 73
+¿A favor 72
+¿Termin 72
+señora presidenta 70
+¡Viva la vida 68
+estratégico 66
+¿Qué es lo que se llama 65
+Señor Singen 64
+de 63
+Adelante 63
+también 63
+¡G-I-S 63
+L'anquistad 63
+¿Qué documentos presentan 61
+¡Puedo votar 61
+¿Están bien 61
+¿Puedo pasar 59
+Boni 59
+¿Ahora es 59
+tamboy en cuenta 59
+más 58
+¿Cómo conocen los derechos 58
+¡Nos vemos 57
+También 54
+¡A un momento 54
+rechazada 53
+¿Qué es 53
+Gracias, señor Gualtieri 53
+contra 52
+Pues 52
+Luz de Besa 52
+¿Qué onda 51
+Salas 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/et.txt b/dataset_configs/multilingual/granary/partials/common_phrases/et.txt
new file mode 100644
index 00000000..0fd9a0ec
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/et.txt
@@ -0,0 +1,281 @@
+Kõik 1718177
+Kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 31943
+Kõige 7729
+Kiitos 4342
+See 3830
+Paldies 3448
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1836
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1819
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1791
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1787
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1779
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1750
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1523
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1521
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1489
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1368
+Aitäh 1321
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1261
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1156
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 1043
+Kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 998
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 845
+Kõik on kõik 817
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 778
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 710
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 669
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 617
+Kõik teht 565
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 541
+Kõik te 488
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 451
+Kult 448
+Kõik tehtu 402
+Ja 400
+Kõik teemad 368
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 362
+Eks minuut 356
+Kõik see 351
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 346
+Kastu 327
+Eee 307
+Kõikus 298
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 286
+Kõik sa 271
+Kõikult 262
+Kõik on tehtu 256
+et 252
+Kõik teist 238
+Eh 238
+Paldun 231
+Gu 222
+Eita 219
+Hõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõ 217
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 213
+Eta 208
+Kõik teed 207
+Kõik se 206
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 201
+Hvala 195
+Kolegit 175
+kõik 174
+Pult 166
+Kõik on 162
+Kõik sõnud 157
+Kui 155
+Ees 154
+Eit 153
+Kost 151
+Leks 149
+Eme 149
+Eel 149
+Eur 149
+Paz 148
+Armet 148
+Eto 148
+Kust 148
+Agust 148
+Kõne 148
+Erit 148
+Arine 148
+Kies 148
+Elu 147
+Kõik teha 146
+Eks 146
+Eri 146
+Arame 146
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 142
+Et 141
+Asena 140
+Ena 140
+on 136
+Kola 135
+Mõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõõ 133
+Kus 131
+Küsimus 130
+Kõrge 129
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 117
+Euroopas 115
+Ehti 113
+Korda 113
+Puhul 112
+Eriti 111
+Kavad 111
+Katoch 111
+Kovosa 111
+Eelta 111
+Kõsel 111
+Kiasal 111
+Kärst 111
+Kulik 111
+Eronomi 111
+Kostol 111
+Pallona 111
+Päst 111
+Aitah 110
+Üldse 110
+Kondi 110
+Kusk 110
+Aitoma 110
+Eurist 110
+Arandusi 110
+Araneb 110
+Kisek 109
+Evalik 109
+Eelü 109
+Eegis 108
+Kuhul 107
+Paldaks 106
+ja 105
+Lepool 104
+EUROT 102
+Kosoval 101
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 95
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 93
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 93
+Kõik teembril 92
+Eesti 92
+Kõik on teemad 91
+Põnud 91
+Kõiklo 89
+Kandahal 89
+Eerikia 89
+Kõpsel 89
+Kesturel 89
+Kõrki 89
+Eest agist 88
+Puiustel 88
+Nõnud 88
+Kadeskos 88
+Armeenia 88
+Eremiseks 88
+Aastumine 88
+Kulitsus 88
+Kõhul 88
+Kratlikult 88
+Katsioon 88
+Põhul 88
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 87
+Kõigele 87
+Eelikult 87
+Kõigal 86
+Ning 85
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 85
+Võimine 85
+Kõik on sõnud 84
+Kainvies 84
+Kõnasa 83
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 82
+Kõnast 82
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 81
+Kõik võit 80
+Kõrta no 80
+Kõik osa 79
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 78
+Kõikame 78
+Tõnus 77
+Kärver 77
+Kõik tehti 76
+Kest pavad 74
+Kõik mees 74
+Eriti zeta 74
+Kõnalea 74
+Hada loodsch 74
+Hvala presidenti 74
+Kõik seda 74
+Kõik meeld 74
+Hea meel 73
+Eri oskusi 73
+Kõigelema 73
+Kõik on 26 73
+Kõiksteist 73
+Kulingrege 73
+Nõel 73
+Eitreikel 73
+Suuradu 73
+Lõppanud 73
+Kõik jist 73
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 72
+Kõige pealt 72
+Kõik on tehtud 72
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 70
+Sõdano 70
+Klaabe niht 68
+Kärvost 68
+Põlod 67
+Kvies 67
+Kõikides 67
+Kõhra 67
+Korda no 66
+Kõu 66
+Kõikile 65
+Kõneks 65
+See on 64
+Kõikinsa 64
+Pavel 64
+Eeg 84 64
+Aga 63
+kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik kõik 63
+Kõik jubel 63
+Kõik sest 63
+Kõikabelt 63
+Krasnobetski 62
+Kõik teulad 62
+Kõik teada 62
+Kõik te luga 62
+Põhjus 61
+Kõik net 61
+Kõrl 61
+Häära 61
+Eeg 61
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 60
+Kõik teile 60
+Era poolehtu 60
+Kõik tex 60
+Kolega Rinaldi 60
+Eks valmis 60
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 58
+Kõik siis 58
+Hvala prezident 58
+Arasimseli 58
+kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 56
+Sorda, no 56
+Kõik pala 56
+Siis 55
+Pätsednitse 55
+Kõik minu 55
+Kõik jasad 55
+Kõrsteid 55
+Küsimusas 55
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 54
+Aastatud 54
+Kõik aastal 54
+Eletrooniliselt 54
+Eemidega 54
+Kiste puhkudel 54
+Koolaeg 54
+Kõik tehtivist 54
+Eeglite alusel 54
+Kõik meadvise 53
+Kõik on pult 53
+Lõppinud 53
+Kõikultasandel 53
+Tõepärast 53
+Elga 53
+Kustatama 53
+Kõik te seda 53
+Kõik võttu 53
+Põrtele 53
+vastu 52
+ka 52
+Kõik fläna 52
+Kõik me tehtik 52
+Kõrda 52
+Vastu 51
+on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on kõik on 51
+Kõik teemise 51
+Kõik on vastu 51
+Kõik on 80 51
+Eilonsa 51
+Eitsa 51
+Kõik lõst 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/fi.txt b/dataset_configs/multilingual/granary/partials/common_phrases/fi.txt
new file mode 100644
index 00000000..9eee81c9
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/fi.txt
@@ -0,0 +1,2 @@
+Kiitos 9048
+Kiitos, minä 52
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/fr.txt b/dataset_configs/multilingual/granary/partials/common_phrases/fr.txt
new file mode 100644
index 00000000..f386ee44
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/fr.txt
@@ -0,0 +1,156 @@
+Sous-titrage Société Radio-Canada 150783
+Merci 79917
+le Président, M 17491
+Je remercie 7395
+Je vous remercie 2838
+le Président et M 2650
+Au revoir 1395
+le Président 1129
+Merci d'avoir regardé cette vidéo 1066
+Sécurité 1053
+Oui 1020
+Merci à vous 886
+Merci, M 841
+Un 663
+Merci à mes Tipeurs et souscripteurs 650
+Pour 527
+Contre 485
+Bien 476
+Hum 447
+C'est bon 412
+Merci à tous 403
+Amen 383
+S'il vous plaît 368
+Société Radio-Canada 334
+Non 301
+Et 299
+pour 277
+Bon 269
+Pardon 238
+Les 232
+On 224
+Sur 223
+Bite 222
+Des 222
+Nature 222
+Par 222
+En 222
+Dieu 216
+Hop 214
+Point 212
+du 211
+Surtout 208
+Le vote est ouvert 207
+Abstention 196
+C'est ça 194
+Sous-titrage FR Pays de l'Ontario 157
+Bon appétit 150
+Ambition 148
+Très 148
+Lé 148
+Évaluation 147
+le Président, et M 147
+le M 147
+SED 145
+S1 144
+Madame la Présidente 143
+Ré 143
+Sip 143
+le 142
+C'est un problème 141
+Trois 140
+L3 140
+Sari 140
+Le 10 139
+Le M 136
+Je vous remercie, M 133
+que 132
+Alors 131
+contre 130
+Donc 125
+Mais 124
+Sérieux 113
+et 112
+Sincère 111
+Satellite 111
+Ressources 110
+Évolution 110
+Surgir 110
+Rassurer 110
+Mille place 109
+Jury 6 109
+Le gouvernement 4 109
+C'est un peu comme ça 108
+Yurek 108
+Munger 108
+Je ne sais pas 106
+Bonsoir 106
+Sorsion 101
+C'est quoi 100
+Sérielle 99
+C'est un 92
+Sénophine 89
+L'association 89
+procédure 89
+Gratulièrement 89
+Merci beaucoup 88
+Sondement 11 88
+L'esprit 87
+législation 87
+Kaut, M 87
+Président, M 84
+Compromis 84
+Voilà 83
+Adopté 82
+C'est ce que je veux dire 82
+Je m'en 82
+Légalité 81
+L'organisation 80
+Alphonsi 79
+discrimination 78
+Radio-Canada 77
+C'est génial 75
+Le logement 75
+Évaluations 75
+Le phosyte 74
+Kelly, M 74
+Bleu 74
+Sources de croissance 73
+L'adhésion 73
+C'est pas ça 72
+C'est pas grave 72
+Leynan, M 72
+A, B, T 72
+Pendeja 72
+47 71
+Le 5 70
+Salut 68
+Garner, M 67
+Le microproche 67
+L'éducation 65
+adopter 64
+SIGURITÉS 63
+le Président, 62
+Zorvaman, M 62
+Bonjour 62
+L'Asie centrale 62
+Brock, M 61
+Ensuite 60
+Durablement 60
+qui 59
+C'est moi 59
+Exploration 58
+questions 56
+Monde-monde 56
+Sra 55
+Engel, M 55
+Quatorze qui est pour 54
+Swinburne 54
+C'est pas illégal 54
+Claire 53
+On peut trouver d'autres personnes 53
+de 52
+Le centre adopté 52
+C'est parti 51
+Adoptez 51
+Gerdinger, M 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/hr.txt b/dataset_configs/multilingual/granary/partials/common_phrases/hr.txt
new file mode 100644
index 00000000..5bc598dd
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/hr.txt
@@ -0,0 +1,88 @@
+Hvala što pratite kanal 146783
+Hvala vam 13292
+Hvala 7568
+što pratite kanal 2739
+Hvala što pratite 2366
+pratite kanal 2018
+Hvala vam na sviđanju 1847
+Hvala vam se 1706
+kanal 1673
+Hvala na sviđanju 1653
+Sviđa 1010
+10 668
+Hvala se 613
+Da 555
+Hvala vam na sviđanje 362
+Uvijek 335
+Ne 305
+vam 253
+Hvala vam sviđanje 233
+Pa 226
+30 225
+Text 223
+2019 222
+16 219
+Hrana 213
+Sviđanje 212
+109 165
+Lama 147
+Sto 146
+Hrvatska 145
+Hvala ljepo 141
+Hvala lijeva 128
+Ďakujem 125
+Let's go 111
+Lepo 110
+Hvalit 110
+Hvala pa 99
+Hvala li 93
+Hvala ne pa 92
+Ljubam 89
+Oprostili 89
+Odbora 9 88
+Lijepa 88
+Hvijek 87
+Vot is open 86
+Misije 85
+Hvala se ni 82
+Hranih 82
+se 79
+Hvala li je pa 79
+Hvala vam glasovanje 76
+Odbija se 75
+Hrvatsko 75
+Hvala novi 74
+Hvala lepa 74
+Sviđate 74
+Hvala imamo 74
+Hvala, Obama 74
+Hrvatski 74
+Top 10 najboljih uvijek 73
+Hvala mi se 73
+Hrvatsko je uvijek 72
+Hvala nič 71
+Hvala reč 69
+Ali 67
+Hvala vam ljepo 67
+Hvala na sviđanje 66
+Stabilnosti 65
+Hrvodnu 64
+Protiv 63
+Hvala za žene 62
+Top 10 minuten 62
+Hrvatskama 62
+Sviđa se 62
+Hvala Lepan 62
+Hrvodin Peters 62
+Dođer 61
+Hvala vam ljepa 61
+Zatvaram 60
+Hvala vam sviđanju 58
+Hvala o 55
+Okupaciju 54
+Hvala djepa 54
+Hrvatsko na to 54
+Peti 54
+Hvala, Sonja 53
+Hvala še stok 53
+Hvala komisji 53
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/hu.txt b/dataset_configs/multilingual/granary/partials/common_phrases/hu.txt
new file mode 100644
index 00000000..c42536be
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/hu.txt
@@ -0,0 +1,116 @@
+NAMASTE 149324
+Köszönöm 26109
+Köszönöm szépen 24607
+Ha 1987
+Köszönöm, hogy megnézted 1827
+szépen 862
+Ha tetszett 757
+Köszönjük 742
+Nem 640
+Egy 505
+És 326
+Oké 321
+Rendben 295
+Szó 294
+Ellene 282
+megnézted 281
+Hogy 276
+Szóval 258
+Elena 252
+Eléna 247
+Csak 242
+Az 233
+Eléne 224
+A köszönöm szépen 223
+Mag 213
+hogy 210
+Szóra 205
+és 200
+Köszönöm, hogy megtaláltad 163
+Köszönöm, M 160
+Azt 157
+Szeg 150
+Elen 149
+Pán 148
+KZ 148
+Azal 148
+Szükség 148
+Azaz 148
+Szig 148
+Magari 147
+Szia 147
+Mikrok 143
+Rá 140
+Ipar 139
+Ná 132
+A mi 124
+Elenor 115
+A lát 111
+Róra 111
+Kérem 110
+Szépen 110
+Szitva 110
+Egyen 108
+Kétes 107
+úr 102
+Rógy 102
+Ha tetszett, ha tetszett 101
+99-2 101
+Ne 100
+Áj 95
+is 90
+Ellen 90
+zárom 90
+Szangosz 89
+Évő 89
+Elégete 89
+Rászol 89
+Már Fát 88
+Lanzi K 88
+Bevétel 88
+Elnöck 88
+Igy marad 88
+A sány 87
+Fájt 87
+Néhá 85
+Előne 83
+Elő 81
+asszony 77
+Én 76
+Ból 76
+De 74
+A oitúr 74
+Elfogottuk 74
+Szolgának 74
+Kárcúr 73
+Szabályok 73
+dé hát 73
+Néhána 72
+Hátorikán 72
+54 72
+Pag 72
+Egyesel 71
+Agyébben 70
+A gájom 70
+Akkor 68
+Először 65
+Elécte 64
+A MEO 64
+Egyényesk 62
+Bármát úr 62
+Kifal elastei 62
+Magyar 61
+Mindenkételő 60
+Magyarországon 59
+Szávazónk 58
+Négy százal 57
+Köszönöm, köszönöm 56
+Ön 56
+A megyet 56
+A Vényassó 55
+Azta 54
+Egy élete 54
+Szentemátor úr 54
+van 52
+Néződani 51
+Képviselők 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/it.txt b/dataset_configs/multilingual/granary/partials/common_phrases/it.txt
new file mode 100644
index 00000000..3aa936b7
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/it.txt
@@ -0,0 +1,9 @@
+Grazie mille 952
+Grazie 918
+Grazie a tutti 244
+Chi è a favore 133
+Compromesso 10 105
+Perchè 72
+grazie 65
+La votazione è aperta 61
+mille 54
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/lt.txt b/dataset_configs/multilingual/granary/partials/common_phrases/lt.txt
new file mode 100644
index 00000000..45b7f663
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/lt.txt
@@ -0,0 +1,431 @@
+Dėkis 121685
+Paldies 111274
+Pag 49533
+Dabarėjau 26965
+Ačiū 22517
+Ir 18466
+Dėkis, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 15058
+Taip 14221
+Ar 12382
+Dabarėjimu 10643
+Tai 6677
+Ďakujem 6574
+Aš 5109
+Labai 3331
+Dabarėjome 2541
+– Dėkis 2516
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2461
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2425
+Dabarėjus 2371
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2302
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2288
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2270
+Dabarėjimą 2253
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2244
+Taip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 2188
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 2152
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1981
+Bet 1841
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1786
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1591
+Dabar 1487
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1368
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1289
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1176
+At 1115
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1023
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 1022
+Įdėjimu 1008
+Nu 910
+Arba 906
+Dėkodės 864
+Labas 857
+Įdėkis 823
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 823
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 820
+Tačiau 789
+Nes 772
+Taip, taip 763
+Nė 758
+Mūsų 741
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 711
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 696
+at 678
+Ačiūrėjau 663
+Dabarėjame 646
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 582
+Gerai 572
+Čiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiausiaus 568
+Aplodinės 512
+Sėdėkis 480
+Taipos 479
+Su 477
+Arbus 444
+Arčio 437
+Sėkiai 430
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 427
+Šiandienas 375
+Tai yra 364
+Atsiū 354
+Dėkštie 350
+Dabarėjimai 318
+Iki 318
+Žinoma 306
+Aras 296
+Įdėjame 292
+Nei 292
+Sėkmės 291
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 288
+Arvo 286
+Čia 284
+Pagliau 280
+Dabarė 261
+Anto 258
+Kėdėkis 242
+Tu 234
+su 233
+Te 230
+Ačiu 229
+Dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 228
+Sėkodės 227
+Nėra 226
+13 223
+Gru 222
+Ar mums 222
+Betis 221
+Arčiu 218
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 214
+Ir Ir 206
+Įsta 192
+Šiandien 192
+Čiausiai 188
+Mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 185
+Prieš 167
+ir 165
+Sama 163
+Dabar mūsų 161
+Arėjus 161
+Dėkui 160
+Taigi 160
+Įdėjim 157
+Sė 156
+Dėkite 155
+Aki 155
+Tama 155
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 154
+Ehm 154
+Vy 153
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 152
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 151
+Nato 151
+Yra 151
+Antra 151
+R2 150
+Aroma 150
+Labie 150
+Asta 150
+Pum 149
+Armas 149
+Imo 149
+Irma 149
+Arman 148
+Kli 148
+Serio 148
+Kali 148
+Arif 148
+Avas 148
+Ina 148
+Aps 148
+Betro 148
+Aris 148
+Antios 148
+Rol 148
+Tux 148
+Iba 147
+Tant 147
+Atra 147
+Antras 146
+Arline 145
+Sėmės 143
+Dabarės 143
+Ačiūrėjus 143
+Dėkodis 141
+Beb 141
+Naudoklis 140
+Labra 139
+Ir mūsų 139
+Argo 139
+Dž 138
+Dabarysi 138
+Laki 138
+Rai 137
+Tiba 137
+Hr 137
+Torbu 136
+Taile 136
+Ir jūsų 136
+Ante 135
+Atia 134
+Sėkės 134
+PC1 134
+Mės 133
+Jūsų 133
+Na 130
+Iks 130
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 128
+Ar jūsų 128
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 126
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 126
+Sės 124
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 123
+Taip, taip, taip 123
+Ačiškia 122
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 121
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 120
+Dėkia 118
+Daugiau 118
+Ar to 118
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 117
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 115
+Są to 114
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 112
+Arsulo 111
+Ir ponios 111
+Irmant 111
+TITIA 111
+Armanso 111
+Akti 111
+Manžia 111
+Ardus 111
+Pabies 111
+Ir gana 111
+Če 111
+Buhata 110
+Mėdėkis 110
+Aši 110
+Kulose 110
+Arceos 110
+Antekste 110
+Irranga 110
+Eurų 110
+Tūra 110
+Irritant 110
+Arminija 110
+Tinkas 110
+Eurot 110
+Pana 30 109
+ďakujem 109
+Įdu 109
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 108
+Lepas 107
+Tai lū 106
+Dėkos 105
+Aisnes 105
+Mančila 104
+Konec 103
+Aspecti 103
+Ir buvo 103
+kad 101
+Dėl 101
+0, 1 101
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 100
+Ar visi 97
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 96
+Ir moji 95
+Ikias 94
+Paldiesi 94
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 91
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 90
+Tėkis 90
+Mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų mūsų 89
+Sėkro 89
+Atsenau 89
+Aristo Poli 89
+Atsukė 89
+Siv 89
+Ateikia 89
+Dabarba 89
+Irminkė 88
+Arvojame 88
+Atsargas 88
+Ačiup 88
+Kliantų 88
+Išdus 88
+Aš manau 88
+Totojui 88
+Irskova 88
+Išdava 88
+Antizacija 88
+Aikėti 88
+Ačioms 87
+ČNES 87
+Dėmato 87
+Aštas 87
+Arbačiu 87
+Konfliktus 87
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 86
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 86
+16 86
+Sąčiu 86
+Įdėjau 85
+Apsauka 85
+Sėrtu 85
+Pats yra 83
+Arpiai 83
+Dėkai 82
+Aštė 82
+Kas už 81
+Rekas 80
+Ačiūrėjimu 79
+Matša 79
+Ietimas 78
+Aplodis 77
+Dėkis kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės kėdės 77
+kur 77
+Apskite 77
+Taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, taip, 76
+Taipa 76
+Ďakujem, į reikiausiai 76
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 75
+Kad 75
+Tumar 75
+Dabar komis 74
+Vėliau 74
+Ar mūsų 74
+Lėčių 74
+Raišin 2020 74
+Naftaliu 74
+Kotecikia 74
+Irpanašė 74
+Ostatusą 74
+Atojau 74
+Dėkodas 73
+Omenyės 73
+Kulbės 73
+Arnekaimu 73
+Eishkinti 73
+Arėmės 73
+Atsiči 73
+Aštinti 73
+Atsuosime 73
+Dabūksta 73
+Pa 72
+Palsuojame 72
+Aškačiu 72
+Atsauktas 72
+Tadarita 71
+Sėdėjau 71
+Dėklau 71
+Aš reikia 71
+Šmėdė 71
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 70
+Aizdota 70
+Atsaukite 70
+Egoizmo 70
+Rūtas 70
+Mės trą 70
+Aplaukite 70
+Aš ką 69
+O, beaciu 69
+Ar teklūs 68
+pirmininkė 66
+PAS 66
+apie 66
+Įdėkite į vietą 66
+Aš čia 66
+Tadėta 66
+O, nė 66
+Dabarėjai 65
+už 65
+At lo 65
+Gerbimie 65
+Yponijų 65
+Irtepad 65
+Kaip 64
+Argus 64
+Kojo 64
+PRAŠAM 64
+Vėlės nuo 63
+Pagėmės 63
+Paginacijos 63
+Latinos 63
+Paneugus 63
+Aš nesitama 63
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 62
+Bezauwite 62
+Taipas 62
+Sėlėkis 62
+Šukšlivena 62
+Oskokybę 62
+Tūs taas 62
+Tau reakiai 62
+dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis, dėkis 61
+Įdėjus 61
+Rekštokat 61
+Sėlėtus 61
+Ar dveja 61
+Ačiūsiu 61
+Dėkis doma 61
+Atsipračiu 61
+Tėlička 61
+Antonia 61
+Aštumto 60
+mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 60
+Dėkis turi 60
+Paglėtis 60
+Ar tiju 60
+Vėlųjų 60
+Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir Ir 59
+kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kuris, kur 59
+Mūsų ats 59
+Rādžiai 59
+Europartnerista 59
+Dėkštė 59
+mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 58
+Sėstis 58
+Dėkite pras 58
+Ažvilgiu 58
+Įdėjie 58
+Apsaugos 58
+Aitėtus 58
+Štėdė 58
+Atsakyti 57
+Dėkis toks 57
+Aš kėsų 57
+Tausiai 56
+Kas prieš 56
+Ačiū, pirmininkė 56
+mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 56
+Sėdėjimu 56
+Pėdėjau 56
+Mas 55
+Dabarėjim 55
+Paglėjimu 55
+Dėkis sektori 55
+Rinkoasteliją 55
+Arslinkus 55
+Są poteštę 55
+mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 54
+Tačiūrėte 54
+Šeitai 54
+Martheu 54
+Aš Karolissi 54
+Sėmėrės 54
+Sėkėjimu 54
+Žinoma rengini 54
+mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų, mūsų 53
+Arčiomario 53
+Sėlėnės 53
+Ir laisvimus 53
+kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, kaip, 52
+Aristo poli 52
+Pai sėkės 52
+Ar kitur 52
+Ir tvingumo 52
+Dėkodėjome 52
+Atsigiams du 52
+Grazie 51
+– Sėdėkis 51
+Nielsonas 51
+Tato sūbėra 51
+Atitikia ma 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/lv.txt b/dataset_configs/multilingual/granary/partials/common_phrases/lv.txt
new file mode 100644
index 00000000..9f77252d
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/lv.txt
@@ -0,0 +1,2 @@
+Paldies 3222
+Plāksmē 64
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/mt.txt b/dataset_configs/multilingual/granary/partials/common_phrases/mt.txt
new file mode 100644
index 00000000..79a3dc61
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/mt.txt
@@ -0,0 +1,150 @@
+Ďakujem 9190
+Hvala što pratite kanal 6349
+NĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚĚ� 2605
+Hvala što pratite 2486
+Għħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħ� 1561
+Hvala 1404
+Paldies 668
+30 668
+Grazie 595
+000 447
+10 445
+Hrķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķķ 404
+Hãy subscribe cho kênh Ghiền Mì Gõ Để không bỏ lỡ những video hấp dẫn 331
+President 322
+għħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħħ� 311
+Sra 304
+Hvala še 299
+Dż 293
+20 227
+19 221
+Issa 168
+Ehm 165
+Contra 157
+Hvala što 153
+Lili 150
+Horm 150
+Sari 149
+Zer 149
+Tzu 149
+Sene 148
+Nala 148
+Beş 148
+Sala 148
+Nati 148
+goni 148
+Juss 148
+Siti 148
+Eri 148
+Pus 148
+Tog 148
+Suur 147
+Paro 146
+Hru 146
+Alissa 145
+Signora 144
+Nya 143
+Pag 143
+Ritz 141
+Ski 140
+Simp 139
+Tuha 136
+Hvala š 127
+Egu 127
+Hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, 119
+Taddi 118
+Haffner 114
+Hrani 113
+Konec 113
+Kodin 111
+Bu tako 111
+U fó 111
+Če 111
+Lessa mis 111
+Kosovo 111
+Esteren 111
+deals 111
+Nāru 111
+Nitsha 111
+Tukun 111
+Mazzini 110
+Lawa net 110
+Hrsh 110
+Nolos 110
+Ponsi 110
+Grazzi 109
+30-1 108
+Apoš 107
+Dossier 103
+Hãy subscribe cho kênh Gõ Để không bỏ lỡ những video hấp dẫn 103
+Naisna 102
+Natsura 102
+Doan 96
+N-net 92
+Hr 91
+Hvala ki 89
+Prego Pilar 89
+Eriksie 89
+Hrviti 89
+Senora Bowles 89
+Hvala vam 89
+Nsiebo 89
+Buat malo 89
+Hrāti 89
+Nil-a 88
+Katsafna 88
+R-Russia 87
+Hjajan 87
+Dżuara 86
+hafna 83
+Čo 82
+Āhna 77
+Hrvatsi għu 76
+Sada 75
+Tore vi melo 74
+Zekunniku 74
+Smaq 74
+Hvala abit 74
+Zipan, Jersey 74
+Gįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįį� 73
+N-adjus 73
+Tafna lom 73
+Nishteħ 72
+Lewa l-net 72
+O emmenu 71
+Hvala i ne 71
+Hrvatski 70
+Għraċi 68
+Trican, mile 68
+Hvala što prat 66
+Is for me 66
+Īvan 66
+Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za Ďakujem za 63
+Čnāla 63
+N-u kontra 63
+Nes rots opa 63
+TZBI don't understand 63
+Hvala i tada 63
+Mr 62
+gįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįįį� 62
+EF-E-U 62
+Hrundan aminto 62
+Representa me 61
+Hrvatskaj 61
+minuta 60
+Welp 60
+So nice to see you 59
+Tulli-s min 59
+Sera Sera Président 55
+Hrubitikofa 55
+Hvala sko zain 55
+Hvala što pratiti 55
+Alman d'Ažer 55
+Atención que se requiere 54
+Čnđe 53
+Graċi 52
+Isimu 52
+Hr-Rush 52
+illi 51
+Sürmetsalikaj 51
+Čina 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/nl.txt b/dataset_configs/multilingual/granary/partials/common_phrases/nl.txt
new file mode 100644
index 00000000..c562d1b8
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/nl.txt
@@ -0,0 +1,5 @@
+Dank u wel 123
+wel 87
+Kompromis 4 86
+Nr 72
+Kompromis 4a 63
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/pl.txt b/dataset_configs/multilingual/granary/partials/common_phrases/pl.txt
new file mode 100644
index 00000000..0e5e2643
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/pl.txt
@@ -0,0 +1,10 @@
+Dziękuję 778
+Dzień dobry 299
+Kto jest za 131
+Dziękuję bardzo 121
+Kto się wstrzymał 114
+Sześć 91
+Kto przeciw 88
+Kompromis 13 88
+Kto jest przeciw 71
+Dziękuję za uwagę 55
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/pt.txt b/dataset_configs/multilingual/granary/partials/common_phrases/pt.txt
new file mode 100644
index 00000000..c2c3cbef
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/pt.txt
@@ -0,0 +1,27 @@
+Sra 8711
+Presidente 1486
+Sr 1435
+Obrigada 1148
+Presidente, Sr 761
+Obrigado 619
+Obrigada, Sra 520
+Obrigado, Sr 370
+Obrigada, Sr 349
+Já 201
+Muito obrigada 168
+Enf 135
+Srs 123
+Muito obrigado 109
+Com licença 105
+800, 1 85
+Renda, Sr 75
+Diz, Sr 72
+Presidente, o Sr 68
+Presidente, Sra 67
+Cavada 66
+Svab 64
+Contra 63
+Abstenções 59
+Martins 57
+O que é a reunião 55
+Jouvin, o Sr 52
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/ro.txt b/dataset_configs/multilingual/granary/partials/common_phrases/ro.txt
new file mode 100644
index 00000000..047d3dc3
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/ro.txt
@@ -0,0 +1,285 @@
+Să vă mulțumim pentru vizionare 64103
+MULȚUMIT PENTRU VIZIONARE 47032
+La revedere 44575
+Să ne vedem la următoarea mea rețetă 32817
+Mulțumesc 26905
+Nu uitați să distribuiți acest material video pe alte rețele sociale 12082
+Mersi 8073
+Nu uitați să dați like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 5936
+Mulțumesc frumos 3477
+pentru vizionare 2347
+vizionare 2079
+Nu 1586
+mulțumim pentru vizionare 1148
+Să vă mulțumesc frumos 1048
+rețetă 1018
+Să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 1008
+Mulțumim 968
+Să 927
+în 893
+Mulțumesc pentru vizionare 766
+cu 738
+S-a deschis votul 704
+Aș 655
+Să vă mulțumesc pentru vizionare 632
+Mersc 589
+mea rețetă 526
+Da 504
+S-a închis votul 475
+Sărbătoare 468
+Este 458
+un 456
+Spins 434
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu 420
+următoarea mea rețetă 404
+Să ne vedem în următoarea mea rețetă 387
+Bun 356
+Așa 353
+Nu uitați să vă mulțumim pentru vizionare 350
+Să vă mulțumim 336
+vă mulțumim pentru vizionare 333
+Succes 332
+Sankies 328
+Să nu 324
+Să vizionare 322
+Mă 308
+și 307
+Clare 297
+alte rețele sociale 296
+Sine 296
+pe alte rețele sociale 291
+VIZIONARE 271
+Pentru 270
+rețele sociale 252
+sociale 251
+material video pe alte rețele sociale 251
+Să vă mulțumesc 251
+vedem la următoarea mea rețetă 241
+pentru 240
+video pe alte rețele sociale 239
+Mersk 235
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați 233
+ne vedem la următoarea mea rețetă 230
+Muzica 227
+Un 226
+Ar 224
+Sără 222
+Sfântului 222
+Financiare 222
+Săr 220
+Victor 220
+acest material video pe alte rețele sociale 216
+Muzică 199
+la următoarea mea rețetă 197
+distribuiți acest material video pe alte rețele sociale 185
+Să deschis votul 185
+Măsuri 176
+Sfântul 175
+să distribuiți acest material video pe alte rețele sociale 168
+Nu uitați să dați like, să distribuiți acest material video pe alte rețele sociale 161
+Acord 157
+sigur 156
+Nu uitați să vă mulțumesc pentru vizionare 153
+Adios 153
+Mers 151
+Vă mulțumim pentru vizionare 151
+Mica 150
+Vitor 150
+Energia 149
+și să distribuiți acest material video pe alte rețele sociale 148
+Schimb 148
+Mies 148
+Astro 148
+Estico 148
+Conturi 148
+Conte 148
+Sina 148
+Azo 148
+Suta 148
+Spania 148
+Ski 148
+Redus 148
+IMP 148
+Sins 148
+Lucie 148
+Iana 147
+Ieri 147
+SDI 147
+Să mai departe 146
+Sper 146
+Trei 145
+Odii 145
+Mim 144
+Bras 143
+că 140
+Martín 140
+Vă mulțumesc pentru vizionare 136
+Sala 134
+Sărbă 133
+și să lăsați un comentariu 132
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu 129
+Apoi 128
+Tiza 127
+Și 124
+Să deschideți 124
+Svotovske 124
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu 120
+Adopta 120
+Spera 120
+PENTRU VIZIONARE 113
+Sperți 112
+Tronice 111
+Irlandese 111
+MdG 111
+Miemo 111
+Mieți 111
+MENI 111
+Spasca 111
+Métie 111
+HACP 110
+Hora 15 110
+Ilegale 110
+Vod electronic 110
+Dreptur 110
+Sărit 110
+Bunzitor 109
+Măsc 108
+Sankiz 108
+Muzora 108
+Să vă mulțumesc pentru like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 107
+Sankt 107
+Sfie 107
+Căr 104
+Să ne vedem la urmă 103
+Exemplu 103
+Sperte 103
+Ruta 79 103
+care 102
+şi 102
+lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 101
+Rău 92
+uitați să dați like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 91
+Sărdo 89
+Măiți 89
+Ouropeine 89
+URGENTE 89
+Excesivă 88
+Mulgherini 88
+The vote is closed 88
+Exționat 88
+Succesul 88
+Să bine 88
+BASEL 3 87
+Votul 5 87
+Sfotul 87
+Aptineri 87
+Sank is vătul 87
+Mătul 87
+Acest lucru 86
+SIGLAIN 86
+Aptinari 85
+Răspuns 85
+Sărace 84
+Sărcere 84
+Mă mulțumim pentru vizionare 83
+Mulțumesc, domnule președinte 81
+Să spune 81
+Luhurul 81
+un comentariu și să distribuiți acest material video pe alte rețele sociale 80
+Să deschia zvătul 80
+Răspins 79
+Mulțumesc, președinte 78
+Suntul 10 77
+Cuquina 77
+S-Li 76
+Abținari 76
+S-a adoptat 74
+Merskviki 74
+Hacem regulament 74
+Dar 73
+Sfășara 73
+Sfără 73
+Sfăruri 73
+Să deschise 73
+S-PONS 73
+Compromisul 1 73
+Sinc de votul 73
+Sancis votul 73
+S pentru 73
+Contrastrategie 73
+Sankis votul 73
+Sărunga 73
+uitați să distribuiți acest material video pe alte rețele sociale 72
+Să remarcăm 72
+Muzurilor 72
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu 71
+Să închis 71
+Răgurile 71
+Odgovița 71
+Așteptăm 70
+votul 70
+Să ne vedem la următoare 69
+Sfânta 69
+Doamne 68
+Mulțumim pentru vizionare 68
+Nu uitați să dați like, să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu 66
+comentariu și să distribuiți acest material video pe alte rețele sociale 66
+Sănkis votul 66
+Votrn 66
+Sărcum 66
+Să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lăsați un comentariu și să lă 65
+Ar vanitist 65
+Vătul 65
+Trăiști 65
+like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 64
+Nu-am 64
+să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 63
+Sfântul 23 63
+Sfântul 83 62
+Să gândiți 62
+Sfântul 76 62
+Să te urziu 62
+dați like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 61
+să dați like, să lăsați un comentariu și să distribuiți acest material video pe alte rețele sociale 61
+S-a închis 61
+Mă-Covei 61
+Sfântul 24 61
+Sfântul European 61
+Sfie folosita 60
+Să lăsă 60
+de 59
+Să ne vedem 57
+Să ne ajutăm 57
+Să ne spingă 56
+Fissas 56
+Sărbăr 56
+Sfântul 11 56
+președinte 55
+Bună 55
+Mă rețința 55
+Vărgânta 55
+Mă testaţi 55
+Aplicați-vă 55
+adoptă 54
+De ce vile 54
+Sistă-mă 54
+Suntem vorbitori 54
+Să deschise votul 54
+Om 54
+Să răspundi 54
+MIGA 54
+Să închim votul 54
+Pălamentarea rurală 54
+Sfântului 2020 54
+Sama frunzulica 54
+Să înkis votul 54
+Să gizvotul 54
+Să sunt întrebări 54
+Ia cuvântul 54
+Să gisvotul 53
+S-a sublinieze 53
+Sărbădă 52
+Din Belarus 51
+Sunt pentru 51
+S-a deschis vot 51
+Sărbătate 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sk.txt b/dataset_configs/multilingual/granary/partials/common_phrases/sk.txt
new file mode 100644
index 00000000..4b0a8446
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/sk.txt
@@ -0,0 +1,83 @@
+Ďakujem za pozornosť 1645498
+Ďakujem 125315
+Ďakuj 40009
+za pozornosť 9407
+pozornosť 9382
+Konec 2653
+Zdravíte 567
+Pre 443
+Hvala 440
+OSN 296
+Kov 295
+Hr 292
+Hru 287
+Ale 286
+Čau 268
+za 236
+1% 223
+Cent 222
+Od 222
+10 219
+že 217
+Máša 167
+Zdravíme 163
+Hvala mi 151
+Per cent 149
+Sonny 148
+Hoh 147
+Kult 147
+MF 146
+Salde 145
+Kto je za 144
+Sala 143
+zi 140
+Proti 139
+Osem 139
+108 135
+Percent 133
+Hvala za pozornosť 131
+Že 119
+Zdra 112
+Odgaard 111
+Ús 111
+Kostel 110
+Hapel 110
+Kvim 101
+Hrv 97
+Môžem 95
+Hlasovanie 93
+KONIEC 89
+Čauj 89
+Či 88
+Opatrení 88
+ZA EF 87
+Čo 85
+Prekestein 84
+Pol7 M 82
+Preto 80
+Ďakujte 79
+Môsťa 74
+Môjši 74
+Ráčelný 73
+Čes 72
+januára 22 72
+Kto sa zdržal 71
+Čili 71
+Znášť 71
+Čaujte 69
+Hvala što pratite kanal 69
+Časají 68
+Spinelli 68
+Českujem 64
+Čer 63
+Kovým 62
+Čaukujem 62
+Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakujem za pozornosť, Ďakuj 58
+Ďalej 58
+Overejme si 58
+Mátaň 55
+Ďakujem pekne 55
+Kováčný 55
+Časť 1 za 54
+Mňe SAD 52
+proti 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sl.txt b/dataset_configs/multilingual/granary/partials/common_phrases/sl.txt
new file mode 100644
index 00000000..c08c64f3
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/sl.txt
@@ -0,0 +1,363 @@
+Hvala što pratite kanal 988348
+Hvala 461493
+Hvala za pozornost 351855
+Hvala na sviđanju 307238
+Hvala na sviđanje 272988
+Hvala što pratite 37478
+Ďakujem za pozornosť 30823
+Hvala še 29093
+Hvala vam 22922
+Hvala lepa 17965
+Hvala š 10578
+Hvala na 6979
+Hvala za pozornosť 6152
+Hvala vse 5016
+Hvala se 3981
+što pratite kanal 2746
+pratite kanal 2141
+kanal 2139
+na sviđanje 1656
+na sviđanju 1629
+Hvala na to 1618
+Hvala za pozornos 1476
+za pozornost 1313
+Hvala Hvala 1190
+pozornost 1172
+sviđanje 1168
+Hvala za gledanje 1158
+Hvala za 1131
+sviđanju 1114
+Hvala što prat 997
+Hvala je 925
+Hra 866
+Ďakuj 785
+Hvala na vse 775
+Hvala na svojih 759
+Hvala na svoj stvari 702
+Hvala v tem 532
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala H 517
+Hvala na svi 502
+Hrv 479
+Pa 462
+Hrani 429
+Hvala da 417
+da 394
+Hvala na pozornost 385
+Hvala še te 360
+Če 354
+Hr 341
+Hvala za pozornosti 329
+Hvala vseh 317
+Hvala na srednje 312
+Hvala te 297
+HDI 294
+Hvala in 284
+Hlasujem 263
+Hvala sem 259
+Hvala šešnje 238
+Hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, hvala, 236
+što pratite 235
+Na 235
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 234
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 232
+Hvala pa 231
+Ni 228
+Hvala na sviđanj 222
+Konec 222
+Hrm 222
+Komisja 221
+Hvalda 220
+67 220
+Hvala što 218
+Hvala nič 218
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala H 209
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala H 208
+Hrpa 202
+Pot 201
+Hvala mi 200
+Hvala vam se 199
+Hvala na svoju 199
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala 194
+Zdravljamo 192
+pratite 190
+Hvala le 179
+Hvala še tudi 178
+Hvala na svoj stv 177
+Hvala što ste 177
+Zdravljamo v tem 175
+Hrvatsko 174
+Horda 173
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 164
+Hvala še nič 158
+Hvala ni 158
+Hvalači 156
+Hvala reči 155
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 151
+Hvala za pozornoske 150
+Ludi 149
+Kaj 149
+Zaj 149
+Mislim 149
+Lid 148
+in 147
+Anja 147
+Pavel 147
+Da 146
+Hvalače 146
+Hvala lepo 146
+Torej 145
+Hanz 141
+za 137
+Sala 137
+Hvala še vse 135
+UR 135
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 128
+Hvala na tem 127
+Hvala štine 124
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 121
+Lepa 120
+Hvala ljudi 119
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala H 118
+Hvala za nas 115
+Hvala za minuta 115
+Ali 114
+Hviso 111
+Kovania 111
+Grči 111
+Hvira 111
+Lepen 111
+Koste 111
+Hrce 111
+Kolek 111
+Uvira 111
+Hrano 111
+Kovac 111
+Horema 111
+Prejto 111
+Hvaly 111
+Hvisel 111
+Intervenci 111
+Hrdo 111
+Tvr 111
+Hrmet 111
+vam 110
+Hruba 110
+Hvaliti 110
+Hlede 110
+Krizo 110
+Hrber 110
+Hredal 110
+Uplive 110
+Hlasik 110
+Hrja 110
+Odbora 110
+Rekah 110
+Zdravljte se 109
+Joče 109
+Hrabe 109
+še 108
+Uredbo 108
+Hjel 108
+Hristo 107
+Proti 106
+Hvala še ne 105
+Hvala gospodina 103
+Hrpo 102
+Predlog 100
+Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, Hvala, 95
+Ne 95
+Hvala na sveto 93
+Hrvi 93
+Hrviti 92
+Hvala ljepa 92
+Hvala za te 91
+Hvala in se 90
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala 89
+Hvala pre 89
+Hraunman 89
+Hvala Lepa 89
+Odgovoro 89
+Odičja 89
+Hrvoss 89
+Uglasal 89
+Hranih 89
+Hvala sam 88
+Hledajte 88
+Vandekamp 88
+Od drugič 88
+Blaginja 88
+Obdobje 88
+Hrčila 88
+Očasno 88
+Oznako C 87
+Hlasovanie 87
+Hršek 87
+za pozornosť 86
+Horebno 86
+Čila 86
+Hlo plen 85
+Hvala še to 85
+Hrdeve 85
+Hvala vam je 84
+Lina Ebol 83
+Hrmoni 83
+Hrva 83
+pozornosť 82
+Hvala, predsednik 82
+Hvala po 82
+Hvala še prijatelj 80
+Hrvim 80
+lepa 78
+Oktobr 78
+Hvala, mislim 75
+Hvala vprašanje 75
+Hrbæk 74
+Hrani Rez 74
+TKSTOV 74
+Zdravitev 74
+Hvala šeče 74
+Hjerovje 74
+Hrvitsk 74
+je 73
+Hrvicu 73
+Hvalačun 73
+Hvala depa 73
+Obviznice 73
+Hvalačka 73
+Vzgodovino 73
+Hvala leč 73
+Hvala imi 73
+Hrvi Krvi 73
+Hvalačila 73
+Hvala dana 73
+Hvala prednost 73
+Hvalačuna 73
+Hvala tudi 73
+Ravivalstvo 73
+Hvala ni pa 73
+Hločenje 73
+Hvalačena 73
+Swinburne K 73
+Hvala rej 72
+Hvalaček 72
+Hrvodne 72
+Hvala da se 72
+Hvala šli 72
+Hvala zir 72
+Horeizontali 72
+Hrvatske 72
+Odij 72
+Hvala, pa 71
+Hleda se 71
+Hrvatska 71
+Hruzkega 71
+to 70
+Politične 70
+Čini 70
+Houshling 70
+Hvala za besedo 69
+Hvala leo 69
+Klajš 69
+Za 68
+Hvala to, da 68
+Hvala tis 68
+Kretič 68
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 67
+In 67
+Hlasujemo 67
+Hrspanje 65
+Čloniranje 65
+Čut 65
+Čeho 65
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 64
+Hvala nekaj 64
+Odporo 64
+Ďakujem 63
+Hvala díti 63
+Hvala na svet 63
+Hvala vam njen 63
+Zdravljenje 63
+Hvala šeba 63
+Hvala predsednik 62
+Hvala na minuto 62
+Hvala je prej 62
+Hvala šultes 62
+Hvala ni zami 62
+Zdravljali 62
+Hvala josti 62
+Hvala rejba 62
+Črnagora 62
+Hrvimam 62
+Uplastmi 62
+Hrvatskene 62
+Opozvalcev 62
+Hrát povem 62
+Zdravljene 62
+Hvala mjeno 62
+Ljubo pri tem 62
+Hocene 62
+Hrčevalce 62
+Corbett 62
+Hvala na pako 62
+Hrvodkih 62
+Hvala komisija 62
+Hvala za inak 62
+Hvala, ne 61
+Článec 61
+Hrvatskih 61
+Hvala za vse 61
+Hvala 3 minuta 61
+Hujbnerjevo 61
+Hvala na rá 60
+Hvala Pils 60
+Tampide 60
+Zdravstveni 60
+Hvala, tak 59
+Čas 58
+Hrčije 57
+Hvali žena 57
+Hrvitič 57
+Hvala za 2016 56
+Hvala še pripravljeni 56
+Hvala še sam 56
+Hvala po dne 56
+Hvala na svej 55
+Aristej 55
+Hvala, sr 55
+Hrvatske Unije 55
+Oznacjavanie 55
+Hvala s priemi 55
+Hrvam ve energie 55
+On 55
+Hvala na svoje 54
+Učinkovitosti 54
+Hvala na vrst 54
+Hvala prijatelj 54
+Hvala predsednika 54
+Hvala še bi 54
+Hvalačevalcu 54
+Hvala za mnogo 54
+Hrvatska Unija 54
+Hrvatske unie 54
+Hvaljujem 54
+Hval 53
+– 90 53
+Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hvala Hval 53
+Hvala na svojih prijatelji 53
+Hrizijski politikami 53
+Hvala za svet 53
+Hvala šičo 53
+Hvala ljima 53
+Hrvatskovič 53
+Hvala le ne 53
+Čána 53
+Hvala vrni 52
+Hvalačevalje 52
+Čelene 52
+Hrčka 52
+Hrspod Florens 51
+Hvala na slob 51
+Hvala predsednice 51
+Hrčevala se 51
+Hvala način 51
+Hrmatikakis 51
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sv.txt b/dataset_configs/multilingual/granary/partials/common_phrases/sv.txt
new file mode 100644
index 00000000..96166957
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/common_phrases/sv.txt
@@ -0,0 +1,184 @@
+Tack till elever och personer som hjälpte med videon 103409
+nu 56532
+Textning 56034
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssä 41530
+Tack så mycket 23877
+Tack 11914
+Ja 11835
+Vi 3767
+Nej 2753
+Okej 1938
+En 1427
+Tack till elever och personal vid Säkerhetssäkerheten 1183
+Inte 963
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 960
+btistudios 914
+Hej 894
+Nej röster 870
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsä 689
+Om 659
+com www 555
+Men 541
+videon 534
+Det är inte det 532
+till elever och personer som hjälpte med videon 500
+Hej då 482
+med videon 477
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 475
+Nu 455
+och personer som hjälpte med videon 448
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 440
+Att 433
+Själv 433
+Text 430
+hjälpte med videon 427
+Nej röstar 412
+som hjälpte med videon 398
+Tack till elever och personer som hjälpte mig 386
+elever och personer som hjälpte med videon 367
+Självklart 356
+personer som hjälpte med videon 351
+Varsågod 333
+Nej, röster 330
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 318
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 316
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetsäkerhetssäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsäkerhetsä 313
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 272
+Amen 272
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 261
+Vad är det 248
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 245
+com 236
+Ett 233
+Fyra 231
+Hur 228
+att 227
+Herr 224
+Till 222
+Textning Stina Hedin www 218
+Hall 217
+Nej, röstar 204
+Svart 204
+Antaget 198
+Det är bra 198
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 185
+Välkomna 185
+Så 174
+Och 165
+Antas 156
+Tack till elever och personer som hjälpte oss med videon 152
+Stora 150
+Lags 148
+En moment 148
+Attra 148
+Skandal 148
+Pré 148
+Vi sa 147
+Sats 146
+för 145
+Jag röstar 145
+Kanet 145
+så mycket 144
+Nej, jag röstar 144
+En person 144
+Vi har 143
+Vi är 140
+Stort 138
+Följ oss 132
+mycket 124
+Applåder 123
+till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssä 122
+det 120
+Tack till elever 118
+Exempel 117
+Hej, hej 115
+Före 113
+com – Textning Stina Hedin www 113
+Alla ett 111
+Gått 111
+Dettajer 111
+SIFTA 111
+Spanier 111
+Jäf 111
+En minutt 110
+Nyanser 110
+Det är inte så bra 103
+Rösta 102
+Nej-röster 102
+Vörr 101
+Två 100
+En annan 100
+Nej, rösta 94
+Tack till elever och personer som hjälpte med 93
+Tack för att du har lyssnat 92
+röster 91
+Delegationen 89
+Själp 89
+Tränsla 89
+Elmån 89
+Låt mig se 89
+Excellenst 89
+Sjåg 89
+Spärgis 88
+Radikalisering 88
+Sjärnor 87
+Exporterna 87
+som 86
+Hur är det 84
+Det stämmer 81
+ÖB 80
+Kompromiss C 80
+Bibliothek 80
+Nej rastar 80
+Vi kan vi 78
+ÖPAPT 77
+Det är inte det här 76
+En lösning 76
+Detta bevis 76
+Hon är nöjd 74
+Texten är 74
+Fyrtysv 74
+Hygienpaket 73
+Låt oss jobba 73
+Det är en bra 73
+Faktiserade vi 73
+Vi har stått 72
+Självna 72
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 71
+Alla rätt 71
+Det är så mycket 71
+Vad händer 70
+00 klockan 12 70
+Det är styrt 70
+Sättet 69
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 68
+Röstning 68
+Enis 68
+Försättning 67
+IK 67
+här 63
+Det är en roll 62
+Vi är tre 61
+Tack till elever och personal vid Säkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhetssäkerhet 60
+IKT 60
+Det är en annan 59
+Nej restar 59
+En anslutad 58
+Detta är det 58
+Stördjä 58
+Att det finns 58
+Förutningen 57
+En fråga 57
+Välkommen till oss 56
+Det är en lösning 56
+Nej, det är inte det 55
+om 54
+Nejröster 54
+selementet 54
+är 53
+Nej rösta 53
+Det är en fjärd 53
+Föderam 52
+Tack till elever och personer som hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med hjälpte med 51
+Nedlagda 51
+Pifal 51

From 6ed79e9238da9012a7fd5285fad749d26b64b801 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 13:03:47 -0700
Subject: [PATCH 41/90] Skipping files from partial dir

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_cfg_runtime_tests.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_cfg_runtime_tests.py b/tests/test_cfg_runtime_tests.py
index cce1b820..73f32cad 100644
--- a/tests/test_cfg_runtime_tests.py
+++ b/tests/test_cfg_runtime_tests.py
@@ -25,7 +25,11 @@
 def get_test_cases():
     """Returns paths to all configs that are checked in."""
     for config_path in glob.glob(f"{DATASET_CONFIGS_ROOT}/**/*.yaml", recursive=True):
-        yield config_path
+        path_parts = Path(config_path).parts
+        if "partials" in path_parts:
+            continue
+        else:
+            yield config_path
 
 
 @pytest.mark.parametrize("config_path", get_test_cases())

From b99702340b95b649218496c4aca67b7690ae25d6 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 13:09:21 -0700
Subject: [PATCH 42/90] DropHighLowDuration processors added to config

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 dataset_configs/multilingual/granary/yodas2.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index 58f23d96..e8f11d92 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -8,7 +8,7 @@ params:
   source_lang_full: English
   min_audio_lid_probability:  0.7
   min_audio_duration: 0.1
-  max_audio_duration: 40.0 #Add specific processor!
+  max_audio_duration: 40.0
   translation:
     target_lang: it
     target_lang_full: Italian
@@ -20,7 +20,6 @@ params:
 
 processors_to_run: "9"
 workspace_dir: ???
-install_requirements: True
 
 processors:
   - _target_: sdp.processors.ListYodas2Data
@@ -125,6 +124,11 @@ processors:
     new_field: 'duration'
     expression: entry.end - entry.start
   
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14a.json
+    high_duration_threshold: ${params.max_audio_duration}
+    low_duration_threshold: ${params.min_audio_duration}
+  
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
     rename_fields: 

From da5678230eaa6e0297d16b31c5a0e7f79bb23c0b Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 13:15:07 -0700
Subject: [PATCH 43/90] DetectWhisperHallucinationFeatures updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../post_processing/whisper_hallucinations.py | 96 +++++++++++--------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
index 73a72855..6f6cb1b0 100644
--- a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
+++ b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
@@ -17,41 +17,41 @@
 
 class DetectWhisperHallucinationFeatures(BaseParallelProcessor):
     """
-    A processor for detecting common hallucination patterns in ASR (automatic speech recognition) model outputs.
-    
-    This processor calculates simple features from the transcript text to help identify potential hallucinations,
-    such as repeated word patterns, overly long words, or unnaturally high character rates.
+    Computes hallucination-related features for ASR model outputs (e.g., Whisper transcripts).
 
-    The following boolean features are computed and added to each manifest entry:
-        - `hall_repeated_ngrams`: True if the fraction of unique words is below a threshold.
-        - `hall_long_word`: True if a word is unusually long or significantly longer than the rest.
-        - `hall_frequent_single_word`: True if the total character count per second is too low.
+    This processor analyzes the transcript text and flags common hallucination patterns by computing
+    boolean features such as:
+    - Repeated or low-diversity n-grams (`hall_repeated_ngrams`)
+    - Unusually long or disproportionately long words (`hall_long_word`)
+    - Matches with known hallucinated phrases (`hall_frequent_single_word`)
+
+    It appends these features to each entry in the manifest for downstream filtering or analysis.
 
     Args:
-        unique_words_threshold (float): Maximum share of unique words before flagging repeated n-grams. Default is 0.4.
-        long_word_threshold (int): Minimum character length of a word to be considered 'too long'. Default is 25.
-        long_word_rel_threshold (float): Relative difference between longest and second-longest word to flag. Default is 3.
-        char_rate_threshold (float): Minimum average characters per second for a transcript. Default is 4.
-        text_field (str): The key in the data entry containing the transcript. Default is 'text'.
-        **kwargs: Additional arguments passed to BaseParallelProcessor.
+        common_hall_file (str): Path to a file with known hallucinated phrases, one per line.
+        unique_words_threshold (float): Maximum allowed share of unique words before marking as repeated. Default is 0.4.
+        long_word_threshold (int): Minimum character length for a word to be considered "long". Default is 25.
+        long_word_rel_threshold (float): Relative length ratio between the longest and second-longest word. Default is 3.
+        char_rate_threshold (float): [Unused in current logic, retained for compatibility]. Default is 4.
+        text_field (str): Key in the data entry that contains the transcript. Default is 'text'.
+        **kwargs: Additional keyword arguments passed to `BaseParallelProcessor`.
 
     Returns:
-        A manifest where each entry includes new boolean hallucination-related features.
-    
-        Example entry after processing::
-            
-            {
-                "text": "<some transcript here>",
-                "duration": 2.5,
-                "hall_repeated_ngrams": False,
-                "hall_long_word": True,
-                "hall_frequent_single_word": False
-            }
-            
+        A manifest with additional boolean fields for hallucination detection.
+
+    Example entry after processing:
+        {
+            "text": "hello hello hello",
+            "duration": 2.0,
+            "hall_repeated_ngrams": True,
+            "hall_long_word": False,
+            "hall_frequent_single_word": False
+        }
     """
 
     def __init__(
         self,
+        common_hall_file,
         unique_words_threshold=0.4,
         long_word_threshold=25,
         long_word_rel_threshold=3,
@@ -63,43 +63,57 @@ def __init__(
         self.unique_words_threshold = unique_words_threshold
         self.long_word_threshold = long_word_threshold
         self.long_word_rel_threshold = long_word_rel_threshold
-        self.char_rate_threshold = char_rate_threshold
+        self.char_rate_threshold = char_rate_threshold  # Currently unused
         self.text_field = text_field
 
+        # Load common hallucination phrases into memory
+        with open(common_hall_file, 'r') as f:
+            self.common_hall_phrases = [line.strip() for line in f]
+
     def repeated_ngrams(self, words):
-        # Calculate the share of unique words in the transcript
+        """
+        Flags entries with low lexical diversity (i.e., repeated n-grams).
+
+        Returns True if the fraction of unique words is below the threshold.
+        """
         unique_words_share = len(set(words)) / len(words)
         return unique_words_share <= self.unique_words_threshold
 
     def long_word(self, words):
-        # Sort word lengths in ascending order
+        """
+        Detects unusually long words or sharp differences in word lengths.
+
+        Returns True if the longest word is above the absolute threshold or much longer
+        than the second-longest word.
+        """
         word_lengths = sorted([len(word) for word in words])
-        
-        # Check if the longest word exceeds the absolute threshold
+
         if word_lengths[-1] >= self.long_word_threshold:
             return True
 
-        # Check if the longest word is much longer than the second longest
-        elif len(words) > 1:
+        if len(words) > 1:
             diff = (word_lengths[-1] - word_lengths[-2]) / word_lengths[-2]
             return diff >= self.long_word_rel_threshold
 
         return False
 
-    def frequent_single_word(self, words, duration):
-        # Calculate average character rate (characters per second)
-        chars = sum(len(word) for word in words)
-        char_rate = chars / duration
-        return char_rate <= self.char_rate_threshold
+    def frequent_single_word(self, text):
+        """
+        Checks if the cleaned transcript matches any known hallucinated phrase.
+        """
+        cleaned_text = text.strip().replace('.', '').replace('?', '').replace('!', '')
+        return cleaned_text in self.common_hall_phrases
 
     def process_dataset_entry(self, data_entry):
-        # Extract the text field and tokenize into words
+        """
+        Processes a single manifest entry and appends hallucination features.
+        """
         text = data_entry[self.text_field]
         words = text.split()
 
-        # Compute and assign hallucination features
+        # Compute hallucination indicators
         data_entry['hall_repeated_ngrams'] = self.repeated_ngrams(words)
         data_entry['hall_long_word'] = self.long_word(words)
-        data_entry['hall_frequent_single_word'] = self.frequent_single_word(words, data_entry.get('duration'))
+        data_entry['hall_frequent_single_word'] = self.frequent_single_word(text)
 
         return [DataEntry(data=data_entry)]

From a10d8de5eb651b9a03ad4498739b15e39af92ec6 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 13:15:21 -0700
Subject: [PATCH 44/90] DetectWhisperHallucinationFeatures updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../asr/post_processing/whisper_hallucinations.py        | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
index 6f6cb1b0..7362e1a1 100644
--- a/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
+++ b/sdp/processors/inference/asr/post_processing/whisper_hallucinations.py
@@ -38,15 +38,6 @@ class DetectWhisperHallucinationFeatures(BaseParallelProcessor):
 
     Returns:
         A manifest with additional boolean fields for hallucination detection.
-
-    Example entry after processing:
-        {
-            "text": "hello hello hello",
-            "duration": 2.0,
-            "hall_repeated_ngrams": True,
-            "hall_long_word": False,
-            "hall_frequent_single_word": False
-        }
     """
 
     def __init__(

From 24d3ab536e01656d380368809d0f2db3bd48c740 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 13:48:51 -0700
Subject: [PATCH 45/90] Separate subregex params are added

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../granary/partials/subregex_params/bg.yaml  | 30 ++++++++++++
 .../granary/partials/subregex_params/cs.yaml  | 27 +++++++++++
 .../granary/partials/subregex_params/da.yaml  | 29 ++++++++++++
 .../granary/partials/subregex_params/de.yaml  | 31 ++++++++++++
 .../granary/partials/subregex_params/el.yaml  | 35 ++++++++++++++
 .../granary/partials/subregex_params/en.yaml  | 47 +++++++++++++++++++
 .../granary/partials/subregex_params/es.yaml  | 28 +++++++++++
 .../granary/partials/subregex_params/et.yaml  | 31 ++++++++++++
 .../granary/partials/subregex_params/fi.yaml  | 34 ++++++++++++++
 .../granary/partials/subregex_params/fr.yaml  | 35 ++++++++++++++
 .../granary/partials/subregex_params/hr.yaml  | 30 ++++++++++++
 .../granary/partials/subregex_params/hu.yaml  | 25 ++++++++++
 .../granary/partials/subregex_params/it.yaml  | 31 ++++++++++++
 .../granary/partials/subregex_params/lt.yaml  | 26 ++++++++++
 .../granary/partials/subregex_params/lv.yaml  | 26 ++++++++++
 .../granary/partials/subregex_params/mt.yaml  | 33 +++++++++++++
 .../granary/partials/subregex_params/nl.yaml  | 29 ++++++++++++
 .../granary/partials/subregex_params/pl.yaml  | 30 ++++++++++++
 .../granary/partials/subregex_params/pt.yaml  | 25 ++++++++++
 .../granary/partials/subregex_params/ro.yaml  | 28 +++++++++++
 .../granary/partials/subregex_params/ru.yaml  | 32 +++++++++++++
 .../granary/partials/subregex_params/sk.yaml  | 26 ++++++++++
 .../granary/partials/subregex_params/sl.yaml  | 28 +++++++++++
 .../granary/partials/subregex_params/sv.yaml  | 30 ++++++++++++
 .../granary/partials/subregex_params/uk.yaml  | 26 ++++++++++
 25 files changed, 752 insertions(+)
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/da.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/de.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/el.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/en.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/es.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/et.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/it.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml
 create mode 100644 dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml

diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml
new file mode 100644
index 00000000..467930cc
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml
@@ -0,0 +1,30 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+- {"pattern": "[^ €₽₴$£%?!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
+
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml
new file mode 100644
index 00000000..19ea0d69
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml
@@ -0,0 +1,27 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+- {"pattern": "[^ €₽₴$£%?!',.0123456789ABCCDEFGHHIJKLMNOPQRSTUVWXYZabccdefghhijklmnopqrstuvwxyzÁÉÍÓÚÝáéíóúýČčĎďĚěŇňŘřŠšŤťŮůŽž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/da.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/da.yaml
new file mode 100644
index 00000000..d9e3bb20
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/da.yaml
@@ -0,0 +1,29 @@
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+- {"pattern": "[^ €₽₴$£%!$',.?0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÆØÅæøå]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/de.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/de.yaml
new file mode 100644
index 00000000..71195701
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/de.yaml
@@ -0,0 +1,31 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "ß", "repl": "ss"}
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%?!',.ÄäÖöÜü0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]", "repl": ""}
+
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
+
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/el.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/el.yaml
new file mode 100644
index 00000000..d3f7cb7f
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/el.yaml
@@ -0,0 +1,35 @@
+- {'pattern': 'á', 'repl': 'a'}
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'í', 'repl': 'i'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ř', 'repl': 'r'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'š', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# In greek question mark is ";" 
+# Got these from part of fleurs set: άέήίωϊόύώ
+- {"pattern": "[^ €₽₴$£%!%$',.;0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψωΆΈΉΌάέήίωϊόύώ]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/en.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/en.yaml
new file mode 100644
index 00000000..5f198c9a
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/en.yaml
@@ -0,0 +1,47 @@
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ñ', 'repl': 'n'}
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'á', 'repl': 'a'}
+- {'pattern': 'í', 'repl': 'i'}
+- {'pattern': 'ú', 'repl': 'u'}
+- {'pattern': 'ü', 'repl': 'u'}
+- {'pattern': 'ï', 'repl': 'i'}
+- {'pattern': 'ë', 'repl': 'e'}
+- {'pattern': 'ô', 'repl': 'o'}
+- {'pattern': 'û', 'repl': 'u'}
+- {'pattern': 'â', 'repl': 'a'}
+- {'pattern': 'ê', 'repl': 'e'}
+- {'pattern': 'î', 'repl': 'i'}
+- {'pattern': 'ô', 'repl': 'o'}
+- {'pattern': 'û', 'repl': 'u'}
+- {'pattern': 'ã', 'repl': 'a'}
+- {'pattern': 'õ', 'repl': 'o'}
+- {'pattern': 'ç', 'repl': 'c'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!$',.?0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/es.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/es.yaml
new file mode 100644
index 00000000..4af19681
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/es.yaml
@@ -0,0 +1,28 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%?!$',.¿¡0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZáéíóúüñÁÉÍÓÚÜÑabcdefghijklmnopqrstuvwxyz]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/et.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/et.yaml
new file mode 100644
index 00000000..4dd82d8d
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/et.yaml
@@ -0,0 +1,31 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'ó', 'repl': 'o'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜÕäöüõšž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
+
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml
new file mode 100644
index 00000000..d5356640
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml
@@ -0,0 +1,34 @@
+- {'pattern': 'á', 'repl': 'a'}
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'í', 'repl': 'i'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ř', 'repl': 'r'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'š', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄäÖö]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml
new file mode 100644
index 00000000..7dd0d7db
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml
@@ -0,0 +1,35 @@
+- {"pattern": "Ä", "repl": "A"}
+- {"pattern": "ä", "repl": "a"}
+- {"pattern": "Ï", "repl": "I"}
+- {"pattern": "Ë", "repl": "E"}
+- {"pattern": "Ö", "repl": "O"}
+- {"pattern": "ö", "repl": "o"}
+- {"pattern": "Ÿ", "repl": "Y"}
+- {"pattern": "ÿ", "repl": "y"}
+- {"pattern": "Ü", "repl": "U"}
+- {"pattern": "ü", "repl": "u"}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%?!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÆŒÀÂÉÎÇàâçèéêëîïóôùûœæ]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml
new file mode 100644
index 00000000..23cdf981
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml
@@ -0,0 +1,30 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%?!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzĆćČčĐđŠšŽž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml
new file mode 100644
index 00000000..2e7c8e4c
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml
@@ -0,0 +1,25 @@
+- {'pattern': 'ğ', 'repl': 'g'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+- {"pattern": "[^ €₽₴$£%!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÉÍÓÕÖÚÜáéíóõöúüŠšŰűŐő]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/it.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/it.yaml
new file mode 100644
index 00000000..15a9296b
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/it.yaml
@@ -0,0 +1,31 @@
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'š', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÈÉÌÒÙàèéìòù]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
+
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml
new file mode 100644
index 00000000..68e25e1a
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml
@@ -0,0 +1,26 @@
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "–", "repl": "-"}
+- {"pattern": "—", "repl": "-"}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': '-'}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzĄąČčĖėĘęĮįŠšŪūŲųŽž-]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml
new file mode 100644
index 00000000..682b57d9
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml
@@ -0,0 +1,26 @@
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml
new file mode 100644
index 00000000..f13b8f64
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml
@@ -0,0 +1,33 @@
+- {'pattern': 'à', 'repl': 'a'}
+- {'pattern': 'è', 'repl': 'e'}
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "–", "repl": "-"}
+- {"pattern": "—", "repl": "-"}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': '-'}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzĊċĠġĦħŻż]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml
new file mode 100644
index 00000000..3f7669b9
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml
@@ -0,0 +1,29 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ñ', 'repl': 'n'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzéëïóü]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml
new file mode 100644
index 00000000..29f9874e
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml
@@ -0,0 +1,30 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ñ', 'repl': 'n'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÓóĄąĆćĘęŁłŃńŚśŹźŻż]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml
new file mode 100644
index 00000000..963f63a9
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml
@@ -0,0 +1,25 @@
+- {'pattern': 'ğ', 'repl': 'g'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÉÊÍÓÔÕÚÜàáâãçéêíóôõúü]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml
new file mode 100644
index 00000000..505465e2
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml
@@ -0,0 +1,28 @@
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": '-'}
+- {"pattern": "–", "repl": '-'}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': '-'}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÂÎâîĂăȘșȚț-]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml
new file mode 100644
index 00000000..a3a777f7
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml
@@ -0,0 +1,32 @@
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ğ', 'repl': 'g'}
+- {'pattern': 'ş', 'repl': 's'}
+- {'pattern': 'ñ', 'repl': 'n'}
+- {'pattern': 'é', 'repl': 'e'}
+- {"pattern": "ё", "repl": "e"}
+- {"pattern": "Ё", "repl": "E"}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "–", "repl": "-"}
+- {"pattern": "—", "repl": "-"}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': '-'}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё-]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml
new file mode 100644
index 00000000..923bb5d7
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml
@@ -0,0 +1,26 @@
+- {'pattern': 'ř', 'repl': 'r'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÁÄÉÍÓÔÚÝáäéíóôúýČčĎďĹĺĽľŇňŔŕŠšŤťŽž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml
new file mode 100644
index 00000000..7a8564aa
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml
@@ -0,0 +1,28 @@
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'ó', 'repl': 'o'}
+- {'pattern': 'ğ', 'repl': 'g'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzČčŠšŽž]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml
new file mode 100644
index 00000000..f9afe0da
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml
@@ -0,0 +1,30 @@
+- {'pattern': 'é', 'repl': 'e'}
+- {'pattern': 'ñ', 'repl': 'n'}
+- {'pattern': 'á', 'repl': 'a'}
+- {'pattern': 'ă', 'repl': 'a'}
+- {'pattern': 'ş', 'repl': 's'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "—", "repl": ' '}
+- {"pattern": "–", "repl": ' '}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': ' '}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÅÖäåö]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml b/dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml
new file mode 100644
index 00000000..34468f7e
--- /dev/null
+++ b/dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml
@@ -0,0 +1,26 @@
+- {'pattern': 'é', 'repl': 'e'}
+
+- {"pattern": "’", "repl": "'"}
+- {"pattern": "‘", "repl": "'"}
+- {"pattern": ";", "repl": ','}
+- {"pattern": "–", "repl": "-"}
+- {"pattern": "—", "repl": "-"}
+- {"pattern": "♫", "repl": ' '}
+- {"pattern": "♪", "repl": ' '}
+- {"pattern": "♬", "repl": ' '}
+- {"pattern": "♩", "repl": ' '}
+- {"pattern": "♭", "repl": ' '}
+- {"pattern": '\|', "repl": ' '}
+- {'pattern': ':', 'repl': ' '}
+- {'pattern': '-', 'repl': '-'}
+
+# Keep these characters and remove everything else !$',.0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+- {"pattern": "[^ €₽₴$£%!',.?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ-]", "repl": ""}
+
+# keep capital letters, lowercase letters, and spaces, ?, !, ., ,, and ' only
+- {"pattern": '\s+\.', "repl": "."}
+- {"pattern": '\?+', "repl": "?"}
+- {"pattern": '\.+', "repl": "."}
+- {"pattern": ',+', "repl": ","}
+- {"pattern": '!+', "repl": "!"}
+- {"pattern": '\s+', "repl": " "}
\ No newline at end of file

From 26673bd8c3f01b7230824d0161818deaf0f38b44 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 14:01:17 -0700
Subject: [PATCH 46/90] Added use_dask: False and use_regex: commom to config

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 dataset_configs/multilingual/granary/yodas2.yaml | 3 +++
 requirements/main.txt                            | 4 ++--
 sdp/processors/inference/llm/vllm/vllm.py        | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index e8f11d92..d487de95 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -3,12 +3,15 @@ documentation: |
   ############
   Documentation is in progress.
 
+use_dask: False
+
 params:
   source_lang: en
   source_lang_full: English
   min_audio_lid_probability:  0.7
   min_audio_duration: 0.1
   max_audio_duration: 40.0
+  use_regex: commom 
   translation:
     target_lang: it
     target_lang_full: Italian
diff --git a/requirements/main.txt b/requirements/main.txt
index 85d25f18..506151df 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -26,8 +26,8 @@ distributed
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
 
 # for FasterWhisperInference processor is required: 
-    # pip install pytorch-lightning, nvidia-cublas-cu12, nvidia-cudnn-cu12==9.*, faster_whisper
+    # pip install pytorch-lightning nvidia-cublas-cu12 nvidia-cudnn-cu12==9.* faster_whisper
     # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
-# for vLLMInference processor is required: pip install optree>=0.13.0, vllm
+# for vLLMInference processor is required: pip install optree>=0.13.0 vllm
 # for FastTextLangIdClassifier processor is required: pip install fasttext
 # for CometoidWMTQualityEstimation processor is required: pip install pymarian
\ No newline at end of file
diff --git a/sdp/processors/inference/llm/vllm/vllm.py b/sdp/processors/inference/llm/vllm/vllm.py
index 94daec8a..b6bb5478 100644
--- a/sdp/processors/inference/llm/vllm/vllm.py
+++ b/sdp/processors/inference/llm/vllm/vllm.py
@@ -56,7 +56,7 @@ class vLLMInference(BaseProcessor):
         - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating#applychattemplate
 
         Make sure to install `optree>=0.13.0` and `vllm` before using this processor:
-            pip install optree>=0.13.0, vllm
+            pip install optree>=0.13.0 vllm
     """
 
     def __init__(self,

From ec666cc56288b9e854db467dfbb6b72a261f85c8 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 14:08:40 -0700
Subject: [PATCH 47/90] Fixed typo

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 requirements/main.txt                     | 2 +-
 sdp/processors/inference/llm/vllm/vllm.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index 506151df..5924a7bd 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -28,6 +28,6 @@ distributed
 # for FasterWhisperInference processor is required: 
     # pip install pytorch-lightning nvidia-cublas-cu12 nvidia-cudnn-cu12==9.* faster_whisper
     # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
-# for vLLMInference processor is required: pip install optree>=0.13.0 vllm
+# for vLLMInference processor is required: pip install "optree>=0.13.0" vllm
 # for FastTextLangIdClassifier processor is required: pip install fasttext
 # for CometoidWMTQualityEstimation processor is required: pip install pymarian
\ No newline at end of file
diff --git a/sdp/processors/inference/llm/vllm/vllm.py b/sdp/processors/inference/llm/vllm/vllm.py
index b6bb5478..08364440 100644
--- a/sdp/processors/inference/llm/vllm/vllm.py
+++ b/sdp/processors/inference/llm/vllm/vllm.py
@@ -56,7 +56,8 @@ class vLLMInference(BaseProcessor):
         - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating#applychattemplate
 
         Make sure to install `optree>=0.13.0` and `vllm` before using this processor:
-            pip install optree>=0.13.0 vllm
+            pip install "optree>=0.13.0" vllm
+
     """
 
     def __init__(self,

From c48d1ac7f9ad0d8f88aa09ae27b77d90f60fce64 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 6 May 2025 14:37:07 -0700
Subject: [PATCH 48/90] yodas2.yaml updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          | 46 ++++++++++---------
 .../modify_manifest/data_to_data.py           |  1 +
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index d487de95..c06f2fd0 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -11,7 +11,7 @@ params:
   min_audio_lid_probability:  0.7
   min_audio_duration: 0.1
   max_audio_duration: 40.0
-  use_regex: commom 
+  use_regex: common
   translation:
     target_lang: it
     target_lang_full: Italian
@@ -21,8 +21,9 @@ params:
     min_qe_score: 0.75
   save_disk_space: True
 
-processors_to_run: "9"
-workspace_dir: ???
+processors_to_run: "3:"
+workspace_dir: /data3/sdp_test/final_test
+sdp_dir: /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor
 
 processors:
   - _target_: sdp.processors.ListYodas2Data
@@ -100,7 +101,7 @@ processors:
     expression: (entry.language == "${params.source_lang}") & (entry.language_probability >= ${params.min_audio_lid_probability})
     filter: True
 
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_11.json
     fields_to_drop:
       - language
@@ -113,10 +114,9 @@ processors:
     model_size_or_path: 'base'
     output_dir: ${workspace_dir}/${params.source_lang}/manifest_12
     inference:
-        batch_size: 16
         language: ${params.source_lang}
     save_timestamps_separately: False
-    skip_corrupted: True
+    skip_corrupted_audios: True
   
   - _target_: sdp.processors.ListToEntries
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
@@ -155,10 +155,9 @@ processors:
     model_size_or_path: 'base'
     output_dir: ${workspace_dir}/${params.source_lang}/manifest_17
     inference:
-        batch_size: 16
         language: ${params.source_lang}
     save_timestamps_separately: False
-    skip_corrupted: True
+    skip_corrupted_audios: True
     slice_by_offset: True
   
   - _target_: sdp.processors.KeepOnlySpecifiedFields
@@ -185,8 +184,9 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.WhisperHallucinationFeatures
+  - _target_: sdp.processors.DetectWhisperHallucinationFeatures
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_21.json
+    common_hall_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/common_phrases/${params.source_lang}.txt
     text_field: text
   
   - _target_: sdp.processors.LambdaExpression
@@ -211,7 +211,7 @@ processors:
   - _target_: sdp.processors.vLLMInference
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_24.json
     generation_field: src_text
-    prompt_file: ./dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/${params.source_lang}.yaml
+    prompt_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/${params.source_lang}.yaml
     model:
       model: "Qwen/Qwen2.5-7B-Instruct-1M"
       tensor_parallel_size: 2
@@ -231,17 +231,17 @@ processors:
       tokenize: False
       add_generation_prompt: True
   
-  - _target_: sdp.processors.QwenGenerationFiltering
+  - _target_: sdp.processors.CleanQwenGeneration
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_25.json
     text_field: text
     generation_field: src_text
   
   - _target_: sdp.processors.SubRegex
     text_key: src_text
-    regex_params_yaml: ./dataset_configs/multilingual/yodas2/partials/subregex_params.yaml
+    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/granary/partials/subregex_params/${params.use_regex}.yaml
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_26.json
   
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_27.json
     fields_to_drop:
       - text
@@ -302,7 +302,7 @@ processors:
     operator: lt
     target_value: ${params.translation.max_len_diff_ratio}
   
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_35.json
     fields_to_drop:
       - num_words_src
@@ -311,14 +311,14 @@ processors:
       - len_diff_ratio
   
   ## filtering based on character histograms
-  - _target_: sdp.processors.CharacterHistograms
+  - _target_: sdp.processors.FilterWithCharacterHistograms
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_36.json
     text_field: src_text
     lang: ${params.source_lang}
     output_score_field: src_hist_token_ratio
     cache_dir: /data3/sdp_test/cache/histograms
   
-  - _target_: sdp.processors.CharacterHistograms
+  - _target_: sdp.processors.FilterWithCharacterHistograms
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_37.json
     text_field: tgt_text
     lang: ${params.translation.target_lang}
@@ -331,7 +331,7 @@ processors:
     expression: (entry.src_hist_token_ratio > ${params.translation.min_hist_token_ratio}) & (entry.tgt_hist_token_ratio > ${params.translation.min_hist_token_ratio})
     filter: True
   
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_39.json
     fields_to_drop:
       - src_hist_token_ratio
@@ -339,14 +339,14 @@ processors:
       - len_diff_ratio_filter
 
   ## filtering based on Fasttext LID
-  - _target_: sdp.processors.FastTextClassifier
+  - _target_: sdp.processors.FastTextLangIdClassifier
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_40.json
     text_field: src_text
     output_field: src_lid
     model_name_or_path: lid.176.bin
     cache_dir: /data3/sdp_test/cache
   
-  - _target_: sdp.processors.FastTextClassifier
+  - _target_: sdp.processors.FastTextLangIdClassifier
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_41.json
     text_field: tgt_text
     output_field: tgt_lid
@@ -359,7 +359,7 @@ processors:
     expression: (entry.src_lid == '${params.source_lang}') & (entry.src_lid_prob > ${params.translation.min_text_lid_probability}) & (entry.tgt_lid == '${params.translation.target_lang}') & (entry.tgt_lid_prob > ${params.translation.min_text_lid_probability})
     filter: True
   
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_43.json
     fields_to_drop:
       - src_lid
@@ -384,9 +384,11 @@ processors:
     operator: gt
     target_value: ${params.translation.min_qe_score}
 
-  - _target_: sdp.processors.DropSpecificFields
+  - _target_: sdp.processors.DropSpecifiedFields
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_46.json
     fields_to_drop:
       - cometoid_score
   
-  # - _target_: sdp.processors.ConvertToTarredAudioDataset
\ No newline at end of file
+  # - _target_: sdp.processors.ConvertToTarredAudioDataset
+
+  #export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2.yaml
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 28842493..b64e879e 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -19,6 +19,7 @@
 import shutil
 import requests
 import wget
+import yaml
 import tarfile
 from glob import glob
 from typing import Dict, List

From 682f0f0c89a23c515b9b4af33d526b3cf8798990 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 03:02:20 -0700
Subject: [PATCH 49/90] ConvertToTarredAudioDataset

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/src/sdp/api.rst                          |    3 +
 requirements/main.txt                         |    5 +-
 sdp/processors/__init__.py                    |    1 +
 .../convert_to_tarred_audio_dataset.py        |  160 +++
 .../utils/convert_to_tarred_audio_dataset.py  | 1021 +++++++++++++++++
 .../utils/create_dali_tarred_dataset_index.py |   95 ++
 6 files changed, 1284 insertions(+), 1 deletion(-)
 create mode 100644 sdp/processors/manage_files/convert_to_tarred_audio_dataset.py
 create mode 100644 sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
 create mode 100644 sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 31563e36..f582b479 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -386,6 +386,9 @@ Miscellaneous
 .. autodata:: sdp.processors.RemoveFiles
    :annotation:
 
+.. autodata:: sdp.processors.ConvertToTarredAudioDatasetConfig
+   :annotation:
+
 .. _sdp-base-classes:
 
 Base classes
diff --git a/requirements/main.txt b/requirements/main.txt
index 5924a7bd..1fd82830 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -9,7 +9,9 @@ omegaconf
 pandas
 rarfile
 regex
+soundfile
 sox
+tabulate
 tqdm
 termplotlib
 gdown
@@ -30,4 +32,5 @@ distributed
     # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
 # for vLLMInference processor is required: pip install "optree>=0.13.0" vllm
 # for FastTextLangIdClassifier processor is required: pip install fasttext
-# for CometoidWMTQualityEstimation processor is required: pip install pymarian
\ No newline at end of file
+# for CometoidWMTQualityEstimation processor is required: pip install pymarian
+# for ConvertToTarredAudioDatasetConfig processor can be additionally required: pip install lhotse "nemo-toolkit[common]"
\ No newline at end of file
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2f945335..eeb3d4e3 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -78,6 +78,7 @@
     SoxConvert,
 )
 from sdp.processors.manage_files.remove import RemoveFiles
+from sdp.processors.manage_files.convert_to_tarred_audio_dataset import ConvertToTarredAudioDataset
 
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
 from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
diff --git a/sdp/processors/manage_files/convert_to_tarred_audio_dataset.py b/sdp/processors/manage_files/convert_to_tarred_audio_dataset.py
new file mode 100644
index 00000000..93f4a970
--- /dev/null
+++ b/sdp/processors/manage_files/convert_to_tarred_audio_dataset.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+from dataclasses import dataclass
+from typing import Optional
+from copy import deepcopy
+from tqdm import tqdm
+import shutil
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.processors.manage_files.utils.convert_to_tarred_audio_dataset import create_tar_datasets
+
+
+@dataclass
+class ConvertToTarredAudioDatasetConfig:
+    """
+    Configuration class for ConvertToTarredAudioDataset.
+
+    Attributes:
+        max_duration (float): Maximum allowed duration for audio samples.
+        min_duration (Optional[float]): Minimum allowed duration for audio samples.
+        concat_manifest_paths (Optional[str]): Path to a manifest file containing multiple manifest paths to concatenate.
+        target_dir (Optional[str]): Output directory to save tarred dataset.
+        metadata_path (Optional[str]): Path to write metadata about the tarred dataset.
+        num_shards (int): Number of shards to create. If -1, it will be determined automatically.
+        shuffle (bool): Whether to shuffle the input manifest before processing.
+        keep_files_together (bool): If True, all segments from the same source file are kept in the same shard.
+        sort_in_shards (bool): If True, samples inside each shard will be sorted by duration.
+        buckets_num (int): Number of duration-based buckets to split data into.
+        dynamic_buckets_num (int): Number of dynamic buckets for load balancing.
+        shuffle_seed (Optional[int]): Random seed used for shuffling.
+        write_metadata (bool): Whether to write metadata JSON files during processing.
+        no_shard_manifests (bool): If True, disables writing per-shard manifest files.
+        force_codec (Optional[str]): Audio codec to use when re-encoding audio files.
+        workers (int): Number of worker processes for parallel audio re-encoding.
+        slice_with_offset (bool): If True, audio slices will use offset and duration fields.
+        only_manifests (bool): If True, only manifests will be generated without audio re-encoding.
+    """
+    max_duration: float
+    min_duration: Optional[float] = None
+    concat_manifest_paths: Optional[str] = None
+    target_dir: Optional[str] = None
+    metadata_path: Optional[str] = None
+    num_shards: int = -1
+    shuffle: bool = False
+    keep_files_together: bool = False
+    sort_in_shards: bool = False
+    buckets_num: int = 1
+    dynamic_buckets_num: int = 30
+    shuffle_seed: Optional[int] = None
+    write_metadata: bool = False
+    no_shard_manifests: bool = False
+    force_codec: Optional[str] = None
+    workers: int = 1
+    slice_with_offset: bool = False
+    only_manifests: bool = False
+
+
+class ConvertToTarredAudioDataset(BaseProcessor):
+    """
+    A processor for converting audio manifests into tarred audio datasets.
+
+    This processor optionally splits data into duration-based buckets, and calls the
+    `create_tar_datasets` utility to convert and shard audio data into tar files,
+    with accompanying manifest files.
+
+    Args:
+        output_manifest_file (str): Path to the final output manifest.
+        input_manifest_file (str): Path to the input manifest to be tarred.
+        **cfg_kwargs: Additional keyword arguments passed to the configuration dataclass.
+    
+    Returns:
+        Writes a tarred and sharded audio dataset to disk.
+
+        - The dataset consists of multiple `.tar` archives with audio files.
+        - A final manifest (JSON lines format) is written to ``output_manifest_file``, 
+          referencing each sample, its path inside the tar, and other metadata.
+        - If ``buckets_num > 1``, each sample will include an additional ``bucket_id`` field.
+
+    .. note::
+        If `buckets_num > 1`, the input manifest is split into multiple duration buckets,
+        and each bucket is processed independently. A `bucket_id` is added to each sample.
+
+        You may need to install the extra dependencies of Lhotse and NeMo for this processor to work correctly:
+        ``pip install lhotse "nemo-toolkit[common]"``  
+        
+    """
+
+    def __init__(
+        self,
+        output_manifest_file: str,
+        input_manifest_file: str = None,
+        **cfg_kwargs,
+    ):
+        super().__init__(
+            input_manifest_file=input_manifest_file,
+            output_manifest_file=output_manifest_file
+        )
+        self.cfg = ConvertToTarredAudioDatasetConfig(**cfg_kwargs)
+
+    def process(self):
+        # If bucketing is enabled, divide the data based on duration ranges.
+        if self.cfg.buckets_num > 1:
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                bucket_length = (self.cfg.max_duration - self.cfg.min_duration) / float(self.cfg.buckets_num)
+
+                for i_bucket in range(self.cfg.buckets_num):
+                    # Create a config for the current bucket
+                    bucket_config = deepcopy(self.cfg)
+                    bucket_config.min_duration = self.cfg.min_duration + i_bucket * bucket_length
+                    bucket_config.max_duration = bucket_config.min_duration + bucket_length
+                    if i_bucket == self.cfg.buckets_num - 1:
+                        # Ensure final bucket includes edge cases
+                        bucket_config.max_duration += 1e-5
+
+                    bucket_config.target_dir = os.path.join(self.cfg.target_dir, f"bucket{i_bucket+1}")
+                    
+                    print(f"Creating bucket {i_bucket+1} with min_duration={bucket_config.min_duration} and max_duration={bucket_config.max_duration} ...")
+                    print(f"Results are being saved at: {bucket_config.target_dir}.")
+
+                    # Create tarred dataset for the current bucket
+                    create_tar_datasets(
+                        manifest_path=self.input_manifest_file,
+                        **vars(bucket_config)
+                    )
+
+                    # Read and modify the output manifest from this bucket
+                    bucket_manifest_path = os.path.join(bucket_config.target_dir, 'tarred_audio_manifest.json')
+                    with open(bucket_manifest_path, 'r', encoding='utf8') as bin_f:
+                        for line in tqdm(bin_f, desc="Writing output manifest.."):
+                            entry = json.loads(line)
+                            entry['bucket_id'] = i_bucket
+                            line = json.dumps(entry)
+                            fout.writelines(f'{line}\n')
+
+                    print(f"Bucket {i_bucket+1} is created.")
+
+        else:
+            # No bucketing — create single tarred dataset
+            create_tar_datasets(
+                manifest_path=self.input_manifest_file,
+                **vars(self.cfg)
+            )
+
+            # Copy the generated manifest to the target location
+            tarred_audio_manifest = os.path.join(self.cfg.target_dir, 'tarred_audio_manifest.json')
+            shutil.copy(tarred_audio_manifest, self.output_manifest_file)
\ No newline at end of file
diff --git a/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py b/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
new file mode 100644
index 00000000..84b10835
--- /dev/null
+++ b/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
@@ -0,0 +1,1021 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+# This script converts an existing audio dataset with a manifest to
+# a tarred and sharded audio dataset that can be read by the
+# TarredAudioToTextDataLayer.
+
+# Please make sure your audio_filepath DOES NOT CONTAIN '-sub'!
+# Because we will use it to handle files which have duplicate filenames but with different offsets
+# (see function create_shard for details)
+
+
+# Bucketing can help to improve the training speed. You may use --buckets_num to specify the number of buckets.
+# It creates multiple tarred datasets, one per bucket, based on the audio durations.
+# The range of [min_duration, max_duration) is split into equal sized buckets.
+# Recommend to use --sort_in_shards to speedup the training by reducing the paddings in the batches
+# More info on how to use bucketing feature: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/datasets.html
+
+# If valid NVIDIA DALI version is installed, will also generate the corresponding DALI index files that need to be
+# supplied to the config in order to utilize webdataset for efficient large dataset handling.
+# NOTE: DALI + Webdataset is NOT compatible with Bucketing support !
+
+# Usage:
+1) Creating a new tarfile dataset
+
+python convert_to_tarred_audio_dataset.py \
+    --manifest_path=<path to the manifest file> \
+    --target_dir=<path to output directory> \
+    --num_shards=<number of tarfiles that will contain the audio> \
+    --max_duration=<float representing maximum duration of audio samples> \
+    --min_duration=<float representing minimum duration of audio samples> \
+    --shuffle --shuffle_seed=1 \
+    --sort_in_shards \
+    --force_codec=flac \
+    --workers=-1
+
+
+2) Concatenating more tarfiles to a pre-existing tarred dataset
+
+python convert_to_tarred_audio_dataset.py \
+    --manifest_path=<path to the tarred manifest file> \
+    --metadata_path=<path to the metadata.yaml (or metadata_version_{X}.yaml) file> \
+    --target_dir=<path to output directory where the original tarfiles are contained> \
+    --max_duration=<float representing maximum duration of audio samples> \
+    --min_duration=<float representing minimum duration of audio samples> \
+    --shuffle --shuffle_seed=1 \
+    --sort_in_shards \
+    --workers=-1 \
+    --concat_manifest_paths
+    <space separated paths to 1 or more manifest files to concatenate into the original tarred dataset>
+
+3) Writing an empty metadata file
+
+python convert_to_tarred_audio_dataset.py \
+    --target_dir=<path to output directory> \
+    # any other optional argument
+    --num_shards=8 \
+    --max_duration=16.7 \
+    --min_duration=0.01 \
+    --shuffle \
+    --workers=-1 \
+    --sort_in_shards \
+    --shuffle_seed=1 \
+    --write_metadata
+
+"""
+import argparse
+import copy
+import json
+import os
+import random
+import tarfile
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from io import BytesIO
+from typing import Any, List, Optional
+
+import numpy as np
+import soundfile
+from joblib import Parallel, delayed
+from omegaconf import DictConfig, OmegaConf, open_dict
+from tabulate import tabulate
+from tqdm import tqdm
+
+try:
+    import create_dali_tarred_dataset_index as dali_index
+
+    DALI_INDEX_SCRIPT_AVAILABLE = True
+except (ImportError, ModuleNotFoundError, FileNotFoundError):
+    DALI_INDEX_SCRIPT_AVAILABLE = False
+
+
+@dataclass
+class ASRTarredDatasetConfig:
+    num_shards: int = -1
+    shuffle: bool = False
+    max_duration: Optional[float] = None
+    min_duration: Optional[float] = None
+    shuffle_seed: Optional[int] = None
+    sort_in_shards: bool = True
+    slice_with_offset: bool = True
+    shard_manifests: bool = True
+    keep_files_together: bool = False
+    force_codec: Optional[str] = None
+    use_lhotse: bool = False
+    use_bucketing: bool = False
+    num_buckets: Optional[int] = None
+    bucket_duration_bins: Optional[list[float]] = None
+
+
+@dataclass
+class ASRTarredDatasetMetadata:
+    created_datetime: Optional[str] = None
+    version: int = 0
+    num_samples_per_shard: Optional[int] = None
+    is_concatenated_manifest: bool = False
+
+    dataset_config: Optional[ASRTarredDatasetConfig] = field(default_factory=lambda: ASRTarredDatasetConfig())
+    history: Optional[List[Any]] = field(default_factory=lambda: [])
+
+    def __post_init__(self):
+        self.created_datetime = self.get_current_datetime()
+
+    def get_current_datetime(self):
+        return datetime.now().strftime("%m-%d-%Y %H-%M-%S")
+
+    @classmethod
+    def from_config(cls, config: DictConfig):
+        obj = cls()
+        obj.__dict__.update(**config)
+        return obj
+
+    @classmethod
+    def from_file(cls, filepath: str):
+        config = OmegaConf.load(filepath)
+        return ASRTarredDatasetMetadata.from_config(config=config)
+
+
+class ASRTarredDatasetBuilder:
+    """
+    Helper class that constructs a tarred dataset from scratch, or concatenates tarred datasets
+    together and constructs manifests for them.
+    """
+
+    def __init__(self):
+        self.config = None
+
+    def configure(self, config: ASRTarredDatasetConfig):
+        """
+        Sets the config generated from command line overrides.
+
+        Args:
+            config: ASRTarredDatasetConfig dataclass object.
+        """
+        self.config = config  # type: ASRTarredDatasetConfig
+
+        if self.config.num_shards < 0:
+            raise ValueError("`num_shards` must be > 0. Please fill in the metadata information correctly.")
+
+    def create_new_dataset(
+        self,
+        manifest_path: str,
+        target_dir: str = "./tarred/",
+        num_workers: int = 0,
+        buckets_num: int = 1,
+        dynamic_buckets_num: int = 30,
+        only_manifests: bool = False,
+        dry_run: bool = False,
+    ):
+        """
+        Creates a new tarred dataset from a given manifest file.
+
+        Args:
+            manifest_path (str): Path to the original ASR manifest file.
+            target_dir (str, optional): Output directory where tarred files and manifests will be saved. Defaults to "./tarred/".
+            num_workers (int, optional): Number of parallel worker processes for writing tar files. Defaults to 0 (sequential processing).
+            buckets_num (int, optional): Number of buckets for static bucketing. Defaults to 1 (no bucketing).
+            dynamic_buckets_num (int, optional): Number of buckets to estimate for dynamic bucketing. Defaults to 30.
+            only_manifests (bool, optional): If True, performs a dry run without creating actual tar files. Defaults to False.
+
+        Raises:
+            ValueError: If the configuration has not been set.
+            FileNotFoundError: If the manifest file does not exist.
+
+        Output:
+            - Creates tar files and a tarred dataset compatible manifest file in the specified `target_dir`.
+            - Preserves a record of the metadata used to construct the tarred dataset in `metadata.yaml`.
+            - Optionally creates shard manifests if `config.shard_manifests` is enabled.
+
+        Notes:
+            - The function reads the manifest, applies filtering and shuffling if specified, and creates shards of tar files.
+            - It generates shard manifests and the main tarred dataset manifest.
+            - Metadata is updated and saved based on the tarred dataset configuration.
+        """
+        if self.config is None:
+            raise ValueError("Config has not been set. Please call `configure(config: ASRTarredDatasetConfig)`")
+
+        if manifest_path is None:
+            raise FileNotFoundError("Manifest filepath cannot be None !")
+
+        config = self.config  # type: ASRTarredDatasetConfig
+
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+
+        # Read the existing manifest
+        entries, total_duration, filtered_entries, filtered_duration = self._read_manifest(manifest_path, config)
+
+        header = [
+            "Min.\nduration",
+            "Max.\nduration",
+            "Entries amount\nafter filtration",
+            "Total duration\nafter filtration",
+            "Shards\namount",
+            "Entries\nper shard",
+            "Remainded\nentries",
+        ]
+
+        entires_amount = f'{len(entries)} / {len(entries) + len(filtered_entries)}'
+        entries_duration = f'{total_duration:.2f} / {total_duration + filtered_duration:.2f} s'
+        entries_per_shard = len(entries) // config.num_shards
+        remainder = len(entries) % config.num_shards
+
+        data = [
+            [
+                f"{config.min_duration} s",
+                f"{config.max_duration} s",
+                f"{entires_amount}",
+                f"{entries_duration}",
+                f"{config.num_shards}",
+                f"{entries_per_shard}",
+                f"{remainder}",
+            ]
+        ]
+
+        print('\n' + tabulate(data, headers=header, tablefmt="grid", colalign=["center"] * len(header)))
+        if dry_run:
+            return
+
+        if len(entries) == 0:
+            print("No tarred dataset was created as there were 0 valid samples after filtering!")
+            return
+        if config.shuffle:
+            random.seed(config.shuffle_seed)
+            print(f"Shuffling (seed: {config.shuffle_seed})...")
+            if config.keep_files_together:
+                filename_entries = defaultdict(list)
+                for ent in entries:
+                    filename_entries[ent["audio_filepath"]].append(ent)
+                filenames = list(filename_entries.keys())
+                random.shuffle(filenames)
+                shuffled_entries = []
+                for filename in filenames:
+                    shuffled_entries += filename_entries[filename]
+                entries = shuffled_entries
+            else:
+                random.shuffle(entries)
+
+        start_indices = []
+        end_indices = []
+        # Build indices
+        for i in range(config.num_shards):
+            start_idx = (len(entries) // config.num_shards) * i
+            end_idx = start_idx + (len(entries) // config.num_shards)
+            print(f"Shard {i} has entries {start_idx} ~ {end_idx}")
+            files = set()
+            for ent_id in range(start_idx, end_idx):
+                files.add(entries[ent_id]["audio_filepath"])
+            print(f"Shard {i} contains {len(files)} files")
+            if i == config.num_shards - 1:
+                # We discard in order to have the same number of entries per shard.
+                print(f"Have {len(entries) - end_idx} entries left over that will be discarded.")
+
+            start_indices.append(start_idx)
+            end_indices.append(end_idx)
+
+        manifest_folder, _ = os.path.split(manifest_path)
+
+        with Parallel(n_jobs=num_workers, verbose=config.num_shards) as parallel:
+            # Call parallel tarfile construction
+            new_entries_list = parallel(
+                delayed(self._create_shard)(entries[start_idx:end_idx], target_dir, i, manifest_folder, only_manifests)
+                for i, (start_idx, end_idx) in enumerate(zip(start_indices, end_indices))
+            )
+
+        if config.shard_manifests:
+            sharded_manifests_dir = target_dir + '/sharded_manifests'
+            if not os.path.exists(sharded_manifests_dir):
+                os.makedirs(sharded_manifests_dir)
+
+            for manifest in new_entries_list:
+                shard_id = manifest[0]['shard_id']
+                new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json')
+                with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2:
+                    for entry in manifest:
+                        json.dump(entry, m2, ensure_ascii=False)
+                        m2.write('\n')
+
+        # Flatten the list of list of entries to a list of entries
+        new_entries = [sample for manifest in new_entries_list for sample in manifest]
+        del new_entries_list
+
+        print("Total number of entries in manifest :", len(new_entries))
+
+        # Write manifest
+        new_manifest_path = os.path.join(target_dir, 'tarred_audio_manifest.json')
+        with open(new_manifest_path, 'w', encoding='utf-8') as m2:
+            for entry in new_entries:
+                json.dump(entry, m2, ensure_ascii=False)
+                m2.write('\n')
+
+        # Write metadata (default metadata for new datasets)
+        new_metadata_path = os.path.join(target_dir, 'metadata.yaml')
+        metadata = ASRTarredDatasetMetadata()
+
+        # Update metadata
+        metadata.dataset_config = config
+        metadata.num_samples_per_shard = len(new_entries) // config.num_shards
+
+        if buckets_num <= 1:
+            # Estimate and update dynamic bucketing args
+            bucketing_kwargs = self.estimate_dynamic_bucketing_duration_bins(
+                new_manifest_path, num_buckets=dynamic_buckets_num
+            )
+            for k, v in bucketing_kwargs.items():
+                setattr(metadata.dataset_config, k, v)
+
+        # Write metadata
+        metadata_yaml = OmegaConf.structured(metadata)
+        OmegaConf.save(metadata_yaml, new_metadata_path, resolve=True)
+
+    def estimate_dynamic_bucketing_duration_bins(self, manifest_path: str, num_buckets: int = 30) -> dict:
+        from lhotse import CutSet
+        from lhotse.dataset.sampling.dynamic_bucketing import estimate_duration_buckets
+
+        from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
+
+        cuts = CutSet(LazyNeMoIterator(manifest_path, metadata_only=True))
+        bins = estimate_duration_buckets(cuts, num_buckets=num_buckets)
+        print(
+            f"Note: we estimated the optimal bucketing duration bins for {num_buckets} buckets. "
+            "You can enable dynamic bucketing by setting the following options in your training script:\n"
+            "  use_lhotse=true\n"
+            "  use_bucketing=true\n"
+            f"  num_buckets={num_buckets}\n"
+            f"  bucket_duration_bins=[{','.join(map(str, bins))}]\n"
+            "  batch_duration=<tune-this-value>\n"
+            "If you'd like to use a different number of buckets, re-estimate this option manually using "
+            "scripts/speech_recognition/estimate_duration_bins.py"
+        )
+        return dict(
+            use_lhotse=True,
+            use_bucketing=True,
+            num_buckets=num_buckets,
+            bucket_duration_bins=list(map(float, bins)),  # np.float -> float for YAML serialization
+        )
+
+    def create_concatenated_dataset(
+        self,
+        base_manifest_path: str,
+        manifest_paths: List[str],
+        metadata: ASRTarredDatasetMetadata,
+        target_dir: str = "./tarred_concatenated/",
+        num_workers: int = 1,
+        only_manifests: bool = False,
+        dry_run: bool = False,
+    ):
+        """
+        Creates a concatenated tarred dataset from the base manifest and additional manifest files.
+
+        Args:
+            base_manifest_path (str): Path to the base manifest file that contains information for the original
+                tarred dataset (with flattened paths).
+            manifest_paths (List[str]): List of paths to additional manifest files that will be concatenated with
+                the base tarred dataset.
+            metadata (ASRTarredDatasetMetadata): Metadata instance containing configuration and overrides.
+            target_dir (str, optional): Output directory where tarred files and manifests will be saved. Defaults to "./tarred_concatenated/".
+            num_workers (int, optional): Number of parallel worker processes for creating tar files. Defaults to 1.
+            only_manifests (bool, optional): If True, performs a dry run without creating actual tar files. Defaults to False.
+
+        Raises:
+            FileNotFoundError: If the base manifest file or any of the additional manifest files does not exist.
+
+        Output:
+            - Creates tar files and a concatenated tarred dataset compatible manifest file in the specified `target_dir`.
+            - Updates metadata to reflect the concatenated dataset, including the version and historical data.
+
+        Notes:
+            - The function reads the base manifest and additional manifests, filters and shuffles entries as needed,
+            and creates new shards of tar files.
+            - It generates a new concatenated dataset manifest and updates metadata with versioning and historical context.
+            - If `metadata` is provided, the function updates its version and includes historical data in the new metadata.
+        """
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+
+        if base_manifest_path is None:
+            raise FileNotFoundError("Base manifest filepath cannot be None !")
+
+        if manifest_paths is None or len(manifest_paths) == 0:
+            raise FileNotFoundError("List of additional manifest filepaths cannot be None !")
+
+        config = ASRTarredDatasetConfig(**(metadata.dataset_config))
+
+        # Read the existing manifest (no filtering here)
+        base_entries, _, _, _ = self._read_manifest(base_manifest_path, config)
+        print(f"Read base manifest containing {len(base_entries)} samples.")
+
+        # Precompute number of samples per shard
+        if metadata.num_samples_per_shard is None:
+            num_samples_per_shard = len(base_entries) // config.num_shards
+        else:
+            num_samples_per_shard = metadata.num_samples_per_shard
+
+        print("Number of samples per shard :", num_samples_per_shard)
+
+        # Compute min and max duration and update config (if no metadata passed)
+        print(f"Selected max duration : {config.max_duration}")
+        print(f"Selected min duration : {config.min_duration}")
+
+        entries = []
+        for new_manifest_idx in range(len(manifest_paths)):
+            new_entries, total_duration, filtered_new_entries, filtered_duration = self._read_manifest(
+                manifest_paths[new_manifest_idx], config
+            )
+
+            if len(filtered_new_entries) > 0:
+                print(
+                    f"Filtered {len(filtered_new_entries)} files which amounts to {filtered_duration:0.2f}"
+                    f" seconds of audio from manifest {manifest_paths[new_manifest_idx]}."
+                )
+            print(
+                f"After filtering, manifest has {len(entries)} files which amounts to {total_duration} seconds of audio."
+            )
+
+            entries.extend(new_entries)
+
+        if len(entries) == 0:
+            print("No tarred dataset was created as there were 0 valid samples after filtering!")
+            return
+
+        if config.shuffle:
+            random.seed(config.shuffle_seed)
+            print(f"Shuffling (seed: {config.shuffle_seed})...")
+            random.shuffle(entries)
+
+        # Drop last section of samples that cannot be added onto a chunk
+        drop_count = len(entries) % num_samples_per_shard
+        total_new_entries = len(entries)
+        entries = entries[:-drop_count]
+
+        print(
+            f"Dropping {drop_count} samples from total new samples {total_new_entries} since they cannot "
+            f"be added into a uniformly sized chunk."
+        )
+
+        # Create shards and updated manifest entries
+        num_added_shards = len(entries) // num_samples_per_shard
+
+        print(f"Number of samples in base dataset : {len(base_entries)}")
+        print(f"Number of samples in additional datasets : {len(entries)}")
+        print(f"Number of added shards : {num_added_shards}")
+        print(f"Remainder: {len(entries) % num_samples_per_shard}")
+
+        if dry_run:
+            return
+
+        start_indices = []
+        end_indices = []
+        shard_indices = []
+        for i in range(num_added_shards):
+            start_idx = (len(entries) // num_added_shards) * i
+            end_idx = start_idx + (len(entries) // num_added_shards)
+            shard_idx = i + config.num_shards
+            print(f"Shard {shard_idx} has entries {start_idx + len(base_entries)} ~ {end_idx + len(base_entries)}")
+
+            start_indices.append(start_idx)
+            end_indices.append(end_idx)
+            shard_indices.append(shard_idx)
+
+        manifest_folder, _ = os.path.split(base_manifest_path)
+
+        with Parallel(n_jobs=num_workers, verbose=num_added_shards) as parallel:
+            # Call parallel tarfile construction
+            new_entries_list = parallel(
+                delayed(self._create_shard)(
+                    entries[start_idx:end_idx], target_dir, shard_idx, manifest_folder, only_manifests
+                )
+                for i, (start_idx, end_idx, shard_idx) in enumerate(zip(start_indices, end_indices, shard_indices))
+            )
+
+        if config.shard_manifests:
+            sharded_manifests_dir = target_dir + '/sharded_manifests'
+            if not os.path.exists(sharded_manifests_dir):
+                os.makedirs(sharded_manifests_dir)
+
+            for manifest in new_entries_list:
+                shard_id = manifest[0]['shard_id']
+                new_manifest_shard_path = os.path.join(sharded_manifests_dir, f'manifest_{shard_id}.json')
+                with open(new_manifest_shard_path, 'w', encoding='utf-8') as m2:
+                    for entry in manifest:
+                        json.dump(entry, m2, ensure_ascii=False)
+                        m2.write('\n')
+
+        # Flatten the list of list of entries to a list of entries
+        new_entries = [sample for manifest in new_entries_list for sample in manifest]
+        del new_entries_list
+
+        # Write manifest
+        if metadata is None:
+            new_version = 1  # start with `1`, where `0` indicates the base manifest + dataset
+        else:
+            new_version = metadata.version + 1
+
+        print("Total number of entries in manifest :", len(base_entries) + len(new_entries))
+
+        new_manifest_path = os.path.join(target_dir, f'tarred_audio_manifest_version_{new_version}.json')
+        with open(new_manifest_path, 'w', encoding='utf-8') as m2:
+            # First write all the entries of base manifest
+            for entry in base_entries:
+                json.dump(entry, m2, ensure_ascii=False)
+                m2.write('\n')
+
+            # Finally write the new entries
+            for entry in new_entries:
+                json.dump(entry, m2, ensure_ascii=False)
+                m2.write('\n')
+
+        # Preserve historical metadata
+        base_metadata = metadata
+
+        # Write metadata (updated metadata for concatenated datasets)
+        new_metadata_path = os.path.join(target_dir, f'metadata_version_{new_version}.yaml')
+        metadata = ASRTarredDatasetMetadata()
+
+        # Update config
+        config.num_shards = config.num_shards + num_added_shards
+
+        # Update metadata
+        metadata.version = new_version
+        metadata.dataset_config = config
+        metadata.num_samples_per_shard = num_samples_per_shard
+        metadata.is_concatenated_manifest = True
+        metadata.created_datetime = metadata.get_current_datetime()
+
+        # Attach history
+        current_metadata = OmegaConf.structured(base_metadata.history)
+        metadata.history = current_metadata
+
+        # Write metadata
+        metadata_yaml = OmegaConf.structured(metadata)
+        OmegaConf.save(metadata_yaml, new_metadata_path, resolve=True)
+
+    def _read_manifest(self, manifest_path: str, config: ASRTarredDatasetConfig):
+        """Read and filters data from the manifest"""
+        # Read the existing manifest
+        entries = []
+        total_duration = 0.0
+        filtered_entries = []
+        filtered_duration = 0.0
+        with open(manifest_path, 'r', encoding='utf-8') as m:
+            for line in m:
+                entry = json.loads(line)
+                audio_key = "audio_filepath" if "audio_filepath" in entry else "audio_file"
+                if config.slice_with_offset and "offset" not in entry:
+                    raise KeyError(
+                        f"Manifest entry does not contain 'offset' field, but '--slice_with_offset' is enabled: {entry}"
+                    )
+                if audio_key not in entry:
+                    raise KeyError(f"Manifest entry does not contain 'audio_filepath' or  'audio_file' key: {entry}")
+                audio_filepath = entry[audio_key]
+                if not os.path.isfile(audio_filepath) and not os.path.isabs(audio_filepath):
+                    audio_filepath_abs = os.path.join(os.path.dirname(manifest_path), audio_filepath)
+                    if not os.path.isfile(audio_filepath_abs):
+                        raise FileNotFoundError(f"Could not find {audio_filepath} or {audio_filepath_abs}!")
+                    entry[audio_key] = audio_filepath_abs
+                if (config.max_duration is None or entry['duration'] < config.max_duration) and (
+                    config.min_duration is None or entry['duration'] >= config.min_duration
+                ):
+                    entries.append(entry)
+                    total_duration += entry["duration"]
+                else:
+                    filtered_entries.append(entry)
+                    filtered_duration += entry['duration']
+
+        return entries, total_duration, filtered_entries, filtered_duration
+
+    def _write_to_tar(
+        self, tar, audio_filepath: str, squashed_filename: str, duration: float = None, offset: float = 0
+    ) -> None:
+        codec = self.config.force_codec
+        to_transcode = not (codec is None or audio_filepath.endswith(f".{codec}"))
+        to_crop = not (duration is None and offset == 0)
+
+        if not to_crop and not to_transcode:
+            # Add existing file without transcoding, trimming, or re-encoding.
+            tar.add(audio_filepath, arcname=squashed_filename)
+            return
+
+        # Standard processing: read, trim, and transcode the audio file
+        with sf.SoundFile(audio_filepath) as f:
+            sampling_rate = f.samplerate
+
+        # Trim audio based on offset and duration.
+        start_sample = int(offset * sampling_rate)
+        num_frames = int(duration * sampling_rate) if duration else -1
+        audio, sampling_rate = sf.read(file_path, start=start_sample, frames=num_frames)
+
+        # Determine codec parameters.
+        if codec is not None:
+            if codec == "opus":
+                kwargs = {"format": "ogg", "subtype": "opus"}
+            else:
+                kwargs = {"format": codec}
+        else:
+            codec = soundfile.info(audio_filepath).format.lower()
+            kwargs = {"format": codec}
+
+        # Transcode and write audio to tar.
+        encoded_audio = BytesIO()
+        soundfile.write(encoded_audio, audio, sampling_rate, closefd=False, **kwargs)
+
+        # Generate filename with the appropriate extension.
+        encoded_squashed_filename = f"{squashed_filename.split('.')[0]}.{codec}"
+
+        # Add the in-memory audio file to the tar archive.
+        ti = tarfile.TarInfo(encoded_squashed_filename)
+        encoded_audio.seek(0)
+        ti.size = len(encoded_audio.getvalue())
+        tar.addfile(ti, encoded_audio)
+
+    def _create_shard(self, entries, target_dir, shard_id, manifest_folder: str = None, only_manifests: bool = False):
+        """Creates a tarball containing the audio files from `entries`."""
+        if self.config.sort_in_shards:
+            entries.sort(key=lambda x: x["duration"], reverse=False)
+
+        new_entries = []
+
+        tar_filepath = os.path.join(target_dir, f'audio_{shard_id}.tar')
+        if not only_manifests:
+            tar = tarfile.open(tar_filepath, mode='w', dereference=True)
+
+        count = dict()
+        for entry in tqdm(entries, desc="Creating shard.."):
+            # We squash the filename since we do not preserve directory structure of audio files in the tarball.
+            if os.path.exists(entry["audio_filepath"]) or only_manifests:
+                audio_filepath = entry["audio_filepath"]
+            else:
+                if not manifest_folder:
+                    raise FileNotFoundError(f"Could not find {entry['audio_filepath']}!")
+
+                audio_filepath = os.path.join(manifest_folder, entry["audio_filepath"])
+                if not os.path.exists(audio_filepath):
+                    raise FileNotFoundError(f"Could not find {entry['audio_filepath']}!")
+
+            base, ext = os.path.splitext(audio_filepath)
+            base = base.replace('/', '_')
+            # Need the following replacement as long as WebDataset splits on first period
+            base = base.replace('.', '_')
+            squashed_filename = f'{base}{ext}'
+
+            if self.config.slice_with_offset:
+                if squashed_filename not in count:
+                    count[squashed_filename] = 1
+
+                entry_offset = str(entry['offset']).split('.')
+                if len(entry_offset) == 1:
+                    # Example: offset = 12 -> becomes 12_0
+                    entry_offset.append('0')
+                elif len(entry_offset) == 2:
+                    # Example: offset = 12.34 -> becomes 12_34
+                    pass
+                else:
+                    raise ValueError(
+                        f"The offset for the entry with audio_filepath '{entry['audio_filepath']}' is incorrectly provided ({entry['offset']}). "
+                        "Expected a float-like value (e.g., 12 or 12.34)."
+                    )
+                entry_offset = "_".join(entry_offset)
+
+                entry_duration = str(entry['duration']).split('.')
+                if len(entry_duration) == 1:
+                    entry_duration.append('0')
+                elif len(entry_duration) > 2:
+                    raise ValueError(
+                        f"The duration for the entry with audio_filepath '{entry['audio_filepath']}' is incorrectly provided ({entry['duration']})."
+                    )
+                entry_duration = "_".join(entry_duration)
+
+                to_write = base + "_" + entry_offset + "_" + entry_duration + ext
+                if not only_manifests:
+                    self._write_to_tar(
+                        tar, audio_filepath, to_write, duration=entry['duration'], offset=entry['offset']
+                    )
+                count[squashed_filename] += 1
+
+                entry['source_audio_offset'] = entry['offset']
+                del entry['offset']
+            else:
+                if squashed_filename not in count:
+                    if not only_manifests:
+                        self._write_to_tar(tar, audio_filepath, squashed_filename)
+                    to_write = squashed_filename
+                    count[squashed_filename] = 1
+                else:
+                    to_write = base + "-sub" + str(count[squashed_filename]) + ext
+                    count[squashed_filename] += 1
+
+            if only_manifests:
+                entry['abs_audio_filepath'] = audio_filepath
+
+            # Carry over every key in the entry, override audio_filepath and shard_id
+            new_entry = {
+                **entry,
+                'audio_filepath': to_write,
+                'shard_id': shard_id,  # Keep shard ID for recordkeeping
+            }
+            new_entries.append(new_entry)
+
+        if not only_manifests:
+            tar.close()
+        return new_entries
+
+    @classmethod
+    def setup_history(cls, base_metadata: ASRTarredDatasetMetadata, history: List[Any]):
+        if 'history' in base_metadata.keys():
+            for history_val in base_metadata.history:
+                cls.setup_history(history_val, history)
+
+        if base_metadata is not None:
+            metadata_copy = copy.deepcopy(base_metadata)
+            with open_dict(metadata_copy):
+                metadata_copy.pop('history', None)
+            history.append(metadata_copy)
+
+
+def main(args):
+    if args.buckets_num > 1:
+        bucket_length = (args.max_duration - args.min_duration) / float(args.buckets_num)
+        for i_bucket in range(args.buckets_num):
+            bucket_config = copy.deepcopy(args)
+            bucket_config.min_duration = args.min_duration + i_bucket * bucket_length
+            bucket_config.max_duration = bucket_config.min_duration + bucket_length
+            if i_bucket == args.buckets_num - 1:
+                # add a small number to cover the samples with exactly duration of max_duration in the last bucket.
+                bucket_config.max_duration += 1e-5
+            bucket_config.target_dir = os.path.join(args.target_dir, f"bucket{i_bucket+1}")
+            print(
+                f"Creating bucket {i_bucket+1} with min_duration={bucket_config.min_duration} and max_duration={bucket_config.max_duration} ..."
+            )
+            print(f"Results are being saved at: {bucket_config.target_dir}.")
+            create_tar_datasets(**vars(bucket_config))
+            if not args.dry_run:
+                print(f"Bucket {i_bucket+1} is created.")
+    else:
+        create_tar_datasets(**vars(args))
+
+
+def create_tar_datasets(
+    manifest_path: str = None,
+    concat_manifest_paths: str = None,
+    target_dir: str = None,
+    metadata_path: str = None,
+    num_shards: int = -1,
+    max_duration: float = None,
+    min_duration: float = None,
+    shuffle: bool = False,
+    keep_files_together: bool = False,
+    sort_in_shards: bool = False,
+    buckets_num: int = 1,
+    dynamic_buckets_num: int = 30,
+    shuffle_seed: int = None,
+    write_metadata: bool = False,
+    no_shard_manifests: bool = False,
+    force_codec: str = None,
+    workers: int = 1,
+    slice_with_offset: bool = False,
+    only_manifests: bool = False,
+    dry_run: bool = False,
+):
+    builder = ASRTarredDatasetBuilder()
+
+    shard_manifests = False if no_shard_manifests else True
+
+    if write_metadata:
+        metadata = ASRTarredDatasetMetadata()
+        dataset_cfg = ASRTarredDatasetConfig(
+            num_shards=num_shards,
+            shuffle=shuffle,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            shuffle_seed=shuffle_seed,
+            sort_in_shards=sort_in_shards,
+            shard_manifests=shard_manifests,
+            keep_files_together=keep_files_together,
+            force_codec=force_codec,
+            slice_with_offset=slice_with_offset,
+        )
+        metadata.dataset_config = dataset_cfg
+
+        output_path = os.path.join(target_dir, 'default_metadata.yaml')
+        OmegaConf.save(metadata, output_path, resolve=True)
+        print(f"Default metadata written to {output_path}")
+        exit(0)
+
+    if concat_manifest_paths is None or len(concat_manifest_paths) == 0:
+        # Create a tarred dataset from scratch
+        config = ASRTarredDatasetConfig(
+            num_shards=num_shards,
+            shuffle=shuffle,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            shuffle_seed=shuffle_seed,
+            sort_in_shards=sort_in_shards,
+            shard_manifests=shard_manifests,
+            keep_files_together=keep_files_together,
+            force_codec=force_codec,
+            slice_with_offset=slice_with_offset,
+        )
+        builder.configure(config)
+        builder.create_new_dataset(
+            manifest_path=manifest_path,
+            target_dir=target_dir,
+            num_workers=workers,
+            buckets_num=buckets_num,
+            dynamic_buckets_num=dynamic_buckets_num,
+            only_manifests=only_manifests,
+            dry_run=dry_run,
+        )
+
+    else:
+        if buckets_num > 1:
+            raise ValueError("Concatenation feature does not support buckets_num > 1.")
+        print("Concatenating multiple tarred datasets ...")
+
+        # Implicitly update config from base details
+        if metadata_path is not None:
+            metadata = ASRTarredDatasetMetadata.from_file(metadata_path)
+        else:
+            raise ValueError("`metadata` yaml file path must be provided!")
+
+        # Preserve history
+        history = []
+        builder.setup_history(OmegaConf.structured(metadata), history)
+        metadata.history = history
+
+        # Add command line overrides (everything other than num_shards)
+        metadata.dataset_config.max_duration = max_duration
+        metadata.dataset_config.min_duration = min_duration
+        metadata.dataset_config.shuffle = shuffle
+        metadata.dataset_config.shuffle_seed = shuffle_seed
+        metadata.dataset_config.sort_in_shards = sort_in_shards
+        metadata.dataset_config.shard_manifests = shard_manifests
+
+        builder.configure(metadata.dataset_config)
+
+        # Concatenate a tarred dataset onto a previous one
+        builder.create_concatenated_dataset(
+            base_manifest_path=manifest_path,
+            manifest_paths=concat_manifest_paths,
+            metadata=metadata,
+            target_dir=target_dir,
+            num_workers=workers,
+            slice_with_offset=slice_with_offset,
+            only_manifests=only_manifests,
+            dry_run=dry_run,
+        )
+
+    if not dry_run and (DALI_INDEX_SCRIPT_AVAILABLE and dali_index.INDEX_CREATOR_AVAILABLE):
+        print("Constructing DALI Tarfile Index - ", target_dir)
+        index_config = dali_index.DALITarredIndexConfig(tar_dir=target_dir, workers=workers)
+        dali_index.main(index_config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert an existing ASR dataset to tarballs compatible with TarredAudioToTextDataLayer."
+    )
+    parser.add_argument(
+        "--manifest_path", default=None, type=str, required=False, help="Path to the existing dataset's manifest."
+    )
+
+    parser.add_argument(
+        '--concat_manifest_paths',
+        nargs='+',
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the additional dataset's manifests that will be concatenated with base dataset.",
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "--target_dir",
+        default='./tarred',
+        type=str,
+        help="Target directory for resulting tarballs and manifest. Defaults to `./tarred`. Creates the path if necessary.",
+    )
+
+    parser.add_argument(
+        "--metadata_path",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to metadata file for the dataset.",
+    )
+
+    parser.add_argument(
+        "--num_shards",
+        default=-1,
+        type=int,
+        help="Number of shards (tarballs) to create. Used for partitioning data among workers.",
+    )
+    parser.add_argument(
+        '--max_duration',
+        default=None,
+        required=True,
+        type=float,
+        help='Maximum duration of audio clip in the dataset. By default, it is None and is required to be set.',
+    )
+    parser.add_argument(
+        '--min_duration',
+        default=None,
+        type=float,
+        help='Minimum duration of audio clip in the dataset. By default, it is None and will not filter files.',
+    )
+    parser.add_argument(
+        "--shuffle",
+        action='store_true',
+        help="Whether or not to randomly shuffle the samples in the manifest before tarring/sharding.",
+    )
+
+    parser.add_argument(
+        "--keep_files_together",
+        action='store_true',
+        help="Whether or not to keep entries from the same file (but different offsets) together when sorting before tarring/sharding.",
+    )
+    parser.add_argument(
+        "--slice_with_offset",
+        action='store_true',
+        help=(
+            "If set, the audio will be sliced based on `offset` and `duration` fields from the manifest. "
+            "This is useful for creating datasets from audio segments instead of full files. "
+            "When unset, the entire audio file is used without slicing, regardless of the offset/duration values in the manifest."
+        ),
+    )
+    parser.add_argument(
+        "--sort_in_shards",
+        action='store_true',
+        help="Whether or not to sort samples inside the shards based on their duration.",
+    )
+
+    parser.add_argument(
+        "--buckets_num",
+        type=int,
+        default=1,
+        help="Number of buckets to create based on duration.",
+    )
+
+    parser.add_argument(
+        "--dynamic_buckets_num",
+        type=int,
+        default=30,
+        help="Intended for dynamic (on-the-fly) bucketing; this option will not bucket your dataset during tar conversion. "
+        "Estimates optimal bucket duration bins for a given number of buckets.",
+    )
+
+    parser.add_argument("--shuffle_seed", type=int, default=None, help="Random seed for use if shuffling is enabled.")
+    parser.add_argument(
+        '--write_metadata',
+        action='store_true',
+        help=(
+            "Flag to write a blank metadata with the current call config. "
+            "Note that the metadata will not contain the number of shards, "
+            "and it must be filled out by the user."
+        ),
+    )
+    parser.add_argument(
+        "--no_shard_manifests",
+        action='store_true',
+        help="Do not write sharded manifests along with the aggregated manifest.",
+    )
+    parser.add_argument(
+        "--force_codec",
+        type=str,
+        default=None,
+        help=(
+            "If specified, transcode the audio to the given format. "
+            "Supports libnsndfile formats (example values: 'opus', 'flac')."
+        ),
+    )
+    parser.add_argument(
+        "--only_manifests",
+        action='store_true',
+        help=(
+            "If set, only creates manifests for each shard without creating the actual tar files. "
+            "This allows you to verify the output structure and content before committing to the full tarball creation process. "
+            "Each manifest entry will also include the field `abs_audio_filepath`, which stores the absolute path to the original audio file."
+        ),
+    )
+    parser.add_argument(
+        "--dry_run",
+        action='store_true',
+        help=(
+            "Run in simulation mode: calculate and display the number of shards and estimated data per shard without reading audio files or writing any output."
+        ),
+    )
+    parser.add_argument('--workers', type=int, default=1, help='Number of worker processes')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py b/sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py
new file mode 100644
index 00000000..1ae64dc5
--- /dev/null
+++ b/sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import logging
+import os
+from dataclasses import dataclass
+
+import hydra
+from hydra.core.config_store import ConfigStore
+from joblib import Parallel, delayed
+from omegaconf import MISSING
+
+try:
+    from wds2idx import IndexCreator
+
+    INDEX_CREATOR_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    INDEX_CREATOR_AVAILABLE = False
+
+"""
+python create_dali_tarred_dataset_index.py \
+    tar_dir=<path to the directory which contains tarred dataset> \
+    workers=-1
+
+"""
+
+logging.basicConfig(level=logging.INFO)
+
+
+@dataclass
+class DALITarredIndexConfig:
+    tar_dir: str = MISSING  # Path to the existing dataset's manifest
+    workers: int = -1  # number of worker processes
+
+
+def process_index_path(tar_paths, index_dir):
+    """
+    Appends the folder `{index_dir}` to the filepath of all tarfiles.
+    Example:
+         /X/Y/Z/audio_0.tar -> /X/Y/Z/{index_dir}/audio_0.index
+    """
+    index_paths = []
+    for path in tar_paths:
+        basepath, filename = os.path.split(path)
+        path = filename.replace('.tar', '.index')
+        path = os.path.join(basepath, path)
+        base, name = os.path.split(path)
+        index_path = os.path.join(index_dir, name)
+        index_paths.append(index_path)
+
+    return index_paths
+
+
+def build_index(tarpath, indexfile):
+    with IndexCreator(tarpath, indexfile) as index:
+        index.create_index()
+
+
+@hydra.main(config_path=None, config_name='index_config', version_base="1.1")
+def main(cfg: DALITarredIndexConfig):
+    if not INDEX_CREATOR_AVAILABLE:
+        logging.error("`wds2idx` is not installed. Please install NVIDIA DALI >= 1.11")
+        exit(1)
+
+    tar_files = list(glob.glob(os.path.join(cfg.tar_dir, "*.tar")))
+
+    index_dir = os.path.join(cfg.tar_dir, "dali_index")
+    if not os.path.exists(index_dir):
+        os.makedirs(index_dir, exist_ok=True)
+
+    index_paths = process_index_path(tar_files, index_dir)
+
+    with Parallel(n_jobs=cfg.workers, verbose=len(tar_files)) as parallel:
+        _ = parallel(delayed(build_index)(tarpath, indexfile) for tarpath, indexfile in zip(tar_files, index_paths))
+
+    logging.info("Finished constructing index files !")
+
+
+ConfigStore.instance().store(name='index_config', node=DALITarredIndexConfig)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 81730e8d3972e35ebd32d4866838e783526af2ac Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 03:12:56 -0700
Subject: [PATCH 50/90] Added lazy imports

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/huggingface/huggingface_hub.py          |  5 ++++-
 .../asr/faster_whisper/faster_whisper_inference.py     | 10 +++++++---
 sdp/processors/inference/nlp/fasttext/fasttext.py      |  4 ++--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
index 8a553d1f..19e31832 100644
--- a/sdp/processors/huggingface/huggingface_hub.py
+++ b/sdp/processors/huggingface/huggingface_hub.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import json
-from huggingface_hub import snapshot_download, list_repo_files
 
 from sdp.processors.base_processor import BaseProcessor
 
@@ -53,6 +52,8 @@ def list_repo_files(self):
         """
         Retrieve the list of files from a Hugging Face repository.
         """
+        from huggingface_hub import list_repo_files
+
         self.files = list_repo_files(**self.list_repo_files_kwargs)
 
     def write_output_manifest_file(self):
@@ -106,6 +107,8 @@ def download(self):
         """
         Download the repository snapshot to a local folder.
         """
+        from huggingface_hub import snapshot_download
+
         self.local_dir = snapshot_download(**self.snapshot_download_kwargs)
 
     def write_output_manifest_file(self):
diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index bdcdd5fc..2a2c9cca 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -63,8 +63,6 @@ def serialize(obj):
 
 @dataclass
 class InferenceConfig:
-    from faster_whisper.vad import VadOptions
-
     language: Optional[str] = None
     task: str = "transcribe"
     log_progress: bool = False
@@ -91,7 +89,13 @@ class InferenceConfig:
     append_punctuations: str = "\"'.。,，!！?？:：”)]}、"
     multilingual: bool = False
     vad_filter: bool = True
-    vad_parameters: Optional[VadOptions] = None
+
+    try:
+        from faster_whisper.vad import VadOptions
+        vad_parameters: Optional[VadOptions] = None
+    except ModuleNotFoundError:
+        pass
+    
     max_new_tokens: Optional[int] = None
     chunk_length: Optional[int] = None
     clip_timestamps: Optional[Any] = "0"
diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index 3bc34c89..7f2d2171 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -17,8 +17,6 @@
 import tempfile
 import wget
 
-import fasttext
-
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
 
@@ -88,6 +86,8 @@ def _download_model(self):
 
     def _load_model(self):
         """Lazily loads the FastText model into memory."""
+        import fasttext
+
         if self._model is None:
             self._model = fasttext.load_model(self.model_name_or_path)
     

From 7d599f24942ff2fc10b02725a11392f27355657e Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 03:24:22 -0700
Subject: [PATCH 51/90] Fixed docs

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/gen_docs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/gen_docs.py b/docs/gen_docs.py
index 856b1948..a1375d39 100644
--- a/docs/gen_docs.py
+++ b/docs/gen_docs.py
@@ -41,6 +41,10 @@ def gen_docs():
 
         # Copy files and change the file extensions
         for file in files:
+            path_parts = Path(file).parts
+            if "partials" in path_parts:
+                continue
+
             if file.endswith('.yaml'):
                 source_path = os.path.join(root, file)
                 config_path = source_path.replace(config_dir, '')[1:]  # removing leading /

From 795a8417d5f0e3a766db20e75866623db742a7d3 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 03:29:39 -0700
Subject: [PATCH 52/90] Added partials dir skipping for gen_docs.py

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docs/gen_docs.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/gen_docs.py b/docs/gen_docs.py
index a1375d39..90b752bb 100644
--- a/docs/gen_docs.py
+++ b/docs/gen_docs.py
@@ -41,14 +41,10 @@ def gen_docs():
 
         # Copy files and change the file extensions
         for file in files:
-            path_parts = Path(file).parts
-            if "partials" in path_parts:
-                continue
-
             if file.endswith('.yaml'):
                 source_path = os.path.join(root, file)
                 config_path = source_path.replace(config_dir, '')[1:]  # removing leading /
-                if config_path in IGNORE_CONFIGS:
+                if config_path in IGNORE_CONFIGS or "partials" in Path(config_path).parts:
                     continue
                 destination_path = source_path.replace(config_dir, config_docs_dir).replace('.yaml', '.rst')
                 with open(source_path, "rt", encoding="utf-8") as fin:

From 7ce03f0c58530ae22a14578f7210fbab97eb910b Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 03:50:24 -0700
Subject: [PATCH 53/90] Added missed termplotlib to requirements

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 requirements/main.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/main.txt b/requirements/main.txt
index 1fd82830..c38fa95e 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -12,6 +12,7 @@ regex
 soundfile
 sox
 tabulate
+termplotlib
 tqdm
 termplotlib
 gdown

From c33517b97af6354788eac4dded627b11cac74a78 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 04:03:32 -0700
Subject: [PATCH 54/90] removed termplotlib

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 requirements/main.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index c38fa95e..350f15b3 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -14,7 +14,6 @@ sox
 tabulate
 termplotlib
 tqdm
-termplotlib
 gdown
 webvtt-py
 wget

From 32cd8755e33a3656ebb7e7ce4dd5eb5574d7eb1d Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 04:13:02 -0700
Subject: [PATCH 55/90] Added ConvertToTarredAudioDataset to yodas2.yaml

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml           | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index c06f2fd0..e7c7a4e2 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -19,6 +19,10 @@ params:
     min_hist_token_ratio: 0.8
     min_text_lid_probability: 0.3
     min_qe_score: 0.75
+  convert_to_audio_tarred_dataset:
+    should_run: True
+    num_shards: 16
+    buckets_num: 1
   save_disk_space: True
 
 processors_to_run: "3:"
@@ -389,6 +393,18 @@ processors:
     fields_to_drop:
       - cometoid_score
   
-  # - _target_: sdp.processors.ConvertToTarredAudioDataset
+  - _target_: sdp.processors.ConvertToTarredAudioDataset
+    should_run: ${params.convert_to_audio_tarred_dataset.should_run}
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_47.json
+    min_duration: ${params.min_audio_duration}
+    max_duration: ${params.max_audio_duration}
+    target_dir: ${workspace_dir}/${params.source_lang}/tarred_dataset
+    num_shards: ${params.convert_to_audio_tarred_dataset.num_shards}
+    buckets_num: ${params.convert_to_audio_tarred_dataset.buckets_num}
+    workers: -1
+    shuffle: True
+    shuffle_seed: 1
+    sort_in_shards: True
+    slice_with_offset: True
 
   #export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2.yaml
\ No newline at end of file

From 6efe46aa3c4a2c7102d5a3c4af88d4797956456c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 06:54:29 -0700
Subject: [PATCH 56/90] Removed data specific tests from common tests

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 tests/test_data_to_data.py | 94 --------------------------------------
 1 file changed, 94 deletions(-)

diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index afe224a4..1e006149 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -23,7 +23,6 @@
     LambdaExpression,
 )
 
-from sdp.processors.inference.asr.post_processing.whisper_hallucinations import DetectWhisperHallucinationFeatures
 from sdp.processors.inference.llm.post_processing.qwen_cleaning import CleanQwenGeneration
 
 test_params_list = []
@@ -207,99 +206,6 @@
     ]
 )
 
-test_params_list.extend(
-    [
-        # Case: repeated n-grams (low unique words share)
-        (
-            DetectWhisperHallucinationFeatures,
-            {"unique_words_threshold": 0.5},
-            {"text": "word word word word", "duration": 2.0},
-            [{   
-                "text": "word word word word", 
-                "duration": 2.0,
-                "hall_repeated_ngrams": True,
-                "hall_long_word": False,
-                "hall_frequent_single_word": False,
-            }],
-        ),
-
-        # Case: high unique word share
-        (
-            DetectWhisperHallucinationFeatures,
-            {"unique_words_threshold": 0.2},
-            {"text": "this is a very diverse sentence", "duration": 3.0},
-            [{
-                "text": "this is a very diverse sentence", 
-                "duration": 3.0,
-                "hall_repeated_ngrams": False,
-                "hall_long_word": False,
-                "hall_frequent_single_word": False,
-            }],
-        ),
-
-        # Case: one very long word
-        (
-            DetectWhisperHallucinationFeatures,
-            {"long_word_threshold": 10},
-            {"text": "short supercalifragilisticexpialidocious", "duration": 3.0},
-            [{
-                "text": "short supercalifragilisticexpialidocious", 
-                "duration": 3.0,
-                "hall_repeated_ngrams": False,
-                "hall_long_word": True,
-                "hall_frequent_single_word": False,
-            }],
-        ),
-
-        # Case: long word with large relative difference
-        (
-            DetectWhisperHallucinationFeatures,
-            {"long_word_threshold": 100, "long_word_rel_threshold": 2.0},
-            {"text": "hi extraordinarylongword tiny", "duration": 3.0},
-            [{
-                "text": "hi extraordinarylongword tiny", 
-                "duration": 3.0,
-                "hall_repeated_ngrams": False,
-                "hall_long_word": True,
-                "hall_frequent_single_word": False,
-            }],
-        ),
-
-        # Case: low character rate (chars/sec)
-        (
-            DetectWhisperHallucinationFeatures,
-            {"char_rate_threshold": 10.0},
-            {"text": "a b", "duration": 2.0},
-            [{   
-                "text": "a b", 
-                "duration": 2.0,
-                "hall_repeated_ngrams": False,
-                "hall_long_word": False,
-                "hall_frequent_single_word": True,
-            }],
-        ),
-
-        # Case: all metrics triggered
-        (
-            DetectWhisperHallucinationFeatures,
-            {
-                "unique_words_threshold": 0.5,
-                "long_word_threshold": 10,
-                "long_word_rel_threshold": 1.0,
-                "char_rate_threshold": 5.0,
-            },
-            {"text": "verylongword verylongword verylongword", "duration": 12.0},
-            [{   
-                "text": "verylongword verylongword verylongword", 
-                "duration": 12.0,
-                "hall_repeated_ngrams": True,
-                "hall_long_word": True,
-                "hall_frequent_single_word": True,
-            }],
-        ),
-    ]
-)
-
 test_params_list.extend(
     [
         # Case: generation is fine, no replacement

From dbc6f541155cf4f15f208ec74fc325b4865f3c6f Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:17:55 -0700
Subject: [PATCH 57/90] FasterWhisperInference fix

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../faster_whisper_inference.py               | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index 2a2c9cca..5383f64e 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -264,7 +264,6 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
         except ImportError:
             TORCH_AVAILABLE = False
 
-        max_available_workers = os.cpu_count()
         if device in ["cuda", "auto"] and TORCH_AVAILABLE:
             cuda_available_workers = torch.cuda.device_count()
             if cuda_available_workers == 0:
@@ -275,11 +274,16 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
                     device == "cpu"
             else:
                 logger.info("CUDA devices found. GPU will be used as workers.")
-                device == "cuda"
+                device = "cuda"
         elif device == "cpu":
             logger.info("CPU will be used as workers.")
         else:
             raise ValueError(f"Invalid device type: {device}")
+        
+        if device == "cuda":
+            max_available_workers = cuda_available_workers
+        else:
+            max_available_workers = os.cpu_count()
 
         if num_devices < -1 or num_devices == 0: 
             raise ValueError(f"Invalid number of workers: {num_devices}.")
@@ -292,7 +296,7 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
         else:
             logger.info(f"Using {workers} {device.upper()} worker(s).")
         
-        device_ids = list(range(num_devices))
+        device_ids = list(range(workers))
         return device, device_ids
     
     def prepare(self):
@@ -400,7 +404,8 @@ def transcribe(self, device_id: int):
                 if self.language_detection_only:
                     try:
                         audio = decode_audio(audio_filepath)
-                        language, language_probability, all_language_probs = model.model.detect_language(audio = audio,
+                        features = model.feature_extractor(audio)
+                        language, language_probability, all_language_probs = model.detect_language(features = features,
                                                 vad_filter = self.config.inference.vad_filter,
                                                 vad_parameters = self.config.inference.vad_parameters,
                                                 language_detection_segments = self.config.inference.language_detection_segments,
@@ -417,7 +422,7 @@ def transcribe(self, device_id: int):
                 else:
                     try:
                         if self.config.dataset.offset:
-                            audio = self.get_audio_segment(audio_filepath, batch['offset'][i], batch['duration'][i])
+                            audio = self.get_audio_segment(audio_filepath, entry['offset'], entry['duration'])
                         else:
                             audio = audio_filepath
                     
@@ -451,7 +456,8 @@ def transcribe(self, device_id: int):
                     pred_text = ' '.join(str(segment['text']) for segment in segments).strip()
                     result['pred_text'] = pred_text
                 
-                fout.write(json.dumps(result, ensure_ascii=False) + "\n")
+                entry.update(result)
+                fout.write(json.dumps(entry, ensure_ascii=False) + "\n")
                 fout.flush()
         
         return output_manifest_file
@@ -468,8 +474,8 @@ def process(self):
         with Pool(processes=len(self.device_ids)) as pool:
             output_rank_manifests = pool.map(self.transcribe, self.device_ids)
         
-        with open(self.output_manifest_filepath, 'w', encoding='utf8') as output_manifest:
-            for rank_manifest_filepath in tqdm(output_rank_manifests):
+        with open(self.output_manifest_file, 'w', encoding='utf8') as output_manifest:
+            for rank_manifest_filepath in tqdm(output_rank_manifests, desc = "Writing output manifest.."):
                 with open(rank_manifest_filepath, 'r', encoding='utf8') as rank_manifest:
                     for line in rank_manifest:
                         output_manifest.writelines(line)
\ No newline at end of file

From 64105f91c5bfdf43ec9e459f5d671158f6e3e7e2 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:18:56 -0700
Subject: [PATCH 58/90] ASRTarredDatasetBuilder fix

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../manage_files/utils/convert_to_tarred_audio_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py b/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
index 84b10835..c06462a9 100644
--- a/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
+++ b/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py
@@ -88,7 +88,7 @@
 from typing import Any, List, Optional
 
 import numpy as np
-import soundfile
+import soundfile as sf
 from joblib import Parallel, delayed
 from omegaconf import DictConfig, OmegaConf, open_dict
 from tabulate import tabulate
@@ -616,7 +616,7 @@ def _write_to_tar(
         # Trim audio based on offset and duration.
         start_sample = int(offset * sampling_rate)
         num_frames = int(duration * sampling_rate) if duration else -1
-        audio, sampling_rate = sf.read(file_path, start=start_sample, frames=num_frames)
+        audio, sampling_rate = sf.read(audio_filepath, start=start_sample, frames=num_frames)
 
         # Determine codec parameters.
         if codec is not None:
@@ -625,12 +625,12 @@ def _write_to_tar(
             else:
                 kwargs = {"format": codec}
         else:
-            codec = soundfile.info(audio_filepath).format.lower()
+            codec = sf.info(audio_filepath).format.lower()
             kwargs = {"format": codec}
 
         # Transcode and write audio to tar.
         encoded_audio = BytesIO()
-        soundfile.write(encoded_audio, audio, sampling_rate, closefd=False, **kwargs)
+        sf.write(encoded_audio, audio, sampling_rate, closefd=False, **kwargs)
 
         # Generate filename with the appropriate extension.
         encoded_squashed_filename = f"{squashed_filename.split('.')[0]}.{codec}"

From 224d4539161175b368360878573f5cb141898d3c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:28:19 -0700
Subject: [PATCH 59/90] Test Dockerfile

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/docker_pull.yml | 4 +++-
 docker/Dockerfile                 | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker_pull.yml b/.github/workflows/docker_pull.yml
index 51523e97..da71efe0 100644
--- a/.github/workflows/docker_pull.yml
+++ b/.github/workflows/docker_pull.yml
@@ -21,7 +21,9 @@ jobs:
       
     - name: Build Docker image
       run: |
-        docker build -t sdp-test-image:${{ github.sha }} -f docker/Dockerfile .
+        docker build -t sdp-test-image:${{ github.sha }} \
+          -f docker/Dockerfile \
+          --build-arg SOURCE=./ .
       
     - name: Run test tests
       run: |
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c0091b7e..8f309aaa 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,9 @@
 FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 
+ARG SOURCE
+COPY ${SOURCE:-.} /workspace
+WORKDIR /workspace
+
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=America/Los_Angeles
 
@@ -22,8 +26,7 @@ RUN apt-get update \
 RUN pip install --upgrade pip
 
 # Clone the NeMo SDP repository
-WORKDIR /src
-RUN git clone https://github.com/NVIDIA/NeMo-speech-data-processor.git
+WORKDIR /workspace
 
 WORKDIR /src/NeMo-speech-data-processor
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)

From 799753b670dc122781b15ad0242ae9121641bd22 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:31:49 -0700
Subject: [PATCH 60/90] Test Dockerfile

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/Dockerfile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8f309aaa..c752dc21 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,7 +2,7 @@ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 
 ARG SOURCE
 COPY ${SOURCE:-.} /workspace
-WORKDIR /workspace
+WORKDIR /workspace/NeMo-speech-data-processor
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=America/Los_Angeles
@@ -25,10 +25,6 @@ RUN apt-get update \
 # Update pip
 RUN pip install --upgrade pip
 
-# Clone the NeMo SDP repository
-WORKDIR /workspace
-
-WORKDIR /src/NeMo-speech-data-processor
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 RUN pip install numpy 
 RUN find requirements/ -name "*.txt" -exec pip install -r {} \;

From 36bec4fda4225590b1291741e45fad67ca8321fd Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:36:28 -0700
Subject: [PATCH 61/90] Test Dockerfile

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index c752dc21..8d47e4c6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,8 +1,7 @@
 FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 
 ARG SOURCE
-COPY ${SOURCE:-.} /workspace
-WORKDIR /workspace/NeMo-speech-data-processor
+COPY ${SOURCE:-.} /workspace/NeMo-speech-data-processor
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=America/Los_Angeles
@@ -26,6 +25,8 @@ RUN apt-get update \
 RUN pip install --upgrade pip
 
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
+WORKDIR /workspace/NeMo-speech-data-processor
+RUN pwd && ls -al && ls -R
 RUN pip install numpy 
 RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
 RUN wget https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

From 01b80a88b19ae3bbbe1f529da3861eb91e0b67b7 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:52:16 -0700
Subject: [PATCH 62/90] Test Dockerfile

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 8d47e4c6..01517dc6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,7 @@ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 
 ARG SOURCE
 COPY ${SOURCE:-.} /workspace/NeMo-speech-data-processor
+RUN rm -rf /workspace/NeMo-speech-data-processor/docker/
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=America/Los_Angeles

From 669d22bc8a0467f3d50671073c60b1aa72ed1d02 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 08:59:49 -0700
Subject: [PATCH 63/90] Removed extra line from Dockerfile

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 01517dc6..c828af53 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -27,7 +27,6 @@ RUN pip install --upgrade pip
 
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 WORKDIR /workspace/NeMo-speech-data-processor
-RUN pwd && ls -al && ls -R
 RUN pip install numpy 
 RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
 RUN wget https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

From ec8a0d477e93846f991d322a0664062da2ecbf15 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 09:56:18 -0700
Subject: [PATCH 64/90] Added Dockerfile and workflow for granary

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/datasets/granary.yaml | 41 +++++++++++++++++++++++++
 .github/workflows/test_e2e_datasets.yml | 14 +++++++++
 docker/datasets/Dockerfile.granary_sdp  | 41 +++++++++++++++++++++++++
 requirements/datasets/granary.txt       | 10 ++++++
 4 files changed, 106 insertions(+)
 create mode 100644 .github/workflows/datasets/granary.yaml
 create mode 100644 .github/workflows/test_e2e_datasets.yml
 create mode 100644 docker/datasets/Dockerfile.granary_sdp
 create mode 100644 requirements/datasets/granary.txt

diff --git a/.github/workflows/datasets/granary.yaml b/.github/workflows/datasets/granary.yaml
new file mode 100644
index 00000000..8b419b03
--- /dev/null
+++ b/.github/workflows/datasets/granary.yaml
@@ -0,0 +1,41 @@
+name: SDP Granary Docker Build and Test
+
+on:
+  workflow_call:
+
+permissions:
+  contents: read
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      
+    - name: Build Docker image
+      run: |
+        docker build -t sdp-test-image:${{ github.sha }} \
+          -f docker/datasets/Dockerfile.granary_sdp \
+          --build-arg SOURCE=./ .
+      
+    - name: Run Yodas2 E2E test
+      run: |
+        docker run --rm \
+          -v ${{ github.workspace }}:/workspace \
+          -w /workspace \
+          sdp-test-image:${{ github.sha }} \
+          bash -c "python -m pytest tests/test_utils.py -v"
+    
+    - name: Get test results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results
+        path: |
+          pytest.xml
+          coverage.xml
\ No newline at end of file
diff --git a/.github/workflows/test_e2e_datasets.yml b/.github/workflows/test_e2e_datasets.yml
new file mode 100644
index 00000000..6236aae6
--- /dev/null
+++ b/.github/workflows/test_e2e_datasets.yml
@@ -0,0 +1,14 @@
+name: Tests for E2E dataset pipelines 
+
+on:
+  pull_request:
+    branches: [ "main" ]
+
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  call-granary:
+    uses: ./.github/workflows/granary.yaml
diff --git a/docker/datasets/Dockerfile.granary_sdp b/docker/datasets/Dockerfile.granary_sdp
new file mode 100644
index 00000000..f8406192
--- /dev/null
+++ b/docker/datasets/Dockerfile.granary_sdp
@@ -0,0 +1,41 @@
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+
+ARG SOURCE
+COPY ${SOURCE:-.} /workspace/NeMo-speech-data-processor
+RUN rm -rf /workspace/NeMo-speech-data-processor/docker/
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=America/Los_Angeles
+
+# Install basics
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-dev python-is-python3 \
+    build-essential \
+    curl \
+    ffmpeg \
+    git \
+    sox \
+    libsox-fmt-mp3 \
+    unzip \
+    wget \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Update pip
+RUN pip install --upgrade pip
+
+#need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
+WORKDIR /workspace/NeMo-speech-data-processor
+RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
+
+#install additional requirements for granary data processing
+WORKDIR /workspace/NeMo-speech-data-processor
+RUN pip install -r requirements/datasets/yodas2.txt
+RUN echo "export LD_LIBRARY_PATH=$(python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))')" >> /etc/bash.bashrc
+
+# Set working directory back to NeMo-speech-data-processor
+WORKDIR /workspace/NeMo-speech-data-processor
+
+# Set up entrypoint
+CMD ["bash"]
\ No newline at end of file
diff --git a/requirements/datasets/granary.txt b/requirements/datasets/granary.txt
new file mode 100644
index 00000000..ab1e8385
--- /dev/null
+++ b/requirements/datasets/granary.txt
@@ -0,0 +1,10 @@
+pytorch-lightning 
+nvidia-cublas-cu12 
+nvidia-cudnn-cu12==9.* 
+faster_whisper
+optree>=0.13.0
+vllm
+fasttext
+pymarian
+lhotse 
+nemo-toolkit[common]
\ No newline at end of file

From ab20a8bd4c4af43b84119f8411516a071aed3ba0 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 10:06:21 -0700
Subject: [PATCH 65/90] test_e2e_datasets.yml to workflow

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/datasets/granary.yaml | 41 -------------------------
 .github/workflows/test_e2e_datasets.yml | 37 ++++++++++++++++++++--
 2 files changed, 34 insertions(+), 44 deletions(-)
 delete mode 100644 .github/workflows/datasets/granary.yaml

diff --git a/.github/workflows/datasets/granary.yaml b/.github/workflows/datasets/granary.yaml
deleted file mode 100644
index 8b419b03..00000000
--- a/.github/workflows/datasets/granary.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: SDP Granary Docker Build and Test
-
-on:
-  workflow_call:
-
-permissions:
-  contents: read
-
-jobs:
-  build-and-test:
-    runs-on: ubuntu-latest
-    
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-      
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-      
-    - name: Build Docker image
-      run: |
-        docker build -t sdp-test-image:${{ github.sha }} \
-          -f docker/datasets/Dockerfile.granary_sdp \
-          --build-arg SOURCE=./ .
-      
-    - name: Run Yodas2 E2E test
-      run: |
-        docker run --rm \
-          -v ${{ github.workspace }}:/workspace \
-          -w /workspace \
-          sdp-test-image:${{ github.sha }} \
-          bash -c "python -m pytest tests/test_utils.py -v"
-    
-    - name: Get test results
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results
-        path: |
-          pytest.xml
-          coverage.xml
\ No newline at end of file
diff --git a/.github/workflows/test_e2e_datasets.yml b/.github/workflows/test_e2e_datasets.yml
index 6236aae6..745648b0 100644
--- a/.github/workflows/test_e2e_datasets.yml
+++ b/.github/workflows/test_e2e_datasets.yml
@@ -1,14 +1,45 @@
-name: Tests for E2E dataset pipelines 
+name: E2E Dataset Pipelines Docker Build and Test
 
 on:
   pull_request:
     branches: [ "main" ]
 
+  # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
 permissions:
   contents: read
 
 jobs:
-  call-granary:
-    uses: ./.github/workflows/granary.yaml
+  Granary:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      
+    - name: Build Docker image
+      run: |
+        docker build -t sdp-test-image:${{ github.sha }} \
+          -f docker/datasets/Dockerfile.granary_sdp \
+          --build-arg SOURCE=./ .
+      
+    - name: Run Yodas2 E2E test
+      run: |
+        docker run --rm \
+          -v ${{ github.workspace }}:/workspace \
+          -w /workspace \
+          sdp-test-image:${{ github.sha }} \
+          bash -c "python -m pytest tests/test_utils.py -v"
+    
+    - name: Get test results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results
+        path: |
+          pytest.xml
+          coverage.xml
\ No newline at end of file

From 5ae3072bc053667d598f8ee3a32f0b3b70028201 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 10:17:22 -0700
Subject: [PATCH 66/90] Prevented requirements collection in subfolders

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/Dockerfile                      | 2 +-
 docker/datasets/Dockerfile.granary_sdp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index c828af53..802ae9a1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -28,7 +28,7 @@ RUN pip install --upgrade pip
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 WORKDIR /workspace/NeMo-speech-data-processor
 RUN pip install numpy 
-RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
+RUN find requirements/ -maxdepth 1 -name "*.txt" -exec pip install -r {} \;
 RUN wget https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 RUN pip install mamba_ssm-2.2.2+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 
diff --git a/docker/datasets/Dockerfile.granary_sdp b/docker/datasets/Dockerfile.granary_sdp
index f8406192..a13b4c21 100644
--- a/docker/datasets/Dockerfile.granary_sdp
+++ b/docker/datasets/Dockerfile.granary_sdp
@@ -27,11 +27,11 @@ RUN pip install --upgrade pip
 
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 WORKDIR /workspace/NeMo-speech-data-processor
-RUN find requirements/ -name "*.txt" -exec pip install -r {} \;
+RUN find requirements/ -maxdepth 1 -name "*.txt" -exec pip install -r {} \;
 
 #install additional requirements for granary data processing
 WORKDIR /workspace/NeMo-speech-data-processor
-RUN pip install -r requirements/datasets/yodas2.txt
+RUN pip install -r requirements/datasets/granary.txt
 RUN echo "export LD_LIBRARY_PATH=$(python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))')" >> /etc/bash.bashrc
 
 # Set working directory back to NeMo-speech-data-processor

From dc0391e8582292a62c313baa2060b230b99812e7 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 10:26:18 -0700
Subject: [PATCH 67/90] Prevent no space left during building

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/datasets/Dockerfile.granary_sdp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/datasets/Dockerfile.granary_sdp b/docker/datasets/Dockerfile.granary_sdp
index a13b4c21..01b9a193 100644
--- a/docker/datasets/Dockerfile.granary_sdp
+++ b/docker/datasets/Dockerfile.granary_sdp
@@ -27,12 +27,13 @@ RUN pip install --upgrade pip
 
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 WORKDIR /workspace/NeMo-speech-data-processor
-RUN find requirements/ -maxdepth 1 -name "*.txt" -exec pip install -r {} \;
+RUN find requirements/ -maxdepth 1 -name "*.txt" -exec pip install --no-cache-dir -r {} \;
 
 #install additional requirements for granary data processing
 WORKDIR /workspace/NeMo-speech-data-processor
-RUN pip install -r requirements/datasets/granary.txt
+RUN pip install --no-cache-dir -r requirements/datasets/granary.txt
 RUN echo "export LD_LIBRARY_PATH=$(python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))')" >> /etc/bash.bashrc
+RUN pip cache purge
 
 # Set working directory back to NeMo-speech-data-processor
 WORKDIR /workspace/NeMo-speech-data-processor

From 8b801170e00bb7237d434742f15bbe01311c737a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 10:35:49 -0700
Subject: [PATCH 68/90] Lightweight granary requirements

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 requirements/datasets/granary.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/requirements/datasets/granary.txt b/requirements/datasets/granary.txt
index ab1e8385..ee221e58 100644
--- a/requirements/datasets/granary.txt
+++ b/requirements/datasets/granary.txt
@@ -5,6 +5,4 @@ faster_whisper
 optree>=0.13.0
 vllm
 fasttext
-pymarian
-lhotse 
-nemo-toolkit[common]
\ No newline at end of file
+pymarian
\ No newline at end of file

From f3210fb36a2fd92feb061c51f2c201f50fd6daf3 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 7 May 2025 11:00:42 -0700
Subject: [PATCH 69/90] Prevent no space left during building

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 docker/datasets/Dockerfile.granary_sdp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/datasets/Dockerfile.granary_sdp b/docker/datasets/Dockerfile.granary_sdp
index 01b9a193..7dffcc70 100644
--- a/docker/datasets/Dockerfile.granary_sdp
+++ b/docker/datasets/Dockerfile.granary_sdp
@@ -28,12 +28,13 @@ RUN pip install --upgrade pip
 #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error)
 WORKDIR /workspace/NeMo-speech-data-processor
 RUN find requirements/ -maxdepth 1 -name "*.txt" -exec pip install --no-cache-dir -r {} \;
+RUN pip cache purge && rm -rf /tmp/* ~/.cache/pip
 
 #install additional requirements for granary data processing
 WORKDIR /workspace/NeMo-speech-data-processor
 RUN pip install --no-cache-dir -r requirements/datasets/granary.txt
 RUN echo "export LD_LIBRARY_PATH=$(python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + \":\" + os.path.dirname(nvidia.cudnn.lib.__file__))')" >> /etc/bash.bashrc
-RUN pip cache purge
+RUN pip cache purge && rm -rf /tmp/* ~/.cache/pip
 
 # Set working directory back to NeMo-speech-data-processor
 WORKDIR /workspace/NeMo-speech-data-processor

From f12f81f566da2466b0b288078a1a82213a2cf8a2 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Sun, 11 May 2025 08:30:44 -0700
Subject: [PATCH 70/90] E2E tests check

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/test_e2e_datasets.yml | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/test_e2e_datasets.yml b/.github/workflows/test_e2e_datasets.yml
index 745648b0..0c58fafd 100644
--- a/.github/workflows/test_e2e_datasets.yml
+++ b/.github/workflows/test_e2e_datasets.yml
@@ -15,25 +15,25 @@ jobs:
     runs-on: ubuntu-latest
     
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-      
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-      
-    - name: Build Docker image
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
       run: |
-        docker build -t sdp-test-image:${{ github.sha }} \
-          -f docker/datasets/Dockerfile.granary_sdp \
-          --build-arg SOURCE=./ .
-      
+        python -m pip install --upgrade pip
+        pip install -r requirements/main.txt
+        pip install -r requirements/tests.txt
+        pip install -r requirements/huggingface.txt
+        pip install -r requirements/datasets/granary.txt
+        python -m pip cache purge
+    
     - name: Run Yodas2 E2E test
+      # in the future this might fail if some runtime tests require nemo
+      # in that case this test will need to be changed
       run: |
-        docker run --rm \
-          -v ${{ github.workspace }}:/workspace \
-          -w /workspace \
-          sdp-test-image:${{ github.sha }} \
-          bash -c "python -m pytest tests/test_utils.py -v"
+        python -m pytest tests/test_utils.py -v
     
     - name: Get test results
       if: always()

From c9e8d43810bf68d816141e849b65323607ac6e7e Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Sun, 11 May 2025 09:49:19 -0700
Subject: [PATCH 71/90] requirements mofidifcation

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .github/workflows/test_e2e_datasets.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_e2e_datasets.yml b/.github/workflows/test_e2e_datasets.yml
index 0c58fafd..c968fd32 100644
--- a/.github/workflows/test_e2e_datasets.yml
+++ b/.github/workflows/test_e2e_datasets.yml
@@ -22,11 +22,9 @@ jobs:
         python-version: "3.10"
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements/main.txt
-        pip install -r requirements/tests.txt
-        pip install -r requirements/huggingface.txt
-        pip install -r requirements/datasets/granary.txt
+        python -m pip install --upgrade pip && \
+        find requirements/ -maxdepth 1 -name "*.txt" -exec pip install -r {} && \
+        pip install -r requirements/datasets/granary.txt && \
         python -m pip cache purge
     
     - name: Run Yodas2 E2E test

From 8f6e1cf24b0acec407a3f7233ff9a0d63d51b223 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 12 May 2025 07:00:57 -0700
Subject: [PATCH 72/90] Fix device setup for FasterWhisperInference

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../asr/faster_whisper/faster_whisper_inference.py         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index 5383f64e..a2d7b783 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -271,7 +271,7 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
                     raise RuntimeError("GPU was requested, but no CUDA devices are available.")
                 else:
                     logger.warning("No GPU found in auto mode — switching to CPU.")
-                    device == "cpu"
+                    device = "cpu"
             else:
                 logger.info("CUDA devices found. GPU will be used as workers.")
                 device = "cuda"
@@ -294,6 +294,7 @@ def setup_devices(device: str = "auto", num_devices: int = -1):
             workers = max_available_workers
             logger.warning(f"Requested {num_devices} {device.upper()} workers, but only {max_available_workers} {device.upper()} available — using {workers}.")
         else:
+            workers = num_devices
             logger.info(f"Using {workers} {device.upper()} worker(s).")
         
         device_ids = list(range(workers))
@@ -364,7 +365,7 @@ def write_timestamps(self, filename: str, segments: List[Dict]):
                 line = json.dumps(segment)
                 output_manifest.writelines(f'{line}\n')
         
-        def write_words(output_words_filepath: str, words: List[Dict]):
+        def write_words(words: List[Dict]):
             output_manifest_filepath = os.path.join(self.config.dataset.output_dir, 'words', f'{filename}.json')
             with open(output_manifest_filepath, 'w', encoding = 'utf8') as output_manifest:
                 for word in words:
@@ -390,7 +391,7 @@ def transcribe(self, device_id: int):
         from faster_whisper.audio import decode_audio
 
         model_cfg = deepcopy(self.config.model)
-        model_cfg.device_index = [device_id]
+        model_cfg.device_index = [device_id] if model_cfg.device == "cuda" else [0]
         model = WhisperModel(**asdict(model_cfg))
 
         inference_cfg = OmegaConf.to_container(self.config.inference, resolve=True)

From 45fdc8ae50d84e6aef3278325baa9abbd36ba398 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 13 May 2025 03:19:05 -0700
Subject: [PATCH 73/90] added prepare_yodas2_data.py

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../prepare_test_data/prepare_yodas2_data.py  | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 tests/prepare_test_data/prepare_yodas2_data.py

diff --git a/tests/prepare_test_data/prepare_yodas2_data.py b/tests/prepare_test_data/prepare_yodas2_data.py
new file mode 100644
index 00000000..bc09801f
--- /dev/null
+++ b/tests/prepare_test_data/prepare_yodas2_data.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script to download a specific shard of the YODAS2 dataset from Hugging Face and prepare a local manifest
+for testing purposes.
+
+It downloads the specified audio, duration, and text files from the dataset repository, stores them
+locally in a test directory, and generates a JSON manifest describing the file paths and metadata.
+"""
+
+import argparse
+import os
+import json
+from pathlib import Path
+from huggingface_hub import snapshot_download
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare a specific YODAS2 test data shard for local use.")
+
+    parser.add_argument(
+        "--lang_subset",
+        default="en000",
+        help=(
+            "Language and subset ID to use from the YODAS2 dataset (e.g., 'en000', 'de003'). "
+            "This determines the subfolder within 'data/' on Hugging Face."
+        ),
+    )
+
+    parser.add_argument(
+        "--shard_id",
+        default="00000000",
+        help=(
+            "ID of the specific shard to download (e.g., '00000000', '00000001'). "
+            "Used to locate the appropriate audio, duration, and text files."
+        ),
+    )
+
+    parser.add_argument(
+        "--test_data_folder",
+        required=True,
+        help=(
+            "Path to the local directory where the downloaded data and generated manifest should be saved."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    # Resolve and create the output directory
+    test_data_folder = Path(args.test_data_folder).resolve()
+    os.makedirs(test_data_folder, exist_ok=True)
+
+    # Construct relative dataset file keys to download
+    audio_key = f"data/{args.lang_subset}/audio/{args.shard_id}.tar.gz"
+    duration_key = f"data/{args.lang_subset}/duration/{args.shard_id}.txt"
+    text_key = f"data/{args.lang_subset}/text/{args.shard_id}.json"
+
+    # Download only the specified files from Hugging Face to the local directory
+    test_data_folder = snapshot_download(
+        repo_id="espnet/yodas",
+        repo_type="dataset",
+        local_dir=test_data_folder,
+        allow_patterns=[audio_key, duration_key, text_key]
+    )
+
+    # Extract language code from lang_subset (e.g., 'en000' -> 'en')
+    lang = args.lang_subset[:2]
+    manifest_dir = os.path.join(test_data_folder, lang)
+    os.makedirs(manifest_dir, exist_ok=True)
+
+    # Generate a simple manifest file for testing
+    manifest_path = os.path.join(manifest_dir, "manifest_03.json")
+    with open(manifest_path, 'w', encoding='utf8') as manifest:
+        sample = dict(
+            lang_subset=args.lang_subset,
+            shard_id=args.shard_id,
+            audio_key=audio_key,
+            duration_key=duration_key,
+            text_key=text_key,
+            src_lang=lang,
+            local_audio=os.path.join(test_data_folder, audio_key),
+            local_duration=os.path.join(test_data_folder, duration_key),
+            local_text=os.path.join(test_data_folder, text_key),
+        )
+        # Write a single line JSON object representing the sample
+        manifest.writelines(json.dumps(sample) + '\n')

From 214a1bef60310f820a06952442e61db373f193f4 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 13 May 2025 03:24:35 -0700
Subject: [PATCH 74/90] added text preprocessing in FastTextLangIdClassifier

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/inference/nlp/fasttext/fasttext.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/inference/nlp/fasttext/fasttext.py b/sdp/processors/inference/nlp/fasttext/fasttext.py
index 7f2d2171..7b0663e0 100644
--- a/sdp/processors/inference/nlp/fasttext/fasttext.py
+++ b/sdp/processors/inference/nlp/fasttext/fasttext.py
@@ -110,7 +110,8 @@ def process_dataset_entry(self, data_entry: dict):
         """Applies the classifier to a single dataset entry."""
 
         self._load_model()
-        label, prob = self._model.predict(data_entry[self.text_field])
+        text = data_entry[self.text_field].strip().replace("\n", " ")
+        label, prob = self._model.predict(text)
         data_entry[self.output_field] = label[0].replace('__label__', '')
         data_entry[f"{self.output_field}_prob"] = prob[0]
 

From 98ab1255cafd2113098db52c71a04ab3dd3cfe7c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 13 May 2025 12:55:54 -0700
Subject: [PATCH 75/90] prepare_yodas2_data.py updated

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../prepare_test_data/prepare_yodas2_data.py  | 107 +++++++++++++++---
 1 file changed, 91 insertions(+), 16 deletions(-)

diff --git a/tests/prepare_test_data/prepare_yodas2_data.py b/tests/prepare_test_data/prepare_yodas2_data.py
index bc09801f..7f41118d 100644
--- a/tests/prepare_test_data/prepare_yodas2_data.py
+++ b/tests/prepare_test_data/prepare_yodas2_data.py
@@ -13,19 +13,26 @@
 # limitations under the License.
 
 """
-Script to download a specific shard of the YODAS2 dataset from Hugging Face and prepare a local manifest
-for testing purposes.
+Script to download a specific shard of the YODAS2 dataset from Hugging Face and prepare test data.
 
-It downloads the specified audio, duration, and text files from the dataset repository, stores them
-locally in a test directory, and generates a JSON manifest describing the file paths and metadata.
+This script:
+- Downloads a specific language subset and shard from the Hugging Face `espnet/yodas2` dataset.
+- Optionally truncates the dataset to a fixed number of entries (`--num_entries`).
+- Extracts and repacks audio samples from a tarball.
+- Writes a local manifest JSON with metadata pointing to the downloaded files.
 """
 
 import argparse
 import os
 import json
 from pathlib import Path
+import tempfile
+import shutil
+import tarfile
+import io
 from huggingface_hub import snapshot_download
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepare a specific YODAS2 test data shard for local use.")
 
@@ -47,6 +54,17 @@
         ),
     )
 
+    parser.add_argument(
+        "--num_entries",
+        default=-1,
+        type=int,
+        help=(
+            "If set to a positive number, limits the dataset to the first N entries. "
+            "Only N lines from the duration file, corresponding samples from the text file, "
+            "and matching audio files will be kept. Set to -1 to keep the full shard."
+        ),
+    )
+
     parser.add_argument(
         "--test_data_folder",
         required=True,
@@ -63,23 +81,81 @@
 
     # Construct relative dataset file keys to download
     audio_key = f"data/{args.lang_subset}/audio/{args.shard_id}.tar.gz"
+    local_audio = os.path.join(test_data_folder, audio_key)
+    os.makedirs(os.path.dirname(local_audio), exist_ok=True)
+    
     duration_key = f"data/{args.lang_subset}/duration/{args.shard_id}.txt"
+    local_duration = os.path.join(test_data_folder, duration_key)
+    os.makedirs(os.path.dirname(local_duration), exist_ok=True)
+    
     text_key = f"data/{args.lang_subset}/text/{args.shard_id}.json"
+    local_text = os.path.join(test_data_folder, text_key)
+    os.makedirs(os.path.dirname(local_text), exist_ok=True)
 
-    # Download only the specified files from Hugging Face to the local directory
-    test_data_folder = snapshot_download(
-        repo_id="espnet/yodas",
-        repo_type="dataset",
-        local_dir=test_data_folder,
-        allow_patterns=[audio_key, duration_key, text_key]
-    )
+    # Download the relevant files into a temporary directory
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = snapshot_download(
+            repo_id="espnet/yodas2",
+            repo_type="dataset",
+            local_dir=tmpdir,
+            allow_patterns=[audio_key, duration_key, text_key]
+        )
+        downloaded_duration = os.path.join(tmpdir, duration_key)
+        downloaded_text = os.path.join(tmpdir, text_key)
+        downloaded_audio = os.path.join(tmpdir, audio_key)
+
+        # Sanity checks
+        assert os.path.exists(downloaded_duration), "Missing duration file after download"
+        assert os.path.exists(downloaded_text), "Missing text file after download"
+        assert os.path.exists(downloaded_audio), "Missing audio file after download"
+
+        # Full shard copy
+        if args.num_entries == -1: 
+            shutil.move(downloaded_duration, local_duration)
+            shutil.move(downloaded_text, local_text)
+            shutil.move(downloaded_audio, local_audio)
+
+        # Partial shard (only first N entries)
+        else:
+            yodas_ids = []
+
+            # Copy first N lines from duration file and collect audio IDs
+            with open(downloaded_duration, 'r') as fin, open(local_duration, 'w') as fout:
+                for i, line in enumerate(fin, 1):
+                    yodas_id = line.split()[0]
+                    yodas_ids.append(yodas_id)
+                    fout.writelines(line)
+                    if i >= args.num_entries:
+                        break
+                else:
+                    print(f"Warning: available entries fewer than requested ({args.num_entries})")
+
+            # Filter JSON entries by audio_id
+            with open(downloaded_text, 'r', encoding='utf8') as fin, open(local_text, 'w', encoding='utf8') as fout:
+                all_samples = json.load(fin)[0]
+                selected = [sample for sample in all_samples if sample['audio_id'] in yodas_ids]
+
+                if len(selected) != len(yodas_ids):
+                    raise ValueError("Mismatch between duration IDs and filtered text samples.")
+
+                fout.writelines(json.dumps(selected) + '\n')
+
+            # Extract and re-tar only selected audio files
+            with tarfile.open(downloaded_audio, "r:gz") as tar_in, tarfile.open(local_audio, "w:gz") as tar_out:
+                for yodas_id in yodas_ids:
+                    audio_filename = f'{yodas_id}.wav'
+                    audio_bytes = tar_in.extractfile(audio_filename).read()
+                    file_obj = io.BytesIO(audio_bytes)
+                    info = tar_in.getmember(audio_filename)
+                    info.size = len(audio_bytes)
+                    tar_out.addfile(info, fileobj=file_obj)
 
     # Extract language code from lang_subset (e.g., 'en000' -> 'en')
     lang = args.lang_subset[:2]
     manifest_dir = os.path.join(test_data_folder, lang)
     os.makedirs(manifest_dir, exist_ok=True)
 
-    # Generate a simple manifest file for testing
+    # Write manifest with metadata for further testing
     manifest_path = os.path.join(manifest_dir, "manifest_03.json")
     with open(manifest_path, 'w', encoding='utf8') as manifest:
         sample = dict(
@@ -89,9 +165,8 @@
             duration_key=duration_key,
             text_key=text_key,
             src_lang=lang,
-            local_audio=os.path.join(test_data_folder, audio_key),
-            local_duration=os.path.join(test_data_folder, duration_key),
-            local_text=os.path.join(test_data_folder, text_key),
+            local_audio=local_audio,
+            local_duration=local_duration,
+            local_text=local_text,
         )
-        # Write a single line JSON object representing the sample
         manifest.writelines(json.dumps(sample) + '\n')

From 6153185b3e9787e89d3332dc49057bd27bf01b35 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 13 May 2025 13:40:59 -0700
Subject: [PATCH 76/90] added wget to add
 tests/prepare_test_data/prepare_yodas2_data.py

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../prepare_test_data/prepare_yodas2_data.py  | 168 ++++++++++--------
 1 file changed, 97 insertions(+), 71 deletions(-)

diff --git a/tests/prepare_test_data/prepare_yodas2_data.py b/tests/prepare_test_data/prepare_yodas2_data.py
index 7f41118d..a2f3d82b 100644
--- a/tests/prepare_test_data/prepare_yodas2_data.py
+++ b/tests/prepare_test_data/prepare_yodas2_data.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 """
-Script to download a specific shard of the YODAS2 dataset from Hugging Face and prepare test data.
-
-This script:
-- Downloads a specific language subset and shard from the Hugging Face `espnet/yodas2` dataset.
-- Optionally truncates the dataset to a fixed number of entries (`--num_entries`).
-- Extracts and repacks audio samples from a tarball.
-- Writes a local manifest JSON with metadata pointing to the downloaded files.
+Script to download and prepare a specific shard of the YODAS2 dataset for testing or development purposes.
+
+Main features:
+- Downloads audio, duration, and text files for a given language subset and shard from Hugging Face (`espnet/yodas2`).
+- Uses `huggingface_hub.snapshot_download` if available; otherwise falls back to direct `wget` download.
+- Supports trimming the dataset to only the first N entries via the `--num_entries` flag.
+- Repackages selected audio entries into a new `.tar.gz`.
+- Generates a minimal manifest JSON with references to the downloaded local files.
 """
 
 import argparse
@@ -30,8 +31,13 @@
 import shutil
 import tarfile
 import io
-from huggingface_hub import snapshot_download
+import wget
 
+try:
+    from huggingface_hub import snapshot_download
+    IS_HF_HUB_AVAILABLE = True
+except ModuleNotFoundError:
+    IS_HF_HUB_AVAILABLE = False
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepare a specific YODAS2 test data shard for local use.")
@@ -40,8 +46,8 @@
         "--lang_subset",
         default="en000",
         help=(
-            "Language and subset ID to use from the YODAS2 dataset (e.g., 'en000', 'de003'). "
-            "This determines the subfolder within 'data/' on Hugging Face."
+            "Language and subset ID from the YODAS2 dataset (e.g., 'en000', 'de003'). "
+            "Used to construct the remote path: data/<lang_subset>/..."
         ),
     )
 
@@ -49,8 +55,8 @@
         "--shard_id",
         default="00000000",
         help=(
-            "ID of the specific shard to download (e.g., '00000000', '00000001'). "
-            "Used to locate the appropriate audio, duration, and text files."
+            "Shard ID to download (e.g., '00000000'). "
+            "Used as a filename to identify the .tar.gz, .txt, and .json files."
         ),
     )
 
@@ -59,114 +65,134 @@
         default=-1,
         type=int,
         help=(
-            "If set to a positive number, limits the dataset to the first N entries. "
-            "Only N lines from the duration file, corresponding samples from the text file, "
-            "and matching audio files will be kept. Set to -1 to keep the full shard."
+            "If set to a positive integer, only the first N entries will be extracted "
+            "from the shard (including audio, duration, and text). Use -1 to keep all data."
         ),
     )
 
     parser.add_argument(
         "--test_data_folder",
         required=True,
-        help=(
-            "Path to the local directory where the downloaded data and generated manifest should be saved."
-        ),
+        help="Target directory where files and manifest will be saved.",
     )
 
     args = parser.parse_args()
 
-    # Resolve and create the output directory
+    # Resolve target output directory and ensure it exists
     test_data_folder = Path(args.test_data_folder).resolve()
     os.makedirs(test_data_folder, exist_ok=True)
 
-    # Construct relative dataset file keys to download
+    # File keys used to construct Hugging Face paths
     audio_key = f"data/{args.lang_subset}/audio/{args.shard_id}.tar.gz"
-    local_audio = os.path.join(test_data_folder, audio_key)
-    os.makedirs(os.path.dirname(local_audio), exist_ok=True)
-    
     duration_key = f"data/{args.lang_subset}/duration/{args.shard_id}.txt"
-    local_duration = os.path.join(test_data_folder, duration_key)
-    os.makedirs(os.path.dirname(local_duration), exist_ok=True)
-    
     text_key = f"data/{args.lang_subset}/text/{args.shard_id}.json"
+
+    # Local paths to store the downloaded files
+    local_audio = os.path.join(test_data_folder, audio_key)
+    local_duration = os.path.join(test_data_folder, duration_key)
     local_text = os.path.join(test_data_folder, text_key)
+
+    # Ensure directories exist
+    os.makedirs(os.path.dirname(local_audio), exist_ok=True)
+    os.makedirs(os.path.dirname(local_duration), exist_ok=True)
     os.makedirs(os.path.dirname(local_text), exist_ok=True)
 
-    # Download the relevant files into a temporary directory
+    # Temporary directory for downloads
     with tempfile.TemporaryDirectory() as tmpdir:
-        tmpdir = snapshot_download(
-            repo_id="espnet/yodas2",
-            repo_type="dataset",
-            local_dir=tmpdir,
-            allow_patterns=[audio_key, duration_key, text_key]
-        )
+        downloaded_audio = os.path.join(tmpdir, audio_key)
         downloaded_duration = os.path.join(tmpdir, duration_key)
         downloaded_text = os.path.join(tmpdir, text_key)
-        downloaded_audio = os.path.join(tmpdir, audio_key)
 
-        # Sanity checks
-        assert os.path.exists(downloaded_duration), "Missing duration file after download"
-        assert os.path.exists(downloaded_text), "Missing text file after download"
-        assert os.path.exists(downloaded_audio), "Missing audio file after download"
+        os.makedirs(os.path.dirname(downloaded_audio), exist_ok=True)
+        os.makedirs(os.path.dirname(downloaded_duration), exist_ok=True)
+        os.makedirs(os.path.dirname(downloaded_text), exist_ok=True)
+
+        def download_with_hf():
+            snapshot_download(
+                repo_id="espnet/yodas2",
+                repo_type="dataset",
+                local_dir=tmpdir,
+                allow_patterns=[audio_key, duration_key, text_key],
+            )
+
+        def download_with_wget():
+            base_url = "https://huggingface.co/datasets/espnet/yodas2/resolve/main/"
+            suffix = "?download=true"
+            wget.download(f"{base_url}{audio_key}{suffix}", out=downloaded_audio)
+            wget.download(f"{base_url}{duration_key}{suffix}", out=downloaded_duration)
+            wget.download(f"{base_url}{text_key}{suffix}", out=downloaded_text)
+
+        # Try downloading the files
+        if IS_HF_HUB_AVAILABLE:
+            try:
+                download_with_hf()
+            except Exception:
+                download_with_wget()
+        else:
+            download_with_wget()
+
+        # Sanity check to make sure files were downloaded
+        assert os.path.exists(downloaded_audio), "Audio file missing after download."
+        assert os.path.exists(downloaded_duration), "Duration file missing after download."
+        assert os.path.exists(downloaded_text), "Text file missing after download."
 
-        # Full shard copy
-        if args.num_entries == -1: 
+        if args.num_entries == -1:
+            # Move full shard
+            shutil.move(downloaded_audio, local_audio)
             shutil.move(downloaded_duration, local_duration)
             shutil.move(downloaded_text, local_text)
-            shutil.move(downloaded_audio, local_audio)
-
-        # Partial shard (only first N entries)
         else:
+            # Limit to N entries
             yodas_ids = []
 
-            # Copy first N lines from duration file and collect audio IDs
+            # Process duration file and extract first N IDs
             with open(downloaded_duration, 'r') as fin, open(local_duration, 'w') as fout:
                 for i, line in enumerate(fin, 1):
                     yodas_id = line.split()[0]
                     yodas_ids.append(yodas_id)
-                    fout.writelines(line)
+                    fout.write(line)
                     if i >= args.num_entries:
                         break
                 else:
-                    print(f"Warning: available entries fewer than requested ({args.num_entries})")
+                    print(f"Warning: fewer lines than requested ({args.num_entries}).")
 
-            # Filter JSON entries by audio_id
+            # Filter JSON entries by selected IDs
             with open(downloaded_text, 'r', encoding='utf8') as fin, open(local_text, 'w', encoding='utf8') as fout:
-                all_samples = json.load(fin)[0]
-                selected = [sample for sample in all_samples if sample['audio_id'] in yodas_ids]
+                all_samples = json.load(fin)
+                selected = [s for s in all_samples if s['audio_id'] in yodas_ids]
 
                 if len(selected) != len(yodas_ids):
-                    raise ValueError("Mismatch between duration IDs and filtered text samples.")
+                    raise ValueError("Mismatch between duration and text entries.")
 
-                fout.writelines(json.dumps(selected) + '\n')
+                fout.write(json.dumps(selected) + '\n')
 
-            # Extract and re-tar only selected audio files
+            # Extract and repack audio subset
             with tarfile.open(downloaded_audio, "r:gz") as tar_in, tarfile.open(local_audio, "w:gz") as tar_out:
                 for yodas_id in yodas_ids:
-                    audio_filename = f'{yodas_id}.wav'
-                    audio_bytes = tar_in.extractfile(audio_filename).read()
+                    filename = f"./{yodas_id}.wav"
+                    member = tar_in.getmember(filename)
+                    audio_bytes = tar_in.extractfile(member).read()
                     file_obj = io.BytesIO(audio_bytes)
-                    info = tar_in.getmember(audio_filename)
-                    info.size = len(audio_bytes)
-                    tar_out.addfile(info, fileobj=file_obj)
+                    member.size = len(audio_bytes)
+                    tar_out.addfile(member, fileobj=file_obj)
 
-    # Extract language code from lang_subset (e.g., 'en000' -> 'en')
+    # Determine manifest location and folder
     lang = args.lang_subset[:2]
     manifest_dir = os.path.join(test_data_folder, lang)
     os.makedirs(manifest_dir, exist_ok=True)
 
-    # Write manifest with metadata for further testing
+    # Write output manifest
     manifest_path = os.path.join(manifest_dir, "manifest_03.json")
     with open(manifest_path, 'w', encoding='utf8') as manifest:
-        sample = dict(
-            lang_subset=args.lang_subset,
-            shard_id=args.shard_id,
-            audio_key=audio_key,
-            duration_key=duration_key,
-            text_key=text_key,
-            src_lang=lang,
-            local_audio=local_audio,
-            local_duration=local_duration,
-            local_text=local_text,
-        )
-        manifest.writelines(json.dumps(sample) + '\n')
+        sample = {
+            "lang_subset": args.lang_subset,
+            "shard_id": args.shard_id,
+            "audio_key": audio_key,
+            "duration_key": duration_key,
+            "text_key": text_key,
+            "src_lang": lang,
+            "local_audio": local_audio,
+            "local_duration": local_duration,
+            "local_text": local_text,
+        }
+        manifest.write(json.dumps(sample) + '\n')
\ No newline at end of file

From 223bd5d648f6ff6a1049988d720d4447a784ed8c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 19 May 2025 03:17:45 -0700
Subject: [PATCH 77/90] Removed already imported module

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/base_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdp/processors/base_processor.py b/sdp/processors/base_processor.py
index 6fc22ee8..a4257e53 100644
--- a/sdp/processors/base_processor.py
+++ b/sdp/processors/base_processor.py
@@ -19,7 +19,6 @@
 import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from itertools import chain
 from typing import Any, Dict, List, Optional, Union
 
 from tqdm import tqdm

From 54a0af522e118c68f4fa3e47b1c86b2906bf585b Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 19 May 2025 03:22:01 -0700
Subject: [PATCH 78/90] Added missed  param in FasterWhisperInference

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../inference/asr/faster_whisper/faster_whisper_inference.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
index a2d7b783..35aa1e77 100644
--- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
+++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py
@@ -198,6 +198,7 @@ def __init__(self,
                  model_size_or_path: str = "base",
                  device: str = "auto",
                  num_devices: int = -1,
+                 compute_type: str = "default",
                  model_download_root: Optional[str] = None,
                  output_dir: Optional[str] = None,
                  skip_corrupted_audios: bool = False,
@@ -236,7 +237,7 @@ def __init__(self,
         device, device_ids = self.setup_devices(device, num_devices)
         self.device_ids = device_ids
         model_cfg = ModelConfig(model_size_or_path = model_size_or_path,
-                                device = device,
+                                device = device, compute_type = compute_type,
                                 download_root = model_download_root)
         
         #GeneralConfig setup

From c9e72fdd85d50b236809f5b9b4fec838b30e99ce Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 19 May 2025 03:25:37 -0700
Subject: [PATCH 79/90] Added HfHubDownloadYodas2Data, HfHubDownload,
 GetGranarysYodas2

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/__init__.py                    |  8 +-
 .../yodas2/create_initial_manifest.py         | 23 ++++-
 sdp/processors/datasets/yodas2/granary.py     | 62 +++++++++++++
 sdp/processors/huggingface/huggingface_hub.py | 89 ++++++++++++++-----
 4 files changed, 153 insertions(+), 29 deletions(-)
 create mode 100644 sdp/processors/datasets/yodas2/granary.py

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index b427ea5a..795ee05c 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -81,7 +81,7 @@
 from sdp.processors.manage_files.convert_to_tarred_audio_dataset import ConvertToTarredAudioDataset
 
 from sdp.processors.huggingface.create_initial_manifest import CreateInitialManifestHuggingFace
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload, HfHubDownload
 
 from sdp.processors.inference.asr.nemo.asr_inference import ASRInference
 from sdp.processors.inference.asr.transformers.speech_recognition import ASRTransformers
@@ -159,6 +159,8 @@
 )
 from sdp.processors.datasets.yodas2.create_initial_manifest import(
     ListYodas2Data,
-    DownloadYodas2Data,
+    SnapshotDownloadYodas2Data,
+    HfHubDownloadYodas2Data,
     CreateInitialManifestYodas2,
-)
\ No newline at end of file
+)
+from sdp.processors.datasets.yodas2.granary import GetGranarysYodas2
\ No newline at end of file
diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index 7c0ea683..83f46f5f 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -20,7 +20,7 @@
 import importlib.util
 
 from sdp.processors import ListToEntries
-from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload
+from sdp.processors.huggingface.huggingface_hub import ListRepoFiles, SnapshotDownload, HfHubDownload
 from sdp.logging import logger
 
 
@@ -137,7 +137,7 @@ def process(self):
         logger.info("Metadata successfully saved!")
         
 
-class DownloadYodas2Data(SnapshotDownload):
+class SnapshotDownloadYodas2Data(SnapshotDownload):
     """
     A specialized processor for downloading the YODAS2 dataset from Hugging Face
     and updating the input manifest with local file paths to the downloaded files.
@@ -194,7 +194,12 @@ class DownloadYodas2Data(SnapshotDownload):
 
     def __init__(self, **kwargs):
         # Hardcoded to download the espnet/yodas2 dataset from Hugging Face
-        super().__init__(repo_id="espnet/yodas2", repo_type="dataset", **kwargs)
+        if not 'snapshot_download_args' in kwargs:
+            kwargs['snapshot_download_args'] = dict()
+        kwargs['snapshot_download_args']['repo_id'] = 'espnet/yodas2'
+        kwargs['snapshot_download_args']['repo_type'] = 'dataset'
+
+        super().__init__(**kwargs)
 
     def write_output_manifest_file(self):
         """
@@ -271,6 +276,18 @@ def process(self):
         self.write_output_manifest_file()
 
 
+class HfHubDownloadYodas2Data(HfHubDownload):
+    def __init__(self, filename_field: str = 'audio_key', output_filepath_field = 'local_audio', **kwargs):
+        if not 'hf_hub_download_args' in kwargs:
+            kwargs['hf_hub_download_args'] = dict()
+        kwargs['hf_hub_download_args']['repo_id'] = 'espnet/yodas2'
+        kwargs['hf_hub_download_args']['repo_type'] = 'dataset'
+
+        super().__init__(filename_field = filename_field, output_filepath_field = output_filepath_field, **kwargs)
+    
+    def process(self):
+        super().process()
+
 class CreateInitialManifestYodas2(ListToEntries):
     """
     A dataset processor specialized for the YODAS2 dataset.
diff --git a/sdp/processors/datasets/yodas2/granary.py b/sdp/processors/datasets/yodas2/granary.py
new file mode 100644
index 00000000..d5d2d520
--- /dev/null
+++ b/sdp/processors/datasets/yodas2/granary.py
@@ -0,0 +1,62 @@
+import os
+import json
+from glob import glob
+from tqdm import tqdm
+import tempfile
+
+from sdp.processors.huggingface.huggingface_hub import SnapshotDownload
+from sdp.logging import logger
+
+class GetGranarysYodas2(SnapshotDownload):
+    AVAILABLE_LANGS = ["bg", "cs", "da", "de", "el",
+                       "en", "es", "et", "fi", "fr",
+                       "hr", "hu", "it", "lt", "lv",
+                       "nl", "pl", "pt", "ro", "ru",
+                       "sk", "sv", "uk"]
+
+    def __init__(self, lang: str, translation: bool = False, **kwargs):
+        super().__init__(repo_id="YODASEnj/YDS", repo_type="dataset", **kwargs)
+        if lang not in self.AVAILABLE_LANGS:
+            raise ValueError("")
+        self.lang = lang
+
+        self.translation = translation
+        if self.lang == "en" and self.translation:
+            logger.warning(f'There are no translations for `en` language.')
+            self.translation = False
+    
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok = True)
+        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            pattern = f"{self.lang}/{self.lang}*.json"
+            if self.translation:
+                pattern = f"Translation/{self.lang}_/{self.lang}*.jsonl"
+
+            self.snapshot_download_kwargs['allow_patterns'] = pattern
+            with tempfile.TemporaryDirectory() as tmp_dir: 
+                self.snapshot_download_kwargs["local_dir"] = tmp_dir
+                self.download()
+
+                for manifest_filepath in sorted(glob(f"{tmp_dir}/{pattern}")):
+                    with open(manifest_filepath, 'r', encoding='utf8') as fin:
+                        for line in tqdm(fin, desc = f'Processing {os.path.basename(manifest_filepath)}'):
+                            sample = json.loads(line)
+                            new_sample = dict(source_lang = self.lang,
+                                            target_lang = self.lang,
+                                            yodas_id = sample['wav_id'],
+                                            offset = sample['start_time'],
+                                            duration = sample['duration'],
+                                            text = sample['text'],
+                                            answer = sample['text'],
+                                            decodercontext = "",
+                                            emotion = "<|emo:undefined|>",
+                                            pnc = "pnc",
+                                            itn = "itn",
+                                            timestamp = "notimestamp", 
+                                            diarize = "nodiarize")
+                                                        
+                            if self.translation:
+                                new_sample['target_lang'] = "en"
+                                new_sample['answer'] = sample['translation_en']
+                                
+                            fout.writelines(json.dumps(new_sample) + '\n')
\ No newline at end of file
diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
index 19e31832..203d6cf4 100644
--- a/sdp/processors/huggingface/huggingface_hub.py
+++ b/sdp/processors/huggingface/huggingface_hub.py
@@ -13,9 +13,16 @@
 # limitations under the License.
 
 import json
+import os
+from typing import Dict
 
-from sdp.processors.base_processor import BaseProcessor
+from tqdm.contrib.concurrent import process_map
 
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor
+
+def _hf_hub_download(kwargs):
+    from huggingface_hub import hf_hub_download
+    return hf_hub_download(**kwargs)
 
 class ListRepoFiles(BaseProcessor):
     """
@@ -93,34 +100,70 @@ class SnapshotDownload(BaseProcessor):
 
     def __init__(
         self,
-        output_manifest_file: str,
-        input_manifest_file: str = None,
-        **snapshot_download_kwargs,
+        output_filepath_field: str = "downloaded",
+        snapshot_download_args: dict = {},
+        **kwargs,
     ):
-        super().__init__(
-            output_manifest_file=output_manifest_file,
-            input_manifest_file=input_manifest_file,
-        )
-        self.snapshot_download_kwargs = snapshot_download_kwargs
+        super().__init__(**kwargs)
+        self.output_filepath_field = output_filepath_field
+        self.snapshot_download_args = snapshot_download_args
 
-    def download(self):
+    def process(self):
         """
-        Download the repository snapshot to a local folder.
+        Main processing entrypoint: download repo and write path to manifest. 
         """
         from huggingface_hub import snapshot_download
 
-        self.local_dir = snapshot_download(**self.snapshot_download_kwargs)
-
-    def write_output_manifest_file(self):
-        """
-        Write the path of the downloaded snapshot folder to the output manifest.
-        """
+        self.local_dir = snapshot_download(**self.snapshot_download_args)
+        
         with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-            fout.writelines(json.dumps({"destination_dir": self.local_dir}))
+            fout.writelines(json.dumps({self.output_filepath_field : self.local_dir}))
+
+
+class HfHubDownload(BaseParallelProcessor):
+    def __init__(
+        self,
+        filename_field: str,
+        output_filepath_field: str = "downloaded",
+        hf_hub_download_args: Dict = {},
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.filename_field = filename_field
+        self.output_filepath_field = output_filepath_field
+        self.hf_hub_download_args = hf_hub_download_args
 
     def process(self):
-        """
-        Main processing entrypoint: download repo and write path to manifest.
-        """
-        self.download()
-        self.write_output_manifest_file()
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # Подготовим список задач
+                download_tasks = [
+                    {
+                        **self.hf_hub_download_args,
+                        "filename": entry[self.filename_field]
+                    }
+                    for entry in manifest_chunk
+                ]
+
+                # Параллельная загрузка с учётом max_workers и chunksize
+                results = process_map(
+                    _hf_hub_download,
+                    download_tasks,
+                    max_workers=self.max_workers,
+                    chunksize=self.chunksize,
+                )
+
+                # Сопоставим обратно результаты с входными entry
+                for entry, local_path in zip(manifest_chunk, results):
+                    entry[self.output_filepath_field] = local_path
+                    json.dump(entry, fout, ensure_ascii=False)
+                    fout.write("\n")
+                    self.number_of_entries += 1
+
+        self.finalize(self.test_cases)
+    
+    def process_dataset_entry(self, data_entry):
+        pass
\ No newline at end of file

From 7a65dbeccaf6feca00edc5f6100ba3feedaf0d6c Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Mon, 19 May 2025 04:15:10 -0700
Subject: [PATCH 80/90] Simplified prepare_yodas2_data.py

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../prepare_test_data/prepare_yodas2_data.py  | 132 ++++++------------
 1 file changed, 45 insertions(+), 87 deletions(-)

diff --git a/tests/prepare_test_data/prepare_yodas2_data.py b/tests/prepare_test_data/prepare_yodas2_data.py
index a2f3d82b..ef37f191 100644
--- a/tests/prepare_test_data/prepare_yodas2_data.py
+++ b/tests/prepare_test_data/prepare_yodas2_data.py
@@ -31,13 +31,7 @@
 import shutil
 import tarfile
 import io
-import wget
-
-try:
-    from huggingface_hub import snapshot_download
-    IS_HF_HUB_AVAILABLE = True
-except ModuleNotFoundError:
-    IS_HF_HUB_AVAILABLE = False
+from huggingface_hub import hf_hub_download
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Prepare a specific YODAS2 test data shard for local use.")
@@ -83,70 +77,36 @@
     os.makedirs(test_data_folder, exist_ok=True)
 
     # File keys used to construct Hugging Face paths
-    audio_key = f"data/{args.lang_subset}/audio/{args.shard_id}.tar.gz"
-    duration_key = f"data/{args.lang_subset}/duration/{args.shard_id}.txt"
-    text_key = f"data/{args.lang_subset}/text/{args.shard_id}.json"
-
-    # Local paths to store the downloaded files
-    local_audio = os.path.join(test_data_folder, audio_key)
-    local_duration = os.path.join(test_data_folder, duration_key)
-    local_text = os.path.join(test_data_folder, text_key)
-
-    # Ensure directories exist
-    os.makedirs(os.path.dirname(local_audio), exist_ok=True)
-    os.makedirs(os.path.dirname(local_duration), exist_ok=True)
-    os.makedirs(os.path.dirname(local_text), exist_ok=True)
+    shard = dict(
+        audio = dict(key = f"data/{args.lang_subset}/audio/{args.shard_id}.tar.gz"),
+        duration = dict(key = f"data/{args.lang_subset}/duration/{args.shard_id}.txt"),
+        text = dict(key = f"data/{args.lang_subset}/text/{args.shard_id}.json")
+    )
 
     # Temporary directory for downloads
     with tempfile.TemporaryDirectory() as tmpdir:
-        downloaded_audio = os.path.join(tmpdir, audio_key)
-        downloaded_duration = os.path.join(tmpdir, duration_key)
-        downloaded_text = os.path.join(tmpdir, text_key)
-
-        os.makedirs(os.path.dirname(downloaded_audio), exist_ok=True)
-        os.makedirs(os.path.dirname(downloaded_duration), exist_ok=True)
-        os.makedirs(os.path.dirname(downloaded_text), exist_ok=True)
-
-        def download_with_hf():
-            snapshot_download(
+        for datatype in shard:
+            tmp_filepath = hf_hub_download(
                 repo_id="espnet/yodas2",
                 repo_type="dataset",
+                filename = shard[datatype]['key'],
                 local_dir=tmpdir,
-                allow_patterns=[audio_key, duration_key, text_key],
+                local_dir_use_symlinks = False,
             )
 
-        def download_with_wget():
-            base_url = "https://huggingface.co/datasets/espnet/yodas2/resolve/main/"
-            suffix = "?download=true"
-            wget.download(f"{base_url}{audio_key}{suffix}", out=downloaded_audio)
-            wget.download(f"{base_url}{duration_key}{suffix}", out=downloaded_duration)
-            wget.download(f"{base_url}{text_key}{suffix}", out=downloaded_text)
-
-        # Try downloading the files
-        if IS_HF_HUB_AVAILABLE:
-            try:
-                download_with_hf()
-            except Exception:
-                download_with_wget()
-        else:
-            download_with_wget()
-
-        # Sanity check to make sure files were downloaded
-        assert os.path.exists(downloaded_audio), "Audio file missing after download."
-        assert os.path.exists(downloaded_duration), "Duration file missing after download."
-        assert os.path.exists(downloaded_text), "Text file missing after download."
-
-        if args.num_entries == -1:
-            # Move full shard
-            shutil.move(downloaded_audio, local_audio)
-            shutil.move(downloaded_duration, local_duration)
-            shutil.move(downloaded_text, local_text)
-        else:
-            # Limit to N entries
-            yodas_ids = []
+            assert os.path.exists(tmp_filepath), f"{datatype} file missing after download ({tmp_filepath} not found)."
+            shard[datatype]['src_filepath'] = tmp_filepath
+
+            dest_filepath = os.path.join(test_data_folder, shard[datatype]['key'])
+            shard[datatype]['dest_filepath'] = dest_filepath
+            os.makedirs(os.path.dirname(dest_filepath), exist_ok=True)
+
+            if args.num_entries == -1:
+                shutil.move(tmp_filepath, dest_filepath)
 
-            # Process duration file and extract first N IDs
-            with open(downloaded_duration, 'r') as fin, open(local_duration, 'w') as fout:
+        if args.num_entries != -1:    
+            yodas_ids = []
+            with open(shard['duration']['src_filepath'], 'r') as fin, open(shard['duration']['dest_filepath'], 'w') as fout:
                 for i, line in enumerate(fin, 1):
                     yodas_id = line.split()[0]
                     yodas_ids.append(yodas_id)
@@ -155,9 +115,8 @@ def download_with_wget():
                         break
                 else:
                     print(f"Warning: fewer lines than requested ({args.num_entries}).")
-
-            # Filter JSON entries by selected IDs
-            with open(downloaded_text, 'r', encoding='utf8') as fin, open(local_text, 'w', encoding='utf8') as fout:
+            
+            with open(shard['text']['src_filepath'], 'r', encoding='utf8') as fin, open(shard['text']['dest_filepath'], 'w', encoding='utf8') as fout:
                 all_samples = json.load(fin)
                 selected = [s for s in all_samples if s['audio_id'] in yodas_ids]
 
@@ -165,9 +124,9 @@ def download_with_wget():
                     raise ValueError("Mismatch between duration and text entries.")
 
                 fout.write(json.dumps(selected) + '\n')
-
+            
             # Extract and repack audio subset
-            with tarfile.open(downloaded_audio, "r:gz") as tar_in, tarfile.open(local_audio, "w:gz") as tar_out:
+            with tarfile.open(shard['audio']['src_filepath'], "r:gz") as tar_in, tarfile.open(shard['audio']['dest_filepath'], "w:gz") as tar_out:
                 for yodas_id in yodas_ids:
                     filename = f"./{yodas_id}.wav"
                     member = tar_in.getmember(filename)
@@ -176,23 +135,22 @@ def download_with_wget():
                     member.size = len(audio_bytes)
                     tar_out.addfile(member, fileobj=file_obj)
 
-    # Determine manifest location and folder
-    lang = args.lang_subset[:2]
-    manifest_dir = os.path.join(test_data_folder, lang)
-    os.makedirs(manifest_dir, exist_ok=True)
-
-    # Write output manifest
-    manifest_path = os.path.join(manifest_dir, "manifest_03.json")
-    with open(manifest_path, 'w', encoding='utf8') as manifest:
-        sample = {
-            "lang_subset": args.lang_subset,
-            "shard_id": args.shard_id,
-            "audio_key": audio_key,
-            "duration_key": duration_key,
-            "text_key": text_key,
-            "src_lang": lang,
-            "local_audio": local_audio,
-            "local_duration": local_duration,
-            "local_text": local_text,
-        }
-        manifest.write(json.dumps(sample) + '\n')
\ No newline at end of file
+        # Determine manifest location and folder
+        lang = args.lang_subset[:2]
+        manifest_dir = os.path.join(test_data_folder, lang)
+        os.makedirs(manifest_dir, exist_ok=True)
+
+        manifest_path = os.path.join(manifest_dir, "manifest_03.json")
+        with open(manifest_path, 'w', encoding='utf8') as manifest:
+            sample = {
+                "lang_subset": args.lang_subset,
+                "shard_id": args.shard_id,
+                "audio_key": shard['audio']['key'],
+                "duration_key": shard['duration']['key'],
+                "text_key": shard['text']['key'],
+                "src_lang": lang,
+                "local_audio": shard['audio']['dest_filepath'],
+                "local_duration": shard['duration']['dest_filepath'],
+                "local_text": shard['text']['dest_filepath'],
+            }
+            manifest.write(json.dumps(sample) + '\n')
\ No newline at end of file

From 33ef48a0655548b7b96f4c0b0f84bdd0779ff892 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 02:55:44 -0700
Subject: [PATCH 81/90] New structure of files

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          | 403 ++-------------
 .../multilingual/yodas2/config.yaml           | 457 ++++++++++++++++++
 .../partials/common_phrases/bg.txt            |   0
 .../partials/common_phrases/cs.txt            |   0
 .../partials/common_phrases/da.txt            |   0
 .../partials/common_phrases/de.txt            |   0
 .../partials/common_phrases/el.txt            |   0
 .../partials/common_phrases/en.txt            |   0
 .../partials/common_phrases/es.txt            |   0
 .../partials/common_phrases/et.txt            |   0
 .../partials/common_phrases/fi.txt            |   0
 .../partials/common_phrases/fr.txt            |   0
 .../partials/common_phrases/hr.txt            |   0
 .../partials/common_phrases/hu.txt            |   0
 .../partials/common_phrases/it.txt            |   0
 .../partials/common_phrases/lt.txt            |   0
 .../partials/common_phrases/lv.txt            |   0
 .../partials/common_phrases/mt.txt            |   0
 .../partials/common_phrases/nl.txt            |   0
 .../partials/common_phrases/pl.txt            |   0
 .../partials/common_phrases/pt.txt            |   0
 .../partials/common_phrases/ro.txt            |   0
 .../partials/common_phrases/sk.txt            |   0
 .../partials/common_phrases/sl.txt            |   0
 .../partials/common_phrases/sv.txt            |   0
 .../partials/pr_recovery_prompts/bg.yaml      |   0
 .../partials/pr_recovery_prompts/cs.yaml      |   0
 .../partials/pr_recovery_prompts/da.yaml      |   0
 .../partials/pr_recovery_prompts/de.yaml      |   0
 .../partials/pr_recovery_prompts/el.yaml      |   0
 .../partials/pr_recovery_prompts/en.yaml      |   0
 .../partials/pr_recovery_prompts/es.yaml      |   0
 .../partials/pr_recovery_prompts/et.yaml      |   0
 .../partials/pr_recovery_prompts/fi.yaml      |   0
 .../partials/pr_recovery_prompts/fr.yaml      |   0
 .../partials/pr_recovery_prompts/hr.yaml      |   0
 .../partials/pr_recovery_prompts/hu.yaml      |   0
 .../partials/pr_recovery_prompts/it.yaml      |   0
 .../partials/pr_recovery_prompts/lt.yaml      |   0
 .../partials/pr_recovery_prompts/lv.yaml      |   0
 .../partials/pr_recovery_prompts/mt.yaml      |   0
 .../partials/pr_recovery_prompts/nl.yaml      |   0
 .../partials/pr_recovery_prompts/pl.yaml      |   0
 .../partials/pr_recovery_prompts/pt.yaml      |   0
 .../partials/pr_recovery_prompts/ro.yaml      |   0
 .../partials/pr_recovery_prompts/ru.yaml      |   0
 .../partials/pr_recovery_prompts/sk.yaml      |   0
 .../partials/pr_recovery_prompts/sl.yaml      |   0
 .../partials/pr_recovery_prompts/sv.yaml      |   0
 .../partials/pr_recovery_prompts/uk.yaml      |   0
 .../partials/subregex_params/bg.yaml          |   0
 .../partials/subregex_params/common.yaml      |   0
 .../partials/subregex_params/cs.yaml          |   0
 .../partials/subregex_params/da.yaml          |   0
 .../partials/subregex_params/de.yaml          |   0
 .../partials/subregex_params/el.yaml          |   0
 .../partials/subregex_params/en.yaml          |   0
 .../partials/subregex_params/es.yaml          |   0
 .../partials/subregex_params/et.yaml          |   0
 .../partials/subregex_params/fi.yaml          |   0
 .../partials/subregex_params/fr.yaml          |   0
 .../partials/subregex_params/hr.yaml          |   0
 .../partials/subregex_params/hu.yaml          |   0
 .../partials/subregex_params/it.yaml          |   0
 .../partials/subregex_params/lt.yaml          |   0
 .../partials/subregex_params/lv.yaml          |   0
 .../partials/subregex_params/mt.yaml          |   0
 .../partials/subregex_params/nl.yaml          |   0
 .../partials/subregex_params/pl.yaml          |   0
 .../partials/subregex_params/pt.yaml          |   0
 .../partials/subregex_params/ro.yaml          |   0
 .../partials/subregex_params/ru.yaml          |   0
 .../partials/subregex_params/sk.yaml          |   0
 .../partials/subregex_params/sl.yaml          |   0
 .../partials/subregex_params/sv.yaml          |   0
 .../partials/subregex_params/uk.yaml          |   0
 76 files changed, 484 insertions(+), 376 deletions(-)
 create mode 100644 dataset_configs/multilingual/yodas2/config.yaml
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/bg.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/cs.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/da.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/de.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/el.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/en.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/es.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/et.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/fi.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/fr.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/hr.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/hu.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/it.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/lt.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/lv.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/mt.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/nl.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/pl.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/pt.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/ro.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/sk.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/sl.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/common_phrases/sv.txt (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/bg.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/cs.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/da.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/de.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/el.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/en.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/es.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/et.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/fi.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/fr.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/hr.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/hu.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/it.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/lt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/lv.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/mt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/nl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/pl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/pt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/ro.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/ru.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/sk.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/sl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/sv.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/pr_recovery_prompts/uk.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/bg.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/common.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/cs.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/da.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/de.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/el.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/en.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/es.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/et.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/fi.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/fr.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/hr.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/hu.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/it.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/lt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/lv.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/mt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/nl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/pl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/pt.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/ro.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/ru.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/sk.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/sl.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/sv.yaml (100%)
 rename dataset_configs/multilingual/{granary => yodas2}/partials/subregex_params/uk.yaml (100%)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index e7c7a4e2..d3f397b5 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -6,51 +6,46 @@ documentation: |
 use_dask: False
 
 params:
-  source_lang: en
-  source_lang_full: English
-  min_audio_lid_probability:  0.7
-  min_audio_duration: 0.1
-  max_audio_duration: 40.0
-  use_regex: common
-  translation:
-    target_lang: it
-    target_lang_full: Italian
-    max_len_diff_ratio: 4
-    min_hist_token_ratio: 0.8
-    min_text_lid_probability: 0.3
-    min_qe_score: 0.75
+  source_lang: es
+  use_regex: ${.source_lang}
+  translation: True
   convert_to_audio_tarred_dataset:
     should_run: True
     num_shards: 16
     buckets_num: 1
+    min_audio_duration: 0.1
+    max_audio_duration: 40.0
   save_disk_space: True
 
-processors_to_run: "3:"
-workspace_dir: /data3/sdp_test/final_test
+processors_to_run: "2:"
+workspace_dir: /data3/sdp_test/new_test_kurz #/data3/sdp_test/final_test
 sdp_dir: /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor
 
 processors:
-  - _target_: sdp.processors.ListYodas2Data
-    output_manifest_file: ${workspace_dir}/manifest_00.json
-    use_metadata: True
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_01.json
-    new_field:  src_lang
-    expression: entry.lang_subset[:2]
+  - _target_: sdp.processors.GetGranarysYodas2
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_0.json
+    lang: ${params.source_lang}
+    translation: ${params.translation}
 
-  - _target_: sdp.processors.PreserveByValue
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_02.json
-    input_value_key: src_lang
-    target_value: ${params.source_lang}
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/granary/partials/subregex_params/${params.use_regex}.yaml
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_1.json
+  
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_2.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
   
   - _target_: sdp.processors.DownloadYodas2Data
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
-    local_dir: ${workspace_dir}/${params.source_lang}/
+    #by pattern
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_3.json
+    local_dir: ${workspace_dir}/${params.source_lang}
     max_workers: 8
   
   - _target_: sdp.processors.ExtractTar
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_04.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_4.json
     field_to_tar_filepath: 'local_audio'
     extraction_dir: ${workspace_dir}/${params.source_lang}
     remove_source_tar: ${params.save_disk_space}
@@ -59,352 +54,8 @@ processors:
     get_extracted_filepaths: True
   
   - _target_: sdp.processors.CreateInitialManifestYodas2
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_05.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_5.json
     field_with_list: 'extracted_audios'
     output_field: 'source_audio_filepath'
-    fields_to_save: 
-      - lang_subset
-      - shard_id
-      - src_lang
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_06.json
-    new_field: 'audio_filepath'
-    expression: "'${workspace_dir}/${params.source_lang}/converted/' + entry.lang_subset + '/' + entry.shard_id + '/' + entry.yodas_id"
-  
-  - _target_: sdp.processors.FfmpegConvert
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_07.json
-    input_file_key: 'source_audio_filepath'
-    output_file_key: 'audio_filepath'
-    id_key: 'audio_filepath'
-    converted_audio_dir: '/'
-    target_samplerate: 16000
-    target_nchannels: 1
-  
-  - _target_: sdp.processors.RemoveFiles
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_08.json
-    filepath_field: 'source_audio_filepath' 
-    should_run: ${params.save_disk_space}
-
-  # Lang ID
-  - _target_: sdp.processors.FasterWhisperInference
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_09.json
-    model_size_or_path: 'base'
-    num_devices: -1
-    output_dir: ${workspace_dir}/${params.source_lang}/manifest_09
-    language_detection_only: True
-    inference:
-        language_detection_segments: 7
-        chunk_length: 30
-    save_timestamps_separately: False
-    skip_corrupted_audios: True
-
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_10.json
-    new_field: 'lid_verified'
-    expression: (entry.language == "${params.source_lang}") & (entry.language_probability >= ${params.min_audio_lid_probability})
-    filter: True
-
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_11.json
-    fields_to_drop:
-      - language
-      - language_probability
-      - lid_verified
-  
-  # Inference on long audio
-  - _target_: sdp.processors.FasterWhisperInference
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_12.json
-    model_size_or_path: 'base'
-    output_dir: ${workspace_dir}/${params.source_lang}/manifest_12
-    inference:
-        language: ${params.source_lang}
-    save_timestamps_separately: False
-    skip_corrupted_audios: True
-  
-  - _target_: sdp.processors.ListToEntries
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
-    field_with_list: 'segments'
-    
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
-    new_field: 'duration'
-    expression: entry.end - entry.start
-  
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14a.json
-    high_duration_threshold: ${params.max_audio_duration}
-    low_duration_threshold: ${params.min_audio_duration}
-  
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
-    rename_fields: 
-      start: offset 
-      id: segment_id 
-  
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_16.json
-    fields_to_keep: 
-      - lang_subset
-      - shard_id
-      - yodas_id
-      - src_lang
-      - audio_filepath
-      - segment_id
-      - offset
-      - duration
-  
-  - _target_: sdp.processors.FasterWhisperInference
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_17.json
-    model_size_or_path: 'base'
-    output_dir: ${workspace_dir}/${params.source_lang}/manifest_17
-    inference:
-        language: ${params.source_lang}
-    save_timestamps_separately: False
-    skip_corrupted_audios: True
-    slice_by_offset: True
-  
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_18.json
-    fields_to_keep: 
-      - lang_subset
-      - shard_id
-      - yodas_id
-      - src_lang
-      - audio_filepath
-      - segment_id
-      - offset
-      - duration
-      - pred_text
-  
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_19.json
-    rename_fields:
-      pred_text: text
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_20.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DetectWhisperHallucinationFeatures
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_21.json
-    common_hall_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/common_phrases/${params.source_lang}.txt
-    text_field: text
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_22.json
-    new_field:  is_hallucinated
-    expression: (not entry.hall_repeated_ngrams) & (not entry.hall_long_word) & (not entry.hall_frequent_single_word)
-    filter: True
-    
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_23.json
-    fields_to_keep: 
-      - lang_subset
-      - shard_id
-      - yodas_id
-      - src_lang
-      - audio_filepath
-      - segment_id
-      - offset
-      - duration
-      - text
-  
-  - _target_: sdp.processors.vLLMInference
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_24.json
-    generation_field: src_text
-    prompt_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/${params.source_lang}.yaml
-    model:
-      model: "Qwen/Qwen2.5-7B-Instruct-1M"
-      tensor_parallel_size: 2
-      max_model_len: 2048
-      enable_chunked_prefill: True
-      max_num_batched_tokens: 1024
-      enforce_eager: True
-      dtype: float16
-      gpu_memory_utilization: 0.95 
-      max_num_seqs: 16
-    inference:
-      temperature: 0.7
-      top_p: 0.8
-      repetition_penalty: 1.05
-      max_tokens: 2048
-    apply_chat_template:
-      tokenize: False
-      add_generation_prompt: True
-  
-  - _target_: sdp.processors.CleanQwenGeneration
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_25.json
-    text_field: text
-    generation_field: src_text
-  
-  - _target_: sdp.processors.SubRegex
-    text_key: src_text
-    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/granary/partials/subregex_params/${params.use_regex}.yaml
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_26.json
-  
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_27.json
-    fields_to_drop:
-      - text
-  
-  # AST
-  - _target_: sdp.processors.AddConstantFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_28.json
-    fields:
-      tgt_lang: ${params.translation.target_lang}
-
-  - _target_: sdp.processors.vLLMInference
-    generation_field: tgt_text
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_29.json
-    prompt:
-      system: ""
-      user: |
-        Translate the following ${params.source_lang_full} source text to ${params.translation.target_lang_full}:
-        ${params.source_lang_full}: {src_text}
-        ${params.translation.target_lang_full}: 
-    model:
-      model: "utter-project/EuroLLM-9B-Instruct"
-      dtype: float16
-    inference:
-      best_of: 1
-      temperature: 0.0
-      top_p: 1.0
-      max_tokens: 1280
-    apply_chat_template:
-      max_length: 512
-      tokenize: False
-      add_generation_prompt: True
-  
-  ## num_words and len_diff_ratio filtering
-  - _target_: sdp.processors.CountNumWords
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_30.json
-    text_key: src_text
-    num_words_key: num_words_src
-  
-  - _target_: sdp.processors.CountNumWords
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_31.json
-    text_key: tgt_text
-    num_words_key: num_words_tgt
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_32.json
-    new_field: num_words_filter
-    expression: (entry.num_words_src > 1) & (entry.num_words_tgt > 1)
-    filter: True
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_33.json
-    new_field: len_diff_ratio
-    expression: max(entry.num_words_src / entry.num_words_tgt, entry.num_words_tgt / entry.num_words_src)
-  
-  - _target_: sdp.processors.PreserveByValue
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_34.json
-    input_value_key: len_diff_ratio
-    operator: lt
-    target_value: ${params.translation.max_len_diff_ratio}
-  
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_35.json
-    fields_to_drop:
-      - num_words_src
-      - num_words_tgt
-      - num_words_filter
-      - len_diff_ratio
-  
-  ## filtering based on character histograms
-  - _target_: sdp.processors.FilterWithCharacterHistograms
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_36.json
-    text_field: src_text
-    lang: ${params.source_lang}
-    output_score_field: src_hist_token_ratio
-    cache_dir: /data3/sdp_test/cache/histograms
-  
-  - _target_: sdp.processors.FilterWithCharacterHistograms
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_37.json
-    text_field: tgt_text
-    lang: ${params.translation.target_lang}
-    output_score_field: tgt_hist_token_ratio
-    cache_dir: /data3/sdp_test/cache/histograms
-
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_38.json
-    new_field: len_diff_ratio_filter
-    expression: (entry.src_hist_token_ratio > ${params.translation.min_hist_token_ratio}) & (entry.tgt_hist_token_ratio > ${params.translation.min_hist_token_ratio})
-    filter: True
-  
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_39.json
-    fields_to_drop:
-      - src_hist_token_ratio
-      - tgt_hist_token_ratio
-      - len_diff_ratio_filter
-
-  ## filtering based on Fasttext LID
-  - _target_: sdp.processors.FastTextLangIdClassifier
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_40.json
-    text_field: src_text
-    output_field: src_lid
-    model_name_or_path: lid.176.bin
-    cache_dir: /data3/sdp_test/cache
-  
-  - _target_: sdp.processors.FastTextLangIdClassifier
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_41.json
-    text_field: tgt_text
-    output_field: tgt_lid
-    model_name_or_path: lid.176.bin
-    cache_dir: /data3/sdp_test/cache
-  
-  - _target_: sdp.processors.LambdaExpression
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_42.json
-    new_field: lid_filter
-    expression: (entry.src_lid == '${params.source_lang}') & (entry.src_lid_prob > ${params.translation.min_text_lid_probability}) & (entry.tgt_lid == '${params.translation.target_lang}') & (entry.tgt_lid_prob > ${params.translation.min_text_lid_probability})
-    filter: True
-  
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_43.json
-    fields_to_drop:
-      - src_lid
-      - src_lid_prob
-      - tgt_lid
-      - tgt_lid_prob
-      - lid_filter
-
-  ## filtering based on Cometoid QE
-  - _target_: sdp.processors.CometoidWMTQualityEstimation
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_44.json
-    source_text_field: src_text
-    target_text_field: tgt_text
-    model_name_or_path: cometoid-wmt23
-    device_type: gpu
-    num_devices: 4
-    chunksize: 10
-  
-  - _target_: sdp.processors.PreserveByValue
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_45.json
-    input_value_key: cometoid_score
-    operator: gt
-    target_value: ${params.translation.min_qe_score}
-
-  - _target_: sdp.processors.DropSpecifiedFields
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_46.json
-    fields_to_drop:
-      - cometoid_score
-  
-  - _target_: sdp.processors.ConvertToTarredAudioDataset
-    should_run: ${params.convert_to_audio_tarred_dataset.should_run}
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_47.json
-    min_duration: ${params.min_audio_duration}
-    max_duration: ${params.max_audio_duration}
-    target_dir: ${workspace_dir}/${params.source_lang}/tarred_dataset
-    num_shards: ${params.convert_to_audio_tarred_dataset.num_shards}
-    buckets_num: ${params.convert_to_audio_tarred_dataset.buckets_num}
-    workers: -1
-    shuffle: True
-    shuffle_seed: 1
-    sort_in_shards: True
-    slice_with_offset: True
 
-  #export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2.yaml
\ No newline at end of file
+#export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2_express.yaml
\ No newline at end of file
diff --git a/dataset_configs/multilingual/yodas2/config.yaml b/dataset_configs/multilingual/yodas2/config.yaml
new file mode 100644
index 00000000..b116c501
--- /dev/null
+++ b/dataset_configs/multilingual/yodas2/config.yaml
@@ -0,0 +1,457 @@
+documentation: |
+  YODAS2 
+  ############
+  Documentation is in progress.
+
+use_dask: False
+
+params:
+  source_lang: en
+  source_lang_full: English
+  min_audio_lid_probability:  0.7
+  min_audio_duration: 0.1
+  max_audio_duration: 40.0
+  use_regex: common
+  translation:
+    target_lang: it
+    target_lang_full: Italian
+    max_len_diff_ratio: 4
+    min_hist_token_ratio: 0.8
+    min_text_lid_probability: 0.3
+    min_qe_score: 0.75
+  convert_to_audio_tarred_dataset:
+    should_run: True
+    num_shards: 16
+    buckets_num: 1
+  save_disk_space: False
+  use_snapshot_download: False
+
+processors_to_run: "7:"
+workspace_dir: /data3/sdp_test/test_hf/_prep/ #/data3/sdp_test/final_test
+sdp_dir: /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor
+
+processors:
+  - _target_: sdp.processors.ListYodas2Data
+    output_manifest_file: ${workspace_dir}/manifest_00.json
+    use_metadata: True
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_01.json
+    new_field:  source_lang
+    expression: entry.lang_subset[:2]
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_02.json
+    input_value_key: source_lang
+    target_value: ${params.source_lang}
+  
+  - _target_: sdp.processors.SnapshotDownloadYodas2Data
+    should_run: ${params.use_snapshot_download}
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+  
+  - _target_: sdp.processors.HfHubDownloadYodas2Data
+    should_run: ${not:${params.use_snapshot_download}}
+    filename_field: audio_key
+    output_filepath_field: local_audio
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/.yodas2_hf_hub/manifest_03_01.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+  
+  - _target_: sdp.processors.HfHubDownloadYodas2Data
+    should_run: ${not:${params.use_snapshot_download}}
+    filename_field: duration_key
+    output_filepath_field: local_duration
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/.yodas2_hf_hub/manifest_03_02.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+
+  - _target_: sdp.processors.HfHubDownloadYodas2Data
+    should_run: ${not:${params.use_snapshot_download}}
+    filename_field: text_key
+    output_filepath_field: local_text
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+    
+  - _target_: sdp.processors.ExtractTar
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_04.json
+    field_to_tar_filepath: 'local_audio'
+    extraction_dir: ${workspace_dir}/${params.source_lang}
+    remove_source_tar: ${params.save_disk_space}
+    filepath_prefix_field: 'lang_subset'
+    output_filepath_field: 'extracted_audios'
+    get_extracted_filepaths: True
+  
+  - _target_: sdp.processors.CreateInitialManifestYodas2
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_05.json
+    field_with_list: 'extracted_audios'
+    output_field: 'source_audio_filepath'
+    fields_to_save: 
+      - lang_subset
+      - shard_id
+      - source_lang
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_06.json
+    new_field: 'audio_filepath'
+    expression: "'${workspace_dir}/${params.source_lang}/converted/' + entry.lang_subset + '/' + entry.shard_id + '/' + entry.yodas_id"
+  
+  - _target_: sdp.processors.FfmpegConvert
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_07.json
+    input_file_key: 'source_audio_filepath'
+    output_file_key: 'audio_filepath'
+    id_key: 'audio_filepath'
+    converted_audio_dir: '/'
+    target_samplerate: 16000
+    target_nchannels: 1
+  
+  - _target_: sdp.processors.RemoveFiles
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_08.json
+    filepath_field: 'source_audio_filepath' 
+    should_run: ${params.save_disk_space}
+
+  # Lang ID
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_09.json
+    model_size_or_path: 'base'
+    num_devices: -1
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_09
+    language_detection_only: True
+    inference:
+        language_detection_segments: 7
+        chunk_length: 30
+    save_timestamps_separately: False
+    skip_corrupted_audios: True
+
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_10.json
+    new_field: 'lid_verified'
+    expression: (entry.language == "${params.source_lang}") & (entry.language_probability >= ${params.min_audio_lid_probability})
+    filter: True
+
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_11.json
+    fields_to_drop:
+      - language
+      - language_probability
+      - lid_verified
+  
+  # Inference on long audio
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_12.json
+    model_size_or_path: 'base'
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_12
+    num_devices: -1
+    inference:
+        language: ${params.source_lang}
+    save_timestamps_separately: False
+    skip_corrupted_audios: True
+  
+  - _target_: sdp.processors.ListToEntries
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
+    field_with_list: 'segments'
+    
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
+    new_field: 'duration'
+    expression: entry.end - entry.start
+  
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14a.json
+    high_duration_threshold: ${params.max_audio_duration}
+    low_duration_threshold: ${params.min_audio_duration}
+  
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
+    rename_fields: 
+      start: offset 
+      id: segment_id 
+      language: source_lang
+  
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_16.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - source_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+  
+  - _target_: sdp.processors.FasterWhisperInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_17.json
+    model_size_or_path: 'base'
+    num_devices: -1
+    output_dir: ${workspace_dir}/${params.source_lang}/manifest_17
+    inference:
+        language: ${params.source_lang}
+    save_timestamps_separately: False
+    skip_corrupted_audios: True
+    slice_by_offset: True
+  
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_18.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - source_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+      - pred_text
+  
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_19.json
+    rename_fields:
+      pred_text: text
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_20.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DetectWhisperHallucinationFeatures
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_21.json
+    common_hall_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/common_phrases/${params.source_lang}.txt
+    text_field: text
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_22.json
+    new_field:  is_hallucinated
+    expression: (not entry.hall_repeated_ngrams) & (not entry.hall_long_word) & (not entry.hall_frequent_single_word)
+    filter: True
+    
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_23.json
+    fields_to_keep: 
+      - lang_subset
+      - shard_id
+      - yodas_id
+      - source_lang
+      - audio_filepath
+      - segment_id
+      - offset
+      - duration
+      - text
+  
+  - _target_: sdp.processors.vLLMInference
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_24.json
+    generation_field: src_text
+    prompt_file: ${sdp_dir}/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/${params.source_lang}.yaml
+    model:
+      model: "Qwen/Qwen2.5-0.5B-Instruct" #"Qwen/Qwen2.5-7B-Instruct-1M"
+      tensor_parallel_size: 2
+      max_model_len: 2048
+      enable_chunked_prefill: True
+      max_num_batched_tokens: 1024
+      enforce_eager: True
+      dtype: float16
+      gpu_memory_utilization: 0.95 
+      max_num_seqs: 16
+    inference:
+      temperature: 0.7
+      top_p: 0.8
+      repetition_penalty: 1.05
+      max_tokens: 2048
+    apply_chat_template:
+      tokenize: False
+      add_generation_prompt: True
+  
+  - _target_: sdp.processors.CleanQwenGeneration
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_25.json
+    text_field: text
+    generation_field: src_text
+  
+  - _target_: sdp.processors.SubRegex
+    text_key: src_text
+    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/granary/partials/subregex_params/${params.use_regex}.yaml
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_26.json
+  
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_27.json
+    fields_to_drop:
+      - text
+  
+  # AST
+  - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_28.json
+    fields:
+      target_lang: ${params.translation.target_lang}
+
+  - _target_: sdp.processors.vLLMInference
+    generation_field: tgt_text
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_29.json
+    prompt:
+      system: ""
+      user: |
+        Translate the following ${params.source_lang_full} source text to ${params.translation.target_lang_full}:
+        ${params.source_lang_full}: {src_text}
+        ${params.translation.target_lang_full}: 
+    model:
+      model: "utter-project/EuroLLM-1.7B-Instruct" #"utter-project/EuroLLM-9B-Instruct"
+      dtype: float16
+      tensor_parallel_size: 2
+    inference:
+      best_of: 1
+      temperature: 0.0
+      top_p: 1.0
+      max_tokens: 1280
+    apply_chat_template:
+      max_length: 512
+      tokenize: False
+      add_generation_prompt: True
+  
+  ## num_words and len_diff_ratio filtering
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_30.json
+    text_key: src_text
+    num_words_key: num_words_src
+  
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_31.json
+    text_key: tgt_text
+    num_words_key: num_words_tgt
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_32.json
+    new_field: num_words_filter
+    expression: (entry.num_words_src > 1) & (entry.num_words_tgt > 1)
+    filter: True
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_33.json
+    new_field: len_diff_ratio
+    expression: max(entry.num_words_src / entry.num_words_tgt, entry.num_words_tgt / entry.num_words_src)
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_34.json
+    input_value_key: len_diff_ratio
+    operator: lt
+    target_value: ${params.translation.max_len_diff_ratio}
+  
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_35.json
+    fields_to_drop:
+      - num_words_src
+      - num_words_tgt
+      - num_words_filter
+      - len_diff_ratio
+  
+  ## filtering based on character histograms
+  - _target_: sdp.processors.FilterWithCharacterHistograms
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_36.json
+    text_field: src_text
+    lang: ${params.source_lang}
+    output_score_field: src_hist_token_ratio
+    cache_dir: /data3/sdp_test/cache/histograms
+  
+  - _target_: sdp.processors.FilterWithCharacterHistograms
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_37.json
+    text_field: tgt_text
+    lang: ${params.translation.target_lang}
+    output_score_field: tgt_hist_token_ratio
+    cache_dir: /data3/sdp_test/cache/histograms
+
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_38.json
+    new_field: len_diff_ratio_filter
+    expression: (entry.src_hist_token_ratio > ${params.translation.min_hist_token_ratio}) & (entry.tgt_hist_token_ratio > ${params.translation.min_hist_token_ratio})
+    filter: True
+  
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_39.json
+    fields_to_drop:
+      - src_hist_token_ratio
+      - tgt_hist_token_ratio
+      - len_diff_ratio_filter
+
+  ## filtering based on Fasttext LID
+  - _target_: sdp.processors.FastTextLangIdClassifier
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_40.json
+    text_field: src_text
+    output_field: src_lid
+    model_name_or_path: lid.176.bin
+    cache_dir: /data3/sdp_test/cache
+  
+  - _target_: sdp.processors.FastTextLangIdClassifier
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_41.json
+    text_field: tgt_text
+    output_field: tgt_lid
+    model_name_or_path: lid.176.bin
+    cache_dir: /data3/sdp_test/cache
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_42.json
+    new_field: lid_filter
+    expression: (entry.src_lid == '${params.source_lang}') & (entry.src_lid_prob > ${params.translation.min_text_lid_probability}) & (entry.tgt_lid == '${params.translation.target_lang}') & (entry.tgt_lid_prob > ${params.translation.min_text_lid_probability})
+    filter: True
+  
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_43.json
+    fields_to_drop:
+      - src_lid
+      - src_lid_prob
+      - tgt_lid
+      - tgt_lid_prob
+      - lid_filter
+
+  ## filtering based on Cometoid QE
+  - _target_: sdp.processors.CometoidWMTQualityEstimation
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_44.json
+    source_text_field: src_text
+    target_text_field: tgt_text
+    model_name_or_path: cometoid-wmt23
+    device_type: gpu
+    num_devices: 4
+    chunksize: 10
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_45.json
+    input_value_key: cometoid_score
+    operator: gt
+    target_value: ${params.translation.min_qe_score}
+
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_46.json
+    fields_to_drop:
+      - cometoid_score
+    
+  - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_47.json
+    fields:
+      decodercontext: ""
+      "emotion": "<|emo:undefined|>"
+      "pnc": "pnc"
+      "itn": "itn"
+      "timestamp": "notimestamp"
+      "diarize": "nodiarize"
+    
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_48.json
+    rename_fields:
+      src_text: text
+      tgt_text: answer
+  
+  - _target_: sdp.processors.ConvertToTarredAudioDataset
+    should_run: ${params.convert_to_audio_tarred_dataset.should_run}
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_49.json
+    min_duration: ${params.min_audio_duration}
+    max_duration: ${params.max_audio_duration}
+    target_dir: ${workspace_dir}/${params.source_lang}/tarred_dataset
+    num_shards: ${params.convert_to_audio_tarred_dataset.num_shards}
+    buckets_num: ${params.convert_to_audio_tarred_dataset.buckets_num}
+    workers: -1
+    shuffle: True
+    shuffle_seed: 1
+    sort_in_shards: True
+    slice_with_offset: True
+
+  #export HF_HOME=/data3/sdp_test/hf/hub/
+  #export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2.yaml
\ No newline at end of file
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/bg.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/bg.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/bg.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/bg.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/cs.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/cs.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/cs.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/cs.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/da.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/da.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/da.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/da.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/de.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/de.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/de.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/de.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/el.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/el.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/el.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/el.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/en.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/en.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/en.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/en.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/es.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/es.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/es.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/es.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/et.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/et.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/et.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/et.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/fi.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/fi.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/fi.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/fi.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/fr.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/fr.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/fr.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/fr.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/hr.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/hr.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/hr.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/hr.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/hu.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/hu.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/hu.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/hu.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/it.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/it.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/it.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/it.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/lt.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/lt.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/lt.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/lt.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/lv.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/lv.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/lv.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/lv.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/mt.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/mt.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/mt.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/mt.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/nl.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/nl.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/nl.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/nl.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/pl.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/pl.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/pl.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/pl.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/pt.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/pt.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/pt.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/pt.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/ro.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/ro.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/ro.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/ro.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sk.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/sk.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/sk.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/sk.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sl.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/sl.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/sl.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/sl.txt
diff --git a/dataset_configs/multilingual/granary/partials/common_phrases/sv.txt b/dataset_configs/multilingual/yodas2/partials/common_phrases/sv.txt
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/common_phrases/sv.txt
rename to dataset_configs/multilingual/yodas2/partials/common_phrases/sv.txt
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/bg.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/bg.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/cs.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/cs.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/da.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/da.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/de.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/de.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/el.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/el.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/en.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/en.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/es.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/es.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/et.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/et.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/fi.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/fi.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/fr.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/fr.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/hr.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/hr.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/hu.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/hu.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/it.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/it.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/lt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/lt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/lv.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/lv.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/mt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/mt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/nl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/nl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/pl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/pl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/pt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/pt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/ro.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/ro.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/ru.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/ru.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sk.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sk.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sv.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/sv.yaml
diff --git a/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml b/dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/uk.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml
rename to dataset_configs/multilingual/yodas2/partials/pr_recovery_prompts/uk.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/bg.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/bg.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/common.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/common.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/common.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/common.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/cs.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/cs.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/da.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/da.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/da.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/da.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/de.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/de.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/de.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/de.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/el.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/el.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/el.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/el.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/en.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/en.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/en.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/en.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/es.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/es.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/es.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/es.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/et.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/et.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/et.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/et.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/fi.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/fi.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/fr.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/fr.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/hr.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/hr.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/hu.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/hu.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/it.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/it.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/it.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/it.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/lt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/lt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/lv.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/lv.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/mt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/mt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/nl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/nl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/pl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/pl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/pt.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/pt.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/ro.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/ro.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/ru.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/ru.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/sk.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/sk.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/sl.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/sl.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/sv.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/sv.yaml
diff --git a/dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml b/dataset_configs/multilingual/yodas2/partials/subregex_params/uk.yaml
similarity index 100%
rename from dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml
rename to dataset_configs/multilingual/yodas2/partials/subregex_params/uk.yaml

From 71d84cbda8e8a815fb2e8a68266ad05c05d39f3f Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:03:16 -0700
Subject: [PATCH 82/90] Fix typo in ListYodas2Data

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/datasets/yodas2/create_initial_manifest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/datasets/yodas2/create_initial_manifest.py b/sdp/processors/datasets/yodas2/create_initial_manifest.py
index 83f46f5f..b08cdf3e 100644
--- a/sdp/processors/datasets/yodas2/create_initial_manifest.py
+++ b/sdp/processors/datasets/yodas2/create_initial_manifest.py
@@ -77,7 +77,7 @@ def process(self):
             # Write the manifest based on metadata.lang2shard_cnt
             with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
                 for lang_subset in sorted(metadata.lang2shard_cnt.keys()):
-                    for shard_no in range(metadata.lang2shard_cnt['aa000']):
+                    for shard_no in range(metadata.lang2shard_cnt[lang_subset]):
                         shard_id = str(shard_no).zfill(8)
                         data_entry = {
                             "lang_subset": lang_subset,

From 0bda056c428bcc500b0b98ec63fc0029abf12210 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:26:33 -0700
Subject: [PATCH 83/90] Added JoinManifests

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../masc/config_filter_noisy_train.yaml       |  6 ++--
 docs/src/sdp/api.rst                          |  2 +-
 sdp/processors/__init__.py                    |  2 +-
 sdp/processors/modify_manifest/common.py      | 34 ++++++++++++-------
 tests/test_modify_manifest.py                 |  4 +--
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/dataset_configs/arabic/masc/config_filter_noisy_train.yaml b/dataset_configs/arabic/masc/config_filter_noisy_train.yaml
index 8059177d..767768af 100644
--- a/dataset_configs/arabic/masc/config_filter_noisy_train.yaml
+++ b/dataset_configs/arabic/masc/config_filter_noisy_train.yaml
@@ -282,10 +282,12 @@ processors:
     output_manifest_file: ${manifest_dir}/manifest21.json
 
   # 22 keeping only low WER and CER samples
-  - _target_: sdp.processors.ApplyInnerJoin
+  - _target_: sdp.processors.JoinManifests
     left_manifest_file: ${manifest_dir}/manifest21.json
     right_manifest_file: ${manifest_dir}/manifest19.json
-    column_id: audio_filepath
+    merge_params:
+      'on': audio_filepath
+      how: inner
     output_manifest_file: ${manifest_dir}/manifest22.json
 
   # 23 changing paths to relative
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index ea860366..940192c9 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -390,7 +390,7 @@ Miscellaneous
 .. autodata:: sdp.processors.CreateInitialManifestByExt
    :annotation:
 
-.. autodata:: sdp.processors.ApplyInnerJoin
+.. autodata:: sdp.processors.JoinManifests
    :annotation:
 
 .. autodata:: sdp.processors.CreateCombinedManifests
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 795ee05c..84f18ab8 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -16,7 +16,7 @@
 
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
-    ApplyInnerJoin,
+    JoinManifests,
     ChangeToRelativePath,
     CombineSources,
     DuplicateFields,
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index b8d63291..be16892c 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -364,39 +364,47 @@ def process(self):
                 fout.write(json.dumps(new_line, ensure_ascii=False) + "\n")
 
 
-class ApplyInnerJoin(BaseProcessor):
-    """Applies inner join to two manifests, i.e. creates a manifest from records that have matching values in both manifests.
-    For more information, please refer to the Pandas merge function documentation:
-    https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge
+class JoinManifests(BaseProcessor):
+    """
+    Applies a configurable join operation to two input manifests using `pandas.merge`.
+
+    This processor reads two manifest files (lists of JSON records), converts them to pandas DataFrames,
+    and performs a join (inner, outer, left, or right) based on the specified merge parameters.
 
+    It supports flexible control over the merging behavior via the `merge_params` dictionary.  
+    For available options, refer to:
+    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge.html
 
     Args:
-        column_id (Union[str, List[str], None]): Field names to join on. These must be found in both manifests.
-            If `column_id` is None then this defaults to the intersection of the columns in both manifests.
-            Defaults to None.
-        left_manifest_file (Optional[str]): path to the left manifest. Defaults to `input_manifest_file`.
-        right_manifest_file (str): path to the right manifest.
+        right_manifest_file (str): Path to the right manifest file to join.
+        left_manifest_file (Optional[str]): Path to the left manifest file to join.  
+            If not provided, defaults to `input_manifest_file`.
+        merge_params (Dict): Dictionary of parameters passed directly to `pandas.merge`, such as:
+            - `on`: Column name or list of column names to join on.
+            - `how`: Type of join to perform: 'left', 'right', 'outer', or 'inner'.
+            - `suffixes`: Suffixes to apply to overlapping column names in the left and right DataFrames.
+            - `validate`: Check whether the merge is of a specified type (e.g., "one_to_one").
 
     Returns:
-        Inner join of two manifests.
+        A merged manifest file in JSON format, written to `output_manifest_file`.
     """
 
     def __init__(
         self,
         right_manifest_file: str,
         left_manifest_file: Optional[str] = None,
-        column_id: Union[str, List[str], None] = None,
+        merge_params: Dict = {},
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.left_manifest_file = left_manifest_file if left_manifest_file != None else self.input_manifest_file
         self.right_manifest_file = right_manifest_file
-        self.column_id = column_id
+        self.merge_params = merge_params
 
     def process(self):
         m1 = pd.DataFrame.from_records(load_manifest(Path(self.left_manifest_file)))
         m2 = pd.DataFrame.from_records(load_manifest(Path(self.right_manifest_file)))
-        m3 = pd.merge(m1, m2, on=self.column_id, how="inner")
+        m3 = pd.merge(m1, m2, **self.merge_params)
 
         with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
             for _, line in m3.iterrows():
diff --git a/tests/test_modify_manifest.py b/tests/test_modify_manifest.py
index 99583c26..1cbe35b4 100644
--- a/tests/test_modify_manifest.py
+++ b/tests/test_modify_manifest.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from sdp.processors import ApplyInnerJoin, DropNonAlphabet
+from sdp.processors import JoinManifests, DropNonAlphabet
 
 
 def _write_manifest(manifest: Path, entries: List[Dict[str, Union[str, float]]]):
@@ -157,7 +157,7 @@ def test_apply_inner_join(
         processor = ApplyInnerJoin(
             left_manifest_file=manifest1,
             right_manifest_file=manifest2,
-            column_id=coloumn_id,
+            merge_params=dict(on = coloumn_id, how = 'inner'),
             output_manifest_file=manifest_out,
         )
 

From 674f1fdcfa0fe9417079ad2ead022cdac0f72fbd Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:27:27 -0700
Subject: [PATCH 84/90] mkdir in SnapshotDownload process

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/huggingface/huggingface_hub.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/huggingface/huggingface_hub.py b/sdp/processors/huggingface/huggingface_hub.py
index 203d6cf4..c5a5521c 100644
--- a/sdp/processors/huggingface/huggingface_hub.py
+++ b/sdp/processors/huggingface/huggingface_hub.py
@@ -101,7 +101,7 @@ class SnapshotDownload(BaseProcessor):
     def __init__(
         self,
         output_filepath_field: str = "downloaded",
-        snapshot_download_args: dict = {},
+        snapshot_download_args: Dict = {},
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -109,6 +109,8 @@ def __init__(
         self.snapshot_download_args = snapshot_download_args
 
     def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok = True)
+        
         """
         Main processing entrypoint: download repo and write path to manifest. 
         """

From 3e12532f6d784359ca8fc2d9f43c75e62eda6b59 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:41:38 -0700
Subject: [PATCH 85/90] Yodas from Granary

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          | 230 +++++++++++++++---
 sdp/processors/__init__.py                    |   2 +-
 .../granary/create_initial_manifest.py        |  70 ++++++
 3 files changed, 268 insertions(+), 34 deletions(-)
 create mode 100644 sdp/processors/datasets/granary/create_initial_manifest.py

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index d3f397b5..88145e7d 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -1,61 +1,225 @@
 documentation: |
-  YODAS2 
-  ############
-  Documentation is in progress.
+  YODAS2 Data Processing Pipeline
+  ==============================
 
-use_dask: False
+  This pipeline processes the YODAS2 subset of the Granary dataset, which consists of long-form multilingual YouTube speech data
+  with timestamps and transcriptions. It downloads, normalizes, filters, converts, and packages the data into a tarred audio dataset
+  format suitable for training speech models with NeMo.
+
+  Overview
+  --------
+  The pipeline automatically downloads all required metadata and audio data, applies text normalization and filtering, converts audio
+  to the standard format, and generates a final Granary dataset in *tarred audio format*.
+
+  The pipeline is designed to process data for a specific source language, which must be one of the following supported languages:
+
+  ::
+
+    "bg", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr",
+    "hr", "hu", "it", "lt", "lv", "nl", "pl", "pt", "ro", "ru", "sk", "sv", "uk"
+
+  Configuration Parameters
+  ------------------------
+  - **use_dask**: Whether to use Dask for multiprocessing (recommended: False)
+  - **params**:
+    - **source_lang**: Language code (e.g., "bg" for Bulgarian)
+    - **use_regex**: Regex rules to apply. Usually set to the same value as `source_lang`, or to `"common"` to apply a universal normalization pattern across 25 languages.
+    - **en_translation**: Whether to include English translations (default: True)
+    - **convert_to_audio_tarred_dataset**:
+      - **should_run**: Whether to convert to tarred audio format
+      - **num_shards**: Number of tar shards
+      - **buckets_num**: Number of buckets for duration grouping
+      - **min_audio_duration** / **max_audio_duration**: Duration filters (in seconds)
+    - **save_disk_space**: Whether to delete intermediate files (default: True)
+    - **use_snapshot_download**: Whether to use snapshot_download from Hugging Face (default: False)
+
+  Output
+  ------
+  After execution, the pipeline produces:
+
+  - A **tarred audio dataset**, which contains:
+    - Converted audio files in 16 kHz mono WAV format
+    - Corresponding manifests with cleaned and normalized transcripts
+    - Optionally, English translations
+
+  Running the Pipeline
+  --------------------
+  Run this command to launch the pipeline:
+
+  .. code-block:: bash
+
+     python main.py \
+       --config-path=dataset_configs/multilingual/granary/ \
+       --config-name=yodas2.yaml
+
+  References
+  ----------
+  - YODAS2 on Hugging Face: https://huggingface.co/datasets/espnet/yodas2
+
+  Summary
+  -------
+  This pipeline prepares filtered, normalized, and language-specific audio data in a format ready for training NeMo-compatible ASR models.
+
+use_dask: False  # Whether to use Dask for multiprocessing. False = use built-in processing (recommended).
 
 params:
-  source_lang: es
-  use_regex: ${.source_lang}
-  translation: True
+  source_lang: ??  # Set the language to process (e.g., "bg")
+  use_regex: ${.source_lang}  # Regex config for text normalization. Usually same as the language, or "common" to apply a universal regex for 25 languages.
+  en_translation: True  # If True, download also English translations (if available).
   convert_to_audio_tarred_dataset:
-    should_run: True
-    num_shards: 16
-    buckets_num: 1
-    min_audio_duration: 0.1
-    max_audio_duration: 40.0
-  save_disk_space: True
+    should_run: True  # If True, the final tarred dataset will be created.
+    num_shards: ??  # Number of tar files to split the dataset into.
+    buckets_num: ??  # Number of duration buckets (used for balancing durations across shards).
+    min_audio_duration: 0.1  # Exclude files shorter than 0.1 seconds.
+    max_audio_duration: 40.0  # Exclude files longer than 40 seconds.
+  save_disk_space: True  # If True, intermediate audio files will be deleted.
+  use_snapshot_download: False  # If True, use snapshot_download instead of Hugging Face Hub APIs.
+
+processors_to_run: "all"  # Run all processors in sequence.
 
-processors_to_run: "2:"
-workspace_dir: /data3/sdp_test/new_test_kurz #/data3/sdp_test/final_test
-sdp_dir: /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor
+workspace_dir: ??  # Required: output directory to save all intermediate and final files.
+sdp_dir: ./NeMo-speech-data-processor  # Path to the local clone of the SDP repo.
 
 processors:
+  # 0. Get base manifest (JSONL with audio references and text)
   - _target_: sdp.processors.GetGranarysYodas2
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_0.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_00.json
     lang: ${params.source_lang}
-    translation: ${params.translation}
+    translation: ${params.en_translation}
 
+  # 1. Apply regex-based substitutions to normalize text
   - _target_: sdp.processors.SubRegex
     text_key: text
-    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/granary/partials/subregex_params/${params.use_regex}.yaml
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_1.json
-  
+    regex_params_yaml: ${sdp_dir}/dataset_configs/multilingual/yodas2/partials/subregex_params/${params.use_regex}.yaml
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_01.json
+
+  # 2. Drop empty or whitespace-only entries
   - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_2.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_02.json
     text_key: text
     regex_patterns:
       - "^\\s*$"
-  
-  - _target_: sdp.processors.DownloadYodas2Data
-    #by pattern
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_3.json
-    local_dir: ${workspace_dir}/${params.source_lang}
+
+  # 3. Expand metadata (adds lang_subset, shard_id, etc.)
+  - _target_: sdp.processors.ListYodas2Data
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_03.json
+    use_metadata: True
+
+  # 4. Remove unused fields
+  - _target_: sdp.processors.DropSpecifiedFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_04.json
+    fields_to_drop:
+      - duration_key
+      - text_key
+
+  # 5. Add `source_lang` field based on lang_subset prefix
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_05.json
+    new_field: source_lang
+    expression: entry.lang_subset[:2]
+
+  # 6. Keep only entries where source_lang matches config
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_06.json
+    input_value_key: source_lang
+    target_value: ${params.source_lang}
+
+  # 7. Download tarballs with snapshot_download (if enabled)
+  - _target_: sdp.processors.SnapshotDownloadYodas2Data
+    should_run: ${params.use_snapshot_download}
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_07.json
+    local_dir: ${workspace_dir}/${params.source_lang}/
     max_workers: 8
-  
+
+  # 8. Download tarballs via HF Hub API (default path)
+  - _target_: sdp.processors.HfHubDownloadYodas2Data
+    input_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_06.json
+    should_run: ${not:${params.use_snapshot_download}}
+    filename_field: audio_key
+    output_filepath_field: local_audio
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_07.json
+    hf_hub_download_args:
+      local_dir: ${workspace_dir}/${params.source_lang}/
+    max_workers: 8
+
+  # 9. Extract .tar files into audio WAVs
   - _target_: sdp.processors.ExtractTar
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_4.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_08.json
     field_to_tar_filepath: 'local_audio'
     extraction_dir: ${workspace_dir}/${params.source_lang}
     remove_source_tar: ${params.save_disk_space}
     filepath_prefix_field: 'lang_subset'
     output_filepath_field: 'extracted_audios'
     get_extracted_filepaths: True
-  
-  - _target_: sdp.processors.CreateInitialManifestYodas2
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_5.json
+
+  # 10. Flatten lists of extracted audio paths
+  - _target_: sdp.processors.ListToEntries
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_09.json
     field_with_list: 'extracted_audios'
     output_field: 'source_audio_filepath'
 
-#export HYDRA_FULL_ERROR=1 && python /ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/main.py --config-path=/ameister/YODAS_PR_FINAL/NeMo-speech-data-processor/dataset_configs/multilingual/granary/ --config-name=yodas2_express.yaml
\ No newline at end of file
+  # 11. Add yodas_id (unique audio ID) from filename
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_10.json
+    new_field: 'yodas_id'
+    expression: "entry.source_audio_filepath[-15:-4]"
+
+  # 12. Define the final audio output path for conversion
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_11.json
+    new_field: 'audio_filepath'
+    expression: "'${workspace_dir}/${params.source_lang}/converted/' + entry.lang_subset + '/' + entry.shard_id + '/' + entry.yodas_id"
+
+  # 13. Convert audio to 16kHz mono WAV
+  - _target_: sdp.processors.FfmpegConvert
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_12.json
+    input_file_key: 'source_audio_filepath'
+    output_file_key: 'audio_filepath'
+    id_key: 'audio_filepath'
+    converted_audio_dir: '/'
+    target_samplerate: 16000
+    target_nchannels: 1
+
+  # 14. Optionally remove the original raw audio files
+  - _target_: sdp.processors.RemoveFiles
+    filepath_field: 'source_audio_filepath'
+    should_run: ${params.save_disk_space}
+
+  # 15. Keep only fields needed for final merge
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    input_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_12.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
+    fields_to_keep:
+      - yodas_id
+      - audio_filepath
+
+  # 16. Merge audio paths with filtered text
+  - _target_: sdp.processors.JoinManifests
+    left_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_02.json
+    right_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_13.json
+    merge_params:
+      'on': yodas_id
+      how: inner
+      copy: False
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
+
+  # 17. Create the final tarred audio dataset
+  - _target_: sdp.processors.ConvertToTarredAudioDataset
+    should_run: ${params.convert_to_audio_tarred_dataset.should_run}
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
+    min_duration: ${params.convert_to_audio_tarred_dataset.min_audio_duration}
+    max_duration: ${params.convert_to_audio_tarred_dataset.max_audio_duration}
+    target_dir: ${workspace_dir}/${params.source_lang}/tarred_dataset
+    num_shards: ${params.convert_to_audio_tarred_dataset.num_shards}
+    buckets_num: ${params.convert_to_audio_tarred_dataset.buckets_num}
+    workers: -1
+    shuffle: True
+    shuffle_seed: 1
+    sort_in_shards: True
+    slice_with_offset: True
+
+  # 18. Optionally delete final converted audio files
+  - _target_: sdp.processors.RemoveFiles
+    input_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
+    filepath_field: 'audio_filepath'
+    should_run: ${params.save_disk_space}
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 84f18ab8..ae2a3f5f 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -163,4 +163,4 @@
     HfHubDownloadYodas2Data,
     CreateInitialManifestYodas2,
 )
-from sdp.processors.datasets.yodas2.granary import GetGranarysYodas2
\ No newline at end of file
+from sdp.processors.datasets.granary import GetGranarysYodas2
\ No newline at end of file
diff --git a/sdp/processors/datasets/granary/create_initial_manifest.py b/sdp/processors/datasets/granary/create_initial_manifest.py
new file mode 100644
index 00000000..51891e18
--- /dev/null
+++ b/sdp/processors/datasets/granary/create_initial_manifest.py
@@ -0,0 +1,70 @@
+import os
+import json
+from glob import glob
+from tqdm import tqdm
+import tempfile
+
+from sdp.processors.huggingface.huggingface_hub import SnapshotDownload
+from sdp.logging import logger
+
+class GetGranarysYodas2(SnapshotDownload):
+    AVAILABLE_LANGS = ["bg", "cs", "da", "de", "el",
+                       "en", "es", "et", "fi", "fr",
+                       "hr", "hu", "it", "lt", "lv",
+                       "nl", "pl", "pt", "ro", "ru",
+                       "sk", "sv", "uk"]
+
+    def __init__(self, lang: str, translation: bool = False, **kwargs):
+    
+        if lang not in self.AVAILABLE_LANGS:
+            raise ValueError("")
+        self.lang = lang
+        pattern = f"{self.lang}/{self.lang}*.json"
+
+        self.translation = translation
+
+        if self.translation:
+            if self.lang == "en":
+                logger.warning(f'There are no translations for `en` language.')
+                self.translation = False
+            else:
+                 pattern = f"Translation/{self.lang}_/{self.lang}*.jsonl"
+
+        if 'snapshot_download_args' not in kwargs:
+            kwargs['snapshot_download_args'] = dict()
+
+        kwargs['snapshot_download_args']['repo_id']="YODASEnj/YDS"
+        kwargs['snapshot_download_args']['repo_type']="dataset"
+        kwargs['snapshot_download_args']['allow_patterns']=pattern
+
+        super().__init__(**kwargs)
+
+    def process(self):
+        with tempfile.TemporaryDirectory() as tmp_dir: 
+            self.snapshot_download_args["local_dir"] = tmp_dir
+            super().process()
+
+            with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+                for manifest_filepath in sorted(glob(f"{tmp_dir}/{self.snapshot_download_args['allow_patterns']}")):
+                    with open(manifest_filepath, 'r', encoding='utf8') as fin:
+                        for line in tqdm(fin, desc = f'Processing {os.path.basename(manifest_filepath)}'):
+                            sample = json.loads(line)
+                            new_sample = dict(source_lang = self.lang,
+                                            target_lang = self.lang,
+                                            yodas_id = sample['wav_id'],
+                                            offset = sample['start_time'],
+                                            duration = sample['duration'],
+                                            text = sample['text'],
+                                            answer = sample['text'],
+                                            decodercontext = "",
+                                            emotion = "<|emo:undefined|>",
+                                            pnc = "pnc",
+                                            itn = "itn",
+                                            timestamp = "notimestamp", 
+                                            diarize = "nodiarize")
+                                                        
+                            if self.translation:
+                                new_sample['target_lang'] = "en"
+                                new_sample['answer'] = sample['translation_en']
+                                
+                            fout.writelines(json.dumps(new_sample) + '\n')
\ No newline at end of file

From b7ee0f9a7a7c88cc7927d36103b85ad5624cd23a Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:44:33 -0700
Subject: [PATCH 86/90] Granary cfg moved

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/datasets/yodas2/granary.py | 62 -----------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 sdp/processors/datasets/yodas2/granary.py

diff --git a/sdp/processors/datasets/yodas2/granary.py b/sdp/processors/datasets/yodas2/granary.py
deleted file mode 100644
index d5d2d520..00000000
--- a/sdp/processors/datasets/yodas2/granary.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import os
-import json
-from glob import glob
-from tqdm import tqdm
-import tempfile
-
-from sdp.processors.huggingface.huggingface_hub import SnapshotDownload
-from sdp.logging import logger
-
-class GetGranarysYodas2(SnapshotDownload):
-    AVAILABLE_LANGS = ["bg", "cs", "da", "de", "el",
-                       "en", "es", "et", "fi", "fr",
-                       "hr", "hu", "it", "lt", "lv",
-                       "nl", "pl", "pt", "ro", "ru",
-                       "sk", "sv", "uk"]
-
-    def __init__(self, lang: str, translation: bool = False, **kwargs):
-        super().__init__(repo_id="YODASEnj/YDS", repo_type="dataset", **kwargs)
-        if lang not in self.AVAILABLE_LANGS:
-            raise ValueError("")
-        self.lang = lang
-
-        self.translation = translation
-        if self.lang == "en" and self.translation:
-            logger.warning(f'There are no translations for `en` language.')
-            self.translation = False
-    
-    def process(self):
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok = True)
-        with open(self.output_manifest_file, 'w', encoding='utf8') as fout:
-            pattern = f"{self.lang}/{self.lang}*.json"
-            if self.translation:
-                pattern = f"Translation/{self.lang}_/{self.lang}*.jsonl"
-
-            self.snapshot_download_kwargs['allow_patterns'] = pattern
-            with tempfile.TemporaryDirectory() as tmp_dir: 
-                self.snapshot_download_kwargs["local_dir"] = tmp_dir
-                self.download()
-
-                for manifest_filepath in sorted(glob(f"{tmp_dir}/{pattern}")):
-                    with open(manifest_filepath, 'r', encoding='utf8') as fin:
-                        for line in tqdm(fin, desc = f'Processing {os.path.basename(manifest_filepath)}'):
-                            sample = json.loads(line)
-                            new_sample = dict(source_lang = self.lang,
-                                            target_lang = self.lang,
-                                            yodas_id = sample['wav_id'],
-                                            offset = sample['start_time'],
-                                            duration = sample['duration'],
-                                            text = sample['text'],
-                                            answer = sample['text'],
-                                            decodercontext = "",
-                                            emotion = "<|emo:undefined|>",
-                                            pnc = "pnc",
-                                            itn = "itn",
-                                            timestamp = "notimestamp", 
-                                            diarize = "nodiarize")
-                                                        
-                            if self.translation:
-                                new_sample['target_lang'] = "en"
-                                new_sample['answer'] = sample['translation_en']
-                                
-                            fout.writelines(json.dumps(new_sample) + '\n')
\ No newline at end of file

From 7421bb01a3a3d0b332852fc3f92f45557e90e856 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:46:11 -0700
Subject: [PATCH 87/90] license

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../datasets/granary/create_initial_manifest.py  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sdp/processors/datasets/granary/create_initial_manifest.py b/sdp/processors/datasets/granary/create_initial_manifest.py
index 51891e18..88658b2e 100644
--- a/sdp/processors/datasets/granary/create_initial_manifest.py
+++ b/sdp/processors/datasets/granary/create_initial_manifest.py
@@ -1,3 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# let's import all supported processors here to simplify target specification
+
 import os
 import json
 from glob import glob

From ef4515bb1191504863b05a7af2da866ed3bbfcdc Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Tue, 20 May 2025 10:47:12 -0700
Subject: [PATCH 88/90] Removed extra line

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 sdp/processors/datasets/granary/create_initial_manifest.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sdp/processors/datasets/granary/create_initial_manifest.py b/sdp/processors/datasets/granary/create_initial_manifest.py
index 88658b2e..1a57f586 100644
--- a/sdp/processors/datasets/granary/create_initial_manifest.py
+++ b/sdp/processors/datasets/granary/create_initial_manifest.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# let's import all supported processors here to simplify target specification
-
 import os
 import json
 from glob import glob

From 3b8aff70f5d5600896967108cf099067490fee90 Mon Sep 17 00:00:00 2001
From: Sasha Meister <ameister@nvidia.com>
Date: Wed, 21 May 2025 09:39:04 -0700
Subject: [PATCH 89/90] Added constant fields

Signed-off-by: Sasha Meister <ameister@nvidia.com>
---
 .../multilingual/granary/yodas2.yaml          | 21 ++++++++++++++-----
 .../granary/create_initial_manifest.py        |  9 ++------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/dataset_configs/multilingual/granary/yodas2.yaml b/dataset_configs/multilingual/granary/yodas2.yaml
index 88145e7d..600e5c3c 100644
--- a/dataset_configs/multilingual/granary/yodas2.yaml
+++ b/dataset_configs/multilingual/granary/yodas2.yaml
@@ -202,11 +202,22 @@ processors:
       how: inner
       copy: False
     output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
-
-  # 17. Create the final tarred audio dataset
+  
+  # 17. Add fields required for Canary model
+  - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
+    fields:
+      decodercontext: ""
+      "emotion": "<|emo:undefined|>"
+      "pnc": "pnc"
+      "itn": "itn"
+      "timestamp": "notimestamp"
+      "diarize": "nodiarize"
+
+  # 18. Create the final tarred audio dataset
   - _target_: sdp.processors.ConvertToTarredAudioDataset
     should_run: ${params.convert_to_audio_tarred_dataset.should_run}
-    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_15.json
+    output_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_16.json
     min_duration: ${params.convert_to_audio_tarred_dataset.min_audio_duration}
     max_duration: ${params.convert_to_audio_tarred_dataset.max_audio_duration}
     target_dir: ${workspace_dir}/${params.source_lang}/tarred_dataset
@@ -218,8 +229,8 @@ processors:
     sort_in_shards: True
     slice_with_offset: True
 
-  # 18. Optionally delete final converted audio files
+  # 19. Optionally delete final converted audio files
   - _target_: sdp.processors.RemoveFiles
-    input_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_14.json
+    input_manifest_file: ${workspace_dir}/${params.source_lang}/manifest_17.json
     filepath_field: 'audio_filepath'
     should_run: ${params.save_disk_space}
diff --git a/sdp/processors/datasets/granary/create_initial_manifest.py b/sdp/processors/datasets/granary/create_initial_manifest.py
index 1a57f586..1c25566d 100644
--- a/sdp/processors/datasets/granary/create_initial_manifest.py
+++ b/sdp/processors/datasets/granary/create_initial_manifest.py
@@ -69,13 +69,8 @@ def process(self):
                                             offset = sample['start_time'],
                                             duration = sample['duration'],
                                             text = sample['text'],
-                                            answer = sample['text'],
-                                            decodercontext = "",
-                                            emotion = "<|emo:undefined|>",
-                                            pnc = "pnc",
-                                            itn = "itn",
-                                            timestamp = "notimestamp", 
-                                            diarize = "nodiarize")
+                                            answer = sample['text']
+                                            )
                                                         
                             if self.translation:
                                 new_sample['target_lang'] = "en"

From 3dc4971cf942edfa7193dcd029845743dcc15e42 Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Wed, 21 May 2025 22:41:57 +0200
Subject: [PATCH 90/90] Update README.md

---
 dataset_configs/multilingual/granary/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/dataset_configs/multilingual/granary/README.md b/dataset_configs/multilingual/granary/README.md
index 51f4da28..7a9407e9 100644
--- a/dataset_configs/multilingual/granary/README.md
+++ b/dataset_configs/multilingual/granary/README.md
@@ -3,3 +3,16 @@
 This folder is designated for Granary speech data processing configuration files will be added soon. It is associated with a forthcoming paper, which will detail the work done within this project.
 
 Note: This folder is a work in progress.
+
+# Granary
+
+## Yodas2
+
+### Convert to tarred audio dataset
+Suggested values for parameters like num_shards and buckets_num depend on the selected `source_lang` and whether `en_translation` is enabled. These values are provided below to help efficiently prepare a ready-to-train tarred audio dataset.
+
+| `source_lang`    |   `bg`  |  `bg`  |   `cs`  |  `cs`  |   `da`  |  `da`  |   `de`  |  `de`  |   `el`  |  `el`  |   `en`  |   `es`  |  `es`  |   `et`  |  `et`  |   `fi`  |  `fi`  |   `fr`  |  `fr`  |   `hr`  |  `hr`  |   `hu`  |  `hu`  |   `it`  |  `it`  |   `lt`  |  `lt`  |   `lv`  |  `lv`  |   `nl`  |  `nl`  |   `pl`  |  `pl`  |   `pt`  |  `pt`  |   `ro`  |  `ro`  |   `ru`  |  `ru`  |   `sk`  |  `sk`  |   `sv`  |  `sv`  |   `uk`  |  `uk`  |
+|------------------|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|:-----:|:----:|
+| `en_translation` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` | `False` | `True` |
+| `num_shards`     |   16  |  16  |   32  |  32  |   16  |  16  |  4096 | 1024 |   16  |  16  |  8192 |  8192 | 1024 |   16  |  16  |   64  |  32  |  4096 | 1024 |   16  |  16  |   64  |  32  |  1024 | 1024 |   16  |  16  |   16  |  16  |  1024 |  512 |  256  |  256 |  4096 | 4096 |   16  |  16  |  8192 | 1024 |   16  |  16  |   64  |  32  |  128  |  128 |
+| `buckets_num`    |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   4   |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |   1   |   1  |