From 8abc0aa79f3f24c53bd421d646d0dda30736de0b Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sat, 5 Oct 2024 19:22:14 +0400
Subject: [PATCH 01/13] add babel processors

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 .../multilingual/babel/config.yaml            |  53 ++++++
 .../multilingual/voxpopuli/config.yaml        |  37 ++++
 .../datasets/babel/create_initial_manifest.py | 179 ++++++++++++++++++
 .../voxpopuli/create_initial_manifest.py      | 126 ++++++++++++
 4 files changed, 395 insertions(+)
 create mode 100644 dataset_configs/multilingual/babel/config.yaml
 create mode 100644 dataset_configs/multilingual/voxpopuli/config.yaml
 create mode 100644 sdp/processors/datasets/babel/create_initial_manifest.py

diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml
new file mode 100644
index 00000000..9da2c14c
--- /dev/null
+++ b/dataset_configs/multilingual/babel/config.yaml
@@ -0,0 +1,53 @@
+documentation: |
+  IARPA Babel Dataset
+  ###################
+
+  This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu.
+
+  It creates initial manifest for the specified data type and data split. 
+  Further data processing steps should be performed based on the specific langauge.
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify path of the directory downloaded from LDC.
+  * **data_type**: should be "conversational" or "scripted".
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval".
+  * **output_manifest_file**: specify output manifest filepath.
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${output_manifest_file}``.
+  The output manifest contains the following fields:
+
+  * **outputFn (str)**: initial audio filename.
+  * **sessID (str)**: session ID of the recording.
+  * **date (str)**: date of the recording.
+  * **time (str)**: time of the recording.
+  * **spkrCode (str)**: speaker ID.
+  * **lineType (str)**: type of the line (inline or outline).
+  * **dialect (str)**: dialect of the speaker.
+  * **gen (str)**: gender of the speaker.
+  * **envType (str)**: environment (i.e., home, office, etc.).
+  * **age (str)**: age of the speaker.
+  * **network (str)**: name of the telecommunications network.
+  * **phoneModel (str)**: model of the phone.
+  * **sampleCount (str)**: count of the sample.
+  * **sampleRate (str)**: original sample rate of the recording.
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+raw_data_dir: ???
+data_type: ???
+resampled_audio_dir: ???
+data_split: ???
+output_manifest_file: ???
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestBabel
+    raw_data_dir: ${raw_data_dir}
+    data_type: ${data_type}
+    data_split: ${data_split}
+    resampled_audio_dir: ${resampled_audio_dir}
+    output_manifest_file: ${output_manifest_file}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/voxpopuli/config.yaml b/dataset_configs/multilingual/voxpopuli/config.yaml
new file mode 100644
index 00000000..a5e077bf
--- /dev/null
+++ b/dataset_configs/multilingual/voxpopuli/config.yaml
@@ -0,0 +1,37 @@
+documentation: |
+  Voxpopuli unlabelled subset
+  ###########################
+
+  This config can be used to prepare
+  `Voxpopuli dataset unlabelled subset <https://github.com/facebookresearch/voxpopuli/>`_
+  dataset in the NeMo format.
+
+  It creates initial manifest for the specified language. 
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify the directory where the downloaded data will be/is saved.
+  * **language_id**: specify the language of the data you wish to be downloaded and/or processed.
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **delete_raw_file**: specify if the initial raw audio files should be deleted or not.
+
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``.
+  The output manifest contains the following fields:
+
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+raw_data_dir: ???
+language_id: ???
+resampled_audio_dir: ???
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+    raw_data_dir: ${raw_data_dir}
+    language_id: ${language_id}
+    resampled_audio_dir: ${resampled_audio_dir}
+    delete_raw_file: False
diff --git a/sdp/processors/datasets/babel/create_initial_manifest.py b/sdp/processors/datasets/babel/create_initial_manifest.py
new file mode 100644
index 00000000..4bf0eaa6
--- /dev/null
+++ b/sdp/processors/datasets/babel/create_initial_manifest.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+from pathlib import Path
+
+from pydub import AudioSegment
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+
+class CreateInitialManifestBabel(BaseParallelProcessor):
+    """Processor to create initial manifest for the Babel dataset.
+
+    Dataset is available for 25 underserved languages on https://catalog.ldc.upenn.edu
+
+    Segments the raw audio based on transcriptions files
+    (each segment contains an utterance from the transcription file for which start and end timestamps are procided)
+    and creates manifest for the resampled data.
+
+    .. note::
+        The dataset should be downloaded manually from LDC.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data is saved.
+        data_type (str): "conversational" or "scripted".
+        data_split (str): "training", "untranscribed-training", "sub-train", "dev" or "eval".
+        resampled_audio_dir (str): the directory where the resampled audio
+            files will be stored.
+        audio_format (str): format in which new audio files will be stored.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "outputFn": <initial audio filename>,
+                "sessID": <session ID of the recording>,
+                "date": <date of the recording>,
+                "time": <time of the recording>,
+                "spkrCode": <speaker ID>,
+                "lineType": <type of the line (inline or outline)>,
+                "dialect": <dialect of the speaker>,
+                "gen": <gender of the speaker>,
+                "envType": <environment (i.e. home, office etc.)>,
+                "age": <age of the speaker>,
+                "network": <name of the telecommunications network>,
+                "phoneModel": <model of the phone>,
+                "sampleCount": <count of the sample>,
+                "sampleRate": <original sample rate of the recording>,
+                "audio_filepath": <path to the processed audio file>,
+                "duration": <duration of the audio in seconds>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        data_type: str,
+        data_split: str,
+        resampled_audio_dir: str,
+        audio_format: str = 'flac',
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.data_type = data_type
+        self.resampled_audio_dir = resampled_audio_dir
+        self.data_split = data_split
+        self.audio_format = audio_format
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+        self.audios_dir = Path(self.raw_data_dir, self.data_type, self.data_split, 'audio')
+        self.transcriptions_dir = Path(self.raw_data_dir, self.data_type, self.data_split, 'transcription')
+        self.demographics_file = Path(self.raw_data_dir, self.data_type, 'reference_materials', 'demographics.tsv')
+        self.un_demographics_file = Path(
+            self.raw_data_dir, self.data_type, 'reference_materials', 'demographics.untranscribed-training.tsv'
+        )
+
+        if not os.path.exists(self.resampled_audio_dir):
+            os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def prepare(self):
+        self.demographics = {}
+
+        with open(self.demographics_file, "rt", encoding="utf8") as fin:
+            header = fin.readline()
+            titles = [t.strip() for t in header.split('\t')]
+            for line in fin:
+                data_entry = dict(zip(titles, line.strip('\n').split('\t')))
+                self.demographics[data_entry["outputFn"]] = data_entry
+
+        if self.un_demographics_file.exists():
+            with open(self.un_demographics_file, "rt", encoding="utf8") as fin:
+                header = fin.readline()
+                titles = [t.strip() for t in header.split('\t')]
+                for line in fin:
+                    data_entry = dict(zip(titles, line.strip('\n').split('\t')))
+                    self.demographics[data_entry["outputFn"]] = data_entry
+
+    def read_manifest(self):
+        return self.audios_dir.glob("*.sph")
+
+    def process_dataset_entry(self, data_entry: pathlib.PosixPath):
+        transcription_path = Path(self.transcriptions_dir, data_entry.stem).with_suffix('.txt')
+
+        tgt_audio_filepath = (
+            Path(self.resampled_audio_dir, data_entry.stem).with_suffix(f".{self.audio_format}").as_posix()
+        )
+
+        try:
+            audio = AudioSegment.from_file(data_entry)
+
+            if not transcription_path.exists():
+                if audio.frame_rate != self.target_samplerate:
+                    audio = audio.set_frame_rate(self.target_samplerate)
+                audio.export(tgt_audio_filepath, format=self.audio_format)
+
+                modified_entry = self.demographics[data_entry.name].copy()
+                modified_entry['audio_filepath'] = tgt_audio_filepath
+                modified_entry['duration'] = round(audio.duration_seconds, 2)
+                return [DataEntry(data=modified_entry)]
+
+            if audio.frame_rate != self.target_samplerate:
+                audio = audio.set_frame_rate(self.target_samplerate)
+
+            if audio.channels != self.target_nchannels:
+                audio = audio.set_channels(self.target_nchannels)
+
+            with open(transcription_path, 'rt') as f:
+                data = f.readlines()
+                timestamps = data[::2]
+                texts = data[1::2]
+
+            data_entries = []
+
+            for idx in range(len(timestamps) - 1):
+                text = texts[idx].strip('\n')
+                if text == "<no-speech>":
+                    continue
+                new_audio_filepath = tgt_audio_filepath.replace(f'.{self.audio_format}', f'_{idx}.{self.audio_format}')
+
+                start = float(timestamps[idx].strip('[]\n'))
+                end = float(timestamps[idx + 1].strip('[]\n'))
+
+                audio_segment = audio[start * 1000 : end * 1000]
+
+                audio_segment.export(new_audio_filepath, format=self.audio_format)
+
+                modified_entry = self.demographics[data_entry.name].copy()
+                modified_entry['audio_filepath'] = new_audio_filepath
+                modified_entry['text'] = text
+                modified_entry['duration'] = round(end - start, 2)
+
+                data_entries.append(DataEntry(data=modified_entry))
+
+        except Exception as e:
+            logger.warning(str(e) + " file: " + transcription_path)
+
+        return data_entries
\ No newline at end of file
diff --git a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
index b7c47ba5..06284f24 100644
--- a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
+++ b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
@@ -153,3 +153,129 @@ def process_dataset_entry(self, data_entry: str):
             "accent": accent,
         }
         return [DataEntry(data=data)]
+
+
+
+class CreateInitialManifestVoxpopuliUnlabelled(BaseParallelProcessor):
+    """Processor to create initial manifest for the VoxPopuli dataset unlabelled subset.
+
+    Dataset link: https://github.com/facebookresearch/voxpopuli/
+
+    If not already downloaded and segmented, downloads and segments raw VoxPopuli data for the specified language,
+    and creates an initial manifest with the reformated audiofilepaths and their durations.
+
+    .. note::
+        This processor will install a couple of Python packages, including
+        PyTorch, so it might be a good idea to run it in an isolated Python
+        environment. As unlabelled data is huge in volumes, the downloading,
+        segmenting and processing might take a long time.
+
+    Args:
+        raw_data_dir (str): the directory where the downloaded data will be/is saved.
+        language_id (str): the language of the data you wish to be downloaded and/or processed.
+            E.g., "en", "es", "it", "it_v2" etc.
+        resampled_audio_dir (str): the directory where the resampled audio
+            files will be stored.
+        audio_format (str): format in which new audio files will be stored.
+        target_samplerate (int): sample rate (Hz) to use for resampling.
+            Defaults to 16000.
+        target_nchannels (int): number of channels to create during resampling process.
+            Defaults to 1.
+        delete_raw_file (bool): whether initial .ogg files should be deleted or not.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "duration": <duration of the audio in seconds>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        language_id: str,
+        resampled_audio_dir: str,
+        audio_format: str = 'flac',
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        delete_raw_file: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.language_id = language_id
+        self.audio_format = audio_format
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        self.delete_raw_file = delete_raw_file
+
+        self.resampled_audio_dir = Path(resampled_audio_dir, self.language_id.replace('_v2', ''))
+        self.output_manifest_file = Path(self.resampled_audio_dir, 'manifest.json').as_posix()
+
+    def prepare(self):
+        """Downloading data (unless already done)"""
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+        if not (self.raw_data_dir / "unlabelled_data" / self.language_id.replace('_v2', '')).exists():
+            # TODO: some kind of isolated environment?
+            if not os.path.exists(self.raw_data_dir / 'voxpopuli'):
+                logger.info("Downloading voxpopuli and installing requirements")
+                subprocess.run(f"git clone {VOXPOPULI_URL} {self.raw_data_dir / 'voxpopuli'}", check=True, shell=True)
+            subprocess.run(
+                f"pip install -r {self.raw_data_dir / 'voxpopuli' / 'requirements.txt'}", check=True, shell=True
+            )
+            subprocess.run(f"pip install torch==1.13 torchaudio==0.13", check=True, shell=True)
+            subprocess.run(f"pip uninstall torch-tensorrt torchdata torchvision -y", check=True, shell=True)
+            if not os.path.exists(self.raw_data_dir / 'raw_audios' / self.language_id.replace('_v2', '')):
+                logger.info("Downloading raw audios")
+                subprocess.run(
+                    f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                    f"python -m voxpopuli.download_audios --root {self.raw_data_dir} --subset {self.language_id}",
+                    check=True,
+                    shell=True,
+                )
+
+            logger.info("Segmenting the data")
+            subprocess.run(
+                f"cd {self.raw_data_dir / 'voxpopuli'} && "
+                f"python -m voxpopuli.get_unlabelled_data  --root {self.raw_data_dir} --subset {self.language_id}",
+                check=True,
+                shell=True,
+            )
+
+        if not self.resampled_audio_dir.exists():
+            self.resampled_audio_dir.mkdir(exist_ok=True, parents=True)
+
+    def read_manifest(self):
+        unlabelled_dir = Path(self.raw_data_dir, 'unlabelled_data')
+        return Path(unlabelled_dir, self.language_id.replace('_v2', '')).rglob('*.ogg')
+
+    def process_dataset_entry(self, data_entry: PosixPath):
+        tgt_audio_filepath = Path(self.resampled_audio_dir, data_entry.stem + f".{self.audio_format}")
+
+        try:
+            audio = AudioSegment.from_ogg(data_entry)
+
+            if audio.frame_rate != self.target_samplerate:
+                audio = audio.set_frame_rate(self.target_samplerate)
+
+            if audio.channels != self.target_nchannels:
+                audio = audio.set_channels(self.target_nchannels)
+
+            audio.export(tgt_audio_filepath, format=self.audio_format)
+
+            data = {
+                "audio_filepath": tgt_audio_filepath,
+                "duration": audio.duration_seconds,
+            }
+
+            if self.delete_raw_file:
+                os.remove(data_entry)
+
+        except Exception as e:
+            logger.warning(str(e) + " file: " + data_entry.as_posix())
+            data = None
+
+        return [DataEntry(data=data)]
\ No newline at end of file

From be88d1f6e408d39c4a385ad1e2baa0279a5b93ea Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sat, 5 Oct 2024 19:33:57 +0400
Subject: [PATCH 02/13] addiing data modif processors -s

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 .../modify_manifest/data_to_data.py           | 296 ++++++++++++++++++
 1 file changed, 296 insertions(+)

diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 9328b081..f616f230 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,10 +15,18 @@
 import collections
 import os
 import re
+import json
+import random
+import itertools
+import tarfile
+from tqdm import tqdm
 from typing import Dict, List
+from pathlib import Path, PosixPath
 
 import soundfile
 from sox import Transformer
+from pydub import AudioSegment
+from tqdm.contrib.concurrent import process_map
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -690,3 +698,291 @@ def process_dataset_entry(self, data_entry):
             data_entry[self.input_text_key], verbose=self.verbose
         )
         return [DataEntry(data=data_entry)]
+
+
+class RandomSegment(BaseParallelProcessor):
+    """
+    Processor that randomly segments mini-audios from the main audio, durations of which are uniformely distributed from ``min_duration`` to ``max_duration``.
+    New audios are saved in the following location ``<resampled_audio_dir>/<audio_file>_segment_num.<audio_format>``
+
+    Args:
+        min_duration (float): minimum duration for the newly segmented audio.
+        max_duration (float): maximum duration for the newly segmented audio.
+        resampled_audio_dir (str) (Optional): directory where the resampled audio files will be stored.
+        audio_format (str) (Optional): key to get audio filepath from data entry. Defaults to None.
+        audio_filepath_key (str) (Optional): format of the output audio files. Defaults to `wav`. Defaults to ``audio_filepath``
+        save_other_part (bool) (Optional): whether to save the residual part of the audio after segmentation. Defaults to True.
+        random_seed (int) (Optional): seed for ``random.uniform``. Defaults to 1000.
+        target_samplerate (int) (Optional): the target sample rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int) (Optional): the target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        min_duration: float,
+        max_duration: float,
+        resampled_audio_dir: str,
+        audio_format: str = None,
+        audio_filepath_key: str = 'audio_filepath',
+        save_other_part: bool = True,
+        random_seed: int = 1000,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.resampled_audio_dir = resampled_audio_dir
+        self.audio_format = audio_format
+        self.audio_filepath_key = audio_filepath_key
+        self.save_other_part = save_other_part
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+        random.seed(random_seed)
+
+    def process_dataset_entry(self, data_entry):
+        data_entries = []
+
+        audio = AudioSegment.from_file(data_entry[self.audio_filepath_key])
+        duration = audio.duration_seconds
+
+        if audio.frame_rate != self.target_samplerate:
+            audio = audio.set_frame_rate(self.target_samplerate)
+
+        if audio.channels != self.target_nchannels:
+            audio = audio.set_channels(self.target_nchannels)
+
+        audio_format = self.audio_format if self.audio_format else data_entry[self.audio_filepath_key].suffix
+
+        Path(self.resampled_audio_dir).mkdir(parents=True, exist_ok=True)
+
+        segment_num = 0
+
+        if duration - self.min_duration < self.min_duration:
+            new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+            new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+            audio.export(new_filename, format=self.audio_format)
+
+            new_data_entry = data_entry.copy()
+            new_data_entry[self.audio_filepath_key] = new_filename
+
+            return [DataEntry(data=new_data_entry)]
+
+        while True:
+            rand_dur = random.uniform(self.min_duration, min(self.max_duration, duration) - self.min_duration)
+            segmented_part = audio[: int(rand_dur * 1000)]
+
+            new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+            new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+            segmented_part.export(new_filename, format=self.audio_format)
+
+            new_data_entry = data_entry.copy()
+            new_data_entry[self.audio_filepath_key] = new_filename
+            new_data_entry['duration'] = round(rand_dur, 2)
+
+            data_entries.append(DataEntry(data=new_data_entry))
+            segment_num += 1
+
+            if (duration - rand_dur) > self.max_duration:
+                audio = audio[int(rand_dur * 1000) :]
+                duration = duration - rand_dur
+                continue
+
+            if self.save_other_part:
+                other_part = audio[int(rand_dur * 1000) :]
+                new_filename = Path(self.resampled_audio_dir) / Path(data_entry[self.audio_filepath_key]).stem
+                new_filename = new_filename.as_posix() + f'_{segment_num}.{audio_format}'
+
+                other_part.export(new_filename, format=self.audio_format)
+
+                new_data_entry = data_entry.copy()
+                new_data_entry[self.audio_filepath_key] = new_filename
+                new_data_entry['duration'] = round(duration - rand_dur, 2)
+                data_entries.append(DataEntry(data=new_data_entry))
+
+            break
+
+        return data_entries
+
+
+class UntarAudios(BaseParallelProcessor):
+    """Processor that extracts the files from .tar files in ``tar_dir`` to ``resampled_audio_dir``.
+
+    Args:
+        tar_dir (str): directory that contains tarred files.
+        resampled_audio_dir (str): directory where extracted files will be located.
+        remove_tars (bool) (Optional): whether tarred file should be removed after files extraction.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        tar_dir: str,
+        resampled_audio_dir: str,
+        remove_tars: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tar_dir = Path(tar_dir)
+        self.resampled_audio_dir = resampled_audio_dir
+        self.remove_tars = remove_tars
+
+    def read_manifest(self):
+        for file in self.tar_dir.glob('*.tar'):
+            yield file
+
+    def process(self):
+        for manifest_chunk in self._chunk_manifest():
+            # this will unroll all inner lists
+            data = itertools.chain(
+                *process_map(
+                    self.process_dataset_entry,
+                    manifest_chunk,
+                    max_workers=self.max_workers,
+                    chunksize=self.chunksize,
+                )
+            )
+
+    def process_dataset_entry(self, data_entry: PosixPath):
+        with tarfile.open(data_entry, 'r') as tar:
+            tar.extractall(self.resampled_audio_dir)
+
+        if self.remove_tars:
+            os.remove(data_entry)
+
+
+class ExtractFilesFromTar(BaseParallelProcessor):
+    """Processor that extracts the files from ``input_manifest_file`` to ``extract_to_dir``.
+
+    Args:
+        tar_dir (str): directory that contains tarred files.
+        extract_to_dir (str): directory where extracted files will be located.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    """
+
+    def __init__(
+        self,
+        tar_dir: str,
+        extract_to_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tar_dir = Path(tar_dir)
+        self.extract_to_dir = extract_to_dir
+
+    def read_manifest(self):
+        if self.input_manifest_file is None:
+            raise ValueError("Manifest with files should be provided!")
+
+        logger.info('Reading Manifest...')
+
+        tar_entries = collections.defaultdict(list)
+
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            for line in tqdm(fin):
+                entry = json.loads(line)
+                tar_entries[entry['shard_id']].append(entry)
+
+        for shard_id, entries in tar_entries.items():
+            yield (shard_id, entries)
+
+    def process_dataset_entry(self, data_entry):
+        shard_id, entries = data_entry
+
+        logger.info('Working on shard_id ', shard_id)
+
+        tar_file = Path(self.tar_dir, f"audio_{shard_id}").with_suffix('.tar')
+
+        extracted_entries = []
+
+        with tarfile.open(tar_file, 'r') as tar:
+            for entry in tqdm(entries):
+                extracted_path = Path(self.extract_to_dir, entry['audio_filepath']).as_posix()
+
+                if not os.path.exists(extracted_path):
+                    tar.extract(member=entry['audio_filepath'], path=self.extract_to_dir)
+
+                entry['audio_filepath'] = extracted_path
+                extracted_entries.append(DataEntry(data=entry))
+
+        return extracted_entries
+
+
+class RemoveEmojis(BaseParallelProcessor):
+    """Replaces emojis with empty string.
+
+    .. note:: Emoji patterns are predefined. There might be (new) emojis which are not included in the list.
+
+    Args:
+        text_key (str): a string indicating which key of the data entries
+            should be used to find the utterance transcript. Defaults to "text".
+
+    Returns:
+         The same data as in the input manifest with ``<text_key>`` field without detected emojis.
+    """
+
+    EMOJI_PATTERN = re.compile(
+        r" ?[\U0001F600-\U0001F64F"  # emoticons
+        r"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        r"\U0001F680-\U0001F6FF"  # transport & map symbols
+        r"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        r"\U00002500-\U00002BEF"  # Chinese characters
+        r"\U00002702-\U000027B0"
+        r"\U00002702-\U000027B0"
+        r"\U000024C2-\U0001F251"
+        r"\U0001f926-\U0001f937"
+        r"\U00010000-\U0010ffff"
+        r"\u2640-\u2642"
+        r"\u2600-\u2B55"
+        r"\u200d"
+        r"\u23cf"
+        r"\u23e9"
+        r"\u231a"
+        r"\ufe0f"  # dingbats
+        r"\u3030"
+        r"]+",
+        flags=re.UNICODE,
+    )
+
+    def __init__(
+        self,
+        text_key: str = "text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_key = text_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        """Replaces each found regex match with a given string."""
+        replace_word_counter = 0
+
+        text_in = data_entry[self.text_key]
+
+        text_in = add_start_end_spaces(text_in)
+        text_out = re.sub(
+            self.EMOJI_PATTERN,
+            repl='',
+            string=text_in,
+        )
+
+        if text_in != text_out:
+            replace_word_counter += 1
+        text_in = text_out
+
+        text_out = remove_extra_spaces(text_out)
+
+        data_entry[self.text_key] = text_out
+
+        return [DataEntry(data=data_entry, metrics=replace_word_counter)]
+
+    def finalize(self, metrics):
+        """Reports how many substitutions were made for each pattern."""
+        super().finalize(metrics)
+

From 5cbd01e0082ec3be3ed859b00670bf8bba788151 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sat, 5 Oct 2024 19:36:02 +0400
Subject: [PATCH 03/13] adding corrupted data remover processor

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 .../modify_manifest/data_to_dropbool.py       | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 3c91ba20..19e51667 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -16,6 +16,7 @@
 import re
 import os 
 import json
+import soundfile
 from operator import eq, ge, gt, le, lt, ne
 from typing import List, Union
 
@@ -863,3 +864,36 @@ def finalize(self, metrics: List):
             total_counter += counter
         logger.info("Dropped %d utterances", total_counter)
         super().finalize(metrics)
+
+
+class DropCorrupted(BaseParallelProcessor):
+    """Drops audios if they are corrupted or empty.
+    Args:
+        audio_filepath_key (str) (Optional): which key to use for audio filepaths. Defaults to ``audio_filepath``
+    """
+
+    def __init__(
+        self,
+        audio_filepath_key: str = 'audio_filepath',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_filepath_key = audio_filepath_key
+
+    def process_dataset_entry(self, data_entry) -> List:
+        try:
+            data, _ = soundfile.read(data_entry[self.audio_filepath_key])
+        except:
+            return [DataEntry(data=None, metrics=1)]
+
+        if sum(data) == 0:
+            return [DataEntry(data=None, metrics=1)]
+
+        return [DataEntry(data=data_entry, metrics=0)]
+
+    def finalize(self, metrics):
+        total_counter = 0
+        for counter in metrics:
+            total_counter += counter
+        logger.info("Dropped %d utterances", total_counter)
+        super().finalize(metrics)

From 33154dacc6095883aaf39f08acfd60f1f4731427 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sat, 5 Oct 2024 19:38:01 +0400
Subject: [PATCH 04/13] updating transcribe_speech

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 sdp/processors/nemo/transcribe_speech.py | 150 +++++++++++++++++++----
 1 file changed, 127 insertions(+), 23 deletions(-)

diff --git a/sdp/processors/nemo/transcribe_speech.py b/sdp/processors/nemo/transcribe_speech.py
index bb04047b..78c12023 100644
--- a/sdp/processors/nemo/transcribe_speech.py
+++ b/sdp/processors/nemo/transcribe_speech.py
@@ -12,34 +12,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is copied over from https://github.com/NVIDIA/NeMo/blob/v1.23.0/examples/asr/transcribe_speech.py.
-# It is currently only compatible with NeMo v1.23.0. To use a different version of NeMo, please modify the file.
-
 import contextlib
+import glob
+import json
 import os
-from dataclasses import dataclass, is_dataclass
+import time
+from dataclasses import dataclass, field, is_dataclass
+from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
 import pytorch_lightning as pl
 import torch
-from omegaconf import OmegaConf, open_dict
-
-from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
+from nemo.collections.asr.models import (
+    EncDecCTCModel,
+    EncDecHybridRNNTCTCModel,
+    EncDecMultiTaskModel,
+)
+from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
-from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
+from nemo.collections.asr.parts.submodules.multitask_decoding import (
+    MultiTaskDecoding,
+    MultiTaskDecodingConfig,
+)
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import (
+    GreedyBatchedRNNTInferConfig,
+)
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (
     compute_output_filename,
     prepare_audio_data,
+    read_and_maybe_sort_manifest,
+    restore_transcription_order,
     setup_model,
     transcribe_partial_audio,
     write_transcription,
 )
+from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
+from omegaconf import OmegaConf, open_dict
 
 """
 Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data.
@@ -79,6 +93,8 @@
   langid: Str used for convert_num_to_words during groundtruth cleaning
   use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
 
+  calculate_rtfx: Bool to calculate the RTFx throughput to transcribe the input dataset.
+
 # Usage
 ASR model can be specified by either "model_path" or "pretrained_name".
 Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
@@ -106,7 +122,6 @@
 
 @dataclass
 class ModelChangeConfig:
-
     # Sub-config for changes specific to the Conformer Encoder
     conformer: ConformerChangeConfig = ConformerChangeConfig()
 
@@ -123,6 +138,7 @@ class TranscriptionConfig:
     ] = None  # Used to select a single channel from multichannel audio, or use average across channels
     audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
     eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
+    presort_manifest: bool = True  # Significant inference speedup on short-form data due to padding reduction
 
     # General configs
     output_filename: Optional[str] = None
@@ -147,6 +163,8 @@ class TranscriptionConfig:
     allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
     amp: bool = False
     amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
+    compute_dtype: str = "float32"
+    matmul_precision: str = "highest"  # Literal["highest", "high", "medium"]
     audio_type: str = "wav"
 
     # Recompute model transcription, even if the output folder exists with scores.
@@ -156,10 +174,19 @@ class TranscriptionConfig:
     ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
 
     # Decoding strategy for RNNT models
+    # enable CUDA graphs for transcription
     rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
     multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
+    # Implicit single-turn assuming default role='user' (works with Canary-1B)
+    #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
+    # Explicit single-turn prompt:
+    #  +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes
+    # Explicit multi-turn prompt:
+    #  +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]'
+    prompt: dict = field(default_factory=dict)
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
@@ -184,11 +211,15 @@ class TranscriptionConfig:
 
     # key for groundtruth text in manifest
     gt_text_attr_name: str = "text"
+    gt_lang_attr_name: str = "lang"
 
     # Use model's transcribe() function instead of transcribe_partial_audio() by default
     # Only use transcribe_partial_audio() when the audio is too long to fit in memory
     # Your manifest input should have `offset` field to use transcribe_partial_audio()
     allow_partial_transcribe: bool = False
+    extract_nbest: bool = False  # Extract n-best hypotheses from the model
+
+    calculate_rtfx: bool = False
 
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
@@ -217,6 +248,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         logging.info(f"Will apply on-the-fly augmentation on samples during transcription: {augmentor} ")
 
     # setup GPU
+    torch.set_float32_matmul_precision(cfg.matmul_precision)
     if cfg.cuda is None:
         if torch.cuda.is_available():
             device = [0]  # use 0th CUDA device
@@ -247,6 +279,14 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     asr_model.set_trainer(trainer)
     asr_model = asr_model.eval()
 
+    if cfg.compute_dtype != "float32" and cfg.amp:
+        raise ValueError("amp=true is mutually exclusive with a compute_dtype other than float32")
+
+    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+
+    if cfg.compute_dtype != "float32":
+        asr_model.to(getattr(torch, cfg.compute_dtype))
+
     # we will adjust this flag if the model does not support it
     compute_timestamps = cfg.compute_timestamps
     compute_langs = cfg.compute_langs
@@ -272,6 +312,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         if isinstance(asr_model.decoding, MultiTaskDecoding):
             cfg.multitask_decoding.compute_langs = cfg.compute_langs
             cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment
+            if cfg.extract_nbest:
+                cfg.multitask_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             asr_model.change_decoding_strategy(cfg.multitask_decoding)
         elif cfg.decoder_type is not None:
             # TODO: Support compute_langs in CTC eventually
@@ -279,6 +322,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
                 raise ValueError("CTC models do not support `compute_langs` at the moment")
 
             decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding
+            if cfg.extract_nbest:
+                decoding_cfg.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             decoding_cfg.compute_timestamps = cfg.compute_timestamps  # both ctc and rnnt support it
             if 'preserve_alignments' in decoding_cfg:
                 decoding_cfg.preserve_alignments = preserve_alignment
@@ -291,6 +337,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
         # Check if ctc or rnnt model
         elif hasattr(asr_model, 'joint'):  # RNNT model
+            if cfg.extract_nbest:
+                cfg.rnnt_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
             cfg.rnnt_decoding.fused_batch_size = -1
             cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps
             cfg.rnnt_decoding.compute_langs = cfg.compute_langs
@@ -302,6 +351,9 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             if cfg.compute_langs:
                 raise ValueError("CTC models do not support `compute_langs` at the moment.")
             cfg.ctc_decoding.compute_timestamps = cfg.compute_timestamps
+            if cfg.extract_nbest:
+                cfg.ctc_decoding.beam.return_best_hypothesis = False
+                cfg.return_hypotheses = True
 
             asr_model.change_decoding_strategy(cfg.ctc_decoding)
 
@@ -311,14 +363,27 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc"
         ):
             cfg.decoding = cfg.ctc_decoding
+        elif isinstance(asr_model.decoding, MultiTaskDecoding):
+            cfg.decoding = cfg.multitask_decoding
         else:
             cfg.decoding = cfg.rnnt_decoding
 
+    remove_path_after_done = None
     if isinstance(asr_model, EncDecMultiTaskModel):
         # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
         partial_audio = False
-        filepaths = cfg.dataset_manifest
-        assert cfg.dataset_manifest is not None
+        if cfg.audio_dir is not None and not cfg.append_pred:
+            filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
+        else:
+            assert cfg.dataset_manifest is not None
+            if cfg.presort_manifest:
+                with NamedTemporaryFile("w", suffix=".json", delete=False) as f:
+                    for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True):
+                        item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
+                        print(json.dumps(item), file=f)
+                    cfg.dataset_manifest = f.name
+                    remove_path_after_done = f.name
+            filepaths = cfg.dataset_manifest
     else:
         # prepare audio filepaths and decide wether it's partial audio
         filepaths, partial_audio = prepare_audio_data(cfg)
@@ -334,7 +399,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     else:
 
         @contextlib.contextmanager
-        def autocast(dtype=None):
+        def autocast(dtype=None, enabled=True):
             yield
 
     # Compute output filename
@@ -350,10 +415,22 @@ def autocast(dtype=None):
 
     # transcribe audio
 
-    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
+    if cfg.calculate_rtfx:
+        total_duration = 0.0
 
-    with autocast(dtype=amp_dtype):
+        with open(cfg.dataset_manifest, "rt") as fh:
+            for line in fh:
+                item = json.loads(line)
+                if "duration" not in item:
+                    raise ValueError(
+                        f"Requested calculate_rtfx=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
+                    )
+                total_duration += item["duration"]
+
+    with autocast(dtype=amp_dtype, enabled=cfg.amp):
         with torch.no_grad():
+            if cfg.calculate_rtfx:
+                start_time = time.time()
             if partial_audio:
                 transcriptions = transcribe_partial_audio(
                     asr_model=asr_model,
@@ -366,21 +443,40 @@ def autocast(dtype=None):
                     decoder_type=cfg.decoder_type,
                 )
             else:
+                override_cfg = asr_model.get_transcribe_config()
+                override_cfg.batch_size = cfg.batch_size
+                override_cfg.num_workers = cfg.num_workers
+                override_cfg.return_hypotheses = cfg.return_hypotheses
+                override_cfg.channel_selector = cfg.channel_selector
+                override_cfg.augmentor = augmentor
+                override_cfg.text_field = cfg.gt_text_attr_name
+                override_cfg.lang_field = cfg.gt_lang_attr_name
+                if hasattr(override_cfg, "prompt"):
+                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
+
                 transcriptions = asr_model.transcribe(
-                    paths2audio_files=filepaths,
-                    batch_size=cfg.batch_size,
-                    num_workers=cfg.num_workers,
-                    return_hypotheses=cfg.return_hypotheses,
-                    channel_selector=cfg.channel_selector,
-                    augmentor=augmentor,
+                    audio=filepaths,
+                    override_config=override_cfg,
                 )
+            if cfg.calculate_rtfx:
+                transcribe_time = time.time() - start_time
 
-    logging.info(f"Finished transcribing {len(filepaths)} files !")
+    if cfg.dataset_manifest is not None:
+        logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
+        if cfg.presort_manifest:
+            transcriptions = restore_transcription_order(cfg.dataset_manifest, transcriptions)
+    else:
+        logging.info(f"Finished transcribing {len(filepaths)} files !")
     logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
 
-    # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
+    # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
     if type(transcriptions) == tuple and len(transcriptions) == 2:
-        transcriptions = transcriptions[0]
+        if cfg.extract_nbest:
+            # extract all hypotheses if exists
+            transcriptions = transcriptions[1]
+        else:
+            # extract just best hypothesis
+            transcriptions = transcriptions[0]
 
     if cfg.return_transcriptions:
         return transcriptions
@@ -396,6 +492,11 @@ def autocast(dtype=None):
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 
+    # clean-up
+    if cfg.presort_manifest is not None:
+        if remove_path_after_done is not None:
+            os.unlink(remove_path_after_done)
+
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,
@@ -410,6 +511,9 @@ def autocast(dtype=None):
             logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
             logging.info(f"{total_res}")
 
+    if cfg.calculate_rtfx:
+        logging.info(f"Dataset RTFx {(total_duration/transcribe_time)}")
+
     return cfg
 
 

From 777f34a9f79492655d9da13dc1a0e5027f53e8cd Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sat, 5 Oct 2024 19:43:12 +0400
Subject: [PATCH 05/13] adding processors to init

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 sdp/processors/__init__.py                                   | 5 +++++
 sdp/processors/datasets/voxpopuli/create_initial_manifest.py | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fdafb521..2fad5bc0 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -50,6 +50,7 @@
 )
 from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
     CreateInitialManifestVoxpopuli,
+    CreateInitialManifestVoxpopuliUnlabelled,
 )
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
@@ -80,6 +81,9 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    ExtractFilesFromTar,
+    RandomSegment,
+    UntarAudios,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
@@ -97,6 +101,7 @@
     DropOnAttribute,
     PreserveByValue,
     DropRepeatedFields,
+    DropCorrupted,
 )
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
diff --git a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
index 06284f24..4f67b29a 100644
--- a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
+++ b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
@@ -14,10 +14,11 @@
 
 import os
 import subprocess
-from pathlib import Path
+from pathlib import Path, PosixPath
 
 import sox
 from sox import Transformer
+from pydub import AudioSegment
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry

From e2a39af7ca3ef8f360cdac87d1756911c0b9693c Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sun, 6 Oct 2024 11:39:06 +0400
Subject: [PATCH 06/13] add basic processor for yodas

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 .../multilingual/yodas/config.yaml            | 35 +++++++++++++++++++
 .../modify_manifest/create_manifest.py        |  2 +-
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 dataset_configs/multilingual/yodas/config.yaml

diff --git a/dataset_configs/multilingual/yodas/config.yaml b/dataset_configs/multilingual/yodas/config.yaml
new file mode 100644
index 00000000..16fffefc
--- /dev/null
+++ b/dataset_configs/multilingual/yodas/config.yaml
@@ -0,0 +1,35 @@
+processors_to_run: all
+manifest: ???
+resampled_audio_dir: ???
+out_manifest: ???
+char_rate: 10
+min_duration: 1.5
+max_duration: 40.1
+
+processors:
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    input_manifest_file: ${manifest}
+
+  - _target_: sdp.processors.RandomSegment
+    min_duration: ${min_duration}
+    max_duration: ${max_duration}
+    resampled_audio_dir: ${resampled_audio_dir}
+    audio_format: flac
+
+  - _target_: sdp.processors.ASRInference
+    pretrained_model: nvidia/parakeet-ctc-0.6b
+
+  - _target_: sdp.processors.DropHighLowCharrate
+    low_charrate_threshold: ${char_rate}
+    text_key: pred_text
+    high_charrate_threshold: 10000
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep:
+      - audio_filepath
+      - duration
+
+  - _target_: sdp.processors.DropCorrupted
+    output_manifest_file: ${out_manifest}
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index 77096b3e..48674007 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -42,7 +42,7 @@ def __init__(
         self.extension = extension
 
     def read_manifest(self):
-        output_file = [str(self.raw_data_dir / file) for file in self.raw_data_dir.rglob('*.' + self.extension)]
+        output_file = [file.as_posix() for file in self.raw_data_dir.rglob('*.' + self.extension)]
         return output_file
 
     def process_dataset_entry(self, data_entry):

From 395d430d2eb3c3e220bac3a6ec663a3bf9cb4b03 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sun, 6 Oct 2024 14:49:48 +0400
Subject: [PATCH 07/13] add tests for new processors

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 .../multilingual/babel/config.yaml            | 14 +++---
 .../voxpopuli/{config.yaml => config_un.yaml} | 12 +++--
 sdp/processors/__init__.py                    |  1 +
 .../voxpopuli/create_initial_manifest.py      |  9 ++--
 tests/test_cfg_end_to_end_tests.py            | 48 +++++++++++--------
 tests/test_data_to_data.py                    | 12 +++++
 6 files changed, 60 insertions(+), 36 deletions(-)
 rename dataset_configs/multilingual/voxpopuli/{config.yaml => config_un.yaml} (79%)

diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml
index 9da2c14c..ade44a4e 100644
--- a/dataset_configs/multilingual/babel/config.yaml
+++ b/dataset_configs/multilingual/babel/config.yaml
@@ -38,16 +38,16 @@ documentation: |
   * **duration (float)**: duration of the audio in seconds.
 
 processors_to_run: all
-raw_data_dir: ???
-data_type: ???
-resampled_audio_dir: ???
-data_split: ???
-output_manifest_file: ???
+workspace_dir: ???
+data_type: scripted
+resampled_audio_dir: ${workspace_dir}/processed/${data_type}/${data_split}
+data_split: training
+final_manifest: ${workspace_dir}/processed/${data_type}/${data_split}_manifest.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestBabel
-    raw_data_dir: ${raw_data_dir}
+    raw_data_dir: ${workspace_dir}
     data_type: ${data_type}
     data_split: ${data_split}
     resampled_audio_dir: ${resampled_audio_dir}
-    output_manifest_file: ${output_manifest_file}
\ No newline at end of file
+    output_manifest_file: ${final_manifest}
\ No newline at end of file
diff --git a/dataset_configs/multilingual/voxpopuli/config.yaml b/dataset_configs/multilingual/voxpopuli/config_un.yaml
similarity index 79%
rename from dataset_configs/multilingual/voxpopuli/config.yaml
rename to dataset_configs/multilingual/voxpopuli/config_un.yaml
index a5e077bf..4480ad6e 100644
--- a/dataset_configs/multilingual/voxpopuli/config.yaml
+++ b/dataset_configs/multilingual/voxpopuli/config_un.yaml
@@ -25,13 +25,15 @@ documentation: |
   * **duration (float)**: duration of the audio in seconds.
 
 processors_to_run: all
-raw_data_dir: ???
-language_id: ???
-resampled_audio_dir: ???
+workspace_dir: ???
+language_id: hu_v2
+resampled_data_dir: ${workspace_dir}/unlabelled_processed/
+final_manifest: ${workspace_dir}/unlabelled_processed/${language_id}/manifest.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
-    raw_data_dir: ${raw_data_dir}
+    raw_data_dir: ${workspace_dir}
     language_id: ${language_id}
-    resampled_audio_dir: ${resampled_audio_dir}
+    resampled_data_dir: ${resampled_data_dir}
     delete_raw_file: False
+    output_manifest_file: ${final_manifest}
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2fad5bc0..39fc9e80 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -52,6 +52,7 @@
     CreateInitialManifestVoxpopuli,
     CreateInitialManifestVoxpopuliUnlabelled,
 )
+from sdp.processors.datasets.babel.create_initial_manifest import CreateInitialManifestBabel
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
diff --git a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
index 4f67b29a..2f7985fe 100644
--- a/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
+++ b/sdp/processors/datasets/voxpopuli/create_initial_manifest.py
@@ -197,7 +197,7 @@ def __init__(
         self,
         raw_data_dir: str,
         language_id: str,
-        resampled_audio_dir: str,
+        resampled_data_dir: str,
         audio_format: str = 'flac',
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
@@ -212,8 +212,9 @@ def __init__(
         self.target_nchannels = target_nchannels
         self.delete_raw_file = delete_raw_file
 
-        self.resampled_audio_dir = Path(resampled_audio_dir, self.language_id.replace('_v2', ''))
-        self.output_manifest_file = Path(self.resampled_audio_dir, 'manifest.json').as_posix()
+
+        self.resampled_audio_dir = Path(resampled_data_dir, self.language_id.replace('_v2', ''), 'audios')
+        self.output_manifest_file = self.output_manifest_file.replace('_v2', '')
 
     def prepare(self):
         """Downloading data (unless already done)"""
@@ -268,7 +269,7 @@ def process_dataset_entry(self, data_entry: PosixPath):
             audio.export(tgt_audio_filepath, format=self.audio_format)
 
             data = {
-                "audio_filepath": tgt_audio_filepath,
+                "audio_filepath": tgt_audio_filepath.as_posix(),
                 "duration": audio.duration_seconds,
             }
 
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 7db88f34..12546313 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -44,15 +44,21 @@ def data_check_fn_generic(raw_data_dir: str, file_name: str, **kwargs) -> None:
 data_check_fn_ksc2 = partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz")
 data_check_fn_librispeech = partial(data_check_fn_generic, file_name="dev-clean.tar.gz")
 data_check_fn_fleurs = partial(data_check_fn_generic, file_name="dev.tar.gz")
+data_check_fn_babel = partial(data_check_fn_generic, file_name="scripted")
 
-def data_check_fn_voxpopuli(raw_data_dir: str) -> None:
+def data_check_fn_voxpopuli(raw_data_dir: str, asr_data: bool = True) -> None:
     """Raises error if do not find expected data.
 
     Will also extract the archive as initial processor expects extracted data.
     """
-    if (Path(raw_data_dir) / "transcribed_data").exists():
+    if asr_data:
+        file_name = "transcribed_data"
+    else:
+        file_name = "unlabelled_data"
+
+    if (Path(raw_data_dir) / file_name).exists():
         return
-    expected_file = Path(raw_data_dir) / "transcribed_data.tar.gz"
+    expected_file = Path(raw_data_dir, file_name).with_suffix(".tar.gz")
     if not expected_file.exists():
         raise ValueError(f"No such file {str(expected_file)}")
     with tarfile.open(expected_file, 'r:gz') as tar:
@@ -85,24 +91,26 @@ def data_check_fn_slr140(raw_data_dir: str) -> None:
 
 def get_test_cases() -> List[Tuple[str, Callable]]:
     return [
-        (f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", partial(data_check_fn_mls, language="spanish")),
-        (f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es")),
-        (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", data_check_fn_voxpopuli),
-        (f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", partial(data_check_fn_mls, language="italian")),
-        (f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", partial(data_check_fn_mls, language="portuguese")),
-        (f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt")),
-        (f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", partial(data_check_fn_mtedx, language_id="pt")),
-        (f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", data_check_fn_coraa),
-        (f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", lambda raw_data_dir: True),
-        (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
-        (f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", data_check_fn_librispeech),
-        (f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", data_check_fn_fleurs),
-        (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
+        # (f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", partial(data_check_fn_mls, language="spanish")),
+        # (f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es")),
+        # (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", partial(data_check_fn_voxpopuli, asr_data=True)),
+        # (f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", partial(data_check_fn_mls, language="italian")),
+        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", partial(data_check_fn_mls, language="portuguese")),
+        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt")),
+        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", partial(data_check_fn_mtedx, language_id="pt")),
+        # (f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", data_check_fn_coraa),
+        # (f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", lambda raw_data_dir: True),
+        # (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
+        # (f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", data_check_fn_librispeech),
+        # (f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", data_check_fn_fleurs),
+        # (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
         (f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", lambda raw_data_dir: True),
-        (f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="mcv_kk")),
-        (f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", data_check_fn_slr140),
-        (f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", data_check_fn_slr102),
-        (f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn_ksc2),
+        # (f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="mcv_kk")),
+        # (f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", data_check_fn_slr140),
+        # (f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", data_check_fn_slr102),
+        # (f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn_ksc2),
+        # (f"{DATASET_CONFIGS_ROOT}/multilingual/babel/config.yaml", data_check_fn_babel),
+        # (f"{DATASET_CONFIGS_ROOT}/multilingual/voxpopuli/config_un.yaml", partial(data_check_fn_voxpopuli, asr_data=False)),
     ]
 
 def check_e2e_test_data() -> bool:
diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py
index 5bd75f47..79c0b9d9 100644
--- a/tests/test_data_to_data.py
+++ b/tests/test_data_to_data.py
@@ -19,6 +19,7 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    RemoveEmojis
 )
 
 test_params_list = []
@@ -90,6 +91,17 @@
     ]
 )
 
+test_params_list.extend(
+    [
+        (
+            RemoveEmojis,
+            {"text_key": "text"},
+            {"text": "The weather is perfect ☀️, and the trails are calling! Let's enjoy the beauty of nature and make some unforgettable memories 🌲🌿."},
+            {"text": "The weather is perfect, and the trails are calling! Let's enjoy the beauty of nature and make some unforgettable memories."},
+        ),
+    ]
+)
+
 
 @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
 def test_data_to_data(test_class, class_kwargs, test_input, expected_output):

From 82adfbd86f2abe48b3cb9cae0ceebe89b37b3a72 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Sun, 6 Oct 2024 15:00:16 +0400
Subject: [PATCH 08/13] add new processors to docs

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 docs/src/sdp/api.rst               | 25 +++++++++++++++++++-
 docs/src/sdp/existing_configs.rst  | 20 ++++++++++++++++
 tests/test_cfg_end_to_end_tests.py | 38 +++++++++++++++---------------
 3 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 6d85e83d..496b6154 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -40,6 +40,9 @@ VoxPopuli
 .. autodata:: sdp.processors.CreateInitialManifestVoxpopuli
    :annotation:
 
+.. autodata:: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+   :annotation:
+
 .. autodata:: sdp.processors.NormalizeFromNonPCTextVoxpopuli
    :annotation:
 
@@ -58,8 +61,13 @@ Librispeech
 
 .. autodata:: sdp.processors.CreateInitialManifestLibrispeech
    :annotation:
-   
 
+Babel
+'''''''''''
+
+.. autodata:: sdp.processors.CreateInitialManifestBabel
+   :annotation:
+   
 SLR83
 '''''
 
@@ -158,6 +166,18 @@ Data modifications
 .. autodata:: sdp.processors.InverseNormalizeText
    :annotation:
 
+.. autodata:: sdp.processors.RandomSegment
+   :annotation:
+
+.. autodata:: sdp.processors.UntarAudios
+   :annotation:
+
+.. autodata:: sdp.processors.ExtractFilesFromTar
+   :annotation:
+
+.. autodata:: sdp.processors.RemoveEmojis
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
@@ -237,6 +257,9 @@ Data filtering
 .. autodata:: sdp.processors.DropRepeatedFields
    :annotation:
 
+.. autodata:: sdp.processors.DropCorrupted
+   :annotation:
+
 
 Miscellaneous
 #############
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 32c52ec1..271a938c 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -92,12 +92,16 @@ VoxPopuli
 * **Spanish**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/spanish_pc/voxpopuli/config.yaml>`__ |
   :doc:`documentation <config-docs/spanish_pc/voxpopuli/config>`
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/voxpopuli/config_un.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/voxpopuli/config_un>`
 
 .. toctree::
    :hidden:
 
    config-docs/italian/voxpopuli/config
    config-docs/spanish_pc/voxpopuli/config
+   config-docs/multilingual/voxpopuli/config_un
 
 Fisher
 ~~~~~~
@@ -237,6 +241,22 @@ MTEDx
 
    config-docs/portuguese/mtedx/config
 
+Babel
+~~~~~~
+
+**Dataset link:** https://www.ldc.upenn.edu
+
+**Supported configs**.
+
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/babel/config.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/babel/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/multilingual/babel/config
+
 Kazakh Speech Dataset (SLR140)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 12546313..3ad47bb6 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -91,26 +91,26 @@ def data_check_fn_slr140(raw_data_dir: str) -> None:
 
 def get_test_cases() -> List[Tuple[str, Callable]]:
     return [
-        # (f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", partial(data_check_fn_mls, language="spanish")),
-        # (f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es")),
-        # (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", partial(data_check_fn_voxpopuli, asr_data=True)),
-        # (f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", partial(data_check_fn_mls, language="italian")),
-        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", partial(data_check_fn_mls, language="portuguese")),
-        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt")),
-        # (f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", partial(data_check_fn_mtedx, language_id="pt")),
-        # (f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", data_check_fn_coraa),
-        # (f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", lambda raw_data_dir: True),
-        # (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
-        # (f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", data_check_fn_librispeech),
-        # (f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", data_check_fn_fleurs),
-        # (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
+        (f"{DATASET_CONFIGS_ROOT}/spanish/mls/config.yaml", partial(data_check_fn_mls, language="spanish")),
+        (f"{DATASET_CONFIGS_ROOT}/spanish_pc/mcv12/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-12.0-2022-12-07-es")),
+        (f"{DATASET_CONFIGS_ROOT}/italian/voxpopuli/config.yaml", partial(data_check_fn_voxpopuli, asr_data=True)),
+        (f"{DATASET_CONFIGS_ROOT}/italian/mls/config.yaml", partial(data_check_fn_mls, language="italian")),
+        (f"{DATASET_CONFIGS_ROOT}/portuguese/mls/config.yaml", partial(data_check_fn_mls, language="portuguese")),
+        (f"{DATASET_CONFIGS_ROOT}/portuguese/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="cv-corpus-15.0-2023-09-08-pt")),
+        (f"{DATASET_CONFIGS_ROOT}/portuguese/mtedx/config.yaml", partial(data_check_fn_mtedx, language_id="pt")),
+        (f"{DATASET_CONFIGS_ROOT}/portuguese/coraa/config.yaml", data_check_fn_coraa),
+        (f"{DATASET_CONFIGS_ROOT}/english/slr83/config.yaml", lambda raw_data_dir: True),
+        (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
+        (f"{DATASET_CONFIGS_ROOT}/english/librispeech/config.yaml", data_check_fn_librispeech),
+        (f"{DATASET_CONFIGS_ROOT}/armenian/fleurs/config.yaml", data_check_fn_fleurs),
+        (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
         (f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", lambda raw_data_dir: True),
-        # (f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="mcv_kk")),
-        # (f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", data_check_fn_slr140),
-        # (f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", data_check_fn_slr102),
-        # (f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn_ksc2),
-        # (f"{DATASET_CONFIGS_ROOT}/multilingual/babel/config.yaml", data_check_fn_babel),
-        # (f"{DATASET_CONFIGS_ROOT}/multilingual/voxpopuli/config_un.yaml", partial(data_check_fn_voxpopuli, asr_data=False)),
+        (f"{DATASET_CONFIGS_ROOT}/kazakh/mcv/config.yaml", partial(data_check_fn_mcv, archive_file_stem="mcv_kk")),
+        (f"{DATASET_CONFIGS_ROOT}/kazakh/slr140/config.yaml", data_check_fn_slr140),
+        (f"{DATASET_CONFIGS_ROOT}/kazakh/slr102/config.yaml", data_check_fn_slr102),
+        (f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn_ksc2),
+        (f"{DATASET_CONFIGS_ROOT}/multilingual/babel/config.yaml", data_check_fn_babel),
+        (f"{DATASET_CONFIGS_ROOT}/multilingual/voxpopuli/config_un.yaml", partial(data_check_fn_voxpopuli, asr_data=False)),
     ]
 
 def check_e2e_test_data() -> bool:

From 068b95129b09270fd6ecd3ffce3676cb0e302fd6 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Wed, 9 Oct 2024 16:13:42 +0400
Subject: [PATCH 09/13] modifying tests

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 dataset_configs/english/coraal/config.yaml |   4 +-
 pytest.ini                                 |   5 +-
 requirements/huggingface.txt               |   4 +-
 sdp/processors/nemo/transcribe_speech.py   | 127 ++++++---------------
 tests/test_cfg_end_to_end_tests.py         |  61 +++++++---
 5 files changed, 86 insertions(+), 115 deletions(-)

diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml
index d5b570a2..7b3fb240 100644
--- a/dataset_configs/english/coraal/config.yaml
+++ b/dataset_configs/english/coraal/config.yaml
@@ -93,8 +93,8 @@ processors:
       - {"pattern": '\baksing\b', "repl": "asking"}
       - {"pattern": '\baksed\b', "repl": "asked"}
       # removing unintelligible/redacted flags
-      - {"pattern": '/(?i)unintelligible/', "repl": ""}
-      - {"pattern": '/(?i)inaudible/', "repl": ""}
+      - {"pattern": '(?i)unintelligible/', "repl": ""}
+      - {"pattern": '(?i)inaudible/', "repl": ""}
       - {"pattern": '/RD(.*?)/', "repl": ""}
       - {"pattern": '/(\?)\1*/', "repl": ""}
       # removing non-linguistic markers
diff --git a/pytest.ini b/pytest.ini
index 2bed0f3a..ae4de828 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
-addopts = --doctest-modules
\ No newline at end of file
+addopts = --doctest-modules
+markers =
+    dependency: mark a test as a dependent on the other mentioned test.
+    slow: marks tests as slow (deselect with '-m "not slow"').
\ No newline at end of file
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
index e603c631..4f8696e1 100644
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@@ -1,3 +1,3 @@
-accelerate
-transformers>=0.2.1
+accelerate==0.34.2
+transformers==4.39
 huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793
diff --git a/sdp/processors/nemo/transcribe_speech.py b/sdp/processors/nemo/transcribe_speech.py
index 78c12023..b5f740dc 100644
--- a/sdp/processors/nemo/transcribe_speech.py
+++ b/sdp/processors/nemo/transcribe_speech.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,47 +13,33 @@
 # limitations under the License.
 
 import contextlib
-import glob
 import json
 import os
 import time
 from dataclasses import dataclass, field, is_dataclass
-from tempfile import NamedTemporaryFile
 from typing import List, Optional, Union
 
 import pytorch_lightning as pl
 import torch
-from nemo.collections.asr.models import (
-    EncDecCTCModel,
-    EncDecHybridRNNTCTCModel,
-    EncDecMultiTaskModel,
-)
+from omegaconf import OmegaConf, open_dict
+
+from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel
 from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
-from nemo.collections.asr.parts.submodules.multitask_decoding import (
-    MultiTaskDecoding,
-    MultiTaskDecodingConfig,
-)
+from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
-from nemo.collections.asr.parts.submodules.rnnt_greedy_decoding import (
-    GreedyBatchedRNNTInferConfig,
-)
 from nemo.collections.asr.parts.utils.eval_utils import cal_write_wer
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import (
     compute_output_filename,
     prepare_audio_data,
-    read_and_maybe_sort_manifest,
     restore_transcription_order,
     setup_model,
-    transcribe_partial_audio,
     write_transcription,
 )
-from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
-from omegaconf import OmegaConf, open_dict
 
 """
 Transcribe audio file on a single CPU/GPU. Useful for transcription of moderate amounts of audio data.
@@ -77,6 +63,7 @@
 
   output_filename: Output filename where the transcriptions will be written
   batch_size: batch size during inference
+  presort_manifest: sorts the provided manifest by audio length for faster inference (default: True)
 
   cuda: Optional int to enable or disable execution of model on certain CUDA device.
   allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
@@ -122,8 +109,9 @@
 
 @dataclass
 class ModelChangeConfig:
+
     # Sub-config for changes specific to the Conformer Encoder
-    conformer: ConformerChangeConfig = ConformerChangeConfig()
+    conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig)
 
 
 @dataclass
@@ -133,9 +121,9 @@ class TranscriptionConfig:
     pretrained_name: Optional[str] = None  # Name of a pretrained model
     audio_dir: Optional[str] = None  # Path to a directory which contains audio files
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
-    channel_selector: Optional[
-        Union[int, str]
-    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
+    channel_selector: Optional[Union[int, str]] = (
+        None  # Used to select a single channel from multichannel audio, or use average across channels
+    )
     audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
     eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
     presort_manifest: bool = True  # Significant inference speedup on short-form data due to padding reduction
@@ -171,14 +159,14 @@ class TranscriptionConfig:
     overwrite_transcripts: bool = True
 
     # Decoding strategy for CTC models
-    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
+    ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig)
 
     # Decoding strategy for RNNT models
     # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
+    rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1))
 
     # Decoding strategy for AED models
-    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
+    multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig)
     # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
     # Implicit single-turn assuming default role='user' (works with Canary-1B)
     #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
@@ -194,7 +182,7 @@ class TranscriptionConfig:
     att_context_size: Optional[list] = None
 
     # Use this for model-specific changes before transcription
-    model_change: ModelChangeConfig = ModelChangeConfig()
+    model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig)
 
     # Config for word / character error rate calculation
     calculate_wer: bool = True
@@ -213,10 +201,6 @@ class TranscriptionConfig:
     gt_text_attr_name: str = "text"
     gt_lang_attr_name: str = "lang"
 
-    # Use model's transcribe() function instead of transcribe_partial_audio() by default
-    # Only use transcribe_partial_audio() when the audio is too long to fit in memory
-    # Your manifest input should have `offset` field to use transcribe_partial_audio()
-    allow_partial_transcribe: bool = False
     extract_nbest: bool = False  # Extract n-best hypotheses from the model
 
     calculate_rtfx: bool = False
@@ -300,7 +284,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
         if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
             raise ValueError('Hybrid model only support ctc or rnnt decoding!')
-    else:  # rnnt model, there could be other models needs to be addressed.
+    elif isinstance(asr_model, EncDecRNNTModel):
         if cfg.decoder_type and cfg.decoder_type != 'rnnt':
             raise ValueError('RNNT model only support rnnt decoding!')
 
@@ -368,39 +352,11 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         else:
             cfg.decoding = cfg.rnnt_decoding
 
-    remove_path_after_done = None
-    if isinstance(asr_model, EncDecMultiTaskModel):
-        # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
-        partial_audio = False
-        if cfg.audio_dir is not None and not cfg.append_pred:
-            filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
-        else:
-            assert cfg.dataset_manifest is not None
-            if cfg.presort_manifest:
-                with NamedTemporaryFile("w", suffix=".json", delete=False) as f:
-                    for item in read_and_maybe_sort_manifest(cfg.dataset_manifest, try_sort=True):
-                        item["audio_filepath"] = get_full_path(item["audio_filepath"], cfg.dataset_manifest)
-                        print(json.dumps(item), file=f)
-                    cfg.dataset_manifest = f.name
-                    remove_path_after_done = f.name
-            filepaths = cfg.dataset_manifest
-    else:
-        # prepare audio filepaths and decide wether it's partial audio
-        filepaths, partial_audio = prepare_audio_data(cfg)
+    filepaths, sorted_manifest_path = prepare_audio_data(cfg)
 
-    if not cfg.allow_partial_transcribe:
-        # by defatul, use model's transcribe() function, unless partial audio is required
-        partial_audio = False
-
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
+    remove_path_after_done = sorted_manifest_path if sorted_manifest_path is not None else None
 
-        @contextlib.contextmanager
-        def autocast(dtype=None, enabled=True):
-            yield
+    filepaths = sorted_manifest_path if sorted_manifest_path is not None else filepaths
 
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
@@ -427,37 +383,26 @@ def autocast(dtype=None, enabled=True):
                     )
                 total_duration += item["duration"]
 
-    with autocast(dtype=amp_dtype, enabled=cfg.amp):
+    with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=amp_dtype, enabled=cfg.amp):
         with torch.no_grad():
             if cfg.calculate_rtfx:
                 start_time = time.time()
-            if partial_audio:
-                transcriptions = transcribe_partial_audio(
-                    asr_model=asr_model,
-                    path2manifest=cfg.dataset_manifest,
-                    batch_size=cfg.batch_size,
-                    num_workers=cfg.num_workers,
-                    return_hypotheses=cfg.return_hypotheses,
-                    channel_selector=cfg.channel_selector,
-                    augmentor=augmentor,
-                    decoder_type=cfg.decoder_type,
-                )
-            else:
-                override_cfg = asr_model.get_transcribe_config()
-                override_cfg.batch_size = cfg.batch_size
-                override_cfg.num_workers = cfg.num_workers
-                override_cfg.return_hypotheses = cfg.return_hypotheses
-                override_cfg.channel_selector = cfg.channel_selector
-                override_cfg.augmentor = augmentor
-                override_cfg.text_field = cfg.gt_text_attr_name
-                override_cfg.lang_field = cfg.gt_lang_attr_name
-                if hasattr(override_cfg, "prompt"):
-                    override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
-
-                transcriptions = asr_model.transcribe(
-                    audio=filepaths,
-                    override_config=override_cfg,
-                )
+
+            override_cfg = asr_model.get_transcribe_config()
+            override_cfg.batch_size = cfg.batch_size
+            override_cfg.num_workers = cfg.num_workers
+            override_cfg.return_hypotheses = cfg.return_hypotheses
+            override_cfg.channel_selector = cfg.channel_selector
+            override_cfg.augmentor = augmentor
+            override_cfg.text_field = cfg.gt_text_attr_name
+            override_cfg.lang_field = cfg.gt_lang_attr_name
+            if hasattr(override_cfg, "prompt"):
+                override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
+
+            transcriptions = asr_model.transcribe(
+                audio=filepaths,
+                override_config=override_cfg,
+            )
             if cfg.calculate_rtfx:
                 transcribe_time = time.time() - start_time
 
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 3ad47bb6..7f4e7ff9 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -15,6 +15,7 @@
 import os
 import shutil
 import tarfile
+import logging
 from functools import partial
 from pathlib import Path
 from typing import Callable, List, Tuple
@@ -113,6 +114,14 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
         (f"{DATASET_CONFIGS_ROOT}/multilingual/voxpopuli/config_un.yaml", partial(data_check_fn_voxpopuli, asr_data=False)),
     ]
 
+def get_test_names():
+    config_names = [
+        Path(t[0]).parent.relative_to(DATASET_CONFIGS_ROOT).as_posix() for t in get_test_cases()
+        ]
+
+    return config_names
+
+
 def check_e2e_test_data() -> bool:
     """
     Checks if required environment variables are defined for e2e data.
@@ -132,7 +141,6 @@ def get_e2e_test_data_path(rel_path_from_root: str) -> str:
         return test_data_root
 
     import boto3
-    import logging
 
     s3_resource = boto3.resource(
         "s3",
@@ -153,27 +161,43 @@ def get_e2e_test_data_path(rel_path_from_root: str) -> str:
 
     return os.path.abspath("test_data")
 
-@pytest.mark.skipif(
-    not check_e2e_test_data(),
-    reason="Either TEST_DATA_ROOT needs to be defined or both AWS_SECRET_KEY "
-    "and AWS_ACCESS_KEY to run e2e config tests",
-)
-@pytest.mark.parametrize("config_path,data_check_fn", get_test_cases())
-def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
-    # we expect DATASET_CONFIGS_ROOT and TEST_DATA_ROOT
-    # to have the same structure (e.g. <lang>/<dataset>)
+@pytest.fixture(scope="module", params=get_test_cases(), ids=get_test_names())
+def setup_data(request):
+
+    if not check_e2e_test_data():
+        pytest.fail("Either TEST_DATA_ROOT needs to be defined or both AWS_SECRET_KEY "
+    "and AWS_ACCESS_KEY to run e2e config tests")
+        
+    config_path, data_check_fn  = request.param
+
     rel_path_from_root = Path(config_path).parent.relative_to(DATASET_CONFIGS_ROOT)
-    test_data_root = Path(get_e2e_test_data_path(str(rel_path_from_root)))
- 
-    # run data_check_fn - it will raise error if the expected test data is not found
+    test_data_root = get_e2e_test_data_path(str(rel_path_from_root))
+    data_dir = Path(test_data_root, rel_path_from_root)
+
+    yield config_path, data_check_fn, data_dir
+    shutil.rmtree(data_dir)
+
+
+def test_data_availability(setup_data):
+
+    _, data_check_fn, data_dir = setup_data
     try:
-        data_check_fn(raw_data_dir=str(test_data_root / rel_path_from_root))
+        data_check_fn(raw_data_dir=data_dir)
     except ValueError as e:
-        pytest.skip(f"Test data not available: {str(e)}")
+        pytest.fail(f"Test data not available: {str(e)}")
+
+    reference_manifest = Path(data_dir, "test_data_reference.json")
 
-    reference_manifest = test_data_root / rel_path_from_root / "test_data_reference.json"
     if not reference_manifest.exists():
-        pytest.skip(f"Reference manifest not found: {reference_manifest}")
+        pytest.fail(f"Reference manifest not found: {reference_manifest}")
+
+@pytest.mark.dependency(depends=['test_data_availability'])
+def test_configs(setup_data, tmp_path):
+    # we expect DATASET_CONFIGS_ROOT and TEST_DATA_ROOT
+    # to have the same structure (e.g. <lang>/<dataset>)
+
+    config_path, _, data_dir = setup_data
+    reference_manifest = data_dir / "test_data_reference.json"
 
     cfg = OmegaConf.load(config_path)
     assert "processors" in cfg
@@ -181,7 +205,7 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
     cfg.workspace_dir = str(tmp_path)
     cfg.final_manifest = str(tmp_path / "final_manifest.json")
     cfg.data_split = cfg.get("data_split", "train")
-    cfg.processors[0].raw_data_dir = str(test_data_root / rel_path_from_root)
+    cfg.processors[0].raw_data_dir = data_dir.as_posix()
 
     if "already_downloaded" in cfg["processors"][0]:
         cfg["processors"][0]["already_downloaded"] = True
@@ -203,7 +227,6 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: Path):
             generated_data.pop("audio_filepath", None)
             assert reference_data == generated_data
 
- # if CLEAN_UP_TMP_PATH is set to non-0 value, we will delete tmp_path
     if os.getenv("CLEAN_UP_TMP_PATH", "0") != "0":
         shutil.rmtree(tmp_path)
 

From f06a94deb5bfa1e7c3965f803d48aa5454857a28 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Wed, 9 Oct 2024 16:21:27 +0400
Subject: [PATCH 10/13] add pydub

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 requirements/main.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index c39b2844..82a2d105 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -13,7 +13,7 @@ sox
 tqdm
 webvtt-py
 wget
-
+pydub
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all

From 98d4ff0ec385f2a01dbd1fc42505fdf20a0369d5 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Wed, 9 Oct 2024 16:27:29 +0400
Subject: [PATCH 11/13] small fix

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 sdp/processors/__init__.py                                | 1 +
 sdp/processors/datasets/coraal/create_initial_manifest.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 39fc9e80..7f0c0d71 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -85,6 +85,7 @@
     ExtractFilesFromTar,
     RandomSegment,
     UntarAudios,
+    RemoveEmojis
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
diff --git a/sdp/processors/datasets/coraal/create_initial_manifest.py b/sdp/processors/datasets/coraal/create_initial_manifest.py
index 1f67f730..16aa166a 100644
--- a/sdp/processors/datasets/coraal/create_initial_manifest.py
+++ b/sdp/processors/datasets/coraal/create_initial_manifest.py
@@ -47,7 +47,7 @@ def get_coraal_url_list():
 class CreateInitialManifestCORAAL(BaseParallelProcessor):
     """Processor to create initial manifest for the Corpus of Regional African American Language (CORAAL) dataset.
 
-    Dataset link: https://oraal.uoregon.edu/coraal/
+    Dataset link: https://oraal.github.io/coraal
 
     Will download all files, extract tars and split wav files based on the
     provided durations in the transcripts.

From 0fb7d8f32061ed1d1251e4fec1edc243486a59f4 Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Wed, 9 Oct 2024 16:52:19 +0400
Subject: [PATCH 12/13] update numpy version

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 requirements/main.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index 82a2d105..7cb1ee43 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -4,7 +4,7 @@ ffmpeg
 hydra-core
 joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
-numpy
+numpy==1.26.4
 omegaconf
 pandas
 rarfile

From f782eee782bde2e73741f704cbe0a8f5b015eecc Mon Sep 17 00:00:00 2001
From: monica-sekoyan <msekoyan@nvidia.com>
Date: Wed, 9 Oct 2024 19:22:33 +0400
Subject: [PATCH 13/13] restored the old version of transcribe_speech

Signed-off-by: monica-sekoyan <msekoyan@nvidia.com>
---
 sdp/processors/nemo/transcribe_speech.py | 181 +++++++++--------------
 1 file changed, 66 insertions(+), 115 deletions(-)

diff --git a/sdp/processors/nemo/transcribe_speech.py b/sdp/processors/nemo/transcribe_speech.py
index b5f740dc..bb04047b 100644
--- a/sdp/processors/nemo/transcribe_speech.py
+++ b/sdp/processors/nemo/transcribe_speech.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# This file is copied over from https://github.com/NVIDIA/NeMo/blob/v1.23.0/examples/asr/transcribe_speech.py.
+# It is currently only compatible with NeMo v1.23.0. To use a different version of NeMo, please modify the file.
+
 import contextlib
-import json
 import os
-import time
-from dataclasses import dataclass, field, is_dataclass
+from dataclasses import dataclass, is_dataclass
 from typing import List, Optional, Union
 
 import pytorch_lightning as pl
 import torch
 from omegaconf import OmegaConf, open_dict
 
-from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecRNNTModel
-from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt
+from nemo.collections.asr.models import EncDecCTCModel, EncDecHybridRNNTCTCModel, EncDecMultiTaskModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerChangeConfig
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
 from nemo.collections.asr.parts.submodules.multitask_decoding import MultiTaskDecoding, MultiTaskDecodingConfig
@@ -34,8 +34,8 @@
 from nemo.collections.asr.parts.utils.transcribe_utils import (
     compute_output_filename,
     prepare_audio_data,
-    restore_transcription_order,
     setup_model,
+    transcribe_partial_audio,
     write_transcription,
 )
 from nemo.core.config import hydra_runner
@@ -63,7 +63,6 @@
 
   output_filename: Output filename where the transcriptions will be written
   batch_size: batch size during inference
-  presort_manifest: sorts the provided manifest by audio length for faster inference (default: True)
 
   cuda: Optional int to enable or disable execution of model on certain CUDA device.
   allow_mps: Bool to allow using MPS (Apple Silicon M-series GPU) device if available
@@ -80,8 +79,6 @@
   langid: Str used for convert_num_to_words during groundtruth cleaning
   use_cer: Bool to use Character Error Rate (CER)  or Word Error Rate (WER)
 
-  calculate_rtfx: Bool to calculate the RTFx throughput to transcribe the input dataset.
-
 # Usage
 ASR model can be specified by either "model_path" or "pretrained_name".
 Data for transcription can be defined with either "audio_dir" or "dataset_manifest".
@@ -111,7 +108,7 @@
 class ModelChangeConfig:
 
     # Sub-config for changes specific to the Conformer Encoder
-    conformer: ConformerChangeConfig = field(default_factory=ConformerChangeConfig)
+    conformer: ConformerChangeConfig = ConformerChangeConfig()
 
 
 @dataclass
@@ -121,12 +118,11 @@ class TranscriptionConfig:
     pretrained_name: Optional[str] = None  # Name of a pretrained model
     audio_dir: Optional[str] = None  # Path to a directory which contains audio files
     dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
-    channel_selector: Optional[Union[int, str]] = (
-        None  # Used to select a single channel from multichannel audio, or use average across channels
-    )
+    channel_selector: Optional[
+        Union[int, str]
+    ] = None  # Used to select a single channel from multichannel audio, or use average across channels
     audio_key: str = 'audio_filepath'  # Used to override the default audio key in dataset_manifest
     eval_config_yaml: Optional[str] = None  # Path to a yaml file of config of evaluation
-    presort_manifest: bool = True  # Significant inference speedup on short-form data due to padding reduction
 
     # General configs
     output_filename: Optional[str] = None
@@ -151,30 +147,19 @@ class TranscriptionConfig:
     allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
     amp: bool = False
     amp_dtype: str = "float16"  # can be set to "float16" or "bfloat16" when using amp
-    compute_dtype: str = "float32"
-    matmul_precision: str = "highest"  # Literal["highest", "high", "medium"]
     audio_type: str = "wav"
 
     # Recompute model transcription, even if the output folder exists with scores.
     overwrite_transcripts: bool = True
 
     # Decoding strategy for CTC models
-    ctc_decoding: CTCDecodingConfig = field(default_factory=CTCDecodingConfig)
+    ctc_decoding: CTCDecodingConfig = CTCDecodingConfig()
 
     # Decoding strategy for RNNT models
-    # enable CUDA graphs for transcription
-    rnnt_decoding: RNNTDecodingConfig = field(default_factory=lambda: RNNTDecodingConfig(fused_batch_size=-1))
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig(fused_batch_size=-1)
 
     # Decoding strategy for AED models
-    multitask_decoding: MultiTaskDecodingConfig = field(default_factory=MultiTaskDecodingConfig)
-    # Prompt slots for prompted models, e.g. Canary-1B. Examples of acceptable prompt inputs:
-    # Implicit single-turn assuming default role='user' (works with Canary-1B)
-    #  +prompt.source_lang=en +prompt.target_lang=es +prompt.task=asr +prompt.pnc=yes
-    # Explicit single-turn prompt:
-    #  +prompt.role=user +prompt.slots.source_lang=en +prompt.slots.target_lang=es +prompt.slots.task=s2t_translation +prompt.slots.pnc=yes
-    # Explicit multi-turn prompt:
-    #  +prompt.turns='[{role:user,slots:{source_lang:en,target_lang:es,task:asr,pnc:yes}}]'
-    prompt: dict = field(default_factory=dict)
+    multitask_decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
 
     # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
     decoder_type: Optional[str] = None
@@ -182,7 +167,7 @@ class TranscriptionConfig:
     att_context_size: Optional[list] = None
 
     # Use this for model-specific changes before transcription
-    model_change: ModelChangeConfig = field(default_factory=ModelChangeConfig)
+    model_change: ModelChangeConfig = ModelChangeConfig()
 
     # Config for word / character error rate calculation
     calculate_wer: bool = True
@@ -199,11 +184,11 @@ class TranscriptionConfig:
 
     # key for groundtruth text in manifest
     gt_text_attr_name: str = "text"
-    gt_lang_attr_name: str = "lang"
 
-    extract_nbest: bool = False  # Extract n-best hypotheses from the model
-
-    calculate_rtfx: bool = False
+    # Use model's transcribe() function instead of transcribe_partial_audio() by default
+    # Only use transcribe_partial_audio() when the audio is too long to fit in memory
+    # Your manifest input should have `offset` field to use transcribe_partial_audio()
+    allow_partial_transcribe: bool = False
 
 
 @hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
@@ -232,7 +217,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         logging.info(f"Will apply on-the-fly augmentation on samples during transcription: {augmentor} ")
 
     # setup GPU
-    torch.set_float32_matmul_precision(cfg.matmul_precision)
     if cfg.cuda is None:
         if torch.cuda.is_available():
             device = [0]  # use 0th CUDA device
@@ -263,14 +247,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     asr_model.set_trainer(trainer)
     asr_model = asr_model.eval()
 
-    if cfg.compute_dtype != "float32" and cfg.amp:
-        raise ValueError("amp=true is mutually exclusive with a compute_dtype other than float32")
-
-    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
-
-    if cfg.compute_dtype != "float32":
-        asr_model.to(getattr(torch, cfg.compute_dtype))
-
     # we will adjust this flag if the model does not support it
     compute_timestamps = cfg.compute_timestamps
     compute_langs = cfg.compute_langs
@@ -284,7 +260,7 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     elif isinstance(asr_model, EncDecHybridRNNTCTCModel):
         if cfg.decoder_type and cfg.decoder_type not in ['ctc', 'rnnt']:
             raise ValueError('Hybrid model only support ctc or rnnt decoding!')
-    elif isinstance(asr_model, EncDecRNNTModel):
+    else:  # rnnt model, there could be other models needs to be addressed.
         if cfg.decoder_type and cfg.decoder_type != 'rnnt':
             raise ValueError('RNNT model only support rnnt decoding!')
 
@@ -296,9 +272,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
         if isinstance(asr_model.decoding, MultiTaskDecoding):
             cfg.multitask_decoding.compute_langs = cfg.compute_langs
             cfg.multitask_decoding.preserve_alignments = cfg.preserve_alignment
-            if cfg.extract_nbest:
-                cfg.multitask_decoding.beam.return_best_hypothesis = False
-                cfg.return_hypotheses = True
             asr_model.change_decoding_strategy(cfg.multitask_decoding)
         elif cfg.decoder_type is not None:
             # TODO: Support compute_langs in CTC eventually
@@ -306,9 +279,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
                 raise ValueError("CTC models do not support `compute_langs` at the moment")
 
             decoding_cfg = cfg.rnnt_decoding if cfg.decoder_type == 'rnnt' else cfg.ctc_decoding
-            if cfg.extract_nbest:
-                decoding_cfg.beam.return_best_hypothesis = False
-                cfg.return_hypotheses = True
             decoding_cfg.compute_timestamps = cfg.compute_timestamps  # both ctc and rnnt support it
             if 'preserve_alignments' in decoding_cfg:
                 decoding_cfg.preserve_alignments = preserve_alignment
@@ -321,9 +291,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
         # Check if ctc or rnnt model
         elif hasattr(asr_model, 'joint'):  # RNNT model
-            if cfg.extract_nbest:
-                cfg.rnnt_decoding.beam.return_best_hypothesis = False
-                cfg.return_hypotheses = True
             cfg.rnnt_decoding.fused_batch_size = -1
             cfg.rnnt_decoding.compute_timestamps = cfg.compute_timestamps
             cfg.rnnt_decoding.compute_langs = cfg.compute_langs
@@ -335,9 +302,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             if cfg.compute_langs:
                 raise ValueError("CTC models do not support `compute_langs` at the moment.")
             cfg.ctc_decoding.compute_timestamps = cfg.compute_timestamps
-            if cfg.extract_nbest:
-                cfg.ctc_decoding.beam.return_best_hypothesis = False
-                cfg.return_hypotheses = True
 
             asr_model.change_decoding_strategy(cfg.ctc_decoding)
 
@@ -347,16 +311,31 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             isinstance(asr_model, EncDecHybridRNNTCTCModel) and cfg.decoder_type == "ctc"
         ):
             cfg.decoding = cfg.ctc_decoding
-        elif isinstance(asr_model.decoding, MultiTaskDecoding):
-            cfg.decoding = cfg.multitask_decoding
         else:
             cfg.decoding = cfg.rnnt_decoding
 
-    filepaths, sorted_manifest_path = prepare_audio_data(cfg)
+    if isinstance(asr_model, EncDecMultiTaskModel):
+        # Special case for EncDecMultiTaskModel, where the input manifest is directly passed into the model's transcribe() function
+        partial_audio = False
+        filepaths = cfg.dataset_manifest
+        assert cfg.dataset_manifest is not None
+    else:
+        # prepare audio filepaths and decide wether it's partial audio
+        filepaths, partial_audio = prepare_audio_data(cfg)
+
+    if not cfg.allow_partial_transcribe:
+        # by defatul, use model's transcribe() function, unless partial audio is required
+        partial_audio = False
 
-    remove_path_after_done = sorted_manifest_path if sorted_manifest_path is not None else None
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
 
-    filepaths = sorted_manifest_path if sorted_manifest_path is not None else filepaths
+        @contextlib.contextmanager
+        def autocast(dtype=None):
+            yield
 
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
@@ -371,57 +350,37 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
     # transcribe audio
 
-    if cfg.calculate_rtfx:
-        total_duration = 0.0
-
-        with open(cfg.dataset_manifest, "rt") as fh:
-            for line in fh:
-                item = json.loads(line)
-                if "duration" not in item:
-                    raise ValueError(
-                        f"Requested calculate_rtfx=True, but line {line} in manifest {cfg.dataset_manifest} lacks a 'duration' field."
-                    )
-                total_duration += item["duration"]
+    amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
 
-    with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=amp_dtype, enabled=cfg.amp):
+    with autocast(dtype=amp_dtype):
         with torch.no_grad():
-            if cfg.calculate_rtfx:
-                start_time = time.time()
-
-            override_cfg = asr_model.get_transcribe_config()
-            override_cfg.batch_size = cfg.batch_size
-            override_cfg.num_workers = cfg.num_workers
-            override_cfg.return_hypotheses = cfg.return_hypotheses
-            override_cfg.channel_selector = cfg.channel_selector
-            override_cfg.augmentor = augmentor
-            override_cfg.text_field = cfg.gt_text_attr_name
-            override_cfg.lang_field = cfg.gt_lang_attr_name
-            if hasattr(override_cfg, "prompt"):
-                override_cfg.prompt = parse_multitask_prompt(OmegaConf.to_container(cfg.prompt))
-
-            transcriptions = asr_model.transcribe(
-                audio=filepaths,
-                override_config=override_cfg,
-            )
-            if cfg.calculate_rtfx:
-                transcribe_time = time.time() - start_time
-
-    if cfg.dataset_manifest is not None:
-        logging.info(f"Finished transcribing from manifest file: {cfg.dataset_manifest}")
-        if cfg.presort_manifest:
-            transcriptions = restore_transcription_order(cfg.dataset_manifest, transcriptions)
-    else:
-        logging.info(f"Finished transcribing {len(filepaths)} files !")
+            if partial_audio:
+                transcriptions = transcribe_partial_audio(
+                    asr_model=asr_model,
+                    path2manifest=cfg.dataset_manifest,
+                    batch_size=cfg.batch_size,
+                    num_workers=cfg.num_workers,
+                    return_hypotheses=cfg.return_hypotheses,
+                    channel_selector=cfg.channel_selector,
+                    augmentor=augmentor,
+                    decoder_type=cfg.decoder_type,
+                )
+            else:
+                transcriptions = asr_model.transcribe(
+                    paths2audio_files=filepaths,
+                    batch_size=cfg.batch_size,
+                    num_workers=cfg.num_workers,
+                    return_hypotheses=cfg.return_hypotheses,
+                    channel_selector=cfg.channel_selector,
+                    augmentor=augmentor,
+                )
+
+    logging.info(f"Finished transcribing {len(filepaths)} files !")
     logging.info(f"Writing transcriptions into file: {cfg.output_filename}")
 
-    # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
+    # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
     if type(transcriptions) == tuple and len(transcriptions) == 2:
-        if cfg.extract_nbest:
-            # extract all hypotheses if exists
-            transcriptions = transcriptions[1]
-        else:
-            # extract just best hypothesis
-            transcriptions = transcriptions[0]
+        transcriptions = transcriptions[0]
 
     if cfg.return_transcriptions:
         return transcriptions
@@ -437,11 +396,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
     )
     logging.info(f"Finished writing predictions to {output_filename}!")
 
-    # clean-up
-    if cfg.presort_manifest is not None:
-        if remove_path_after_done is not None:
-            os.unlink(remove_path_after_done)
-
     if cfg.calculate_wer:
         output_manifest_w_wer, total_res, _ = cal_write_wer(
             pred_manifest=output_filename,
@@ -456,9 +410,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
             logging.info(f"Writing prediction and error rate of each sample to {output_manifest_w_wer}!")
             logging.info(f"{total_res}")
 
-    if cfg.calculate_rtfx:
-        logging.info(f"Dataset RTFx {(total_duration/transcribe_time)}")
-
     return cfg