NVIDIA · monica-sekoyan · Oct 5, 2024 · Oct 5, 2024 · Oct 5, 2024 · Oct 5, 2024
diff --git a/dataset_configs/english/coraal/config.yaml b/dataset_configs/english/coraal/config.yaml
@@ -93,8 +93,8 @@ processors:
       - {"pattern": '\baksing\b', "repl": "asking"}
       - {"pattern": '\baksed\b', "repl": "asked"}
       # removing unintelligible/redacted flags
-      - {"pattern": '/(?i)unintelligible/', "repl": ""}
-      - {"pattern": '/(?i)inaudible/', "repl": ""}
+      - {"pattern": '(?i)unintelligible/', "repl": ""}
+      - {"pattern": '(?i)inaudible/', "repl": ""}
       - {"pattern": '/RD(.*?)/', "repl": ""}
       - {"pattern": '/(\?)\1*/', "repl": ""}
       # removing non-linguistic markers

diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml
@@ -0,0 +1,53 @@
+documentation: |
+  IARPA Babel Dataset
+  ###################
+
+  This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu.
+
+  It creates initial manifest for the specified data type and data split. 
+  Further data processing steps should be performed based on the specific langauge.
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify path of the directory downloaded from LDC.
+  * **data_type**: should be "conversational" or "scripted".
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval".
+  * **output_manifest_file**: specify output manifest filepath.
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${output_manifest_file}``.
+  The output manifest contains the following fields:
+
+  * **outputFn (str)**: initial audio filename.
+  * **sessID (str)**: session ID of the recording.
+  * **date (str)**: date of the recording.
+  * **time (str)**: time of the recording.
+  * **spkrCode (str)**: speaker ID.
+  * **lineType (str)**: type of the line (inline or outline).
+  * **dialect (str)**: dialect of the speaker.
+  * **gen (str)**: gender of the speaker.
+  * **envType (str)**: environment (i.e., home, office, etc.).
+  * **age (str)**: age of the speaker.
+  * **network (str)**: name of the telecommunications network.
+  * **phoneModel (str)**: model of the phone.
+  * **sampleCount (str)**: count of the sample.
+  * **sampleRate (str)**: original sample rate of the recording.
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+data_type: scripted
+resampled_audio_dir: ${workspace_dir}/processed/${data_type}/${data_split}
+data_split: training
+final_manifest: ${workspace_dir}/processed/${data_type}/${data_split}_manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestBabel
+    raw_data_dir: ${workspace_dir}
+    data_type: ${data_type}
+    data_split: ${data_split}
+    resampled_audio_dir: ${resampled_audio_dir}
+    output_manifest_file: ${final_manifest}
diff --git a/dataset_configs/multilingual/voxpopuli/config_un.yaml b/dataset_configs/multilingual/voxpopuli/config_un.yaml
@@ -0,0 +1,39 @@
+documentation: |
+  Voxpopuli unlabelled subset
+  ###########################
+
+  This config can be used to prepare
+  `Voxpopuli dataset unlabelled subset <https://github.com/facebookresearch/voxpopuli/>`_
+  dataset in the NeMo format.
+
+  It creates initial manifest for the specified language. 
+
+  **Required arguments**.
+
+  * **raw_data_dir**: specify the directory where the downloaded data will be/is saved.
+  * **language_id**: specify the language of the data you wish to be downloaded and/or processed.
+  * **resampled_audio_dir**: specify the directory path, where new processed audios should be located.
+  * **delete_raw_file**: specify if the initial raw audio files should be deleted or not.
+
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``.
+  The output manifest contains the following fields:
+
+  * **audio_filepath (str)**: path to the processed audio file.
+  * **duration (float)**: duration of the audio in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+language_id: hu_v2
+resampled_data_dir: ${workspace_dir}/unlabelled_processed/
+final_manifest: ${workspace_dir}/unlabelled_processed/${language_id}/manifest.json
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+    raw_data_dir: ${workspace_dir}
+    language_id: ${language_id}
+    resampled_data_dir: ${resampled_data_dir}
+    delete_raw_file: False
+    output_manifest_file: ${final_manifest}
diff --git a/dataset_configs/multilingual/yodas/config.yaml b/dataset_configs/multilingual/yodas/config.yaml
@@ -0,0 +1,35 @@
+processors_to_run: all
+manifest: ???
+resampled_audio_dir: ???
+out_manifest: ???
+char_rate: 10
+min_duration: 1.5
+max_duration: 40.1
+
+processors:
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    input_manifest_file: ${manifest}
+
+  - _target_: sdp.processors.RandomSegment
+    min_duration: ${min_duration}
+    max_duration: ${max_duration}
+    resampled_audio_dir: ${resampled_audio_dir}
+    audio_format: flac
+
+  - _target_: sdp.processors.ASRInference
+    pretrained_model: nvidia/parakeet-ctc-0.6b
+
+  - _target_: sdp.processors.DropHighLowCharrate
+    low_charrate_threshold: ${char_rate}
+    text_key: pred_text
+    high_charrate_threshold: 10000
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep:
+      - audio_filepath
+      - duration
+
+  - _target_: sdp.processors.DropCorrupted
+    output_manifest_file: ${out_manifest}
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -40,6 +40,9 @@ VoxPopuli
 .. autodata:: sdp.processors.CreateInitialManifestVoxpopuli
    :annotation:
 
+.. autodata:: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled
+   :annotation:
+
 .. autodata:: sdp.processors.NormalizeFromNonPCTextVoxpopuli
    :annotation:
 
@@ -58,8 +61,13 @@ Librispeech
 
 .. autodata:: sdp.processors.CreateInitialManifestLibrispeech
    :annotation:
-
 
+Babel
+'''''''''''
+
+.. autodata:: sdp.processors.CreateInitialManifestBabel
+   :annotation:
+
 SLR83
 '''''
 
@@ -158,6 +166,18 @@ Data modifications
 .. autodata:: sdp.processors.InverseNormalizeText
    :annotation:
 
+.. autodata:: sdp.processors.RandomSegment
+   :annotation:
+
+.. autodata:: sdp.processors.UntarAudios
+   :annotation:
+
+.. autodata:: sdp.processors.ExtractFilesFromTar
+   :annotation:
+
+.. autodata:: sdp.processors.RemoveEmojis
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
@@ -237,6 +257,9 @@ Data filtering
 .. autodata:: sdp.processors.DropRepeatedFields
    :annotation:
 
+.. autodata:: sdp.processors.DropCorrupted
+   :annotation:
+
 
 Miscellaneous
 #############

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -92,12 +92,16 @@ VoxPopuli
 * **Spanish**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/spanish_pc/voxpopuli/config.yaml>`__ |
   :doc:`documentation <config-docs/spanish_pc/voxpopuli/config>`
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/voxpopuli/config_un.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/voxpopuli/config_un>`
 
 .. toctree::
    :hidden:
 
    config-docs/italian/voxpopuli/config
    config-docs/spanish_pc/voxpopuli/config
+   config-docs/multilingual/voxpopuli/config_un
 
 Fisher
 ~~~~~~
@@ -237,6 +241,22 @@ MTEDx
 
    config-docs/portuguese/mtedx/config
 
+Babel
+~~~~~~
+
+**Dataset link:** https://www.ldc.upenn.edu
+
+**Supported configs**.
+
+* **Multilingual**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/multilingual/babel/config.yaml>`__ |
+  :doc:`documentation <config-docs/multilingual/babel/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/multilingual/babel/config
+
 Kazakh Speech Dataset (SLR140)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/pytest.ini b/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
-addopts = --doctest-modules
+addopts = --doctest-modules
+markers =
+    dependency: mark a test as a dependent on the other mentioned test.
+    slow: marks tests as slow (deselect with '-m "not slow"').
diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -1,3 +1,3 @@
-accelerate
-transformers>=0.2.1
+accelerate==0.34.2
+transformers==4.39
 huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793
diff --git a/requirements/main.txt b/requirements/main.txt
@@ -4,7 +4,7 @@ ffmpeg
 hydra-core
 joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
-numpy
+numpy==1.26.4
 omegaconf
 pandas
 rarfile
@@ -13,7 +13,7 @@ sox
 tqdm
 webvtt-py
 wget
-
+pydub
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
@@ -50,7 +50,9 @@
 )
 from sdp.processors.datasets.voxpopuli.create_initial_manifest import (
     CreateInitialManifestVoxpopuli,
+    CreateInitialManifestVoxpopuliUnlabelled,
 )
+from sdp.processors.datasets.babel.create_initial_manifest import CreateInitialManifestBabel
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
@@ -80,6 +82,10 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    ExtractFilesFromTar,
+    RandomSegment,
+    UntarAudios,
+    RemoveEmojis
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
@@ -97,6 +103,7 @@
     DropOnAttribute,
     PreserveByValue,
     DropRepeatedFields,
+    DropCorrupted,
 )
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,