Merge branch 'main' of github.com:NVIDIA/NeMo-speech-data-processor i…

…nto karpnv/cc
NVIDIA · Oct 14, 2024 · 4441a2a · 4441a2a
2 parents c71f558 + aec62fa
commit 4441a2a
Show file tree

Hide file tree

Showing 7 changed files with 327 additions and 63 deletions.
diff --git a/dataset_configs/english/librispeech/all.yaml b/dataset_configs/english/librispeech/all.yaml
@@ -0,0 +1,171 @@
+documentation: |
+  Librispeech (all)
+  #################
+  
+  This config can be used to prepare
+  `Librispeech <https://www.openslr.org/12/>`_
+  dataset in the NeMo format.
+
+  It produces manifests for the all splits of Libripseech. 
+
+  This config performs the following data processing.
+
+  1. Downloads Librispeech data
+  2. Converts flac files to wav file
+  3. Calculates the length of wav files
+  4. Makes capitalization lowercase
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config generates output manifest files for all splits of the data:
+
+  * ``${workspace_dir}/dev-clean.json`` - dev-clean subset.
+  * ``${workspace_dir}/dev-other.json`` - dev-other subset.
+  * ``${workspace_dir}/test-clean.json`` - test-clean subset.
+  * ``${workspace_dir}/test-other.json`` - test-other subset.
+  * ``${workspace_dir}/train-clean-100.json`` - train-clean-100 subset.
+  * ``${workspace_dir}/train-clean-360.json`` - train-clean-360 subset.
+  * ``${workspace_dir}/train-other-500.json`` - train-other-500 subset.
+
+
+  Output manifest contains the following fields:
+
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription (lower-case without punctuation).
+  * **duration (float)**: audio duration in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+
+processors:
+  # creating manifest for dev-clean set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: dev-clean
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/dev-clean.json
+
+  # creating manifest for dev-other set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: dev-other
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/dev-other.json
+
+  # creating manifest for test-clean set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: test-clean
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/test-clean.json
+
+  # creating manifest for test-other set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: test-other
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/test-other.json
+
+  # creating manifest for train-clean-100 set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: train-clean-100
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/train-clean-100.json
+
+  # creating manifest for train-clean-360 set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: train-clean-360
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/train-clean-360.json
+
+  # creating manifest for train-other-500 set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: train-other-500
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/train-other-500.json
diff --git a/dataset_configs/english/librispeech/config.yaml b/dataset_configs/english/librispeech/config.yaml
@@ -6,17 +6,18 @@ documentation: |
   `Librispeech <https://openslr.org/12>`_
   dataset in the NeMo format.
 
-  It produces manifests for the dev-clean split (for other splits, please configure, change, or combine). 
+  It produces manifests for the dev-clean split (for other splits, please configure). 
   The options are:
 
-  - ``["dev-clean"]``,
-  - ``["dev-other"]``,
-  - ``["test-clean"]``,
-  - ``["test-other"]``,
-  - ``["train-clean-100"]``,
-  - ``["train-clean-360"]``,
-  - ``["train-other-500"]``,
-  - ``["all"]`` (for all datasets available)
+  - ``"dev-clean"``
+  - ``"dev-other"``
+  - ``"test-clean"``
+  - ``"test-other"``
+  - ``"train-clean-100"``
+  - ``"train-clean-360"``
+  - ``"train-other-500"``
+  - ``"dev-clean-2"``
+  - ``"train-clean-5"``
 
   This config performs the following data processing.
 
@@ -45,13 +46,13 @@ documentation: |
 
 processors_to_run: all
 workspace_dir: ???
-data_split: ["dev-clean"]
+data_split: "dev-clean"
 final_manifest: ${workspace_dir}/manifest.json
 
 processors:
   # creating manifest for dev-clean set
   - _target_: sdp.processors.CreateInitialManifestLibrispeech
-    splits: ${data_split}
+    split: ${data_split}
     raw_data_dir: ${workspace_dir}/raw_data
 
   - _target_: sdp.processors.SoxConvert

diff --git a/dataset_configs/english/librispeech/mini.yaml b/dataset_configs/english/librispeech/mini.yaml
@@ -0,0 +1,75 @@
+documentation: |
+  Librispeech (mini)
+  ##################
+  
+  This config can be used to prepare
+  `Librispeech mini <https://www.openslr.org/31/>`_
+  dataset in the NeMo format.
+
+  It produces manifests for the mini split of Libripseech. 
+
+  This config performs the following data processing.
+
+  1. Downloads Librispeech data
+  2. Converts flac files to wav file
+  3. Calculates the length of wav files
+  4. Makes capitalization lowercase
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**.
+
+  This config generates 2 output manifest files:
+
+  * ``${workspace_dir}/dev-clean-2.json`` - mini dev-clean subset of the data.
+  * ``${workspace_dir}/train-clean-5.json`` - mini train-clean subset of the data.
+
+  Output manifest contains the following fields:
+
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription (lower-case without punctuation).
+  * **duration (float)**: audio duration in seconds.
+
+processors_to_run: all
+workspace_dir: ???
+
+processors:
+  # creating manifest for mini dev-clean set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: dev-clean-2
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/dev-clean-2.json
+
+  # creating manifest for mini traio-clean set
+  - _target_: sdp.processors.CreateInitialManifestLibrispeech
+    split: train-clean-5
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.SoxConvert
+    converted_audio_dir: ${workspace_dir}/audio
+    input_audio_file_key: "audio_filepath"
+    output_audio_file_key: "audio_filepath"
+    output_format: "wav"
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/train-clean-5.json
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
@@ -181,18 +181,32 @@ Few-shot Learning Evaluation of Universal Representations of Speech (FLEURS)
 
    config-docs/armenian/fleurs/config
 
-English LibriSpeech (ELS)
-~~~~~~~~~~~~~~~~~~~~~~~~~
+LibriSpeech
+~~~~~~~~~~~
 
-**Dataset link:** https://openslr.org/12
+**Dataset links:** https://openslr.org/12 (regular), https://openslr.org/31 (mini Librispeech)
+
+
+**Supported configs**.
+
+* **config (for processing one specific subset at a time)**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/librispeech/config.yaml>`__ |
+   :doc:`documentation <config-docs/english/librispeech/config>`
+* **mini**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/librispeech/mini.yaml>`__ |
+   :doc:`documentation <config-docs/english/librispeech/mini>`
+* **all (for obtaining all subsets in one go)**:
+   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/librispeech/all.yaml>`__ |
+   :doc:`documentation <config-docs/english/librispeech/all>`
 
-`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/english/librispeech/config.yaml>`__ |
-:doc:`documentation <config-docs/english/librispeech/config>`
 
 .. toctree::
    :hidden:
-   
+
    config-docs/english/librispeech/config
+   config-docs/english/librispeech/mini
+   config-docs/english/librispeech/all
+
 
 Coraa Brazilian Portuguese dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/requirements/main.txt b/requirements/main.txt
@@ -1,18 +1,19 @@
 diff_match_patch
 editdistance
+ffmpeg
 hydra-core
 joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
 numpy
 omegaconf
 pandas
+rarfile
 regex
 sox
 tqdm
-wget
-ffmpeg
-rarfile
 webvtt-py
+wget
+
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
-# for some processers, additionally nemo_text_processingis required
-pytorch_lightning
+# for some processers, additionally nemo_text_processing is required
+# for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all