From 27423f3cf2f5f7981e00caa6176aa6fefcbf4c50 Mon Sep 17 00:00:00 2001
From: Rima Shahbazyan <rshahbazyan@nvidia.com>
Date: Tue, 24 Sep 2024 12:48:11 +0400
Subject: [PATCH] Uzbek processors added

Signed-off-by: Rima <rshahbazyan@nvidia.com>
Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

reverting changes from a merge

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

docs added

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

minor change to Uzbek configuration documentation

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

minor change

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

Comments added to configs and fleurs testcase added

Comments added to configs and fleurs testcase added

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>

Fleurs test added

Signed-off-by: Rima Shahbazyan <rshahbazyan@nvidia.com>
---
 .github/workflows/tests.yml                   |   1 +
 dataset_configs/uzbek/fleurs/config.yaml      | 146 +++++++++++++++++
 dataset_configs/uzbek/mcv/config.yaml         | 151 ++++++++++++++++++
 dataset_configs/uzbek/uzbekvoice/config.yaml  | 147 +++++++++++++++++
 docs/src/sdp/api.rst                          |   5 +
 docs/src/sdp/existing_configs.rst             |  29 +++-
 requirements/main.txt                         |   1 +
 sdp/processors/__init__.py                    |   3 +
 .../uzbekvoice/create_initial_manifest.py     | 124 ++++++++++++++
 tests/test_cfg_end_to_end_tests.py            |  22 ++-
 10 files changed, 626 insertions(+), 3 deletions(-)
 create mode 100644 dataset_configs/uzbek/fleurs/config.yaml
 create mode 100644 dataset_configs/uzbek/mcv/config.yaml
 create mode 100644 dataset_configs/uzbek/uzbekvoice/config.yaml
 create mode 100644 sdp/processors/datasets/uzbekvoice/create_initial_manifest.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 070d5141..aed66873 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -75,6 +75,7 @@ jobs:
         pip install Cython wheel  # need to pre-install to avoid error in nemo installation
         pip install "nemo_toolkit[asr,nlp]"
         python -m pip cache purge
+
     - name: Run all tests
       env:
         AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
diff --git a/dataset_configs/uzbek/fleurs/config.yaml b/dataset_configs/uzbek/fleurs/config.yaml
new file mode 100644
index 00000000..bd1be503
--- /dev/null
+++ b/dataset_configs/uzbek/fleurs/config.yaml
@@ -0,0 +1,146 @@
+documentation: |
+  FLEURS
+  ######
+  This config can be used to prepare
+  `FLEURS <https://huggingface.co/datasets/google/fleurs>`_
+  dataset in the NeMo format.
+  It produces manifest for uzbek language.
+  This config performs the following data processing.
+
+  1. Downloads FLEURS data
+  2. Calculates the length of wav files
+  3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
+
+
+  **Required arguments**.
+  
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+  * **data_split**: should be "train", "dev" or "test".
+  
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**
+
+  This config generates output manifest files:
+
+  * ``${workspace_dir}/${final_manifest}`` - dev subset of the data.
+
+  Output manifest contains the following keys:
+
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription (lower-case without punctuation).
+  * **duration (float)**: audio duration in seconds.
+processors_to_run: '0:'
+workspace_dir: ???
+data_split: dev
+save_dir: ${workspace_dir}
+final_manifest: ${workspace_dir}/${data_split}_manifest.json
+
+processors:
+  # creating manifest for uzbek dev set
+  - _target_: sdp.processors.CreateInitialManifestFleurs
+    lang: "uz_uz"
+    split: ${data_split}
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ''}
+      
+      # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
+      - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
+      - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}
+
+      # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
+      - {"pattern": "‘", "repl": "’"}
+      - {"pattern": "`", "repl": "’"}
+      - {"pattern": "'", "repl": "’"}
+      - {"pattern": 'ʼ', "repl": "’"}
+      - {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}
+
+    test_cases:
+      - { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
+      - { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}
+
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ""}
+
+      - {"pattern": "!", "repl": "."}
+      - {"pattern": "\r", "repl": ""}
+
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '–', "repl": "-"}
+      
+      - {"pattern": '"', "repl": ""}
+      - {"pattern": '“', "repl": ""}
+      - {"pattern": '”', "repl": ""}
+      - {"pattern": '„', "repl": ""}
+      - {"pattern": '‟', "repl": ""}
+      - {"pattern": ';', "repl": ","}
+      - {"pattern": '…', "repl": "."}
+      - {"pattern": '\.\.\.', "repl": "."}
+
+      # for Ŏ ŏ Ó ó Ō ō Õ õ 
+      - {"pattern": "Ŏ", "repl": "Oʻ"}
+      - {"pattern": "ŏ", "repl": "oʻ"}
+      - {"pattern": "Ó", "repl": "Oʻ"}
+      - {"pattern": "ó", "repl": "oʻ"}
+      - {"pattern": "Ō", "repl": "Oʻ"}
+      - {"pattern": "ō", "repl": "oʻ"}
+      - {"pattern": "Õ", "repl": "Oʻ"}
+      - {"pattern": "õ", "repl": "oʻ"}
+
+      #for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ 
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      - {"pattern": "Ǵ", "repl": "Gʻ"}
+      - {"pattern": "ǵ", "repl": "gʻ"}
+      - {"pattern": "Ḡ", "repl": "Gʻ"}
+      - {"pattern": "ḡ", "repl": "gʻ"}
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      
+      #for Ş ş Ç ç Ñ ñ
+      - {"pattern": "Ş", "repl": "Sh"}
+      - {"pattern": "ş", "repl": "sh"}
+      - {"pattern": "Ç", "repl": "Ch"}
+      - {"pattern": "ç", "repl": "ch"}
+      - {"pattern": "Ñ", "repl": "Ng"}
+      - {"pattern": "ñ", "repl": "ng"}
+
+    test_cases:
+        - { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}
+
+
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    regex_patterns: ["^( [A-Z])(.)+"]
+    test_cases:
+      - { input: { text: "one One" }, output: null }
+      - { input: { text: "One one" }, output: { text: "One one" } }
+
+
+  - _target_: sdp.processors.DropNonAlphabet
+    alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
+    test_cases:
+      - { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
+      - { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }
+
+      - { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
+    output_manifest_file: ${final_manifest}
+    
\ No newline at end of file
diff --git a/dataset_configs/uzbek/mcv/config.yaml b/dataset_configs/uzbek/mcv/config.yaml
new file mode 100644
index 00000000..7e09c4bf
--- /dev/null
+++ b/dataset_configs/uzbek/mcv/config.yaml
@@ -0,0 +1,151 @@
+documentation: |
+  MCV Uzbek
+  ###########
+
+  This config is designed for the
+  `Mozilla Common Voice (MCV) <https://commonvoice.mozilla.org/>`_ dataset
+  17.0 release, but should work for any subsequent releases as well.
+
+  It performs the following data processing.
+
+  1. Extracts and converts all data to the specified manifest format.
+  2. Gets audio durations and then keeps only instances with the duration greater than 0.
+  3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
+
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+    You need to manually place the downloaded .tar files data inside
+    ``<workspace dir>`` folder.
+  * **data_split**: should be "train", "dev" or "test".
+
+  Note that you can customize any part of this config either directly or from command-line.
+  Here are some common customizations to consider:
+
+  * **remove_pc**: set to True if P&C is not needed. Defaults to True.
+  * **remove_hyphen**: set to True if hyphens is not needed. Defaults to True.
+
+  **Output format**.
+
+  This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``.
+  The output manifest contains the following fields:
+
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription, including punctuation ".,?" and capitalization.
+  * **duration (float)**: audio duration in seconds.
+
+processors_to_run: '0:'
+workspace_dir: ???
+data_split: ???
+final_manifest: ${workspace_dir}/${data_split}_manifest.json
+save_dir: ${workspace_dir}
+remove_pc: False
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestMCV
+    language_id: uz
+    extract_archive_dir: ${workspace_dir}/raw_data
+    resampled_audio_dir: ${workspace_dir}/${data_split}/audio/
+    data_split: ${data_split}
+    raw_data_dir: ${workspace_dir}
+    output_manifest_file: ${save_dir}/${data_split}_manifest_1.json
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    output_manifest_file: ${save_dir}/${data_split}_manifest_2.json
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ''}
+      
+      # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
+      - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
+      - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}
+
+      # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
+      - {"pattern": "‘", "repl": "’"}
+      - {"pattern": "`", "repl": "’"}
+      - {"pattern": "'", "repl": "’"}
+      - {"pattern": 'ʼ', "repl": "’"}
+      - {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}
+
+    test_cases:
+      - { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
+      - { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}
+
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    output_manifest_file: ${save_dir}/${data_split}_manifest_3.json
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ""}
+
+      - {"pattern": "!", "repl": "."}
+      - {"pattern": "\r", "repl": ""}
+
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '–', "repl": "-"}
+      
+      - {"pattern": '"', "repl": ""}
+      - {"pattern": '“', "repl": ""}
+      - {"pattern": '”', "repl": ""}
+      - {"pattern": '„', "repl": ""}
+      - {"pattern": '‟', "repl": ""}
+      - {"pattern": ';', "repl": ","}
+      - {"pattern": '…', "repl": "."}
+      - {"pattern": '\.\.\.', "repl": "."}
+
+      # for Ŏ ŏ Ó ó Ō ō Õ õ 
+      - {"pattern": "Ŏ", "repl": "Oʻ"}
+      - {"pattern": "ŏ", "repl": "oʻ"}
+      - {"pattern": "Ó", "repl": "Oʻ"}
+      - {"pattern": "ó", "repl": "oʻ"}
+      - {"pattern": "Ō", "repl": "Oʻ"}
+      - {"pattern": "ō", "repl": "oʻ"}
+      - {"pattern": "Õ", "repl": "Oʻ"}
+      - {"pattern": "õ", "repl": "oʻ"}
+
+      #for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ 
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      - {"pattern": "Ǵ", "repl": "Gʻ"}
+      - {"pattern": "ǵ", "repl": "gʻ"}
+      - {"pattern": "Ḡ", "repl": "Gʻ"}
+      - {"pattern": "ḡ", "repl": "gʻ"}
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      
+      #for Ş ş Ç ç Ñ ñ
+      - {"pattern": "Ş", "repl": "Sh"}
+      - {"pattern": "ş", "repl": "sh"}
+      - {"pattern": "Ç", "repl": "Ch"}
+      - {"pattern": "ç", "repl": "ch"}
+      - {"pattern": "Ñ", "repl": "Ng"}
+      - {"pattern": "ñ", "repl": "ng"}
+
+    test_cases:
+        - { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}
+
+
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    regex_patterns: ["^( [A-Z])(.)+"]
+    test_cases:
+      - { input: { text: "one One" }, output: null }
+      - { input: { text: "One one" }, output: { text: "One one" } }
+
+
+  - _target_: sdp.processors.DropNonAlphabet
+    alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
+    test_cases:
+      - { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
+      - { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }
+
+      - { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
+    output_manifest_file: ${final_manifest}
diff --git a/dataset_configs/uzbek/uzbekvoice/config.yaml b/dataset_configs/uzbek/uzbekvoice/config.yaml
new file mode 100644
index 00000000..241e83e8
--- /dev/null
+++ b/dataset_configs/uzbek/uzbekvoice/config.yaml
@@ -0,0 +1,147 @@
+documentation: |
+  uzbekvoice
+  ######
+  This config can be used to prepare
+  `uzbekvoice <https://corpus.uzbekvoice.ai/en-US>`_
+  dataset in the NeMo format.
+  It produces manifest for uzbek language.
+  This config performs the following data processing.
+
+  1. Downloads uzbekvoice data
+  2. Calculates the length of wav files
+  3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
+
+  **Required arguments**.
+  
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+  
+  Note that you can customize any part of this config either directly or from command-line.
+ 
+  **Output format**
+
+  This config generates output manifest files:
+
+  * ``${workspace_dir}/${final_manifest}`` - dev subset of the data.
+
+  Output manifest contains the following keys:
+
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription (lower-case without punctuation).
+  * **duration (float)**: audio duration in seconds.
+processors_to_run: '0:'
+workspace_dir: ???
+final_manifest: ${workspace_dir}/manifest.json
+data_split: ???
+save_dir: ${workspace_dir}
+
+
+processors:
+  # creating manifest for uzbek dev set
+  - _target_: sdp.processors.CreateInitialManifestUzbekvoice
+    raw_data_dir: ${workspace_dir}/raw_data
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    output_manifest_file: ${save_dir}/${data_split}_manifest_1.json
+
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    output_manifest_file: ${save_dir}/${data_split}_manifest_2.json
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ''}
+      
+      # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
+      - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
+      - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
+      - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}
+
+      # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
+      - {"pattern": "‘", "repl": "’"}
+      - {"pattern": "`", "repl": "’"}
+      - {"pattern": "'", "repl": "’"}
+      - {"pattern": 'ʼ', "repl": "’"}
+      - {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}
+
+    test_cases:
+      - { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
+      - { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}
+
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    output_manifest_file: ${save_dir}/${data_split}_manifest_3.json
+
+    regex_params_list:
+      - {"pattern": ":", "repl": ""}
+
+      - {"pattern": "!", "repl": "."}
+      - {"pattern": "\r", "repl": ""}
+
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '–', "repl": "-"}
+      
+      - {"pattern": '"', "repl": ""}
+      - {"pattern": '“', "repl": ""}
+      - {"pattern": '”', "repl": ""}
+      - {"pattern": '„', "repl": ""}
+      - {"pattern": '‟', "repl": ""}
+      - {"pattern": ';', "repl": ","}
+      - {"pattern": '…', "repl": "."}
+      - {"pattern": '\.\.\.', "repl": "."}
+
+      # for Ŏ ŏ Ó ó Ō ō Õ õ 
+      - {"pattern": "Ŏ", "repl": "Oʻ"}
+      - {"pattern": "ŏ", "repl": "oʻ"}
+      - {"pattern": "Ó", "repl": "Oʻ"}
+      - {"pattern": "ó", "repl": "oʻ"}
+      - {"pattern": "Ō", "repl": "Oʻ"}
+      - {"pattern": "ō", "repl": "oʻ"}
+      - {"pattern": "Õ", "repl": "Oʻ"}
+      - {"pattern": "õ", "repl": "oʻ"}
+
+      #for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ 
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      - {"pattern": "Ǵ", "repl": "Gʻ"}
+      - {"pattern": "ǵ", "repl": "gʻ"}
+      - {"pattern": "Ḡ", "repl": "Gʻ"}
+      - {"pattern": "ḡ", "repl": "gʻ"}
+      - {"pattern": "Ğ", "repl": "Gʻ"}
+      - {"pattern": "ğ", "repl": "gʻ"}
+      
+      #for Ş ş Ç ç Ñ ñ
+      - {"pattern": "Ş", "repl": "Sh"}
+      - {"pattern": "ş", "repl": "sh"}
+      - {"pattern": "Ç", "repl": "Ch"}
+      - {"pattern": "ç", "repl": "ch"}
+      - {"pattern": "Ñ", "repl": "Ng"}
+      - {"pattern": "ñ", "repl": "ng"}
+
+    test_cases:
+        - { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output:  { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}
+
+
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    regex_patterns: ["^( [A-Z])(.)+"]
+    test_cases:
+      - { input: { text: "one One" }, output: null }
+      - { input: { text: "One one" }, output: { text: "One one" } }
+
+
+  - _target_: sdp.processors.DropNonAlphabet
+    alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
+    test_cases:
+      - { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
+      - { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }
+
+      - { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
+    output_manifest_file: ${final_manifest}
+    
\ No newline at end of file
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 6d85e83d..23d58495 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -91,6 +91,11 @@ FLEURS
 .. autodata:: sdp.processors.CreateInitialManifestFleurs
    :annotation:
 
+UzbekVoice
+''''''''''
+.. autodata:: sdp.processors.CreateInitialManifestUzbekvoice
+   :annotation:
+
 Lhotse processors
 #################
 
diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 32c52ec1..3b6b5e67 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -37,7 +37,10 @@ download the data archive and specify its location with the ``raw_data_dir`` par
 * **Georgian**:
   `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/georgian/mcv/config.yaml>`__ |
   :doc:`documentation <config-docs/georgian/mcv/config>`
-
+* **Uzbek**:
+  `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/uzbek/mcv/config.yaml>`__ |
+  :doc:`documentation <config-docs/uzbek/mcv/config>`
+  
 .. toctree:: 
    :hidden:
 
@@ -46,6 +49,7 @@ download the data archive and specify its location with the ``raw_data_dir`` par
    config-docs/portuguese/mcv/config
    config-docs/kazakh/mcv/config
    config-docs/georgian/mcv/config
+   config-docs/uzbek/mcv/config
 
 Multilingual LibriSpeech (MLS)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -173,13 +177,19 @@ Few-shot Learning Evaluation of Universal Representations of Speech (FLEURS)
 
 **Dataset link:** https://huggingface.co/datasets/google/fleurs
 
+* **Armenian**:
 `config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/fleurs/config.yaml>`__ |
 :doc:`documentation <config-docs/armenian/fleurs/config>`
 
+* **Uzbek**:
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/uzbek/fleurs/config.yaml>`__ |
+:doc:`documentation <config-docs/uzbek/fleurs/config>`
+
 .. toctree::
    :hidden:
 
    config-docs/armenian/fleurs/config
+   config-docs/uzbek/fleurs/config
 
 LibriSpeech
 ~~~~~~~~~~~
@@ -276,4 +286,19 @@ Kazakh Speech Corpus 2 (KSC2)
 .. toctree::
    :hidden:
 
-   config-docs/kazakh/ksc2/config
\ No newline at end of file
+   config-docs/kazakh/ksc2/config
+
+UzbekVoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** https://corpus.uzbekvoice.ai/en-US
+
+**Required manual steps:** You need to download the dataset from the google drive provided on the website.
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/uzbek/uzbekvoice/config.yaml>`__ |
+:doc:`documentation <config-docs/uzbek/uzbekvoice/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/uzbek/uzbekvoice/config
\ No newline at end of file
diff --git a/requirements/main.txt b/requirements/main.txt
index 7617e793..5283c3c7 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -11,6 +11,7 @@ rarfile
 regex
 sox
 tqdm
+gdown
 webvtt-py
 wget
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fdafb521..23079d84 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -24,6 +24,9 @@
 from sdp.processors.datasets.fleurs.create_initial_manifest import (
     CreateInitialManifestFleurs,
 )
+from sdp.processors.datasets.uzbekvoice.create_initial_manifest import (
+    CreateInitialManifestUzbekvoice,
+)
 from sdp.processors.datasets.ksc2.create_initial_manifest import (
     CreateInitialManifestKSC2,
 )
diff --git a/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py b/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
new file mode 100644
index 00000000..e41216d7
--- /dev/null
+++ b/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+import typing
+import gdown
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import extract_archive
+from sdp.logging import logger
+
+
+class CreateInitialManifestUzbekvoice(BaseProcessor):
+    URL = "https://drive.google.com/drive/folders/18N5i7GD0LmUnNQok6BP3EC8PYov7pZDW"
+
+    """
+    Processor to create initial manifest for the Uzbekvoice dataset.
+
+    Dataset link: https://uzbekvoice.ai/en-US
+
+    Will download all files, extract them, and create a manifest file with the
+    "audio_filepath", "text" and "duration" fields.
+
+    Args:    
+        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
+
+    Returns:
+        This processor generates an initial manifest file with the following fields::
+
+            {
+                "audio_filepath": <path to the audio file>,
+                "text": <transcription>,
+            }
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = raw_data_dir
+
+    def download_extract_files(self, dst_folder: str) -> None:
+        """downloading and extracting files"""
+
+        os.makedirs(dst_folder, exist_ok=True)
+
+        # downloading all files
+        # for big files google drive doesn't allow to try downlaoding them more than once
+        # so, in case of receiveing gdown error we need to download them manually
+
+        #check if clisp.zip and uzbekvoice-dataset.zip are already in dst_folder
+        if os.path.exists(os.path.join(dst_folder, 'clips.zip')) and os.path.exists(os.path.join(dst_folder, 'uzbekvoice-dataset.zip')):
+            print("Files already exist in the folder. Skipping download.")
+        else:
+            print(f"Downloading files from {self.URL}...")
+            try:
+                gdown.download_folder(self.URL, output=dst_folder)
+            except Exception as e:
+                print("Error occured while downloading files from google drive. Please download them manually.")
+                print("URL: ", self.URL)
+                print("Error: ", e)
+        for file in glob.glob(os.path.join(dst_folder, '*.zip')):
+            extract_archive(file, str(dst_folder), force_extract=True)
+            print(f"Extracted {file}")
+
+
+    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
+        """
+        Parse transcript JSON file and put it inside manifest.
+        """
+
+        entries = []
+        root = os.path.join(self.raw_data_dir, 'clips')
+        number_of_entries = 0
+        total_duration = 0
+        # parse json file and collect audio file path, transcript and lenght in entries
+        with open(file_path, encoding="utf-8") as fin:
+            data = json.load(fin)
+            for entry in data:
+                audio_file = os.path.join(root, entry["client_id"], entry["original_sentence_id"] + '.mp3')
+                transcript = entry["original_sentence"]
+                utter_length = entry["clip_duration"]
+                number_of_entries += 1
+                entries.append(
+                    {
+                        "audio_filepath": os.path.abspath(audio_file), 
+                        "text": transcript, 
+                        "duration": utter_length
+                    }
+                )
+            
+
+            logger.info("Total number of entries after processing: %d", number_of_entries)
+            logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
+
+        return entries
+
+    def process_data(self, data_folder: str, manifest_file: str) -> None:
+        entries = self.process_transcript(os.path.join(data_folder, "uzbekvoice-dataset", "voice_dataset.json"))
+
+        with open(manifest_file, "w", encoding="utf-8") as fout:
+            for m in entries:
+                fout.write(json.dumps(m, ensure_ascii=False) + "\n")
+
+
+
+    def process(self):
+        self.download_extract_files(self.raw_data_dir)
+        self.process_data(self.raw_data_dir, self.output_manifest_file)
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 94e770d9..edb95326 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -76,7 +76,15 @@ def data_check_fn_slr140(raw_data_dir: str) -> None:
     if not expected_file.exists():
         raise ValueError(f"No such file {str(expected_file)}")
 
-    extract_tar_with_strip_components(expected_file, tgt_dir, strip_components=1)
+    extract_tar_with_strip_components(expected_file, tgt_dir, strip_components=1)  
+
+def data_check_fn_uzbekvoice(raw_data_dir: str) -> None:
+    expected_files = [Path(raw_data_dir) / "clips.zip", Path(raw_data_dir) / "uzbekvoice-dataset.zip"]
+    for expected_file in expected_files:
+        if expected_file.exists():
+            return
+        else:
+            raise ValueError(f"No such file {str(expected_file)} at {str(raw_data_dir)}")
 
 # using Mock so coraal_processor will only try to use the files listed.
 # To reduce the amount of storage required by the test data, the S3 bucket contains
@@ -166,6 +174,18 @@ def get_test_cases() -> List[Tuple[str, Callable]]:
             config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", 
             data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz")
             ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", 
+            data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz")
+            ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", 
+            data_check_fn=data_check_fn_uzbekvoice
+            ),
+        TestCase(
+            config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", 
+            data_check_fn=data_check_fn_fleurs
+            )
     ]
 
 def get_test_names():