From 27423f3cf2f5f7981e00caa6176aa6fefcbf4c50 Mon Sep 17 00:00:00 2001 From: Rima Shahbazyan Date: Tue, 24 Sep 2024 12:48:11 +0400 Subject: [PATCH] Uzbek processors added Signed-off-by: Rima Signed-off-by: Rima Shahbazyan reverting changes from a merge Signed-off-by: Rima Shahbazyan docs added Signed-off-by: Rima Shahbazyan minor change to Uzbek configuration documentation Signed-off-by: Rima Shahbazyan minor change Signed-off-by: Rima Shahbazyan Comments added to configs and fleurs testcase added Comments added to configs and fleurs testcase added Signed-off-by: Rima Shahbazyan Fleurs test added Signed-off-by: Rima Shahbazyan --- .github/workflows/tests.yml | 1 + dataset_configs/uzbek/fleurs/config.yaml | 146 +++++++++++++++++ dataset_configs/uzbek/mcv/config.yaml | 151 ++++++++++++++++++ dataset_configs/uzbek/uzbekvoice/config.yaml | 147 +++++++++++++++++ docs/src/sdp/api.rst | 5 + docs/src/sdp/existing_configs.rst | 29 +++- requirements/main.txt | 1 + sdp/processors/__init__.py | 3 + .../uzbekvoice/create_initial_manifest.py | 124 ++++++++++++++ tests/test_cfg_end_to_end_tests.py | 22 ++- 10 files changed, 626 insertions(+), 3 deletions(-) create mode 100644 dataset_configs/uzbek/fleurs/config.yaml create mode 100644 dataset_configs/uzbek/mcv/config.yaml create mode 100644 dataset_configs/uzbek/uzbekvoice/config.yaml create mode 100644 sdp/processors/datasets/uzbekvoice/create_initial_manifest.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 070d5141..aed66873 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -75,6 +75,7 @@ jobs: pip install Cython wheel # need to pre-install to avoid error in nemo installation pip install "nemo_toolkit[asr,nlp]" python -m pip cache purge + - name: Run all tests env: AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} diff --git a/dataset_configs/uzbek/fleurs/config.yaml b/dataset_configs/uzbek/fleurs/config.yaml new file mode 100644 index 00000000..bd1be503 --- /dev/null +++ b/dataset_configs/uzbek/fleurs/config.yaml @@ -0,0 +1,146 @@ +documentation: | + FLEURS + ###### + This config can be used to prepare + `FLEURS `_ + dataset in the NeMo format. + It produces manifest for uzbek language. + This config performs the following data processing. + + 1. Downloads FLEURS data + 2. Calculates the length of wav files + 3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters. + + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files will be stored. + * **data_split**: should be "train", "dev" or "test". + + Note that you can customize any part of this config either directly or from command-line. + + **Output format** + + This config generates output manifest files: + + * ``${workspace_dir}/${final_manifest}`` - dev subset of the data. + + Output manifest contains the following keys: + + * **audio_filepath (str)**: relative path to the audio files. + * **text (str)**: transcription (lower-case without punctuation). + * **duration (float)**: audio duration in seconds. +processors_to_run: '0:' +workspace_dir: ??? +data_split: dev +save_dir: ${workspace_dir} +final_manifest: ${workspace_dir}/${data_split}_manifest.json + +processors: + # creating manifest for uzbek dev set + - _target_: sdp.processors.CreateInitialManifestFleurs + lang: "uz_uz" + split: ${data_split} + raw_data_dir: ${workspace_dir}/raw_data + + - _target_: sdp.processors.GetAudioDuration + audio_filepath_key: audio_filepath + duration_key: duration + + - _target_: sdp.processors.SubRegex + text_key: text + + regex_params_list: + - {"pattern": ":", "repl": ''} + + # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ + - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"} + - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"} + + # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’ + - {"pattern": "‘", "repl": "’"} + - {"pattern": "`", "repl": "’"} + - {"pattern": "'", "repl": "’"} + - {"pattern": 'ʼ', "repl": "’"} + - {"pattern": '(?`_ dataset + 17.0 release, but should work for any subsequent releases as well. + + It performs the following data processing. + + 1. Extracts and converts all data to the specified manifest format. + 2. Gets audio durations and then keeps only instances with the duration greater than 0. + 3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters. + + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files will be stored. + You need to manually place the downloaded .tar files data inside + ```` folder. + * **data_split**: should be "train", "dev" or "test". + + Note that you can customize any part of this config either directly or from command-line. + Here are some common customizations to consider: + + * **remove_pc**: set to True if P&C is not needed. Defaults to True. + * **remove_hyphen**: set to True if hyphens is not needed. Defaults to True. + + **Output format**. + + This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. + The output manifest contains the following fields: + + * **audio_filepath (str)**: relative path to the audio files. + * **text (str)**: transcription, including punctuation ".,?" and capitalization. + * **duration (float)**: audio duration in seconds. + +processors_to_run: '0:' +workspace_dir: ??? +data_split: ??? +final_manifest: ${workspace_dir}/${data_split}_manifest.json +save_dir: ${workspace_dir} +remove_pc: False + +processors: + - _target_: sdp.processors.CreateInitialManifestMCV + language_id: uz + extract_archive_dir: ${workspace_dir}/raw_data + resampled_audio_dir: ${workspace_dir}/${data_split}/audio/ + data_split: ${data_split} + raw_data_dir: ${workspace_dir} + output_manifest_file: ${save_dir}/${data_split}_manifest_1.json + + - _target_: sdp.processors.SubRegex + text_key: text + output_manifest_file: ${save_dir}/${data_split}_manifest_2.json + + regex_params_list: + - {"pattern": ":", "repl": ''} + + # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ + - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"} + - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"} + + # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’ + - {"pattern": "‘", "repl": "’"} + - {"pattern": "`", "repl": "’"} + - {"pattern": "'", "repl": "’"} + - {"pattern": 'ʼ', "repl": "’"} + - {"pattern": '(?`_ + dataset in the NeMo format. + It produces manifest for uzbek language. + This config performs the following data processing. + + 1. Downloads uzbekvoice data + 2. Calculates the length of wav files + 3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters. + + **Required arguments**. + + * **workspace_dir**: specify the workspace folder where all audio files will be stored. + + Note that you can customize any part of this config either directly or from command-line. + + **Output format** + + This config generates output manifest files: + + * ``${workspace_dir}/${final_manifest}`` - dev subset of the data. + + Output manifest contains the following keys: + + * **audio_filepath (str)**: relative path to the audio files. + * **text (str)**: transcription (lower-case without punctuation). + * **duration (float)**: audio duration in seconds. +processors_to_run: '0:' +workspace_dir: ??? +final_manifest: ${workspace_dir}/manifest.json +data_split: ??? +save_dir: ${workspace_dir} + + +processors: + # creating manifest for uzbek dev set + - _target_: sdp.processors.CreateInitialManifestUzbekvoice + raw_data_dir: ${workspace_dir}/raw_data + + - _target_: sdp.processors.GetAudioDuration + audio_filepath_key: audio_filepath + duration_key: duration + output_manifest_file: ${save_dir}/${data_split}_manifest_1.json + + + - _target_: sdp.processors.SubRegex + text_key: text + output_manifest_file: ${save_dir}/${data_split}_manifest_2.json + + regex_params_list: + - {"pattern": ":", "repl": ''} + + # replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ + - {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"} + - {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"} + - {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"} + + # rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’ + - {"pattern": "‘", "repl": "’"} + - {"pattern": "`", "repl": "’"} + - {"pattern": "'", "repl": "’"} + - {"pattern": 'ʼ', "repl": "’"} + - {"pattern": '(?`__ | :doc:`documentation ` - +* **Uzbek**: + `config `__ | + :doc:`documentation ` + .. toctree:: :hidden: @@ -46,6 +49,7 @@ download the data archive and specify its location with the ``raw_data_dir`` par config-docs/portuguese/mcv/config config-docs/kazakh/mcv/config config-docs/georgian/mcv/config + config-docs/uzbek/mcv/config Multilingual LibriSpeech (MLS) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -173,13 +177,19 @@ Few-shot Learning Evaluation of Universal Representations of Speech (FLEURS) **Dataset link:** https://huggingface.co/datasets/google/fleurs +* **Armenian**: `config `__ | :doc:`documentation ` +* **Uzbek**: +`config `__ | +:doc:`documentation ` + .. toctree:: :hidden: config-docs/armenian/fleurs/config + config-docs/uzbek/fleurs/config LibriSpeech ~~~~~~~~~~~ @@ -276,4 +286,19 @@ Kazakh Speech Corpus 2 (KSC2) .. toctree:: :hidden: - config-docs/kazakh/ksc2/config \ No newline at end of file + config-docs/kazakh/ksc2/config + +UzbekVoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Dataset link:** https://corpus.uzbekvoice.ai/en-US + +**Required manual steps:** You need to download the dataset from the google drive provided on the website. + +`config `__ | +:doc:`documentation ` + +.. toctree:: + :hidden: + + config-docs/uzbek/uzbekvoice/config \ No newline at end of file diff --git a/requirements/main.txt b/requirements/main.txt index 7617e793..5283c3c7 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -11,6 +11,7 @@ rarfile regex sox tqdm +gdown webvtt-py wget diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index fdafb521..23079d84 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -24,6 +24,9 @@ from sdp.processors.datasets.fleurs.create_initial_manifest import ( CreateInitialManifestFleurs, ) +from sdp.processors.datasets.uzbekvoice.create_initial_manifest import ( + CreateInitialManifestUzbekvoice, +) from sdp.processors.datasets.ksc2.create_initial_manifest import ( CreateInitialManifestKSC2, ) diff --git a/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py b/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py new file mode 100644 index 00000000..e41216d7 --- /dev/null +++ b/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py @@ -0,0 +1,124 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import json +import os +import typing +import gdown + +from sdp.processors.base_processor import BaseProcessor +from sdp.utils.common import extract_archive +from sdp.logging import logger + + +class CreateInitialManifestUzbekvoice(BaseProcessor): + URL = "https://drive.google.com/drive/folders/18N5i7GD0LmUnNQok6BP3EC8PYov7pZDW" + + """ + Processor to create initial manifest for the Uzbekvoice dataset. + + Dataset link: https://uzbekvoice.ai/en-US + + Will download all files, extract them, and create a manifest file with the + "audio_filepath", "text" and "duration" fields. + + Args: + raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted. + + Returns: + This processor generates an initial manifest file with the following fields:: + + { + "audio_filepath": , + "text": , + } + """ + + def __init__( + self, + raw_data_dir: str, + **kwargs, + ): + super().__init__(**kwargs) + self.raw_data_dir = raw_data_dir + + def download_extract_files(self, dst_folder: str) -> None: + """downloading and extracting files""" + + os.makedirs(dst_folder, exist_ok=True) + + # downloading all files + # for big files google drive doesn't allow to try downlaoding them more than once + # so, in case of receiveing gdown error we need to download them manually + + #check if clisp.zip and uzbekvoice-dataset.zip are already in dst_folder + if os.path.exists(os.path.join(dst_folder, 'clips.zip')) and os.path.exists(os.path.join(dst_folder, 'uzbekvoice-dataset.zip')): + print("Files already exist in the folder. Skipping download.") + else: + print(f"Downloading files from {self.URL}...") + try: + gdown.download_folder(self.URL, output=dst_folder) + except Exception as e: + print("Error occured while downloading files from google drive. Please download them manually.") + print("URL: ", self.URL) + print("Error: ", e) + for file in glob.glob(os.path.join(dst_folder, '*.zip')): + extract_archive(file, str(dst_folder), force_extract=True) + print(f"Extracted {file}") + + + def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]: + """ + Parse transcript JSON file and put it inside manifest. + """ + + entries = [] + root = os.path.join(self.raw_data_dir, 'clips') + number_of_entries = 0 + total_duration = 0 + # parse json file and collect audio file path, transcript and lenght in entries + with open(file_path, encoding="utf-8") as fin: + data = json.load(fin) + for entry in data: + audio_file = os.path.join(root, entry["client_id"], entry["original_sentence_id"] + '.mp3') + transcript = entry["original_sentence"] + utter_length = entry["clip_duration"] + number_of_entries += 1 + entries.append( + { + "audio_filepath": os.path.abspath(audio_file), + "text": transcript, + "duration": utter_length + } + ) + + + logger.info("Total number of entries after processing: %d", number_of_entries) + logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600) + + return entries + + def process_data(self, data_folder: str, manifest_file: str) -> None: + entries = self.process_transcript(os.path.join(data_folder, "uzbekvoice-dataset", "voice_dataset.json")) + + with open(manifest_file, "w", encoding="utf-8") as fout: + for m in entries: + fout.write(json.dumps(m, ensure_ascii=False) + "\n") + + + + def process(self): + self.download_extract_files(self.raw_data_dir) + self.process_data(self.raw_data_dir, self.output_manifest_file) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index 94e770d9..edb95326 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -76,7 +76,15 @@ def data_check_fn_slr140(raw_data_dir: str) -> None: if not expected_file.exists(): raise ValueError(f"No such file {str(expected_file)}") - extract_tar_with_strip_components(expected_file, tgt_dir, strip_components=1) + extract_tar_with_strip_components(expected_file, tgt_dir, strip_components=1) + +def data_check_fn_uzbekvoice(raw_data_dir: str) -> None: + expected_files = [Path(raw_data_dir) / "clips.zip", Path(raw_data_dir) / "uzbekvoice-dataset.zip"] + for expected_file in expected_files: + if expected_file.exists(): + return + else: + raise ValueError(f"No such file {str(expected_file)} at {str(raw_data_dir)}") # using Mock so coraal_processor will only try to use the files listed. # To reduce the amount of storage required by the test data, the S3 bucket contains @@ -166,6 +174,18 @@ def get_test_cases() -> List[Tuple[str, Callable]]: config_path=f"{DATASET_CONFIGS_ROOT}/kazakh/ksc2/config.yaml", data_check_fn=partial(data_check_fn_generic, file_name="ksc2_kk.tar.gz") ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/mcv/config.yaml", + data_check_fn=partial(data_check_fn_mcv, archive_file_stem="mcv_uz") + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/uzbekvoice/config.yaml", + data_check_fn=data_check_fn_uzbekvoice + ), + TestCase( + config_path=f"{DATASET_CONFIGS_ROOT}/uzbek/fleurs/config.yaml", + data_check_fn=data_check_fn_fleurs + ) ] def get_test_names():