Skip to content

Commit

Permalink
black
Browse files Browse the repository at this point in the history
Signed-off-by: Nikolay Karpov <[email protected]>
  • Loading branch information
karpnv committed Nov 24, 2024
2 parents 4441a2a + d6ba7fc commit 1b8c189
Show file tree
Hide file tree
Showing 15 changed files with 806 additions and 68 deletions.
1 change: 1 addition & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ jobs:
pip install Cython wheel # need to pre-install to avoid error in nemo installation
pip install "nemo_toolkit[asr,nlp]"
python -m pip cache purge
- name: Run all tests
env:
AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
Expand Down
4 changes: 2 additions & 2 deletions dataset_configs/english/coraal/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ processors:
- {"pattern": '\baksing\b', "repl": "asking"}
- {"pattern": '\baksed\b', "repl": "asked"}
# removing unintelligible/redacted flags
- {"pattern": '/(?i)unintelligible/', "repl": ""}
- {"pattern": '/(?i)inaudible/', "repl": ""}
- {"pattern": '(?i)unintelligible/', "repl": ""}
- {"pattern": '(?i)inaudible/', "repl": ""}
- {"pattern": '/RD(.*?)/', "repl": ""}
- {"pattern": '/(\?)\1*/', "repl": ""}
# removing non-linguistic markers
Expand Down
146 changes: 146 additions & 0 deletions dataset_configs/uzbek/fleurs/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
documentation: |
FLEURS
######
This config can be used to prepare
`FLEURS <https://huggingface.co/datasets/google/fleurs>`_
dataset in the NeMo format.
It produces manifest for uzbek language.
This config performs the following data processing.
1. Downloads FLEURS data
2. Calculates the length of wav files
3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
**Required arguments**.
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
* **data_split**: should be "train", "dev" or "test".
Note that you can customize any part of this config either directly or from command-line.
**Output format**
This config generates output manifest files:
* ``${workspace_dir}/${final_manifest}`` - dev subset of the data.
Output manifest contains the following keys:
* **audio_filepath (str)**: relative path to the audio files.
* **text (str)**: transcription (lower-case without punctuation).
* **duration (float)**: audio duration in seconds.
processors_to_run: '0:'
workspace_dir: ???
data_split: dev
save_dir: ${workspace_dir}
final_manifest: ${workspace_dir}/${data_split}_manifest.json

processors:
# creating manifest for uzbek dev set
- _target_: sdp.processors.CreateInitialManifestFleurs
lang: "uz_uz"
split: ${data_split}
raw_data_dir: ${workspace_dir}/raw_data

- _target_: sdp.processors.GetAudioDuration
audio_filepath_key: audio_filepath
duration_key: duration

- _target_: sdp.processors.SubRegex
text_key: text

regex_params_list:
- {"pattern": ":", "repl": ''}

# replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
- {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
- {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}

# rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
- {"pattern": "‘", "repl": "’"}
- {"pattern": "`", "repl": "’"}
- {"pattern": "'", "repl": "’"}
- {"pattern": 'ʼ', "repl": "’"}
- {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}

test_cases:
- { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output: { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
- { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}


- _target_: sdp.processors.SubRegex
text_key: text

regex_params_list:
- {"pattern": ":", "repl": ""}

- {"pattern": "!", "repl": "."}
- {"pattern": "\r", "repl": ""}

- {"pattern": '―', "repl": "-"}
- {"pattern": '—', "repl": "-"}
- {"pattern": '⁻', "repl": "-"}
- {"pattern": '‑', "repl": "-"}
- {"pattern": '–', "repl": "-"}

- {"pattern": '"', "repl": ""}
- {"pattern": '“', "repl": ""}
- {"pattern": '”', "repl": ""}
- {"pattern": '„', "repl": ""}
- {"pattern": '‟', "repl": ""}
- {"pattern": ';', "repl": ","}
- {"pattern": '…', "repl": "."}
- {"pattern": '\.\.\.', "repl": "."}

# for Ŏ ŏ Ó ó Ō ō Õ õ
- {"pattern": "Ŏ", "repl": "Oʻ"}
- {"pattern": "ŏ", "repl": "oʻ"}
- {"pattern": "Ó", "repl": "Oʻ"}
- {"pattern": "ó", "repl": "oʻ"}
- {"pattern": "Ō", "repl": "Oʻ"}
- {"pattern": "ō", "repl": "oʻ"}
- {"pattern": "Õ", "repl": "Oʻ"}
- {"pattern": "õ", "repl": "oʻ"}

#for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ
- {"pattern": "Ğ", "repl": "Gʻ"}
- {"pattern": "ğ", "repl": "gʻ"}
- {"pattern": "Ǵ", "repl": "Gʻ"}
- {"pattern": "ǵ", "repl": "gʻ"}
- {"pattern": "Ḡ", "repl": "Gʻ"}
- {"pattern": "ḡ", "repl": "gʻ"}
- {"pattern": "Ğ", "repl": "Gʻ"}
- {"pattern": "ğ", "repl": "gʻ"}

#for Ş ş Ç ç Ñ ñ
- {"pattern": "Ş", "repl": "Sh"}
- {"pattern": "ş", "repl": "sh"}
- {"pattern": "Ç", "repl": "Ch"}
- {"pattern": "ç", "repl": "ch"}
- {"pattern": "Ñ", "repl": "Ng"}
- {"pattern": "ñ", "repl": "ng"}

test_cases:
- { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output: { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}


- _target_: sdp.processors.DropIfNoneOfRegexMatch
regex_patterns: ["^( [A-Z])(.)+"]
test_cases:
- { input: { text: "one One" }, output: null }
- { input: { text: "One one" }, output: { text: "One one" } }


- _target_: sdp.processors.DropNonAlphabet
alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
test_cases:
- { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
- { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }

- { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
output_manifest_file: ${final_manifest}

151 changes: 151 additions & 0 deletions dataset_configs/uzbek/mcv/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
documentation: |
MCV Uzbek
###########
This config is designed for the
`Mozilla Common Voice (MCV) <https://commonvoice.mozilla.org/>`_ dataset
17.0 release, but should work for any subsequent releases as well.
It performs the following data processing.
1. Extracts and converts all data to the specified manifest format.
2. Gets audio durations and then keeps only instances with the duration greater than 0.
3. Adjusts the text by removing punctuation marks and replacing some inconsistent characters.
**Required arguments**.
* **workspace_dir**: specify the workspace folder where all audio files will be stored.
You need to manually place the downloaded .tar files data inside
``<workspace dir>`` folder.
* **data_split**: should be "train", "dev" or "test".
Note that you can customize any part of this config either directly or from command-line.
Here are some common customizations to consider:
* **remove_pc**: set to True if P&C is not needed. Defaults to True.
* **remove_hyphen**: set to True if hyphens is not needed. Defaults to True.
**Output format**.
This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``.
The output manifest contains the following fields:
* **audio_filepath (str)**: relative path to the audio files.
* **text (str)**: transcription, including punctuation ".,?" and capitalization.
* **duration (float)**: audio duration in seconds.
processors_to_run: '0:'
workspace_dir: ???
data_split: ???
final_manifest: ${workspace_dir}/${data_split}_manifest.json
save_dir: ${workspace_dir}
remove_pc: False

processors:
- _target_: sdp.processors.CreateInitialManifestMCV
language_id: uz
extract_archive_dir: ${workspace_dir}/raw_data
resampled_audio_dir: ${workspace_dir}/${data_split}/audio/
data_split: ${data_split}
raw_data_dir: ${workspace_dir}
output_manifest_file: ${save_dir}/${data_split}_manifest_1.json

- _target_: sdp.processors.SubRegex
text_key: text
output_manifest_file: ${save_dir}/${data_split}_manifest_2.json

regex_params_list:
- {"pattern": ":", "repl": ''}

# replace all the inconsistent apostrophy characters for oʻ ang gʻ with ʻ
- {"pattern": "(?<=o|g|O|G)‘", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)’", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)`", "repl": "ʻ"}
- {"pattern": "(?<=o|g|O|G)'", "repl": "ʻ"}
- {"pattern": '(?<=o|g|O|G)ʼ', "repl": "ʻ"}

# rreplace all the inconsistent apostrophy characters besides oʻ ang gʻ with ’
- {"pattern": "‘", "repl": "’"}
- {"pattern": "`", "repl": "’"}
- {"pattern": "'", "repl": "’"}
- {"pattern": 'ʼ', "repl": "’"}
- {"pattern": '(?<!o|g|O|G)ʻ', "repl": "’"}

test_cases:
- { input: { text: "Bir sig’ir ka'tta qashshoqlikni yopadi." }, output: { text: "Bir sigʻir ka’tta qashshoqlikni yopadi." }}
- { input: { text: "O‘shanda yapon universiteti ta’lim grantini yutib olgandim." }, output: { text: "Oʻshanda yapon universiteti ta’lim grantini yutib olgandim." }}


- _target_: sdp.processors.SubRegex
text_key: text
output_manifest_file: ${save_dir}/${data_split}_manifest_3.json

regex_params_list:
- {"pattern": ":", "repl": ""}

- {"pattern": "!", "repl": "."}
- {"pattern": "\r", "repl": ""}

- {"pattern": '―', "repl": "-"}
- {"pattern": '—', "repl": "-"}
- {"pattern": '⁻', "repl": "-"}
- {"pattern": '‑', "repl": "-"}
- {"pattern": '–', "repl": "-"}

- {"pattern": '"', "repl": ""}
- {"pattern": '“', "repl": ""}
- {"pattern": '”', "repl": ""}
- {"pattern": '„', "repl": ""}
- {"pattern": '‟', "repl": ""}
- {"pattern": ';', "repl": ","}
- {"pattern": '…', "repl": "."}
- {"pattern": '\.\.\.', "repl": "."}

# for Ŏ ŏ Ó ó Ō ō Õ õ
- {"pattern": "Ŏ", "repl": "Oʻ"}
- {"pattern": "ŏ", "repl": "oʻ"}
- {"pattern": "Ó", "repl": "Oʻ"}
- {"pattern": "ó", "repl": "oʻ"}
- {"pattern": "Ō", "repl": "Oʻ"}
- {"pattern": "ō", "repl": "oʻ"}
- {"pattern": "Õ", "repl": "Oʻ"}
- {"pattern": "õ", "repl": "oʻ"}

#for Ğ ğ Ǵ ǵ Ḡ ḡ Ğ ğ
- {"pattern": "Ğ", "repl": "Gʻ"}
- {"pattern": "ğ", "repl": "gʻ"}
- {"pattern": "Ǵ", "repl": "Gʻ"}
- {"pattern": "ǵ", "repl": "gʻ"}
- {"pattern": "Ḡ", "repl": "Gʻ"}
- {"pattern": "ḡ", "repl": "gʻ"}
- {"pattern": "Ğ", "repl": "Gʻ"}
- {"pattern": "ğ", "repl": "gʻ"}

#for Ş ş Ç ç Ñ ñ
- {"pattern": "Ş", "repl": "Sh"}
- {"pattern": "ş", "repl": "sh"}
- {"pattern": "Ç", "repl": "Ch"}
- {"pattern": "ç", "repl": "ch"}
- {"pattern": "Ñ", "repl": "Ng"}
- {"pattern": "ñ", "repl": "ng"}

test_cases:
- { input: { text: "Bir siḡir katta; qashshoqlikni yopadi." }, output: { text: "Bir sigʻir katta, qashshoqlikni yopadi." }}


- _target_: sdp.processors.DropIfNoneOfRegexMatch
regex_patterns: ["^( [A-Z])(.)+"]
test_cases:
- { input: { text: "one One" }, output: null }
- { input: { text: "One one" }, output: { text: "One one" } }


- _target_: sdp.processors.DropNonAlphabet
alphabet: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZʻ’.,-? "
test_cases:
- { input: { text: "• Bir sigir katta qashshoqlikni yopadi." }, output: null }
- { input: { text: "Bir sigir 2 katta qashshoqlikni yopadi" }, output: null }

- { input: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." }, output: { text: "Bu vaqtga qadar u oʻzi yoqtirmagan kasbda faoliyat yuritgan." } }
output_manifest_file: ${final_manifest}
Loading

0 comments on commit 1b8c189

Please sign in to comment.