NVIDIA · Alcray · Jun 3, 2024 · Jun 3, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml b/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml
@@ -0,0 +1,67 @@
+documentation: |
+  Getting final resuts from Toloka
+  ################################
+
+  This configuration represents the final stage of processing Armenian language datasets for the Toloka platform. 
+  It processes all accepted results from the Toloka pool and prepares the data for training by refining and resampling audio files and ensuring text formatting consistency.
+
+  **Stage Overview**:
+  This stage includes the following steps:
+  1. Downloading all the ACCEPTED results from the Toloka platform.
+  2. Filtering out damaged audio files.
+  3. Resampling audio files to ensure compatibility with ASR models (16 kHz, mono channel).
+  4. Ensuring all utterances end with a proper Armenian end symbol; adding `:` if not.
+  5. Dropping all unnecessary fields, keeping only `text` and `audio_filepath` for training.
+  6. Calculating the audio duration for each utterance.
+
+  **Required Arguments**:
+  - `workspace_dir`: Specify the directory for storing intermediate and final output files.
+
+  **Output Files**:
+  - `${workspace_dir}/manifest-1.json`: Manifest of all accepted results.
+  - `${workspace_dir}/manifest0.json`: Manifest after filtering out damaged audio files.
+  - `${workspace_dir}/manifest1.json`: Manifest with resampled audio files.
+  - `${workspace_dir}/manifest3.json`: Manifest with text formatting corrections.
+  - `${workspace_dir}/manifest4.json`: Manifest with only the necessary fields (`text`, `audio_filepath`).
+  - `${workspace_dir}/results.json`: Final manifest with audio durations.
+
+processors_to_run: all
+workspace_dir: ???
+
+processors:
+  - _target_: sdp.processors.GetTolokaResults
+    input_pool_file: ${workspace_dir}/taskpool.json
+    input_data_file: ${workspace_dir}/data_file.json
+    status: ACCEPTED
+    output_dir: ${workspace_dir}/results
+    output_manifest_file: ${workspace_dir}/manifest-1.json
+    platform: "PRODUCTION"
+
+  - _target_: sdp.processors.ASRFileCheck
+    audio_filepath_key: audio_filepath  
+    corrupted_audio_dir: ${workspace_dir}/curr
+    output_manifest_file: ${workspace_dir}/manifest0.json
+
+  - _target_: sdp.processors.AudioResampler
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    resampled_audio_dir: ${workspace_dir}/16k
+    audio_filepath_key: "audio_filepath"
+    target_samplerate: 16000
+    target_nchannels: 1
+
+  - _target_: sdp.processors.MakeSentence
+    text_key: "text"
+    end_symbol: ":"
+    make_uppercase: True
+    output_manifest_file: ${workspace_dir}/manifest3.json    
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    fields_to_keep: ["text", "audio_filepath"]
+
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_key: audio_filepath
+    duration_key: duration
+    output_manifest_file: ${workspace_dir}/results.json
+
+
diff --git a/dataset_configs/armenian/toloka/pipeline_start.yaml b/dataset_configs/armenian/toloka/pipeline_start.yaml
@@ -0,0 +1,188 @@
+documentation: |
+  Start Toloka for Armenian
+  #########################
+
+  This configuration represents the first of three stages for processing Armenian language datasets for the Toloka platform. 
+  It sets up the foundation for creating structured tasks by initializing a new Toloka project, preparing pools, and processing textual data to generate a clean and organized corpus.
+
+  **Stage Overview**:
+  This stage focuses on preparing and refining the dataset through the following steps:
+  1. Creating a new Toloka project.
+  2. Creating a new pool for the project.
+  3. Generating an initial dataset manifest by saving file paths of a docs corpus.
+  4. Extracting text lines from `.docx` files.
+  5. Processing Armenian punctuation and converting it to English equivalents.
+  6. Extracting text within brackets to form an additional corpus.
+  7. Separating sentences in utterances in the additional corpus.
+  8. Separating sentences in utterances in the main corpus.
+  9. Merging the main and additional corpuses into a combined dataset.
+  10. Counting the number of words in each sentence.
+  11. Filtering out long sentences.
+  12. Filtering out short sentences.
+  13. Removing duplicate utterances.
+  14. Submitting the cleaned and processed data to the Toloka pool.
+
+  **Required Arguments**:
+  - `workspace_dir`: Specify the directory for storing intermediate and final output files.
+
+  **Output Files**:
+  - `${workspace_dir}/data_file.json`: Manifest with metadata of the Toloka project.
+  - `${workspace_dir}/taskpool.json`: Manifest with metadata of the Toloka pool.
+  - `${workspace_dir}/tasks_clear.json`: Final manifest of the clean text corpus.
+
+processors_to_run: all
+workspace_dir: ???
+
+processors:
+  - _target_: sdp.processors.CreateTolokaProject
+    platform: "PRODUCTION"
+    project_name: "Հայերեն ձայնագրություններ"
+    project_description: "Սեղմել ձայնագրելու կոճակը և կարդալ տեքստը բարձրաձայն։"
+    project_instructions: |
+        <style scoped="">
+          h2,h3 { color: #1F3A93; }
+          h4 { color: #333333; }
+          ul,ol { padding-left: 10px; margin-left: 20px; }
+        </style>
+        <h3>Հանձնարարության մասին</h3>
+        <div>
+          <div>Ձայնագրությունները կօգնեն սովորեցնել հայերեն ավտոմատ խոսք հասկացող մոդել։</div>
+          <div>Ամեն մի հանձնարարությունը պարունակում է բառակապակցություն կամ նախադասություն։</div>
+          <div>Ձեր նպատակն է բարձրաձայն կարդալ և ձայնագրել տեքստը։</div>
+          <div>Ամեն մի նախադասության համար պետք է նոր ձայնագրություն անեք, եթե նույնիսկ դա արդեն հանդիպել է։</div>
+        </div>
+        <h3>Ներկայացում</h3>
+        <div>
+          <b>iOS Համակարգ (iPhone, iPad)</b>
+          <ol>
+            <li>Սեղմեք ձայնագրելու կոճակը։</li>
+            <li>Սկսեք խոսել միայն ձայնագրությունը միանալուց հետո, այս դեպքում խոսքի սկիզբը չի կորի։</li>
+            <li>Խոսքը ավարտելուց հետո սեղմեք կանգնեցնելու կոճակը։</li>
+            <li>Պահպանեք ձայնագրությունը, եթե ամեն ինչ ճիշտ է։ Եթե սխալ եք թույլ տվել, սեղմեք «Նորից սկսել»։</li>
+            <li>Ձայնագրության հաջող upload-ից հետո իր համարը կհայտնվի հանձնարարության մեջ։ Շարունակեք դեպի հաջորդ հանձնարարությունը։</li>
+          </ol>
+          <b>Android Համակարգ</b>
+          <ol>
+            <li>Սեղմեք ձայնագրելու կոճակը։</li>
+            <li>Սեղմեք ձայնագրությունը սկսելու կոճակը։ Սկսեք խոսել միայն ձայնագրությունը միանալուց հետո, այս դեպքում խոսքի սկիզբը չի կորի։</li>
+            <li>Խոսքը ավարտելուց հետո սեղմեք կանգնեցնելու կոճակը։</li>
+            <li>Պահպանեք ձայնագրությունը, եթե ամեն ինչ ճիշտ է։ Եթե սխալ եք թույլ տվել, սեղմեք «Նորից սկսել»։</li>
+            <li>Ձայնագրության հաջող upload-ից հետո իր համարը կհայտնվի հանձնարարության մեջ։ Շարունակեք դեպի հաջորդ հանձնարարությունը։</li>
+          </ol>
+        </div>
+        <h3>Հանձնարարության ավարտման ստուգում</h3>
+        <div>
+          <div>Ձայնագրությունների ստուգումը իրականացվում է 7 օրվա ընթացքում։</div>
+          <h4>Մերժման հնարավոր պատճառներն են՝</h4>
+          <ul>
+            <li>Ձայնագրությունները չեն պարունակում հանձնարարության մեջ ներկայացված նախադասությունը կամ բառակապակցությունը։</li>
+            <li>Նախադասությունները ձայնագրված չեն ամբողջությամբ։</li>
+            <li>Ձայնագրությունները շատ ցածր են և/կամ առկա է չափից դուրս ուժեղ աղմուկ։</li>
+            <li>Նույն ձայնագրությունը ներկայացված է մի քանի հանձնարարություններին։</li>
+          </ul>
+        </div>
+    output_manifest_file: ${workspace_dir}/data_file.json
+    save_api_key_to_config: false
+
+
+  - _target_: sdp.processors.CreateTolokaPool
+    input_manifest_file: ${workspace_dir}/data_file.json
+    output_manifest_file: ${workspace_dir}/taskpool.json
+
+  - _target_: sdp.processors.CreateInitialManifestByExt
+    raw_data_dir: ${workspace_dir}/arm_docs
+    extension: docx
+    output_file_key: source_filepath
+    output_manifest_file: ${workspace_dir}/docfiles.json
+
+  - _target_: sdp.processors.ReadDocxLines
+    source_filepath: source_filepath
+    text_key: text
+    output_manifest_file: ${workspace_dir}/lines.json
+
+  - _target_: sdp.processors.SubRegex
+    input_manifest_file: ${workspace_dir}/lines.json
+    output_manifest_file: ${workspace_dir}/lines1.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '։', "repl": ':'}
+      - {"pattern": '․', "repl": "."}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '–', "repl": "-"}
+      - {"pattern": '―', "repl": "-"} 
+      - {"pattern": '\.\.\.', "repl": "…"}
+      - {"pattern": "\\s+", "repl": " "}
+
+  - _target_: sdp.processors.ExtractFromBrackets
+    text_key: text
+    brackets: ["«»", "()", "[]"]
+    input_manifest_file: ${workspace_dir}/lines1.json
+    output_manifest_file: ${workspace_dir}/inbrackets.json
+
+  - _target_: sdp.processors.SplitLineBySentence
+    text_key: text
+    end_pattern: ':|\.|…'
+    input_manifest_file: ${workspace_dir}/inbrackets.json
+    output_manifest_file: ${workspace_dir}/inbrackets_splited.json
+
+  - _target_: sdp.processors.SplitLineBySentence
+    text_key: text
+    end_pattern: ':|\.|…'
+    input_manifest_file: ${workspace_dir}/lines1.json
+    output_manifest_file: ${workspace_dir}/sentences0.json
+
+  - _target_: sdp.processors.CreateCombinedManifests
+    manifest_list: ["${workspace_dir}/inbrackets_splited.json", "${workspace_dir}/sentences0.json"]
+    output_manifest_file: ${workspace_dir}/sentences_combined.json
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    input_manifest_file: ${workspace_dir}/sentences_combined.json
+    output_manifest_file: ${workspace_dir}/sentences1.json
+    regex_patterns:
+      - '[0-9]'
+      - '\('
+      - '\)'
+      - '\['
+      - '\]'
+      - '\*'
+      - '"'
+      - '[А-Яа-я]'
+      - '[A-Za-z]'
+      - '\+'
+      - '='
+      - '¬'
+      - '&'
+      - '«'
+      - '»'
+
+  - _target_: sdp.processors.CountNumWords
+    input_manifest_file: ${workspace_dir}/sentences1.json
+    output_manifest_file: ${workspace_dir}/sentences2.json
+    alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
+    text_key: text
+    num_words_key: num_words
+
+  - _target_: sdp.processors.PreserveByValue
+    input_manifest_file: ${workspace_dir}/sentences2.json
+    output_manifest_file: ${workspace_dir}/sentences3.json
+    input_value_key: num_words
+    target_value: 20
+    operator: le
+
+  - _target_: sdp.processors.PreserveByValue
+    input_manifest_file: ${workspace_dir}/sentences3.json
+    output_manifest_file: ${workspace_dir}/sentences4.json
+    input_value_key: num_words
+    target_value: 3
+    operator: ge
+
+  - _target_: sdp.processors.DropDuplicates
+    drop_key: "text"
+    input_manifest_file: ${workspace_dir}/sentences4.json
+    output_manifest_file: ${workspace_dir}/tasks_clear.json
+
+  - _target_: sdp.processors.CreateTolokaTaskSet
+    input_manifest_file: ${workspace_dir}/tasks_clear.json
+    input_pool_file: ${workspace_dir}/taskpool.json
+    input_data_file: ${workspace_dir}/data_file.json
+    limit: 1
diff --git a/dataset_configs/armenian/toloka/pipeline_validate_answers.yaml b/dataset_configs/armenian/toloka/pipeline_validate_answers.yaml
@@ -0,0 +1,91 @@
+documentation: |
+  Validation of responses Armenian
+  ################################
+
+  This configuration represents the second stage of processing Armenian language datasets for the Toloka platform. 
+  It focuses on validating and refining the results of completed tasks, leveraging speech-to-text models and quality metrics to ensure high-quality data for subsequent processing.
+
+  **Stage Overview**:
+  This stage includes the following steps:
+  1. Downloading results of completed tasks from Toloka.
+  2. Validating the audio files and filtering out corrupted files.
+  3. Transcribing Armenian audio to text using a HuggingFace model.
+  4. Cleaning ground truth text by:
+     - Dropping all non-Armenian alphabetical characters.
+     - Replacing the double Armenian symbol "եւ" with the single symbol "և".
+     - Converting text to lowercase.
+  5. Cleaning model-predicted text using the same steps as the ground truth text.
+  6. Calculating Word Error Rate (WER) between the predicted text and the ground truth text.
+  7. Filtering out responses with high WER and accepting those with low WER.
+  8. Rejecting responses from previously banned Tolokers.
+
+  **Required Arguments**:
+  - `workspace_dir`: Specify the directory for storing intermediate and final output files.
+
+  **Output Files**:
+  - `${workspace_dir}/result_manifest.json`: Manifest of results downloaded from Toloka.
+  - `${workspace_dir}/result_manifest_no_curr.json`: Manifest after removing corrupted files.
+  - `${workspace_dir}/result_manifest_pred.json`: Manifest with model-predicted transcriptions.
+  - `${workspace_dir}/result_manifest_pred_clean.json`: Manifest with cleaned predicted transcriptions.
+  - `${workspace_dir}/result_manifest_pred_review.json`: Final manifest after quality checks, ready for review and acceptance.
+
+processors_to_run: all
+workspace_dir: ???
+
+processors:
+  - _target_: sdp.processors.GetTolokaResults
+    input_pool_file: ${workspace_dir}/taskpool.json
+    input_data_file: ${workspace_dir}/data_file.json
+    status: SUBMITTED
+    output_dir: ${workspace_dir}/results
+    output_manifest_file: ${workspace_dir}/result_manifest.json
+
+  - _target_: sdp.processors.ASRFileCheck
+    audio_filepath_key: audio_filepath  
+    corrupted_audio_dir: ${workspace_dir}/curr
+    output_manifest_file: ${workspace_dir}/result_manifest_no_curr.json
+
+  - _target_: sdp.processors.ASRTransformers # pip install accelerate transformers
+    input_manifest_file: ${workspace_dir}/result_manifest_no_curr.json
+    output_manifest_file: ${workspace_dir}/result_manifest_pred.json
+    pretrained_model: "facebook/seamless-m4t-v2-large" 
+    batch_size: 32
+    generate_language: armenian
+    generate_task: transcribe
+    output_text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    input_manifest_file: ${workspace_dir}/result_manifest_pred.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '[^ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև ]', "repl": ""}
+      - {"pattern": 'եւ', "repl": "և"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": '[^ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև ]', "repl": ""}
+      - {"pattern": 'եւ', "repl": "և"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    text_key: pred_text
+
+  - _target_: sdp.processors.GetWER
+    pred_text_key: pred_text
+    output_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
+
+  - _target_: sdp.processors.AcceptIfWERLess
+    input_pool_file: ${workspace_dir}/taskpool.json
+    input_data_file: ${workspace_dir}/data_file.json
+    input_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
+    output_manifest_file: ${workspace_dir}/result_manifest_pred_review.json
+    threshold: 75
+
+  - _target_: sdp.processors.RejectIfBanned
+    input_pool_file: ${workspace_dir}/taskpool.json
+    input_data_file: ${workspace_dir}/data_file.json
+    input_manifest_file: ${workspace_dir}/result_manifest_pred_clean.json
+    output_manifest_file: ${workspace_dir}/result_manifest_pred_review.json
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
@@ -239,9 +239,14 @@ Data filtering
 .. autodata:: sdp.processors.DropHighLowDuration
    :annotation:
 
+<<<<<<< HEAD
 .. autodata:: sdp.processors.DropRepeatedFields
    :annotation:
 
+=======
+.. autodata:: sdp.processors.DropDuplicates
+   :annotation:
+>>>>>>> 718eb2f (Toloka support for armenian)
 
 Miscellaneous
 #############

diff --git a/requirements/main.txt b/requirements/main.txt
@@ -14,7 +14,17 @@ tqdm
 gdown
 webvtt-py
 wget
+python-docx
+rarfile
+regex
+sentencepiece
+sox
+toloka-kit
+tqdm
+transformers
+wget
 
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
 # for some processers, additionally nemo_text_processing is required
 # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all
+# for seamless: pip install git+https://github.com/huggingface/transformers.git sentencepiece