From 8abc0aa79f3f24c53bd421d646d0dda30736de0b Mon Sep 17 00:00:00 2001 From: monica-sekoyan Date: Sat, 5 Oct 2024 19:22:14 +0400 Subject: [PATCH 01/13] add babel processors Signed-off-by: monica-sekoyan --- .../multilingual/babel/config.yaml | 53 ++++++ .../multilingual/voxpopuli/config.yaml | 37 ++++ .../datasets/babel/create_initial_manifest.py | 179 ++++++++++++++++++ .../voxpopuli/create_initial_manifest.py | 126 ++++++++++++ 4 files changed, 395 insertions(+) create mode 100644 dataset_configs/multilingual/babel/config.yaml create mode 100644 dataset_configs/multilingual/voxpopuli/config.yaml create mode 100644 sdp/processors/datasets/babel/create_initial_manifest.py diff --git a/dataset_configs/multilingual/babel/config.yaml b/dataset_configs/multilingual/babel/config.yaml new file mode 100644 index 00000000..9da2c14c --- /dev/null +++ b/dataset_configs/multilingual/babel/config.yaml @@ -0,0 +1,53 @@ +documentation: | + IARPA Babel Dataset + ################### + + This config is designed for the languages of the IARPA Babel Dataset available at https://catalog.ldc.upenn.edu. + + It creates initial manifest for the specified data type and data split. + Further data processing steps should be performed based on the specific langauge. + + **Required arguments**. + + * **raw_data_dir**: specify path of the directory downloaded from LDC. + * **data_type**: should be "conversational" or "scripted". + * **resampled_audio_dir**: specify the directory path, where new processed audios should be located. + * **data_split**: should be "training", "untranscribed-training", "sub-train", "dev" or "eval". + * **output_manifest_file**: specify output manifest filepath. + + **Output format**. + + This config dumps the final manifest at ``${output_manifest_file}``. + The output manifest contains the following fields: + + * **outputFn (str)**: initial audio filename. + * **sessID (str)**: session ID of the recording. + * **date (str)**: date of the recording. + * **time (str)**: time of the recording. + * **spkrCode (str)**: speaker ID. + * **lineType (str)**: type of the line (inline or outline). + * **dialect (str)**: dialect of the speaker. + * **gen (str)**: gender of the speaker. + * **envType (str)**: environment (i.e., home, office, etc.). + * **age (str)**: age of the speaker. + * **network (str)**: name of the telecommunications network. + * **phoneModel (str)**: model of the phone. + * **sampleCount (str)**: count of the sample. + * **sampleRate (str)**: original sample rate of the recording. + * **audio_filepath (str)**: path to the processed audio file. + * **duration (float)**: duration of the audio in seconds. + +processors_to_run: all +raw_data_dir: ??? +data_type: ??? +resampled_audio_dir: ??? +data_split: ??? +output_manifest_file: ??? + +processors: + - _target_: sdp.processors.CreateInitialManifestBabel + raw_data_dir: ${raw_data_dir} + data_type: ${data_type} + data_split: ${data_split} + resampled_audio_dir: ${resampled_audio_dir} + output_manifest_file: ${output_manifest_file} \ No newline at end of file diff --git a/dataset_configs/multilingual/voxpopuli/config.yaml b/dataset_configs/multilingual/voxpopuli/config.yaml new file mode 100644 index 00000000..a5e077bf --- /dev/null +++ b/dataset_configs/multilingual/voxpopuli/config.yaml @@ -0,0 +1,37 @@ +documentation: | + Voxpopuli unlabelled subset + ########################### + + This config can be used to prepare + `Voxpopuli dataset unlabelled subset `_ + dataset in the NeMo format. + + It creates initial manifest for the specified language. + + **Required arguments**. + + * **raw_data_dir**: specify the directory where the downloaded data will be/is saved. + * **language_id**: specify the language of the data you wish to be downloaded and/or processed. + * **resampled_audio_dir**: specify the directory path, where new processed audios should be located. + * **delete_raw_file**: specify if the initial raw audio files should be deleted or not. + + + **Output format**. + + This config dumps the final manifest at ``${resampled_audio_dir}/${language_id}/manifest.json``. + The output manifest contains the following fields: + + * **audio_filepath (str)**: path to the processed audio file. + * **duration (float)**: duration of the audio in seconds. + +processors_to_run: all +raw_data_dir: ??? +language_id: ??? +resampled_audio_dir: ??? + +processors: + - _target_: sdp.processors.CreateInitialManifestVoxpopuliUnlabelled + raw_data_dir: ${raw_data_dir} + language_id: ${language_id} + resampled_audio_dir: ${resampled_audio_dir} + delete_raw_file: False diff --git a/sdp/processors/datasets/babel/create_initial_manifest.py b/sdp/processors/datasets/babel/create_initial_manifest.py new file mode 100644 index 00000000..4bf0eaa6 --- /dev/null +++ b/sdp/processors/datasets/babel/create_initial_manifest.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib +from pathlib import Path + +from pydub import AudioSegment + +from sdp.logging import logger +from sdp.processors.base_processor import BaseParallelProcessor, DataEntry + + +class CreateInitialManifestBabel(BaseParallelProcessor): + """Processor to create initial manifest for the Babel dataset. + + Dataset is available for 25 underserved languages on https://catalog.ldc.upenn.edu + + Segments the raw audio based on transcriptions files + (each segment contains an utterance from the transcription file for which start and end timestamps are procided) + and creates manifest for the resampled data. + + .. note:: + The dataset should be downloaded manually from LDC. + + Args: + raw_data_dir (str): the directory where the downloaded data is saved. + data_type (str): "conversational" or "scripted". + data_split (str): "training", "untranscribed-training", "sub-train", "dev" or "eval". + resampled_audio_dir (str): the directory where the resampled audio + files will be stored. + audio_format (str): format in which new audio files will be stored. + target_samplerate (int): sample rate (Hz) to use for resampling. + Defaults to 16000. + target_nchannels (int): number of channels to create during resampling process. + Defaults to 1. + + Returns: + This processor generates an initial manifest file with the following fields:: + + { + "outputFn": , + "sessID": , + "date": , + "time":