From d0bf53b01d61872633c7ec70c2d3e7fc6fe6fbae Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Fri, 1 Nov 2024 23:46:09 -0400 Subject: [PATCH 01/13] update --- ultravox/data/datasets.py | 6 +- ultravox/data/datasets_boolq.py | 18 + ultravox/data/datasets_commonvoice.py | 338 ++++++++++++ ultravox/data/datasets_covost2.py | 481 ++++++++++++++++++ ultravox/data/datasets_gigaspeech.py | 32 ++ ultravox/data/datasets_librispeech.py | 63 +++ .../data/datasets_multilingual_librispeech.py | 59 +++ ultravox/data/datasets_peoplespeech.py | 33 ++ ultravox/data/datasets_voxpopuli.py | 18 + ultravox/data/datasets_wenetspeech.py | 29 ++ ultravox/data/registry.py | 19 +- ultravox/data/types.py | 22 +- ultravox/training/configs/release_config.yaml | 2 +- 13 files changed, 1105 insertions(+), 15 deletions(-) create mode 100644 ultravox/data/datasets_boolq.py create mode 100644 ultravox/data/datasets_commonvoice.py create mode 100644 ultravox/data/datasets_covost2.py create mode 100644 ultravox/data/datasets_gigaspeech.py create mode 100644 ultravox/data/datasets_librispeech.py create mode 100644 ultravox/data/datasets_multilingual_librispeech.py create mode 100644 ultravox/data/datasets_peoplespeech.py create mode 100644 ultravox/data/datasets_voxpopuli.py create mode 100644 ultravox/data/datasets_wenetspeech.py diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index 4ed3413b..90c0e865 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -118,7 +118,7 @@ def _init_dataset(self, dataset: data.Dataset, num_samples: int) -> None: def __len__(self): return self._length - + def _load_hf_dataset( self, path: str, @@ -187,10 +187,10 @@ def __iter__(self): raise ValueError(f"Audio length is 0 for sample {sample}") if ( self._args.max_audio_duration_secs is not None - and sample.audio.shape[-1] / types.SAMPLE_RATE + and sample.audio.shape[-1] / data_sample.SAMPLE_RATE > self._args.max_audio_duration_secs ): - duration = sample.audio.shape[-1] / types.SAMPLE_RATE + duration = sample.audio.shape[-1] / data_sample.SAMPLE_RATE warnings.warn( f"Audio length ({duration}s) exceeds max audio duration ({self._args.max_audio_duration_secs}s), skipping sample." ) diff --git a/ultravox/data/datasets_boolq.py b/ultravox/data/datasets_boolq.py new file mode 100644 index 00000000..37c2bfbd --- /dev/null +++ b/ultravox/data/datasets_boolq.py @@ -0,0 +1,18 @@ + +from ultravox.data import types + +BOOLQ_CONFIG = types.DatasetConfig( + name="boolq", + path="fixie-ai/boolq-audio", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=10000), + types.DatasetSplitConfig(name="validation", num_samples=1000), + ], + user_template=f"{{{{passage}}}}\n\n{types.AUDIO_PLACEHOLDER}", + assistant_template="{{'True' if answer else 'False'}}", + transcript_template="{{question}}", +) + +configs = [ + BOOLQ_CONFIG, +] diff --git a/ultravox/data/datasets_commonvoice.py b/ultravox/data/datasets_commonvoice.py new file mode 100644 index 00000000..3273aa5a --- /dev/null +++ b/ultravox/data/datasets_commonvoice.py @@ -0,0 +1,338 @@ + +from ultravox.data import types + +CV_BASE_CONFIG = types.DatasetConfig( + name="commonvoice", + path="fixie-ai/common_voice_17_0", + transcript_template="{{sentence}}", + assistant_template="{{sentence}}", +) + +CV_EN_CONFIG = types.DatasetConfig( + name="commonvoice-en", + base="commonvoice", + subset="en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_101_170), + types.DatasetSplitConfig(name="validation", num_samples=16_393), + ], + transcript_template="{{text_proc.format_asr_text(sentence)}}", + assistant_template="{{text_proc.format_asr_text(sentence)}}", +) + +CV_AR_CONFIG = types.DatasetConfig( + name="commonvoice-ar", + base="commonvoice", + subset="ar", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=28_369), + types.DatasetSplitConfig(name="validation", num_samples=10_470), + ], +) + +CV_DE_CONFIG = types.DatasetConfig( + name="commonvoice-de", + base="commonvoice", + subset="de", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=589_100), + types.DatasetSplitConfig(name="validation", num_samples=16_183), + ], +) + +CV_ES_CONFIG = types.DatasetConfig( + name="commonvoice-es", + base="commonvoice", + subset="es", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=336_846), + types.DatasetSplitConfig(name="validation", num_samples=15_857), + ], +) + +CV_FR_CONFIG = types.DatasetConfig( + name="commonvoice-fr", + base="commonvoice", + subset="fr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=558_054), + types.DatasetSplitConfig(name="validation", num_samples=16_159), + ], +) + +CV_IT_CONFIG = types.DatasetConfig( + name="commonvoice-it", + base="commonvoice", + subset="it", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=169_771), + types.DatasetSplitConfig(name="validation", num_samples=15_149), + ], +) + +CV_JA_CONFIG = types.DatasetConfig( + name="commonvoice-ja", + base="commonvoice", + subset="ja", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=10_039), + types.DatasetSplitConfig(name="validation", num_samples=6_261), + ], +) + +CV_PT_CONFIG = types.DatasetConfig( + name="commonvoice-pt", + base="commonvoice", + subset="pt", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=21_968), + types.DatasetSplitConfig(name="validation", num_samples=9_464), + ], +) + +CV_RU_CONFIG = types.DatasetConfig( + name="commonvoice-ru", + base="commonvoice", + subset="ru", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=26_377), + types.DatasetSplitConfig(name="validation", num_samples=10_203), + ], +) + +CV_HI_CONFIG = types.DatasetConfig( + name="commonvoice-hi", + base="commonvoice", + subset="hi", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=9_378), + types.DatasetSplitConfig(name="validation", num_samples=4_856), + ], +) + +CV_TR_CONFIG = types.DatasetConfig( + name="commonvoice-tr", + base="commonvoice", + subset="tr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=35_147), + types.DatasetSplitConfig(name="validation", num_samples=11_258), + ], +) + +CV_SV_CONFIG = types.DatasetConfig( + name="commonvoice-sv", + base="commonvoice", + subset="sv-SE", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_744), + types.DatasetSplitConfig(name="validation", num_samples=5_210), + ], +) + +CV_UK_CONFIG = types.DatasetConfig( + name="commonvoice-uk", + base="commonvoice", + subset="uk", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=25_137), + types.DatasetSplitConfig(name="validation", num_samples=10_007), + ], +) + +CV_EN_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-en-transcription", + base="commonvoice-en", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_AR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ar-transcription", + base="commonvoice-ar", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_DE_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-de-transcription", + base="commonvoice-de", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_ES_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-es-transcription", + base="commonvoice-es", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_FR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-fr-transcription", + base="commonvoice-fr", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_IT_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-it-transcription", + base="commonvoice-it", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_JA_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ja-transcription", + base="commonvoice-ja", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_PT_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-pt-transcription", + base="commonvoice-pt", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_RU_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ru-transcription", + base="commonvoice-ru", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_HI_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-hi-transcription", + base="commonvoice-hi", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_TR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-tr-transcription", + base="commonvoice-tr", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_SV_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-sv-transcription", + base="commonvoice-sv", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_UK_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-uk-transcription", + base="commonvoice-uk", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_EN_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-en-continuation", + base="commonvoice-en", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_AR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ar-continuation", + base="commonvoice-ar", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_DE_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-de-continuation", + base="commonvoice-de", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_ES_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-es-continuation", + base="commonvoice-es", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_FR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-fr-continuation", + base="commonvoice-fr", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_IT_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-it-continuation", + base="commonvoice-it", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_JA_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ja-continuation", + base="commonvoice-ja", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_PT_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-pt-continuation", + base="commonvoice-pt", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_RU_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ru-continuation", + base="commonvoice-ru", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + + +CV_HI_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-hi-continuation", + base="commonvoice-hi", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_TR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-tr-continuation", + base="commonvoice-tr", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_SV_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-sv-continuation", + base="commonvoice-sv", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_UK_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-uk-continuation", + base="commonvoice-uk", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + CV_BASE_CONFIG, + CV_EN_CONFIG, + CV_AR_CONFIG, + CV_DE_CONFIG, + CV_ES_CONFIG, + CV_FR_CONFIG, + CV_IT_CONFIG, + CV_JA_CONFIG, + CV_PT_CONFIG, + CV_RU_CONFIG, + CV_HI_CONFIG, + CV_TR_CONFIG, + CV_SV_CONFIG, + CV_UK_CONFIG, + CV_EN_TRANS_CONFIG, + CV_AR_TRANS_CONFIG, + CV_DE_TRANS_CONFIG, + CV_ES_TRANS_CONFIG, + CV_FR_TRANS_CONFIG, + CV_IT_TRANS_CONFIG, + CV_JA_TRANS_CONFIG, + CV_PT_TRANS_CONFIG, + CV_RU_TRANS_CONFIG, + CV_HI_TRANS_CONFIG, + CV_TR_TRANS_CONFIG, + CV_SV_TRANS_CONFIG, + CV_UK_TRANS_CONFIG, + CV_EN_CONT_CONFIG, + CV_AR_CONT_CONFIG, + CV_DE_CONT_CONFIG, + CV_ES_CONT_CONFIG, + CV_FR_CONT_CONFIG, + CV_IT_CONT_CONFIG, + CV_JA_CONT_CONFIG, + CV_PT_CONT_CONFIG, + CV_RU_CONT_CONFIG, + CV_HI_CONT_CONFIG, + CV_TR_CONT_CONFIG, + CV_SV_CONT_CONFIG, + CV_UK_CONT_CONFIG, +] diff --git a/ultravox/data/datasets_covost2.py b/ultravox/data/datasets_covost2.py new file mode 100644 index 00000000..4d948e34 --- /dev/null +++ b/ultravox/data/datasets_covost2.py @@ -0,0 +1,481 @@ +from ultravox.data import types + +CVST_BASE_CONFIG = types.DatasetConfig( + name="covost2", + path="fixie-ai/covost2", + user_template=types.TRANSLATION_USER_TEMPLATE, + transcript_template="{{sentence}}", + assistant_template="{{translation}}", +) + +CVST_AR_EN_CONFIG = types.DatasetConfig( + name="covost2-ar-en", + base="covost2", + subset="ar_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_283), + types.DatasetSplitConfig(name="validation", num_samples=1_758), + types.DatasetSplitConfig(name="test", num_samples=1_695), + ], + user_template_args={"target": "English"}, +) + +CVST_CA_EN_CONFIG = types.DatasetConfig( + name="covost2-ca-en", + base="covost2", + subset="ca_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=95_854), + types.DatasetSplitConfig(name="validation", num_samples=12_730), + types.DatasetSplitConfig(name="test", num_samples=12_730), + ], + user_template_args={"target": "English"}, +) + +CVST_CY_EN_CONFIG = types.DatasetConfig( + name="covost2-cy-en", + base="covost2", + subset="cy_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_241), + types.DatasetSplitConfig(name="validation", num_samples=690), + types.DatasetSplitConfig(name="test", num_samples=690), + ], + user_template_args={"target": "English"}, +) + +CVST_DE_EN_CONFIG = types.DatasetConfig( + name="covost2-de-en", + base="covost2", + subset="de_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=127_834), + types.DatasetSplitConfig(name="validation", num_samples=13_511), + types.DatasetSplitConfig(name="test", num_samples=13_511), + ], + user_template_args={"target": "English"}, +) + +CVST_EN_AR_CONFIG = types.DatasetConfig( + name="covost2-en-ar", + base="covost2", + subset="en_ar", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Arabic"}, +) + +CVST_EN_CA_CONFIG = types.DatasetConfig( + name="covost2-en-ca", + base="covost2", + subset="en_ca", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Catalan"}, +) + +CVST_EN_CY_CONFIG = types.DatasetConfig( + name="covost2-en-cy", + base="covost2", + subset="en_cy", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Welsh"}, +) + +CVST_EN_DE_CONFIG = types.DatasetConfig( + name="covost2-en-de", + base="covost2", + subset="en_de", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "German"}, +) + +CVST_EN_ET_CONFIG = types.DatasetConfig( + name="covost2-en-et", + base="covost2", + subset="en_et", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Estonian"}, +) + +CVST_EN_FA_CONFIG = types.DatasetConfig( + name="covost2-en-fa", + base="covost2", + subset="en_fa", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Persian"}, +) + +CVST_EN_ID_CONFIG = types.DatasetConfig( + name="covost2-en-id", + base="covost2", + subset="en_id", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Indonesian"}, +) + +CVST_EN_JA_CONFIG = types.DatasetConfig( + name="covost2-en-ja", + base="covost2", + subset="en_ja", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Japanese"}, +) + +CVST_EN_LV_CONFIG = types.DatasetConfig( + name="covost2-en-lv", + base="covost2", + subset="en_lv", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Latvian"}, +) + +CVST_EN_MN_CONFIG = types.DatasetConfig( + name="covost2-en-mn", + base="covost2", + subset="en_mn", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Mongolian"}, +) + +CVST_EN_SL_CONFIG = types.DatasetConfig( + name="covost2-en-sl", + base="covost2", + subset="en_sl", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Slovenian"}, +) + +CVST_EN_SV_CONFIG = types.DatasetConfig( + name="covost2-en-sv", + base="covost2", + subset="en_sv-SE", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Swedish"}, +) + +CVST_EN_TA_CONFIG = types.DatasetConfig( + name="covost2-en-ta", + base="covost2", + subset="en_ta", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Tamil"}, +) + +CVST_EN_TR_CONFIG = types.DatasetConfig( + name="covost2-en-tr", + base="covost2", + subset="en_tr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Turkish"}, +) + +CVST_EN_ZH_CONFIG = types.DatasetConfig( + name="covost2-en-zh", + base="covost2", + subset="en_zh-CN", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Chinese"}, +) + +CVST_ES_EN_CONFIG = types.DatasetConfig( + name="covost2-es-en", + base="covost2", + subset="es_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=79_015), + types.DatasetSplitConfig(name="validation", num_samples=13_221), + types.DatasetSplitConfig(name="test", num_samples=13_221), + ], + user_template_args={"target": "English"}, +) + +CVST_ET_EN_CONFIG = types.DatasetConfig( + name="covost2-et-en", + base="covost2", + subset="et_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_782), + types.DatasetSplitConfig(name="validation", num_samples=1_576), + types.DatasetSplitConfig(name="test", num_samples=1_571), + ], + user_template_args={"target": "English"}, +) + +CVST_FA_EN_CONFIG = types.DatasetConfig( + name="covost2-fa-en", + base="covost2", + subset="fa_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=53_949), + types.DatasetSplitConfig(name="validation", num_samples=3_445), + types.DatasetSplitConfig(name="test", num_samples=3_445), + ], + user_template_args={"target": "English"}, +) + +CVST_FR_EN_CONFIG = types.DatasetConfig( + name="covost2-fr-en", + base="covost2", + subset="fr_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=207_374), + types.DatasetSplitConfig(name="validation", num_samples=14_760), + types.DatasetSplitConfig(name="test", num_samples=14_760), + ], + user_template_args={"target": "English"}, +) + +CVST_ID_EN_CONFIG = types.DatasetConfig( + name="covost2-id-en", + base="covost2", + subset="id_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_243), + types.DatasetSplitConfig(name="validation", num_samples=792), + types.DatasetSplitConfig(name="test", num_samples=844), + ], + user_template_args={"target": "English"}, +) + +CVST_IT_EN_CONFIG = types.DatasetConfig( + name="covost2-it-en", + base="covost2", + subset="it_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=31_698), + types.DatasetSplitConfig(name="validation", num_samples=8_940), + types.DatasetSplitConfig(name="test", num_samples=8_951), + ], + user_template_args={"target": "English"}, +) + +CVST_JA_EN_CONFIG = types.DatasetConfig( + name="covost2-ja-en", + base="covost2", + subset="ja_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_119), + types.DatasetSplitConfig(name="validation", num_samples=635), + types.DatasetSplitConfig(name="test", num_samples=684), + ], + user_template_args={"target": "English"}, +) + +CVST_LV_EN_CONFIG = types.DatasetConfig( + name="covost2-lv-en", + base="covost2", + subset="lv_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_337), + types.DatasetSplitConfig(name="validation", num_samples=1_125), + types.DatasetSplitConfig(name="test", num_samples=1_629), + ], + user_template_args={"target": "English"}, +) + +CVST_MN_EN_CONFIG = types.DatasetConfig( + name="covost2-mn-en", + base="covost2", + subset="mn_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_067), + types.DatasetSplitConfig(name="validation", num_samples=1_761), + types.DatasetSplitConfig(name="test", num_samples=1_759), + ], + user_template_args={"target": "English"}, +) + +CVST_NL_EN_CONFIG = types.DatasetConfig( + name="covost2-nl-en", + base="covost2", + subset="nl_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_108), + types.DatasetSplitConfig(name="validation", num_samples=1_699), + types.DatasetSplitConfig(name="test", num_samples=1_699), + ], + user_template_args={"target": "English"}, +) + +CVST_PT_EN_CONFIG = types.DatasetConfig( + name="covost2-pt-en", + base="covost2", + subset="pt_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=9_158), + types.DatasetSplitConfig(name="validation", num_samples=3_318), + types.DatasetSplitConfig(name="test", num_samples=4_023), + ], + user_template_args={"target": "English"}, +) + +CVST_RU_EN_CONFIG = types.DatasetConfig( + name="covost2-ru-en", + base="covost2", + subset="ru_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=12_112), + types.DatasetSplitConfig(name="validation", num_samples=6_110), + types.DatasetSplitConfig(name="test", num_samples=6_300), + ], + user_template_args={"target": "English"}, +) + +CVST_SL_EN_CONFIG = types.DatasetConfig( + name="covost2-sl-en", + base="covost2", + subset="sl_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_843), + types.DatasetSplitConfig(name="validation", num_samples=509), + types.DatasetSplitConfig(name="test", num_samples=360), + ], + user_template_args={"target": "English"}, +) + +CVST_SV_EN_CONFIG = types.DatasetConfig( + name="covost2-sv-en", + base="covost2", + subset="sv-SE_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_160), + types.DatasetSplitConfig(name="validation", num_samples=1_349), + types.DatasetSplitConfig(name="test", num_samples=1_595), + ], + user_template_args={"target": "English"}, +) + +CVST_TA_EN_CONFIG = types.DatasetConfig( + name="covost2-ta-en", + base="covost2", + subset="ta_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_358), + types.DatasetSplitConfig(name="validation", num_samples=384), + types.DatasetSplitConfig(name="test", num_samples=786), + ], + user_template_args={"target": "English"}, +) + +CVST_TR_EN_CONFIG = types.DatasetConfig( + name="covost2-tr-en", + base="covost2", + subset="tr_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=3_966), + types.DatasetSplitConfig(name="validation", num_samples=1_624), + types.DatasetSplitConfig(name="test", num_samples=1_629), + ], + user_template_args={"target": "English"}, +) + +CVST_ZH_EN_CONFIG = types.DatasetConfig( + name="covost2-zh-en", + base="covost2", + subset="zh-CN_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_085), + types.DatasetSplitConfig(name="validation", num_samples=4_843), + types.DatasetSplitConfig(name="test", num_samples=4_898), + ], + user_template_args={"target": "English"}, +) + +configs = [ + CVST_BASE_CONFIG, + CVST_AR_EN_CONFIG, + CVST_CA_EN_CONFIG, + CVST_CY_EN_CONFIG, + CVST_DE_EN_CONFIG, + CVST_EN_AR_CONFIG, + CVST_EN_CA_CONFIG, + CVST_EN_CY_CONFIG, + CVST_EN_DE_CONFIG, + CVST_EN_ET_CONFIG, + CVST_EN_FA_CONFIG, + CVST_EN_ID_CONFIG, + CVST_EN_JA_CONFIG, + CVST_EN_LV_CONFIG, + CVST_EN_MN_CONFIG, + CVST_EN_SL_CONFIG, + CVST_EN_SV_CONFIG, + CVST_EN_TA_CONFIG, + CVST_EN_TR_CONFIG, + CVST_EN_ZH_CONFIG, + CVST_ES_EN_CONFIG, + CVST_ET_EN_CONFIG, + CVST_FA_EN_CONFIG, + CVST_FR_EN_CONFIG, + CVST_ID_EN_CONFIG, + CVST_IT_EN_CONFIG, + CVST_JA_EN_CONFIG, + CVST_LV_EN_CONFIG, + CVST_MN_EN_CONFIG, + CVST_NL_EN_CONFIG, + CVST_PT_EN_CONFIG, + CVST_RU_EN_CONFIG, + CVST_SL_EN_CONFIG, + CVST_SV_EN_CONFIG, + CVST_TA_EN_CONFIG, + CVST_TR_EN_CONFIG, + CVST_ZH_EN_CONFIG +] diff --git a/ultravox/data/datasets_gigaspeech.py b/ultravox/data/datasets_gigaspeech.py new file mode 100644 index 00000000..cf07b33c --- /dev/null +++ b/ultravox/data/datasets_gigaspeech.py @@ -0,0 +1,32 @@ + +from ultravox.data import types + +GS_XL_CONFIG = types.DatasetConfig( + name="gigaspeech-xl", + path="fixie-ai/gigaspeech", + subset="xl-empty-audio-removed", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=8_266_422), + ], + transcript_template="{{text_proc.format_asr_text(text)}}", + assistant_template="{{text_proc.format_asr_text(text)}}", +) + +GS_XL_TRANS_CONFIG = types.DatasetConfig( + name="gigaspeech-xl-transcription", + base="gigaspeech-xl", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +GS_XL_CONT_CONFIG = types.DatasetConfig( + name="gigaspeech-xl-continuation", + base="gigaspeech-xl", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + GS_XL_CONFIG, + GS_XL_TRANS_CONFIG, + GS_XL_CONT_CONFIG +] diff --git a/ultravox/data/datasets_librispeech.py b/ultravox/data/datasets_librispeech.py new file mode 100644 index 00000000..878d79c3 --- /dev/null +++ b/ultravox/data/datasets_librispeech.py @@ -0,0 +1,63 @@ + +from ultravox.data import types + +LS_BASE_CONFIG = types.DatasetConfig( + name="librispeech", + path="fixie-ai/librispeech_asr", + transcript_template="{{text_proc.format_asr_text(text)}}", + assistant_template="{{text_proc.format_asr_text(text)}}", +) + +LS_CLEAN_CONFIG = types.DatasetConfig( + name="librispeech-clean", + base="librispeech", + subset="clean", + splits=[ + types.DatasetSplitConfig(name="train.100", num_samples=28_539), + types.DatasetSplitConfig(name="train.360", num_samples=104_014), + ], +) + +LS_OTHER_CONFIG = types.DatasetConfig( + name="librispeech-other", + base="librispeech", + subset="other", + splits=[ + types.DatasetSplitConfig(name="train.500", num_samples=148_688), + ], +) + +LS_CLEAN_TRANS_CONFIG = types.DatasetConfig( + name="librispeech-clean-transcription", + base="librispeech-clean", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +LS_OTHER_TRANS_CONFIG = types.DatasetConfig( + name="librispeech-other-transcription", + base="librispeech-other", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +LS_CLEAN_CONT_CONFIG = types.DatasetConfig( + name="librispeech-clean-continuation", + base="librispeech-clean", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +LS_OTHER_CONT_CONFIG = types.DatasetConfig( + name="librispeech-other-continuation", + base="librispeech-other", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + LS_BASE_CONFIG, + LS_CLEAN_CONFIG, + LS_OTHER_CONFIG, + LS_CLEAN_TRANS_CONFIG, + LS_OTHER_TRANS_CONFIG, + LS_CLEAN_CONT_CONFIG, + LS_OTHER_CONT_CONFIG, +] diff --git a/ultravox/data/datasets_multilingual_librispeech.py b/ultravox/data/datasets_multilingual_librispeech.py new file mode 100644 index 00000000..40cc758c --- /dev/null +++ b/ultravox/data/datasets_multilingual_librispeech.py @@ -0,0 +1,59 @@ + +from ultravox.data import types + +ML_BASE_CONFIG = types.DatasetConfig( + name="multilingual_librispeech", + path="fixie-ai/multilingual_librispeech", + transcript_template="{{transcript}}", + assistant_template="{{transcript}}", +) + +ML_NL_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl", + base="multilingual_librispeech", + subset="dutch", + splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], +) + +ML_PT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt", + base="multilingual_librispeech", + subset="portuguese", + splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], +) + +ML_NL_TRANS_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl-transcription", + base="multilingual_librispeech-nl", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +ML_PT_TRANS_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt-transcription", + base="multilingual_librispeech-pt", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +ML_NL_CONT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl-continuation", + base="multilingual_librispeech-nl", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +ML_PT_CONT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt-continuation", + base="multilingual_librispeech-pt", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + ML_BASE_CONFIG, + ML_NL_CONFIG, + ML_PT_CONFIG, + ML_NL_TRANS_CONFIG, + ML_PT_TRANS_CONFIG, + ML_NL_CONT_CONFIG, + ML_PT_CONT_CONFIG, +] diff --git a/ultravox/data/datasets_peoplespeech.py b/ultravox/data/datasets_peoplespeech.py new file mode 100644 index 00000000..aaa16ff2 --- /dev/null +++ b/ultravox/data/datasets_peoplespeech.py @@ -0,0 +1,33 @@ + +from ultravox.data import types + +PS_BASE_CONFIG = types.DatasetConfig( + name="peoplespeech", + path="fixie-ai/peoples_speech", + subset="clean", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_501_271), + types.DatasetSplitConfig(name="test", num_samples=34_898, split_type=types.DatasetSplit.VALIDATION), + ], + assistant_template="{{text_proc.format_asr_text(text)}}", + transcript_template="{{text_proc.format_asr_text(text)}}", +) + +PS_TRANS_CONFIG = types.DatasetConfig( + name="peoplespeech-clean-transcription", + base="peoplespeech", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +PS_CONT_CONFIG = types.DatasetConfig( + name="peoplespeech-clean-continuation", + base="peoplespeech", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + PS_BASE_CONFIG, + PS_TRANS_CONFIG, + PS_CONT_CONFIG, +] diff --git a/ultravox/data/datasets_voxpopuli.py b/ultravox/data/datasets_voxpopuli.py new file mode 100644 index 00000000..18741c5b --- /dev/null +++ b/ultravox/data/datasets_voxpopuli.py @@ -0,0 +1,18 @@ + +from ultravox.data import types + +VP_EN_CONFIG = types.DatasetConfig( + name="voxpopuli-en", + path="facebook/voxpopuli", + subset="en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_000_000), + types.DatasetSplitConfig(name="validation", num_samples=10_000), + ], + assistant_template="{{raw_text}}", + transcript_template="{{raw_text}}", +) + +configs = [ + VP_EN_CONFIG, +] diff --git a/ultravox/data/datasets_wenetspeech.py b/ultravox/data/datasets_wenetspeech.py new file mode 100644 index 00000000..ccf3c3fa --- /dev/null +++ b/ultravox/data/datasets_wenetspeech.py @@ -0,0 +1,29 @@ + +from ultravox.data import types + +WS_BASE_CONFIG = types.DatasetConfig( + name="wenetspeech", + path="fixie-ai/wenetspeech", + subset="L_fixed", + splits=[types.DatasetSplitConfig(name="train", num_samples=14_621_415)], + transcript_template="{{text}}", +) + +WS_TRANS_CONFIG = types.DatasetConfig( + name="wenetspeech-transcription", + base="wenetspeech", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +WS_CONT_CONFIG = types.DatasetConfig( + name="wenetspeech-continuation", + base="wenetspeech", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + WS_BASE_CONFIG, + WS_TRANS_CONFIG, + WS_CONT_CONFIG, +] diff --git a/ultravox/data/registry.py b/ultravox/data/registry.py index 9f1b5e29..91ab8a5b 100644 --- a/ultravox/data/registry.py +++ b/ultravox/data/registry.py @@ -3,11 +3,12 @@ from ultravox.data import datasets from ultravox.data import types +from ultravox.data import datasets_boolq, datasets_commonvoice, datasets_covost2, \ + datasets_gigaspeech, datasets_librispeech, datasets_multilingual_librispeech, \ + datasets_peoplespeech, datasets_voxpopuli,datasets_wenetspeech -CONTINUATION_USER_TEMPLATE = f"Continue the following text using less than 50 words:\n\n{types.AUDIO_PLACEHOLDER}" -CONTINUATION_ASSISTANT_TEMPLATE = "{{continuation}}" -TRANSCRIPTION_USER_TEMPLATE = f"Transcribe\n{types.AUDIO_PLACEHOLDER}" +<<<<<<< HEAD BOOLQ_CONFIG = types.DatasetConfig( name="boolq", path="fixie-ai/boolq-audio", @@ -377,6 +378,8 @@ PS_CLEAN_CONT_CONFIG, VP_EN_CONFIG, ] +======= +>>>>>>> upstream/zhuang.2024-10-09-v0_4_1.stacking-4b DATASET_MAP: Dict[str, types.DatasetConfig] = {} @@ -427,4 +430,12 @@ def create_dataset( return datasets.GenericDataset(args, merged_config) -register_datasets(INTERNAL_DATASETS) +register_datasets(datasets_boolq.configs) +register_datasets(datasets_commonvoice.configs) +register_datasets(datasets_covost2.configs) +register_datasets(datasets_gigaspeech.configs) +register_datasets(datasets_librispeech.configs) +register_datasets(datasets_multilingual_librispeech.configs) +register_datasets(datasets_peoplespeech.configs) +register_datasets(datasets_voxpopuli.configs) +register_datasets(datasets_wenetspeech.configs) diff --git a/ultravox/data/types.py b/ultravox/data/types.py index b499c1dd..0e6468ad 100644 --- a/ultravox/data/types.py +++ b/ultravox/data/types.py @@ -6,10 +6,15 @@ AUDIO_PLACEHOLDER = "<|audio|>" +TRANSLATION_USER_TEMPLATE = f"Please translate the text to {{{{target}}}}. Your response should only include the {{{{target}}}} translation, without any additional words:\n\n{AUDIO_PLACEHOLDER}" +CONTINUATION_USER_TEMPLATE = f"Continue the following text using less than 50 words:\n\n{AUDIO_PLACEHOLDER}" +CONTINUATION_ASSISTANT_TEMPLATE = "{{continuation}}" +TRANSCRIPTION_USER_TEMPLATE = f"Transcribe\n{AUDIO_PLACEHOLDER}" class DatasetSplit(str, enum.Enum): TRAIN = "train" VALIDATION = "validation" + TEST = "test" @dataclasses.dataclass @@ -40,15 +45,18 @@ class DatasetSplitConfig(helpers.Serializable): """Name of the split.""" num_samples: int """Number of samples in the split""" - split_type: DatasetSplit = DatasetSplit.TRAIN - """Type of split, i.e., train or validation.""" + split_type: DatasetSplit = None + """Type of split, i.e., train, test, or validation.""" def __post_init__(self): - """Automatically set is_validation if it's a validation split.""" - if self.name == "test": - self.split_type = DatasetSplit.TEST - elif self.name == "validation": - self.split_type = DatasetSplit.VALIDATION + """Automatically set split type based on split name""" + if self.split_type is None: + if self.name == "test": + self.split_type = DatasetSplit.TEST + elif self.name == "validation": + self.split_type = DatasetSplit.VALIDATION + else: + self.split_type = DatasetSplit.TRAIN @dataclasses.dataclass diff --git a/ultravox/training/configs/release_config.yaml b/ultravox/training/configs/release_config.yaml index 6ebdd987..329f9c89 100644 --- a/ultravox/training/configs/release_config.yaml +++ b/ultravox/training/configs/release_config.yaml @@ -57,7 +57,7 @@ train_sets: # Temporarily remove heysquad_human from val_sets as it causes the training to fail. val_sets: - - name: peoplespeech + - name: peoplespeech-clean-transcription batch_size: 24 max_steps: 14400 # x8x24 = 2,764,800 From b6226f751b06b024d7f052a03a65fcffedc2dbe1 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Fri, 1 Nov 2024 23:53:16 -0400 Subject: [PATCH 02/13] update --- ultravox/data/registry.py | 372 -------------------------------------- 1 file changed, 372 deletions(-) diff --git a/ultravox/data/registry.py b/ultravox/data/registry.py index 91ab8a5b..c1a3804f 100644 --- a/ultravox/data/registry.py +++ b/ultravox/data/registry.py @@ -8,378 +8,6 @@ datasets_peoplespeech, datasets_voxpopuli,datasets_wenetspeech -<<<<<<< HEAD -BOOLQ_CONFIG = types.DatasetConfig( - name="boolq", - path="fixie-ai/boolq-audio", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=10000), - types.DatasetSplitConfig(name="validation", num_samples=1000), - ], - user_template="{{passage}}\n\n{AUDIO_PLACEHOLDER}", - assistant_template="{{'True' if answer else 'False'}}", - transcript_template="{{question}}", -) - -CV_BASE_CONFIG = types.DatasetConfig( - name="commonvoice", - path="fixie-ai/common_voice_17_0", - assistant_template="{{sentence}}", - transcript_template="{{sentence}}", -) - -CV_EN_CONFIG = types.DatasetConfig( - name="commonvoice-en", - base="commonvoice", - subset="en", - splits=[types.DatasetSplitConfig(name="train", num_samples=1_101_170)], -) - -CV_AR_CONFIG = types.DatasetConfig( - name="commonvoice-ar", - base="commonvoice", - subset="ar", - splits=[types.DatasetSplitConfig(name="train", num_samples=28_369)], -) - -CV_DE_CONFIG = types.DatasetConfig( - name="commonvoice-de", - base="commonvoice", - subset="de", - splits=[types.DatasetSplitConfig(name="train", num_samples=589_100)], -) - -CV_ES_CONFIG = types.DatasetConfig( - name="commonvoice-es", - base="commonvoice", - subset="es", - splits=[types.DatasetSplitConfig(name="train", num_samples=336_846)], -) - -CV_FR_CONFIG = types.DatasetConfig( - name="commonvoice-fr", - base="commonvoice", - subset="fr", - splits=[types.DatasetSplitConfig(name="train", num_samples=558_054)], -) - -CV_IT_CONFIG = types.DatasetConfig( - name="commonvoice-it", - base="commonvoice", - subset="it", - splits=[types.DatasetSplitConfig(name="train", num_samples=169_771)], -) - -CV_JA_CONFIG = types.DatasetConfig( - name="commonvoice-ja", - base="commonvoice", - subset="ja", - splits=[types.DatasetSplitConfig(name="train", num_samples=10_039)], -) - -CV_PT_CONFIG = types.DatasetConfig( - name="commonvoice-pt", - base="commonvoice", - subset="pt", - splits=[types.DatasetSplitConfig(name="train", num_samples=21_968)], -) - -CV_RU_CONFIG = types.DatasetConfig( - name="commonvoice-ru", - base="commonvoice", - subset="ru", - splits=[types.DatasetSplitConfig(name="train", num_samples=26_377)], -) - -CV_HI_CONFIG = types.DatasetConfig( - name="commonvoice-hi", - base="commonvoice", - subset="hi", - splits=[types.DatasetSplitConfig(name="train", num_samples=4_690)], -) - -CV_SV_SE_CONFIG = types.DatasetConfig( - name="commonvoice-sv-se", - base="commonvoice", - subset="sv-SE", - splits=[types.DatasetSplitConfig(name="train", num_samples=7_740)], -) - -CV_TR_CONFIG = types.DatasetConfig( - name="commonvoice-tr", - base="commonvoice", - subset="tr", - splits=[types.DatasetSplitConfig(name="train", num_samples=35_100)], -) - -CV_UK_CONFIG = types.DatasetConfig( - name="commonvoice-uk", - base="commonvoice", - subset="uk", - splits=[types.DatasetSplitConfig(name="train", num_samples=25_100)], -) - -GS_XL_CONFIG = types.DatasetConfig( - name="gigaspeech", - path="speechcolab/gigaspeech", - subset="xl", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_000_000), - types.DatasetSplitConfig(name="validation", num_samples=10_000), - ], - assistant_template="{{text_proc.format_asr_text(text)}}", - transcript_template="{{text_proc.format_asr_text(text)}}", -) - -LS_BASE_CONFIG = types.DatasetConfig( - name="librispeech", - path="fixie-ai/librispeech_asr", - assistant_template="{{text_proc.format_asr_text(text)}}", - transcript_template="{{text_proc.format_asr_text(text)}}", -) - -LS_CLEAN_CONFIG = types.DatasetConfig( - name="librispeech-clean", - base="librispeech", - subset="clean", - splits=[ - types.DatasetSplitConfig(name="train.100", num_samples=28_539), - types.DatasetSplitConfig(name="train.360", num_samples=104_014), - ], -) - -LS_OTHER_CONFIG = types.DatasetConfig( - name="librispeech-other", - base="librispeech", - subset="other", - splits=[ - types.DatasetSplitConfig(name="train.500", num_samples=148_688), - ], -) - -PS_CLEAN_CONFIG = types.DatasetConfig( - name="peoplespeech", - path="fixie-ai/peoples_speech", - subset="clean", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_000_000), - types.DatasetSplitConfig(name="validation", num_samples=10_000), - ], -) - -# SODA_CONFIG = types.DatasetConfig( -# name="soda", -# path="fixie-ai/soda-audio", -# splits=[ -# types.DatasetSplitConfig(name="train", num_samples=1_000_000), -# types.DatasetSplitConfig(name="validation", num_samples=10_000), -# ], -# # Need way to specify message history. -# audio_field="audio_second_last_turn", -# assistant_template="{{alt_last_turn}}", -# transcript_template="{{turns[-2]}}", -# ) - -VP_BASE_CONFIG = types.DatasetConfig( - name="voxpopuli", - path="facebook/voxpopuli", - assistant_template="{{raw_text}}", - transcript_template="{{raw_text}}", -) - -VP_EN_CONFIG = types.DatasetConfig( - name="voxpopuli-en", - base="voxpopuli", - subset="en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_000_000), - types.DatasetSplitConfig(name="validation", num_samples=10_000), - ], -) - -CV_EN_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-en-transcription", - base="commonvoice-en", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_AR_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ar-transcription", - base="commonvoice-ar", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_DE_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-de-transcription", - base="commonvoice-de", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_ES_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-es-transcription", - base="commonvoice-es", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_FR_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-fr-transcription", - base="commonvoice-fr", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_IT_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-it-transcription", - base="commonvoice-it", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_JA_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ja-transcription", - base="commonvoice-ja", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_PT_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-pt-transcription", - base="commonvoice-pt", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -CV_RU_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ru-transcription", - base="commonvoice-ru", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) - -LS_CLEAN_TRANS_CONFIG = types.DatasetConfig( - name="librispeech-clean-transcription", - base="librispeech-clean", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) -LS_OTHER_TRANS_CONFIG = types.DatasetConfig( - name="librispeech-other-transcription", - base="librispeech-other", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) - -PS_CLEAN_TRANS_CONFIG = types.DatasetConfig( - name="peoplespeech-clean-transcription", - base="peoplespeech", - user_template=TRANSCRIPTION_USER_TEMPLATE, -) - -CV_EN_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-en-continuation", - base="commonvoice-en", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_AR_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ar-continuation", - base="commonvoice-ar", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_DE_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-de-continuation", - base="commonvoice-de", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_ES_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-es-continuation", - base="commonvoice-es", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_FR_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-fr-continuation", - base="commonvoice-fr", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_IT_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-it-continuation", - base="commonvoice-it", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_JA_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ja-continuation", - base="commonvoice-ja", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_PT_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-pt-continuation", - base="commonvoice-pt", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_RU_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ru-continuation", - base="commonvoice-ru", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) - -LS_CLEAN_CONT_CONFIG = types.DatasetConfig( - name="librispeech-clean-continuation", - base="librispeech-clean", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) -LS_OTHER_CONT_CONFIG = types.DatasetConfig( - name="librispeech-other-continuation", - base="librispeech-other", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) - -PS_CLEAN_CONT_CONFIG = types.DatasetConfig( - name="peoplespeech-clean-continuation", - base="peoplespeech", - user_template=CONTINUATION_USER_TEMPLATE, - assistant_template=CONTINUATION_ASSISTANT_TEMPLATE, -) - -INTERNAL_DATASETS = [ - BOOLQ_CONFIG, - CV_BASE_CONFIG, - CV_EN_CONFIG, - CV_AR_CONFIG, - CV_DE_CONFIG, - CV_ES_CONFIG, - CV_FR_CONFIG, - CV_IT_CONFIG, - CV_JA_CONFIG, - CV_PT_CONFIG, - CV_RU_CONFIG, - CV_EN_TRANS_CONFIG, - CV_AR_TRANS_CONFIG, - CV_DE_TRANS_CONFIG, - CV_ES_TRANS_CONFIG, - CV_FR_TRANS_CONFIG, - CV_IT_TRANS_CONFIG, - CV_JA_TRANS_CONFIG, - CV_PT_TRANS_CONFIG, - CV_RU_TRANS_CONFIG, - CV_EN_CONT_CONFIG, - CV_AR_CONT_CONFIG, - CV_DE_CONT_CONFIG, - CV_ES_CONT_CONFIG, - CV_FR_CONT_CONFIG, - CV_IT_CONT_CONFIG, - CV_JA_CONT_CONFIG, - CV_PT_CONT_CONFIG, - CV_RU_CONT_CONFIG, - GS_XL_CONFIG, - LS_BASE_CONFIG, - LS_CLEAN_CONFIG, - LS_OTHER_CONFIG, - LS_CLEAN_TRANS_CONFIG, - LS_OTHER_TRANS_CONFIG, - LS_CLEAN_CONT_CONFIG, - LS_OTHER_CONT_CONFIG, - PS_CLEAN_CONFIG, - PS_CLEAN_TRANS_CONFIG, - PS_CLEAN_CONT_CONFIG, - VP_EN_CONFIG, -] -======= ->>>>>>> upstream/zhuang.2024-10-09-v0_4_1.stacking-4b DATASET_MAP: Dict[str, types.DatasetConfig] = {} From 420fbea0363f050cf706187835d0ef806241b0ea Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Sat, 2 Nov 2024 00:00:19 -0400 Subject: [PATCH 03/13] update --- ultravox/data/{ => configs}/datasets_boolq.py | 1 - .../data/{ => configs}/datasets_commonvoice.py | 1 - ultravox/data/{ => configs}/datasets_covost2.py | 4 ++-- .../data/{ => configs}/datasets_gigaspeech.py | 7 +------ .../data/{ => configs}/datasets_librispeech.py | 1 - .../datasets_multilingual_librispeech.py | 1 - .../data/{ => configs}/datasets_peoplespeech.py | 5 +++-- ultravox/data/{ => configs}/datasets_test.py | 0 .../data/{ => configs}/datasets_voxpopuli.py | 1 - .../data/{ => configs}/datasets_wenetspeech.py | 1 - ultravox/data/datasets.py | 2 +- ultravox/data/registry.py | 16 ++++++++++------ ultravox/data/types.py | 9 ++++++--- 13 files changed, 23 insertions(+), 26 deletions(-) rename ultravox/data/{ => configs}/datasets_boolq.py (99%) rename ultravox/data/{ => configs}/datasets_commonvoice.py (99%) rename ultravox/data/{ => configs}/datasets_covost2.py (99%) rename ultravox/data/{ => configs}/datasets_gigaspeech.py (90%) rename ultravox/data/{ => configs}/datasets_librispeech.py (99%) rename ultravox/data/{ => configs}/datasets_multilingual_librispeech.py (99%) rename ultravox/data/{ => configs}/datasets_peoplespeech.py (86%) rename ultravox/data/{ => configs}/datasets_test.py (100%) rename ultravox/data/{ => configs}/datasets_voxpopuli.py (99%) rename ultravox/data/{ => configs}/datasets_wenetspeech.py (99%) diff --git a/ultravox/data/datasets_boolq.py b/ultravox/data/configs/datasets_boolq.py similarity index 99% rename from ultravox/data/datasets_boolq.py rename to ultravox/data/configs/datasets_boolq.py index 37c2bfbd..b99ca104 100644 --- a/ultravox/data/datasets_boolq.py +++ b/ultravox/data/configs/datasets_boolq.py @@ -1,4 +1,3 @@ - from ultravox.data import types BOOLQ_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets_commonvoice.py b/ultravox/data/configs/datasets_commonvoice.py similarity index 99% rename from ultravox/data/datasets_commonvoice.py rename to ultravox/data/configs/datasets_commonvoice.py index 3273aa5a..ef823d14 100644 --- a/ultravox/data/datasets_commonvoice.py +++ b/ultravox/data/configs/datasets_commonvoice.py @@ -1,4 +1,3 @@ - from ultravox.data import types CV_BASE_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets_covost2.py b/ultravox/data/configs/datasets_covost2.py similarity index 99% rename from ultravox/data/datasets_covost2.py rename to ultravox/data/configs/datasets_covost2.py index 4d948e34..42091199 100644 --- a/ultravox/data/datasets_covost2.py +++ b/ultravox/data/configs/datasets_covost2.py @@ -443,7 +443,7 @@ configs = [ CVST_BASE_CONFIG, CVST_AR_EN_CONFIG, - CVST_CA_EN_CONFIG, + CVST_CA_EN_CONFIG, CVST_CY_EN_CONFIG, CVST_DE_EN_CONFIG, CVST_EN_AR_CONFIG, @@ -477,5 +477,5 @@ CVST_SV_EN_CONFIG, CVST_TA_EN_CONFIG, CVST_TR_EN_CONFIG, - CVST_ZH_EN_CONFIG + CVST_ZH_EN_CONFIG, ] diff --git a/ultravox/data/datasets_gigaspeech.py b/ultravox/data/configs/datasets_gigaspeech.py similarity index 90% rename from ultravox/data/datasets_gigaspeech.py rename to ultravox/data/configs/datasets_gigaspeech.py index cf07b33c..17b889a4 100644 --- a/ultravox/data/datasets_gigaspeech.py +++ b/ultravox/data/configs/datasets_gigaspeech.py @@ -1,4 +1,3 @@ - from ultravox.data import types GS_XL_CONFIG = types.DatasetConfig( @@ -25,8 +24,4 @@ assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, ) -configs = [ - GS_XL_CONFIG, - GS_XL_TRANS_CONFIG, - GS_XL_CONT_CONFIG -] +configs = [GS_XL_CONFIG, GS_XL_TRANS_CONFIG, GS_XL_CONT_CONFIG] diff --git a/ultravox/data/datasets_librispeech.py b/ultravox/data/configs/datasets_librispeech.py similarity index 99% rename from ultravox/data/datasets_librispeech.py rename to ultravox/data/configs/datasets_librispeech.py index 878d79c3..2f186336 100644 --- a/ultravox/data/datasets_librispeech.py +++ b/ultravox/data/configs/datasets_librispeech.py @@ -1,4 +1,3 @@ - from ultravox.data import types LS_BASE_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets_multilingual_librispeech.py b/ultravox/data/configs/datasets_multilingual_librispeech.py similarity index 99% rename from ultravox/data/datasets_multilingual_librispeech.py rename to ultravox/data/configs/datasets_multilingual_librispeech.py index 40cc758c..4209f27c 100644 --- a/ultravox/data/datasets_multilingual_librispeech.py +++ b/ultravox/data/configs/datasets_multilingual_librispeech.py @@ -1,4 +1,3 @@ - from ultravox.data import types ML_BASE_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets_peoplespeech.py b/ultravox/data/configs/datasets_peoplespeech.py similarity index 86% rename from ultravox/data/datasets_peoplespeech.py rename to ultravox/data/configs/datasets_peoplespeech.py index aaa16ff2..36f8e8f8 100644 --- a/ultravox/data/datasets_peoplespeech.py +++ b/ultravox/data/configs/datasets_peoplespeech.py @@ -1,4 +1,3 @@ - from ultravox.data import types PS_BASE_CONFIG = types.DatasetConfig( @@ -7,7 +6,9 @@ subset="clean", splits=[ types.DatasetSplitConfig(name="train", num_samples=1_501_271), - types.DatasetSplitConfig(name="test", num_samples=34_898, split_type=types.DatasetSplit.VALIDATION), + types.DatasetSplitConfig( + name="test", num_samples=34_898, split_type=types.DatasetSplit.VALIDATION + ), ], assistant_template="{{text_proc.format_asr_text(text)}}", transcript_template="{{text_proc.format_asr_text(text)}}", diff --git a/ultravox/data/datasets_test.py b/ultravox/data/configs/datasets_test.py similarity index 100% rename from ultravox/data/datasets_test.py rename to ultravox/data/configs/datasets_test.py diff --git a/ultravox/data/datasets_voxpopuli.py b/ultravox/data/configs/datasets_voxpopuli.py similarity index 99% rename from ultravox/data/datasets_voxpopuli.py rename to ultravox/data/configs/datasets_voxpopuli.py index 18741c5b..dd62b862 100644 --- a/ultravox/data/datasets_voxpopuli.py +++ b/ultravox/data/configs/datasets_voxpopuli.py @@ -1,4 +1,3 @@ - from ultravox.data import types VP_EN_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets_wenetspeech.py b/ultravox/data/configs/datasets_wenetspeech.py similarity index 99% rename from ultravox/data/datasets_wenetspeech.py rename to ultravox/data/configs/datasets_wenetspeech.py index ccf3c3fa..a487a3cc 100644 --- a/ultravox/data/datasets_wenetspeech.py +++ b/ultravox/data/configs/datasets_wenetspeech.py @@ -1,4 +1,3 @@ - from ultravox.data import types WS_BASE_CONFIG = types.DatasetConfig( diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py index 90c0e865..5badf0c5 100644 --- a/ultravox/data/datasets.py +++ b/ultravox/data/datasets.py @@ -118,7 +118,7 @@ def _init_dataset(self, dataset: data.Dataset, num_samples: int) -> None: def __len__(self): return self._length - + def _load_hf_dataset( self, path: str, diff --git a/ultravox/data/registry.py b/ultravox/data/registry.py index c1a3804f..21cf2b2b 100644 --- a/ultravox/data/registry.py +++ b/ultravox/data/registry.py @@ -1,12 +1,16 @@ import dataclasses from typing import Dict, List, Optional -from ultravox.data import datasets -from ultravox.data import types -from ultravox.data import datasets_boolq, datasets_commonvoice, datasets_covost2, \ - datasets_gigaspeech, datasets_librispeech, datasets_multilingual_librispeech, \ - datasets_peoplespeech, datasets_voxpopuli,datasets_wenetspeech - +from ultravox.data import datasets, types +from ultravox.data.configs import datasets_boolq +from ultravox.data.configs import datasets_commonvoice +from ultravox.data.configs import datasets_covost2 +from ultravox.data.configs import datasets_gigaspeech +from ultravox.data.configs import datasets_librispeech +from ultravox.data.configs import datasets_multilingual_librispeech +from ultravox.data.configs import datasets_peoplespeech +from ultravox.data.configs import datasets_voxpopuli +from ultravox.data.configs import datasets_wenetspeech DATASET_MAP: Dict[str, types.DatasetConfig] = {} diff --git a/ultravox/data/types.py b/ultravox/data/types.py index 0e6468ad..58eccc1d 100644 --- a/ultravox/data/types.py +++ b/ultravox/data/types.py @@ -6,11 +6,14 @@ AUDIO_PLACEHOLDER = "<|audio|>" -TRANSLATION_USER_TEMPLATE = f"Please translate the text to {{{{target}}}}. Your response should only include the {{{{target}}}} translation, without any additional words:\n\n{AUDIO_PLACEHOLDER}" -CONTINUATION_USER_TEMPLATE = f"Continue the following text using less than 50 words:\n\n{AUDIO_PLACEHOLDER}" +TRANSLATION_USER_TEMPLATE = f"Please translate the text to {{{{target}}}}. Your response should only include the {{{{target}}}} translation, without any additional words:\n\n{AUDIO_PLACEHOLDER}" +CONTINUATION_USER_TEMPLATE = ( + f"Continue the following text using less than 50 words:\n\n{AUDIO_PLACEHOLDER}" +) CONTINUATION_ASSISTANT_TEMPLATE = "{{continuation}}" TRANSCRIPTION_USER_TEMPLATE = f"Transcribe\n{AUDIO_PLACEHOLDER}" + class DatasetSplit(str, enum.Enum): TRAIN = "train" VALIDATION = "validation" @@ -45,7 +48,7 @@ class DatasetSplitConfig(helpers.Serializable): """Name of the split.""" num_samples: int """Number of samples in the split""" - split_type: DatasetSplit = None + split_type: Optional[DatasetSplit] = None """Type of split, i.e., train, test, or validation.""" def __post_init__(self): From 60f6757ac13408f2ff36e5c2ad87e3a83d6ae375 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Sat, 2 Nov 2024 00:08:09 -0400 Subject: [PATCH 04/13] update --- ultravox/data/registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ultravox/data/registry.py b/ultravox/data/registry.py index 21cf2b2b..fa6b3275 100644 --- a/ultravox/data/registry.py +++ b/ultravox/data/registry.py @@ -1,7 +1,8 @@ import dataclasses from typing import Dict, List, Optional -from ultravox.data import datasets, types +from ultravox.data import datasets +from ultravox.data import types from ultravox.data.configs import datasets_boolq from ultravox.data.configs import datasets_commonvoice from ultravox.data.configs import datasets_covost2 From f548cfb47adbebfdee4accf139852fcbdeef131c Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Sat, 2 Nov 2024 00:10:57 -0400 Subject: [PATCH 05/13] update --- ultravox/data/configs/datasets_boolq.py | 17 - ultravox/data/configs/datasets_commonvoice.py | 337 ------------ ultravox/data/configs/datasets_covost2.py | 481 ------------------ ultravox/data/configs/datasets_gigaspeech.py | 27 - ultravox/data/configs/datasets_librispeech.py | 62 --- .../datasets_multilingual_librispeech.py | 58 --- .../data/configs/datasets_peoplespeech.py | 34 -- ultravox/data/configs/datasets_test.py | 377 -------------- ultravox/data/configs/datasets_voxpopuli.py | 17 - ultravox/data/configs/datasets_wenetspeech.py | 28 - ultravox/data/registry.py | 36 +- 11 files changed, 18 insertions(+), 1456 deletions(-) delete mode 100644 ultravox/data/configs/datasets_boolq.py delete mode 100644 ultravox/data/configs/datasets_commonvoice.py delete mode 100644 ultravox/data/configs/datasets_covost2.py delete mode 100644 ultravox/data/configs/datasets_gigaspeech.py delete mode 100644 ultravox/data/configs/datasets_librispeech.py delete mode 100644 ultravox/data/configs/datasets_multilingual_librispeech.py delete mode 100644 ultravox/data/configs/datasets_peoplespeech.py delete mode 100644 ultravox/data/configs/datasets_test.py delete mode 100644 ultravox/data/configs/datasets_voxpopuli.py delete mode 100644 ultravox/data/configs/datasets_wenetspeech.py diff --git a/ultravox/data/configs/datasets_boolq.py b/ultravox/data/configs/datasets_boolq.py deleted file mode 100644 index b99ca104..00000000 --- a/ultravox/data/configs/datasets_boolq.py +++ /dev/null @@ -1,17 +0,0 @@ -from ultravox.data import types - -BOOLQ_CONFIG = types.DatasetConfig( - name="boolq", - path="fixie-ai/boolq-audio", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=10000), - types.DatasetSplitConfig(name="validation", num_samples=1000), - ], - user_template=f"{{{{passage}}}}\n\n{types.AUDIO_PLACEHOLDER}", - assistant_template="{{'True' if answer else 'False'}}", - transcript_template="{{question}}", -) - -configs = [ - BOOLQ_CONFIG, -] diff --git a/ultravox/data/configs/datasets_commonvoice.py b/ultravox/data/configs/datasets_commonvoice.py deleted file mode 100644 index ef823d14..00000000 --- a/ultravox/data/configs/datasets_commonvoice.py +++ /dev/null @@ -1,337 +0,0 @@ -from ultravox.data import types - -CV_BASE_CONFIG = types.DatasetConfig( - name="commonvoice", - path="fixie-ai/common_voice_17_0", - transcript_template="{{sentence}}", - assistant_template="{{sentence}}", -) - -CV_EN_CONFIG = types.DatasetConfig( - name="commonvoice-en", - base="commonvoice", - subset="en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_101_170), - types.DatasetSplitConfig(name="validation", num_samples=16_393), - ], - transcript_template="{{text_proc.format_asr_text(sentence)}}", - assistant_template="{{text_proc.format_asr_text(sentence)}}", -) - -CV_AR_CONFIG = types.DatasetConfig( - name="commonvoice-ar", - base="commonvoice", - subset="ar", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=28_369), - types.DatasetSplitConfig(name="validation", num_samples=10_470), - ], -) - -CV_DE_CONFIG = types.DatasetConfig( - name="commonvoice-de", - base="commonvoice", - subset="de", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=589_100), - types.DatasetSplitConfig(name="validation", num_samples=16_183), - ], -) - -CV_ES_CONFIG = types.DatasetConfig( - name="commonvoice-es", - base="commonvoice", - subset="es", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=336_846), - types.DatasetSplitConfig(name="validation", num_samples=15_857), - ], -) - -CV_FR_CONFIG = types.DatasetConfig( - name="commonvoice-fr", - base="commonvoice", - subset="fr", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=558_054), - types.DatasetSplitConfig(name="validation", num_samples=16_159), - ], -) - -CV_IT_CONFIG = types.DatasetConfig( - name="commonvoice-it", - base="commonvoice", - subset="it", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=169_771), - types.DatasetSplitConfig(name="validation", num_samples=15_149), - ], -) - -CV_JA_CONFIG = types.DatasetConfig( - name="commonvoice-ja", - base="commonvoice", - subset="ja", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=10_039), - types.DatasetSplitConfig(name="validation", num_samples=6_261), - ], -) - -CV_PT_CONFIG = types.DatasetConfig( - name="commonvoice-pt", - base="commonvoice", - subset="pt", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=21_968), - types.DatasetSplitConfig(name="validation", num_samples=9_464), - ], -) - -CV_RU_CONFIG = types.DatasetConfig( - name="commonvoice-ru", - base="commonvoice", - subset="ru", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=26_377), - types.DatasetSplitConfig(name="validation", num_samples=10_203), - ], -) - -CV_HI_CONFIG = types.DatasetConfig( - name="commonvoice-hi", - base="commonvoice", - subset="hi", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=9_378), - types.DatasetSplitConfig(name="validation", num_samples=4_856), - ], -) - -CV_TR_CONFIG = types.DatasetConfig( - name="commonvoice-tr", - base="commonvoice", - subset="tr", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=35_147), - types.DatasetSplitConfig(name="validation", num_samples=11_258), - ], -) - -CV_SV_CONFIG = types.DatasetConfig( - name="commonvoice-sv", - base="commonvoice", - subset="sv-SE", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=7_744), - types.DatasetSplitConfig(name="validation", num_samples=5_210), - ], -) - -CV_UK_CONFIG = types.DatasetConfig( - name="commonvoice-uk", - base="commonvoice", - subset="uk", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=25_137), - types.DatasetSplitConfig(name="validation", num_samples=10_007), - ], -) - -CV_EN_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-en-transcription", - base="commonvoice-en", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_AR_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ar-transcription", - base="commonvoice-ar", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_DE_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-de-transcription", - base="commonvoice-de", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_ES_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-es-transcription", - base="commonvoice-es", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_FR_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-fr-transcription", - base="commonvoice-fr", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_IT_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-it-transcription", - base="commonvoice-it", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_JA_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ja-transcription", - base="commonvoice-ja", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_PT_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-pt-transcription", - base="commonvoice-pt", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) -CV_RU_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-ru-transcription", - base="commonvoice-ru", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -CV_HI_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-hi-transcription", - base="commonvoice-hi", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -CV_TR_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-tr-transcription", - base="commonvoice-tr", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -CV_SV_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-sv-transcription", - base="commonvoice-sv", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -CV_UK_TRANS_CONFIG = types.DatasetConfig( - name="commonvoice-uk-transcription", - base="commonvoice-uk", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -CV_EN_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-en-continuation", - base="commonvoice-en", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_AR_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ar-continuation", - base="commonvoice-ar", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_DE_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-de-continuation", - base="commonvoice-de", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_ES_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-es-continuation", - base="commonvoice-es", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_FR_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-fr-continuation", - base="commonvoice-fr", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_IT_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-it-continuation", - base="commonvoice-it", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_JA_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ja-continuation", - base="commonvoice-ja", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_PT_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-pt-continuation", - base="commonvoice-pt", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -CV_RU_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-ru-continuation", - base="commonvoice-ru", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - - -CV_HI_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-hi-continuation", - base="commonvoice-hi", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -CV_TR_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-tr-continuation", - base="commonvoice-tr", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -CV_SV_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-sv-continuation", - base="commonvoice-sv", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -CV_UK_CONT_CONFIG = types.DatasetConfig( - name="commonvoice-uk-continuation", - base="commonvoice-uk", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [ - CV_BASE_CONFIG, - CV_EN_CONFIG, - CV_AR_CONFIG, - CV_DE_CONFIG, - CV_ES_CONFIG, - CV_FR_CONFIG, - CV_IT_CONFIG, - CV_JA_CONFIG, - CV_PT_CONFIG, - CV_RU_CONFIG, - CV_HI_CONFIG, - CV_TR_CONFIG, - CV_SV_CONFIG, - CV_UK_CONFIG, - CV_EN_TRANS_CONFIG, - CV_AR_TRANS_CONFIG, - CV_DE_TRANS_CONFIG, - CV_ES_TRANS_CONFIG, - CV_FR_TRANS_CONFIG, - CV_IT_TRANS_CONFIG, - CV_JA_TRANS_CONFIG, - CV_PT_TRANS_CONFIG, - CV_RU_TRANS_CONFIG, - CV_HI_TRANS_CONFIG, - CV_TR_TRANS_CONFIG, - CV_SV_TRANS_CONFIG, - CV_UK_TRANS_CONFIG, - CV_EN_CONT_CONFIG, - CV_AR_CONT_CONFIG, - CV_DE_CONT_CONFIG, - CV_ES_CONT_CONFIG, - CV_FR_CONT_CONFIG, - CV_IT_CONT_CONFIG, - CV_JA_CONT_CONFIG, - CV_PT_CONT_CONFIG, - CV_RU_CONT_CONFIG, - CV_HI_CONT_CONFIG, - CV_TR_CONT_CONFIG, - CV_SV_CONT_CONFIG, - CV_UK_CONT_CONFIG, -] diff --git a/ultravox/data/configs/datasets_covost2.py b/ultravox/data/configs/datasets_covost2.py deleted file mode 100644 index 42091199..00000000 --- a/ultravox/data/configs/datasets_covost2.py +++ /dev/null @@ -1,481 +0,0 @@ -from ultravox.data import types - -CVST_BASE_CONFIG = types.DatasetConfig( - name="covost2", - path="fixie-ai/covost2", - user_template=types.TRANSLATION_USER_TEMPLATE, - transcript_template="{{sentence}}", - assistant_template="{{translation}}", -) - -CVST_AR_EN_CONFIG = types.DatasetConfig( - name="covost2-ar-en", - base="covost2", - subset="ar_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=2_283), - types.DatasetSplitConfig(name="validation", num_samples=1_758), - types.DatasetSplitConfig(name="test", num_samples=1_695), - ], - user_template_args={"target": "English"}, -) - -CVST_CA_EN_CONFIG = types.DatasetConfig( - name="covost2-ca-en", - base="covost2", - subset="ca_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=95_854), - types.DatasetSplitConfig(name="validation", num_samples=12_730), - types.DatasetSplitConfig(name="test", num_samples=12_730), - ], - user_template_args={"target": "English"}, -) - -CVST_CY_EN_CONFIG = types.DatasetConfig( - name="covost2-cy-en", - base="covost2", - subset="cy_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_241), - types.DatasetSplitConfig(name="validation", num_samples=690), - types.DatasetSplitConfig(name="test", num_samples=690), - ], - user_template_args={"target": "English"}, -) - -CVST_DE_EN_CONFIG = types.DatasetConfig( - name="covost2-de-en", - base="covost2", - subset="de_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=127_834), - types.DatasetSplitConfig(name="validation", num_samples=13_511), - types.DatasetSplitConfig(name="test", num_samples=13_511), - ], - user_template_args={"target": "English"}, -) - -CVST_EN_AR_CONFIG = types.DatasetConfig( - name="covost2-en-ar", - base="covost2", - subset="en_ar", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Arabic"}, -) - -CVST_EN_CA_CONFIG = types.DatasetConfig( - name="covost2-en-ca", - base="covost2", - subset="en_ca", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Catalan"}, -) - -CVST_EN_CY_CONFIG = types.DatasetConfig( - name="covost2-en-cy", - base="covost2", - subset="en_cy", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Welsh"}, -) - -CVST_EN_DE_CONFIG = types.DatasetConfig( - name="covost2-en-de", - base="covost2", - subset="en_de", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "German"}, -) - -CVST_EN_ET_CONFIG = types.DatasetConfig( - name="covost2-en-et", - base="covost2", - subset="en_et", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Estonian"}, -) - -CVST_EN_FA_CONFIG = types.DatasetConfig( - name="covost2-en-fa", - base="covost2", - subset="en_fa", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Persian"}, -) - -CVST_EN_ID_CONFIG = types.DatasetConfig( - name="covost2-en-id", - base="covost2", - subset="en_id", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Indonesian"}, -) - -CVST_EN_JA_CONFIG = types.DatasetConfig( - name="covost2-en-ja", - base="covost2", - subset="en_ja", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Japanese"}, -) - -CVST_EN_LV_CONFIG = types.DatasetConfig( - name="covost2-en-lv", - base="covost2", - subset="en_lv", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Latvian"}, -) - -CVST_EN_MN_CONFIG = types.DatasetConfig( - name="covost2-en-mn", - base="covost2", - subset="en_mn", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Mongolian"}, -) - -CVST_EN_SL_CONFIG = types.DatasetConfig( - name="covost2-en-sl", - base="covost2", - subset="en_sl", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Slovenian"}, -) - -CVST_EN_SV_CONFIG = types.DatasetConfig( - name="covost2-en-sv", - base="covost2", - subset="en_sv-SE", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Swedish"}, -) - -CVST_EN_TA_CONFIG = types.DatasetConfig( - name="covost2-en-ta", - base="covost2", - subset="en_ta", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Tamil"}, -) - -CVST_EN_TR_CONFIG = types.DatasetConfig( - name="covost2-en-tr", - base="covost2", - subset="en_tr", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Turkish"}, -) - -CVST_EN_ZH_CONFIG = types.DatasetConfig( - name="covost2-en-zh", - base="covost2", - subset="en_zh-CN", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=289_430), - types.DatasetSplitConfig(name="validation", num_samples=15_531), - types.DatasetSplitConfig(name="test", num_samples=15_531), - ], - user_template_args={"target": "Chinese"}, -) - -CVST_ES_EN_CONFIG = types.DatasetConfig( - name="covost2-es-en", - base="covost2", - subset="es_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=79_015), - types.DatasetSplitConfig(name="validation", num_samples=13_221), - types.DatasetSplitConfig(name="test", num_samples=13_221), - ], - user_template_args={"target": "English"}, -) - -CVST_ET_EN_CONFIG = types.DatasetConfig( - name="covost2-et-en", - base="covost2", - subset="et_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_782), - types.DatasetSplitConfig(name="validation", num_samples=1_576), - types.DatasetSplitConfig(name="test", num_samples=1_571), - ], - user_template_args={"target": "English"}, -) - -CVST_FA_EN_CONFIG = types.DatasetConfig( - name="covost2-fa-en", - base="covost2", - subset="fa_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=53_949), - types.DatasetSplitConfig(name="validation", num_samples=3_445), - types.DatasetSplitConfig(name="test", num_samples=3_445), - ], - user_template_args={"target": "English"}, -) - -CVST_FR_EN_CONFIG = types.DatasetConfig( - name="covost2-fr-en", - base="covost2", - subset="fr_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=207_374), - types.DatasetSplitConfig(name="validation", num_samples=14_760), - types.DatasetSplitConfig(name="test", num_samples=14_760), - ], - user_template_args={"target": "English"}, -) - -CVST_ID_EN_CONFIG = types.DatasetConfig( - name="covost2-id-en", - base="covost2", - subset="id_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_243), - types.DatasetSplitConfig(name="validation", num_samples=792), - types.DatasetSplitConfig(name="test", num_samples=844), - ], - user_template_args={"target": "English"}, -) - -CVST_IT_EN_CONFIG = types.DatasetConfig( - name="covost2-it-en", - base="covost2", - subset="it_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=31_698), - types.DatasetSplitConfig(name="validation", num_samples=8_940), - types.DatasetSplitConfig(name="test", num_samples=8_951), - ], - user_template_args={"target": "English"}, -) - -CVST_JA_EN_CONFIG = types.DatasetConfig( - name="covost2-ja-en", - base="covost2", - subset="ja_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_119), - types.DatasetSplitConfig(name="validation", num_samples=635), - types.DatasetSplitConfig(name="test", num_samples=684), - ], - user_template_args={"target": "English"}, -) - -CVST_LV_EN_CONFIG = types.DatasetConfig( - name="covost2-lv-en", - base="covost2", - subset="lv_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=2_337), - types.DatasetSplitConfig(name="validation", num_samples=1_125), - types.DatasetSplitConfig(name="test", num_samples=1_629), - ], - user_template_args={"target": "English"}, -) - -CVST_MN_EN_CONFIG = types.DatasetConfig( - name="covost2-mn-en", - base="covost2", - subset="mn_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=2_067), - types.DatasetSplitConfig(name="validation", num_samples=1_761), - types.DatasetSplitConfig(name="test", num_samples=1_759), - ], - user_template_args={"target": "English"}, -) - -CVST_NL_EN_CONFIG = types.DatasetConfig( - name="covost2-nl-en", - base="covost2", - subset="nl_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=7_108), - types.DatasetSplitConfig(name="validation", num_samples=1_699), - types.DatasetSplitConfig(name="test", num_samples=1_699), - ], - user_template_args={"target": "English"}, -) - -CVST_PT_EN_CONFIG = types.DatasetConfig( - name="covost2-pt-en", - base="covost2", - subset="pt_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=9_158), - types.DatasetSplitConfig(name="validation", num_samples=3_318), - types.DatasetSplitConfig(name="test", num_samples=4_023), - ], - user_template_args={"target": "English"}, -) - -CVST_RU_EN_CONFIG = types.DatasetConfig( - name="covost2-ru-en", - base="covost2", - subset="ru_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=12_112), - types.DatasetSplitConfig(name="validation", num_samples=6_110), - types.DatasetSplitConfig(name="test", num_samples=6_300), - ], - user_template_args={"target": "English"}, -) - -CVST_SL_EN_CONFIG = types.DatasetConfig( - name="covost2-sl-en", - base="covost2", - subset="sl_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_843), - types.DatasetSplitConfig(name="validation", num_samples=509), - types.DatasetSplitConfig(name="test", num_samples=360), - ], - user_template_args={"target": "English"}, -) - -CVST_SV_EN_CONFIG = types.DatasetConfig( - name="covost2-sv-en", - base="covost2", - subset="sv-SE_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=2_160), - types.DatasetSplitConfig(name="validation", num_samples=1_349), - types.DatasetSplitConfig(name="test", num_samples=1_595), - ], - user_template_args={"target": "English"}, -) - -CVST_TA_EN_CONFIG = types.DatasetConfig( - name="covost2-ta-en", - base="covost2", - subset="ta_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_358), - types.DatasetSplitConfig(name="validation", num_samples=384), - types.DatasetSplitConfig(name="test", num_samples=786), - ], - user_template_args={"target": "English"}, -) - -CVST_TR_EN_CONFIG = types.DatasetConfig( - name="covost2-tr-en", - base="covost2", - subset="tr_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=3_966), - types.DatasetSplitConfig(name="validation", num_samples=1_624), - types.DatasetSplitConfig(name="test", num_samples=1_629), - ], - user_template_args={"target": "English"}, -) - -CVST_ZH_EN_CONFIG = types.DatasetConfig( - name="covost2-zh-en", - base="covost2", - subset="zh-CN_en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=7_085), - types.DatasetSplitConfig(name="validation", num_samples=4_843), - types.DatasetSplitConfig(name="test", num_samples=4_898), - ], - user_template_args={"target": "English"}, -) - -configs = [ - CVST_BASE_CONFIG, - CVST_AR_EN_CONFIG, - CVST_CA_EN_CONFIG, - CVST_CY_EN_CONFIG, - CVST_DE_EN_CONFIG, - CVST_EN_AR_CONFIG, - CVST_EN_CA_CONFIG, - CVST_EN_CY_CONFIG, - CVST_EN_DE_CONFIG, - CVST_EN_ET_CONFIG, - CVST_EN_FA_CONFIG, - CVST_EN_ID_CONFIG, - CVST_EN_JA_CONFIG, - CVST_EN_LV_CONFIG, - CVST_EN_MN_CONFIG, - CVST_EN_SL_CONFIG, - CVST_EN_SV_CONFIG, - CVST_EN_TA_CONFIG, - CVST_EN_TR_CONFIG, - CVST_EN_ZH_CONFIG, - CVST_ES_EN_CONFIG, - CVST_ET_EN_CONFIG, - CVST_FA_EN_CONFIG, - CVST_FR_EN_CONFIG, - CVST_ID_EN_CONFIG, - CVST_IT_EN_CONFIG, - CVST_JA_EN_CONFIG, - CVST_LV_EN_CONFIG, - CVST_MN_EN_CONFIG, - CVST_NL_EN_CONFIG, - CVST_PT_EN_CONFIG, - CVST_RU_EN_CONFIG, - CVST_SL_EN_CONFIG, - CVST_SV_EN_CONFIG, - CVST_TA_EN_CONFIG, - CVST_TR_EN_CONFIG, - CVST_ZH_EN_CONFIG, -] diff --git a/ultravox/data/configs/datasets_gigaspeech.py b/ultravox/data/configs/datasets_gigaspeech.py deleted file mode 100644 index 17b889a4..00000000 --- a/ultravox/data/configs/datasets_gigaspeech.py +++ /dev/null @@ -1,27 +0,0 @@ -from ultravox.data import types - -GS_XL_CONFIG = types.DatasetConfig( - name="gigaspeech-xl", - path="fixie-ai/gigaspeech", - subset="xl-empty-audio-removed", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=8_266_422), - ], - transcript_template="{{text_proc.format_asr_text(text)}}", - assistant_template="{{text_proc.format_asr_text(text)}}", -) - -GS_XL_TRANS_CONFIG = types.DatasetConfig( - name="gigaspeech-xl-transcription", - base="gigaspeech-xl", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -GS_XL_CONT_CONFIG = types.DatasetConfig( - name="gigaspeech-xl-continuation", - base="gigaspeech-xl", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [GS_XL_CONFIG, GS_XL_TRANS_CONFIG, GS_XL_CONT_CONFIG] diff --git a/ultravox/data/configs/datasets_librispeech.py b/ultravox/data/configs/datasets_librispeech.py deleted file mode 100644 index 2f186336..00000000 --- a/ultravox/data/configs/datasets_librispeech.py +++ /dev/null @@ -1,62 +0,0 @@ -from ultravox.data import types - -LS_BASE_CONFIG = types.DatasetConfig( - name="librispeech", - path="fixie-ai/librispeech_asr", - transcript_template="{{text_proc.format_asr_text(text)}}", - assistant_template="{{text_proc.format_asr_text(text)}}", -) - -LS_CLEAN_CONFIG = types.DatasetConfig( - name="librispeech-clean", - base="librispeech", - subset="clean", - splits=[ - types.DatasetSplitConfig(name="train.100", num_samples=28_539), - types.DatasetSplitConfig(name="train.360", num_samples=104_014), - ], -) - -LS_OTHER_CONFIG = types.DatasetConfig( - name="librispeech-other", - base="librispeech", - subset="other", - splits=[ - types.DatasetSplitConfig(name="train.500", num_samples=148_688), - ], -) - -LS_CLEAN_TRANS_CONFIG = types.DatasetConfig( - name="librispeech-clean-transcription", - base="librispeech-clean", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -LS_OTHER_TRANS_CONFIG = types.DatasetConfig( - name="librispeech-other-transcription", - base="librispeech-other", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -LS_CLEAN_CONT_CONFIG = types.DatasetConfig( - name="librispeech-clean-continuation", - base="librispeech-clean", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) -LS_OTHER_CONT_CONFIG = types.DatasetConfig( - name="librispeech-other-continuation", - base="librispeech-other", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [ - LS_BASE_CONFIG, - LS_CLEAN_CONFIG, - LS_OTHER_CONFIG, - LS_CLEAN_TRANS_CONFIG, - LS_OTHER_TRANS_CONFIG, - LS_CLEAN_CONT_CONFIG, - LS_OTHER_CONT_CONFIG, -] diff --git a/ultravox/data/configs/datasets_multilingual_librispeech.py b/ultravox/data/configs/datasets_multilingual_librispeech.py deleted file mode 100644 index 4209f27c..00000000 --- a/ultravox/data/configs/datasets_multilingual_librispeech.py +++ /dev/null @@ -1,58 +0,0 @@ -from ultravox.data import types - -ML_BASE_CONFIG = types.DatasetConfig( - name="multilingual_librispeech", - path="fixie-ai/multilingual_librispeech", - transcript_template="{{transcript}}", - assistant_template="{{transcript}}", -) - -ML_NL_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-nl", - base="multilingual_librispeech", - subset="dutch", - splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], -) - -ML_PT_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-pt", - base="multilingual_librispeech", - subset="portuguese", - splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], -) - -ML_NL_TRANS_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-nl-transcription", - base="multilingual_librispeech-nl", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -ML_PT_TRANS_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-pt-transcription", - base="multilingual_librispeech-pt", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -ML_NL_CONT_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-nl-continuation", - base="multilingual_librispeech-nl", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -ML_PT_CONT_CONFIG = types.DatasetConfig( - name="multilingual_librispeech-pt-continuation", - base="multilingual_librispeech-pt", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [ - ML_BASE_CONFIG, - ML_NL_CONFIG, - ML_PT_CONFIG, - ML_NL_TRANS_CONFIG, - ML_PT_TRANS_CONFIG, - ML_NL_CONT_CONFIG, - ML_PT_CONT_CONFIG, -] diff --git a/ultravox/data/configs/datasets_peoplespeech.py b/ultravox/data/configs/datasets_peoplespeech.py deleted file mode 100644 index 36f8e8f8..00000000 --- a/ultravox/data/configs/datasets_peoplespeech.py +++ /dev/null @@ -1,34 +0,0 @@ -from ultravox.data import types - -PS_BASE_CONFIG = types.DatasetConfig( - name="peoplespeech", - path="fixie-ai/peoples_speech", - subset="clean", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_501_271), - types.DatasetSplitConfig( - name="test", num_samples=34_898, split_type=types.DatasetSplit.VALIDATION - ), - ], - assistant_template="{{text_proc.format_asr_text(text)}}", - transcript_template="{{text_proc.format_asr_text(text)}}", -) - -PS_TRANS_CONFIG = types.DatasetConfig( - name="peoplespeech-clean-transcription", - base="peoplespeech", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -PS_CONT_CONFIG = types.DatasetConfig( - name="peoplespeech-clean-continuation", - base="peoplespeech", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [ - PS_BASE_CONFIG, - PS_TRANS_CONFIG, - PS_CONT_CONFIG, -] diff --git a/ultravox/data/configs/datasets_test.py b/ultravox/data/configs/datasets_test.py deleted file mode 100644 index 98e4c6f6..00000000 --- a/ultravox/data/configs/datasets_test.py +++ /dev/null @@ -1,377 +0,0 @@ -from typing import Optional - -import datasets as hf_datasets -import numpy as np -import pytest -import torch -from torch.utils import data -from transformers.feature_extraction_utils import BatchFeature - -from ultravox.data import data_sample -from ultravox.data import datasets -from ultravox.data import registry -from ultravox.data import types - - -class FakeSizedIterableDataset(datasets.SizedIterableDataset): - """Fake version of datasets.SizedIterableDataset""" - - def __init__(self, n, start=0, length=0): - self.data = range(start, start + n) - self._length = length or n - - def __iter__(self): - for sample in self.data: - yield sample - - def __len__(self): - return self._length - - -class FakeHuggingFaceIterableDataset(hf_datasets.IterableDataset): - """Fake version of an ASR Hugging Face IterableDataset.""" - - def __init__(self, n): - self.data = [ - { - "text": str(i), - "audio": {"array": np.full(256, float(i)), "sampling_rate": 16000}, - } - for i in range(n) - ] - self._split = "fake" - - def __iter__(self): - return (i for i in self.data) - - -class FakeTranscribeDataset(datasets.VoiceDataset): - """Fake version of our VoiceDataset.""" - - def __init__(self, n: int, args: Optional[types.VoiceDatasetArgs] = None): - super().__init__(args or types.VoiceDatasetArgs()) - self._init_dataset(FakeHuggingFaceIterableDataset(n), n) - - def _get_sample(self, row: BatchFeature) -> Optional[data_sample.VoiceSample]: - messages = self._make_messages("<|audio|>", row["text"]) - return self._make_sample(messages, np.zeros(256), row["text"]) - - -class FakeGenericDataset(datasets.GenericDataset): - """Fake version of GenericDataset, hooked to return a FakeHuggingFaceIterableDataset.""" - - def __init__( - self, - n: int, - config: types.DatasetConfig, - args: Optional[types.VoiceDatasetArgs] = None, - ): - self._n = n - super().__init__(args or types.VoiceDatasetArgs(), config) - - def _load_hf_dataset( - self, - path: str, - name: Optional[str] = None, - *, - split: Optional[str] = None, - streaming: bool = True, - audio_field: Optional[str] = None, - ) -> data.Dataset: - return FakeHuggingFaceIterableDataset(self._n) - - -class FakeDataproc(datasets.Dataproc): - def __init__(self, dataset): - super().__init__(dataset) - - def _process(self, sample): - return -sample - - -def test_dataproc(): - ds = FakeSizedIterableDataset(5) - s = FakeDataproc(ds) - assert list(s) == [0, -1, -2, -3, -4] - - -def test_interleaved_empty(): - s = datasets.InterleaveDataset([]) - assert list(s) == [] - - -def test_interleaved_single_set(): - ds1 = FakeSizedIterableDataset(4) - s = datasets.InterleaveDataset([ds1]) - assert list(s) == [0, 1, 2, 3] - - -def test_interleaved_normal_weights(): - ds1 = FakeSizedIterableDataset(4) - ds2 = FakeSizedIterableDataset(8, start=10) - ds3 = FakeSizedIterableDataset(2, start=100) - s = datasets.InterleaveDataset([ds1, ds2, ds3]) - assert list(s) == [0, 10, 100, 11, 1, 12, 13, 2, 14, 101, 15, 3, 16, 17] - - -def test_interleaved_specific_weights(): - ds1 = FakeSizedIterableDataset(4) - ds2 = FakeSizedIterableDataset(2, start=10) - s = datasets.InterleaveDataset([ds1, ds2], [0.5, 2.0]) - assert list(s) == [0, 10, 11, 1, 10, 11] - - -def test_interleaved_zero_weights(): - ds1 = FakeSizedIterableDataset(4) - ds2 = FakeSizedIterableDataset(2, start=10) - s = datasets.InterleaveDataset([ds1, ds2], [0.0, 0.0]) - assert list(s) == [] - - -def test_interleaved_with_multiprocessing(): - ds = FakeSizedIterableDataset(5) - s = datasets.InterleaveDataset([ds]) - dl = data.DataLoader(s, num_workers=1, batch_size=5) - batch = next(iter(dl)) - assert torch.allclose(batch, torch.tensor([0, 1, 2, 3, 4])) - - -def test_range(): - ds = FakeSizedIterableDataset(10, length=10) - s = datasets.Range(ds, 5) - assert len(s) == 5 - assert list(s) == [0, 1, 2, 3, 4] - with pytest.raises(ValueError, match="exceeds dataset length"): - s = datasets.Range(ds, 100) - s = datasets.Range(ds, 10) - assert list(s) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - s = datasets.Range(ds) - assert len(s) == 10 - assert list(s) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - - -def test_transcribe_dataset(): - ds = FakeTranscribeDataset(5) - assert len(ds) == 5 - sample = next(iter(ds)) - assert isinstance(sample, data_sample.VoiceSample) - assert sample.messages == [ - {"role": "user", "content": "<|audio|>"}, - {"role": "assistant", "content": "0"}, - ] - assert np.array_equal(sample.audio, np.zeros(256)) - assert sample.sample_rate == 16000 - assert sample.audio_transcript == "0" - - -def test_dataset_config(): - config = types.DatasetConfig( - name="fake_dataset", - path="mock_path", - splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000), - types.DatasetSplitConfig(name="other", num_samples=10000), - types.DatasetSplitConfig(name="validation", num_samples=1000), - types.DatasetSplitConfig( - name="another_validation", - num_samples=1000, - split_type=types.DatasetSplit.VALIDATION, - ), - ], - ) - assert config.name == "fake_dataset" - assert config.path == "mock_path" - assert len(config.splits) == 4 - assert config.splits[0].name == "clean" - assert config.splits[0].num_samples == 5000 - assert config.splits[0].split_type == types.DatasetSplit.TRAIN - assert config.splits[1].name == "other" - assert config.splits[1].num_samples == 10000 - assert config.splits[1].split_type == types.DatasetSplit.TRAIN - assert config.splits[2].name == "validation" - assert config.splits[2].num_samples == 1000 - assert config.splits[2].split_type == types.DatasetSplit.VALIDATION - assert config.splits[3].name == "another_validation" - assert config.splits[3].num_samples == 1000 - assert config.splits[3].split_type == types.DatasetSplit.VALIDATION - - -def test_dataset_config_serialization(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000), - types.DatasetSplitConfig(name="other", num_samples=10000), - ], - ) - serialized = config.dumps_yaml() - deserialized = types.DatasetConfig.loads_yaml(serialized) - assert isinstance(deserialized, types.DatasetConfig) - assert deserialized.name == "fake_dataset" - assert deserialized.path == "fake_path" - assert len(deserialized.splits) == 2 - assert deserialized.splits[0].name == "clean" - assert deserialized.splits[0].num_samples == 5000 - assert deserialized.splits[1].name == "other" - assert deserialized.splits[1].num_samples == 10000 - - -def test_generic_dataset(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], - ) - ds = FakeGenericDataset(5, config) - assert len(ds) == 5 - sample = next(iter(ds)) - assert isinstance(sample, data_sample.VoiceSample) - assert sample.messages == [ - {"role": "user", "content": "<|audio|>"}, - {"role": "assistant", "content": "0"}, - ] - assert np.array_equal(sample.audio, np.zeros(256)) - assert sample.sample_rate == 16000 - assert sample.audio_transcript == "0" - - -def test_generic_dataset_custom_templates(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], - user_template="Listen to the following and respond with 'xyzzy':\n<|audio|>", - assistant_template="xyzzy", - transcript_template="{{text}}", - ) - ds = FakeGenericDataset(5, config) - assert len(ds) == 5 - sample = next(iter(ds)) - assert isinstance(sample, data_sample.VoiceSample) - assert sample.messages == [ - { - "role": "user", - "content": "Listen to the following and respond with 'xyzzy':\n<|audio|>", - }, - {"role": "assistant", "content": "xyzzy"}, - ] - assert np.array_equal(sample.audio, np.zeros(256)) - assert sample.sample_rate == 16000 - assert sample.audio_transcript == "0" - - -def test_generic_dataset_text_only(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], - user_template="Transcribe\n<|audio|>", - ) - ds = FakeGenericDataset(5, config, types.VoiceDatasetArgs(include_audio=False)) - assert len(ds) == 5 - sample = next(iter(ds)) - assert isinstance(sample, data_sample.VoiceSample) - assert sample.messages == [ - {"role": "user", "content": 'Transcribe\n"0"'}, - {"role": "assistant", "content": "0"}, - ] - assert sample.audio is None - - -def test_generic_dataset_merge_configs(): - base_config = types.DatasetConfig( - name="fake_base", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], - ) - mid_config = types.DatasetConfig( - name="fake_mid", - base="fake_base", - user_template="fake_user_template", - user_template_args={"a": 1}, - transcript_template="fake_transcript_template", - ) - leaf_config = types.DatasetConfig( - name="fake_leaf", - base="fake_mid", - audio_field="fake_audio_field", - ) - config = registry._merge_configs([base_config, mid_config, leaf_config]) - assert config.name == "fake_leaf" - assert config.base is None - assert config.path == "fake_path" - assert config.splits[0].name == "fake" - assert config.splits[0].num_samples == 5 - assert config.splits[0].split_type == types.DatasetSplit.TRAIN - assert config.user_template == "fake_user_template" - assert config.user_template_args == {"a": 1} - assert config.assistant_template == "{{text}}" # the default - assert config.transcript_template == "fake_transcript_template" - assert config.audio_field == "fake_audio_field" - - -def test_generic_dataset_length_mismatch(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], - ) - ds = FakeGenericDataset(10, config) - assert len(ds) == 5 - - pattern = r"(has been exceeded|Mismatch between presumed length)" - with pytest.warns(UserWarning, match=pattern): - list(ds) - - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=10)], - ) - ds = FakeGenericDataset(5, config) - assert len(ds) == 10 - - with pytest.warns(UserWarning, match="Mismatch between presumed length"): - list(ds) - - -def test_generic_dataset_multiple_splits(): - config = types.DatasetConfig( - name="fake_dataset", - path="fake_path", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=90), - types.DatasetSplitConfig(name="validation", num_samples=10), - ], - ) - ds = FakeGenericDataset(100, config) - assert len(ds) == 90 - ds = FakeGenericDataset( - 100, config, types.VoiceDatasetArgs(split=types.DatasetSplit.VALIDATION) - ) - assert len(ds) == 10 - - -def test_get_messages(): - messages = datasets._get_messages("Yo!", "Hi!") - assert messages == [ - {"role": "user", "content": "Yo!"}, - {"role": "assistant", "content": "Hi!"}, - ] - - messages = datasets._get_messages( - "Yo!", "Hi!", assistant_last=False, sys_prompt="Be nice!" - ) - assert messages == [ - {"role": "system", "content": "Be nice!"}, - {"role": "assistant", "content": "Yo!"}, - {"role": "user", "content": "Hi!"}, - ] - - messages = datasets._get_messages("A", "B", "C") - assert messages == [ - {"role": "assistant", "content": "A"}, - {"role": "user", "content": "B"}, - {"role": "assistant", "content": "C"}, - ] diff --git a/ultravox/data/configs/datasets_voxpopuli.py b/ultravox/data/configs/datasets_voxpopuli.py deleted file mode 100644 index dd62b862..00000000 --- a/ultravox/data/configs/datasets_voxpopuli.py +++ /dev/null @@ -1,17 +0,0 @@ -from ultravox.data import types - -VP_EN_CONFIG = types.DatasetConfig( - name="voxpopuli-en", - path="facebook/voxpopuli", - subset="en", - splits=[ - types.DatasetSplitConfig(name="train", num_samples=1_000_000), - types.DatasetSplitConfig(name="validation", num_samples=10_000), - ], - assistant_template="{{raw_text}}", - transcript_template="{{raw_text}}", -) - -configs = [ - VP_EN_CONFIG, -] diff --git a/ultravox/data/configs/datasets_wenetspeech.py b/ultravox/data/configs/datasets_wenetspeech.py deleted file mode 100644 index a487a3cc..00000000 --- a/ultravox/data/configs/datasets_wenetspeech.py +++ /dev/null @@ -1,28 +0,0 @@ -from ultravox.data import types - -WS_BASE_CONFIG = types.DatasetConfig( - name="wenetspeech", - path="fixie-ai/wenetspeech", - subset="L_fixed", - splits=[types.DatasetSplitConfig(name="train", num_samples=14_621_415)], - transcript_template="{{text}}", -) - -WS_TRANS_CONFIG = types.DatasetConfig( - name="wenetspeech-transcription", - base="wenetspeech", - user_template=types.TRANSCRIPTION_USER_TEMPLATE, -) - -WS_CONT_CONFIG = types.DatasetConfig( - name="wenetspeech-continuation", - base="wenetspeech", - user_template=types.CONTINUATION_USER_TEMPLATE, - assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, -) - -configs = [ - WS_BASE_CONFIG, - WS_TRANS_CONFIG, - WS_CONT_CONFIG, -] diff --git a/ultravox/data/registry.py b/ultravox/data/registry.py index fa6b3275..4788e132 100644 --- a/ultravox/data/registry.py +++ b/ultravox/data/registry.py @@ -3,15 +3,15 @@ from ultravox.data import datasets from ultravox.data import types -from ultravox.data.configs import datasets_boolq -from ultravox.data.configs import datasets_commonvoice -from ultravox.data.configs import datasets_covost2 -from ultravox.data.configs import datasets_gigaspeech -from ultravox.data.configs import datasets_librispeech -from ultravox.data.configs import datasets_multilingual_librispeech -from ultravox.data.configs import datasets_peoplespeech -from ultravox.data.configs import datasets_voxpopuli -from ultravox.data.configs import datasets_wenetspeech +from ultravox.data.configs import boolq +from ultravox.data.configs import commonvoice +from ultravox.data.configs import covost2 +from ultravox.data.configs import gigaspeech +from ultravox.data.configs import librispeech +from ultravox.data.configs import multilingual_librispeech +from ultravox.data.configs import peoplespeech +from ultravox.data.configs import voxpopuli +from ultravox.data.configs import wenetspeech DATASET_MAP: Dict[str, types.DatasetConfig] = {} @@ -63,12 +63,12 @@ def create_dataset( return datasets.GenericDataset(args, merged_config) -register_datasets(datasets_boolq.configs) -register_datasets(datasets_commonvoice.configs) -register_datasets(datasets_covost2.configs) -register_datasets(datasets_gigaspeech.configs) -register_datasets(datasets_librispeech.configs) -register_datasets(datasets_multilingual_librispeech.configs) -register_datasets(datasets_peoplespeech.configs) -register_datasets(datasets_voxpopuli.configs) -register_datasets(datasets_wenetspeech.configs) +register_datasets(boolq.configs) +register_datasets(commonvoice.configs) +register_datasets(covost2.configs) +register_datasets(gigaspeech.configs) +register_datasets(librispeech.configs) +register_datasets(multilingual_librispeech.configs) +register_datasets(peoplespeech.configs) +register_datasets(voxpopuli.configs) +register_datasets(wenetspeech.configs) From b5d5e2e52440860de7513bf3de38650cef8df59e Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Sat, 2 Nov 2024 00:11:20 -0400 Subject: [PATCH 06/13] update --- ultravox/data/datasets_test.py | 377 +++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) create mode 100644 ultravox/data/datasets_test.py diff --git a/ultravox/data/datasets_test.py b/ultravox/data/datasets_test.py new file mode 100644 index 00000000..98e4c6f6 --- /dev/null +++ b/ultravox/data/datasets_test.py @@ -0,0 +1,377 @@ +from typing import Optional + +import datasets as hf_datasets +import numpy as np +import pytest +import torch +from torch.utils import data +from transformers.feature_extraction_utils import BatchFeature + +from ultravox.data import data_sample +from ultravox.data import datasets +from ultravox.data import registry +from ultravox.data import types + + +class FakeSizedIterableDataset(datasets.SizedIterableDataset): + """Fake version of datasets.SizedIterableDataset""" + + def __init__(self, n, start=0, length=0): + self.data = range(start, start + n) + self._length = length or n + + def __iter__(self): + for sample in self.data: + yield sample + + def __len__(self): + return self._length + + +class FakeHuggingFaceIterableDataset(hf_datasets.IterableDataset): + """Fake version of an ASR Hugging Face IterableDataset.""" + + def __init__(self, n): + self.data = [ + { + "text": str(i), + "audio": {"array": np.full(256, float(i)), "sampling_rate": 16000}, + } + for i in range(n) + ] + self._split = "fake" + + def __iter__(self): + return (i for i in self.data) + + +class FakeTranscribeDataset(datasets.VoiceDataset): + """Fake version of our VoiceDataset.""" + + def __init__(self, n: int, args: Optional[types.VoiceDatasetArgs] = None): + super().__init__(args or types.VoiceDatasetArgs()) + self._init_dataset(FakeHuggingFaceIterableDataset(n), n) + + def _get_sample(self, row: BatchFeature) -> Optional[data_sample.VoiceSample]: + messages = self._make_messages("<|audio|>", row["text"]) + return self._make_sample(messages, np.zeros(256), row["text"]) + + +class FakeGenericDataset(datasets.GenericDataset): + """Fake version of GenericDataset, hooked to return a FakeHuggingFaceIterableDataset.""" + + def __init__( + self, + n: int, + config: types.DatasetConfig, + args: Optional[types.VoiceDatasetArgs] = None, + ): + self._n = n + super().__init__(args or types.VoiceDatasetArgs(), config) + + def _load_hf_dataset( + self, + path: str, + name: Optional[str] = None, + *, + split: Optional[str] = None, + streaming: bool = True, + audio_field: Optional[str] = None, + ) -> data.Dataset: + return FakeHuggingFaceIterableDataset(self._n) + + +class FakeDataproc(datasets.Dataproc): + def __init__(self, dataset): + super().__init__(dataset) + + def _process(self, sample): + return -sample + + +def test_dataproc(): + ds = FakeSizedIterableDataset(5) + s = FakeDataproc(ds) + assert list(s) == [0, -1, -2, -3, -4] + + +def test_interleaved_empty(): + s = datasets.InterleaveDataset([]) + assert list(s) == [] + + +def test_interleaved_single_set(): + ds1 = FakeSizedIterableDataset(4) + s = datasets.InterleaveDataset([ds1]) + assert list(s) == [0, 1, 2, 3] + + +def test_interleaved_normal_weights(): + ds1 = FakeSizedIterableDataset(4) + ds2 = FakeSizedIterableDataset(8, start=10) + ds3 = FakeSizedIterableDataset(2, start=100) + s = datasets.InterleaveDataset([ds1, ds2, ds3]) + assert list(s) == [0, 10, 100, 11, 1, 12, 13, 2, 14, 101, 15, 3, 16, 17] + + +def test_interleaved_specific_weights(): + ds1 = FakeSizedIterableDataset(4) + ds2 = FakeSizedIterableDataset(2, start=10) + s = datasets.InterleaveDataset([ds1, ds2], [0.5, 2.0]) + assert list(s) == [0, 10, 11, 1, 10, 11] + + +def test_interleaved_zero_weights(): + ds1 = FakeSizedIterableDataset(4) + ds2 = FakeSizedIterableDataset(2, start=10) + s = datasets.InterleaveDataset([ds1, ds2], [0.0, 0.0]) + assert list(s) == [] + + +def test_interleaved_with_multiprocessing(): + ds = FakeSizedIterableDataset(5) + s = datasets.InterleaveDataset([ds]) + dl = data.DataLoader(s, num_workers=1, batch_size=5) + batch = next(iter(dl)) + assert torch.allclose(batch, torch.tensor([0, 1, 2, 3, 4])) + + +def test_range(): + ds = FakeSizedIterableDataset(10, length=10) + s = datasets.Range(ds, 5) + assert len(s) == 5 + assert list(s) == [0, 1, 2, 3, 4] + with pytest.raises(ValueError, match="exceeds dataset length"): + s = datasets.Range(ds, 100) + s = datasets.Range(ds, 10) + assert list(s) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + s = datasets.Range(ds) + assert len(s) == 10 + assert list(s) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + + +def test_transcribe_dataset(): + ds = FakeTranscribeDataset(5) + assert len(ds) == 5 + sample = next(iter(ds)) + assert isinstance(sample, data_sample.VoiceSample) + assert sample.messages == [ + {"role": "user", "content": "<|audio|>"}, + {"role": "assistant", "content": "0"}, + ] + assert np.array_equal(sample.audio, np.zeros(256)) + assert sample.sample_rate == 16000 + assert sample.audio_transcript == "0" + + +def test_dataset_config(): + config = types.DatasetConfig( + name="fake_dataset", + path="mock_path", + splits=[ + types.DatasetSplitConfig(name="clean", num_samples=5000), + types.DatasetSplitConfig(name="other", num_samples=10000), + types.DatasetSplitConfig(name="validation", num_samples=1000), + types.DatasetSplitConfig( + name="another_validation", + num_samples=1000, + split_type=types.DatasetSplit.VALIDATION, + ), + ], + ) + assert config.name == "fake_dataset" + assert config.path == "mock_path" + assert len(config.splits) == 4 + assert config.splits[0].name == "clean" + assert config.splits[0].num_samples == 5000 + assert config.splits[0].split_type == types.DatasetSplit.TRAIN + assert config.splits[1].name == "other" + assert config.splits[1].num_samples == 10000 + assert config.splits[1].split_type == types.DatasetSplit.TRAIN + assert config.splits[2].name == "validation" + assert config.splits[2].num_samples == 1000 + assert config.splits[2].split_type == types.DatasetSplit.VALIDATION + assert config.splits[3].name == "another_validation" + assert config.splits[3].num_samples == 1000 + assert config.splits[3].split_type == types.DatasetSplit.VALIDATION + + +def test_dataset_config_serialization(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[ + types.DatasetSplitConfig(name="clean", num_samples=5000), + types.DatasetSplitConfig(name="other", num_samples=10000), + ], + ) + serialized = config.dumps_yaml() + deserialized = types.DatasetConfig.loads_yaml(serialized) + assert isinstance(deserialized, types.DatasetConfig) + assert deserialized.name == "fake_dataset" + assert deserialized.path == "fake_path" + assert len(deserialized.splits) == 2 + assert deserialized.splits[0].name == "clean" + assert deserialized.splits[0].num_samples == 5000 + assert deserialized.splits[1].name == "other" + assert deserialized.splits[1].num_samples == 10000 + + +def test_generic_dataset(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + ) + ds = FakeGenericDataset(5, config) + assert len(ds) == 5 + sample = next(iter(ds)) + assert isinstance(sample, data_sample.VoiceSample) + assert sample.messages == [ + {"role": "user", "content": "<|audio|>"}, + {"role": "assistant", "content": "0"}, + ] + assert np.array_equal(sample.audio, np.zeros(256)) + assert sample.sample_rate == 16000 + assert sample.audio_transcript == "0" + + +def test_generic_dataset_custom_templates(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + user_template="Listen to the following and respond with 'xyzzy':\n<|audio|>", + assistant_template="xyzzy", + transcript_template="{{text}}", + ) + ds = FakeGenericDataset(5, config) + assert len(ds) == 5 + sample = next(iter(ds)) + assert isinstance(sample, data_sample.VoiceSample) + assert sample.messages == [ + { + "role": "user", + "content": "Listen to the following and respond with 'xyzzy':\n<|audio|>", + }, + {"role": "assistant", "content": "xyzzy"}, + ] + assert np.array_equal(sample.audio, np.zeros(256)) + assert sample.sample_rate == 16000 + assert sample.audio_transcript == "0" + + +def test_generic_dataset_text_only(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + user_template="Transcribe\n<|audio|>", + ) + ds = FakeGenericDataset(5, config, types.VoiceDatasetArgs(include_audio=False)) + assert len(ds) == 5 + sample = next(iter(ds)) + assert isinstance(sample, data_sample.VoiceSample) + assert sample.messages == [ + {"role": "user", "content": 'Transcribe\n"0"'}, + {"role": "assistant", "content": "0"}, + ] + assert sample.audio is None + + +def test_generic_dataset_merge_configs(): + base_config = types.DatasetConfig( + name="fake_base", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + ) + mid_config = types.DatasetConfig( + name="fake_mid", + base="fake_base", + user_template="fake_user_template", + user_template_args={"a": 1}, + transcript_template="fake_transcript_template", + ) + leaf_config = types.DatasetConfig( + name="fake_leaf", + base="fake_mid", + audio_field="fake_audio_field", + ) + config = registry._merge_configs([base_config, mid_config, leaf_config]) + assert config.name == "fake_leaf" + assert config.base is None + assert config.path == "fake_path" + assert config.splits[0].name == "fake" + assert config.splits[0].num_samples == 5 + assert config.splits[0].split_type == types.DatasetSplit.TRAIN + assert config.user_template == "fake_user_template" + assert config.user_template_args == {"a": 1} + assert config.assistant_template == "{{text}}" # the default + assert config.transcript_template == "fake_transcript_template" + assert config.audio_field == "fake_audio_field" + + +def test_generic_dataset_length_mismatch(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + ) + ds = FakeGenericDataset(10, config) + assert len(ds) == 5 + + pattern = r"(has been exceeded|Mismatch between presumed length)" + with pytest.warns(UserWarning, match=pattern): + list(ds) + + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[types.DatasetSplitConfig(name="fake", num_samples=10)], + ) + ds = FakeGenericDataset(5, config) + assert len(ds) == 10 + + with pytest.warns(UserWarning, match="Mismatch between presumed length"): + list(ds) + + +def test_generic_dataset_multiple_splits(): + config = types.DatasetConfig( + name="fake_dataset", + path="fake_path", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=90), + types.DatasetSplitConfig(name="validation", num_samples=10), + ], + ) + ds = FakeGenericDataset(100, config) + assert len(ds) == 90 + ds = FakeGenericDataset( + 100, config, types.VoiceDatasetArgs(split=types.DatasetSplit.VALIDATION) + ) + assert len(ds) == 10 + + +def test_get_messages(): + messages = datasets._get_messages("Yo!", "Hi!") + assert messages == [ + {"role": "user", "content": "Yo!"}, + {"role": "assistant", "content": "Hi!"}, + ] + + messages = datasets._get_messages( + "Yo!", "Hi!", assistant_last=False, sys_prompt="Be nice!" + ) + assert messages == [ + {"role": "system", "content": "Be nice!"}, + {"role": "assistant", "content": "Yo!"}, + {"role": "user", "content": "Hi!"}, + ] + + messages = datasets._get_messages("A", "B", "C") + assert messages == [ + {"role": "assistant", "content": "A"}, + {"role": "user", "content": "B"}, + {"role": "assistant", "content": "C"}, + ] From 9a57002c7b116cc0f359bd152bac39f617cdeeeb Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Sat, 2 Nov 2024 00:11:32 -0400 Subject: [PATCH 07/13] update --- ultravox/data/configs/boolq.py | 17 + ultravox/data/configs/commonvoice.py | 337 ++++++++++++ ultravox/data/configs/covost2.py | 481 ++++++++++++++++++ ultravox/data/configs/gigaspeech.py | 27 + ultravox/data/configs/librispeech.py | 62 +++ .../data/configs/multilingual_librispeech.py | 58 +++ ultravox/data/configs/peoplespeech.py | 34 ++ ultravox/data/configs/voxpopuli.py | 17 + ultravox/data/configs/wenetspeech.py | 28 + 9 files changed, 1061 insertions(+) create mode 100644 ultravox/data/configs/boolq.py create mode 100644 ultravox/data/configs/commonvoice.py create mode 100644 ultravox/data/configs/covost2.py create mode 100644 ultravox/data/configs/gigaspeech.py create mode 100644 ultravox/data/configs/librispeech.py create mode 100644 ultravox/data/configs/multilingual_librispeech.py create mode 100644 ultravox/data/configs/peoplespeech.py create mode 100644 ultravox/data/configs/voxpopuli.py create mode 100644 ultravox/data/configs/wenetspeech.py diff --git a/ultravox/data/configs/boolq.py b/ultravox/data/configs/boolq.py new file mode 100644 index 00000000..b99ca104 --- /dev/null +++ b/ultravox/data/configs/boolq.py @@ -0,0 +1,17 @@ +from ultravox.data import types + +BOOLQ_CONFIG = types.DatasetConfig( + name="boolq", + path="fixie-ai/boolq-audio", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=10000), + types.DatasetSplitConfig(name="validation", num_samples=1000), + ], + user_template=f"{{{{passage}}}}\n\n{types.AUDIO_PLACEHOLDER}", + assistant_template="{{'True' if answer else 'False'}}", + transcript_template="{{question}}", +) + +configs = [ + BOOLQ_CONFIG, +] diff --git a/ultravox/data/configs/commonvoice.py b/ultravox/data/configs/commonvoice.py new file mode 100644 index 00000000..ef823d14 --- /dev/null +++ b/ultravox/data/configs/commonvoice.py @@ -0,0 +1,337 @@ +from ultravox.data import types + +CV_BASE_CONFIG = types.DatasetConfig( + name="commonvoice", + path="fixie-ai/common_voice_17_0", + transcript_template="{{sentence}}", + assistant_template="{{sentence}}", +) + +CV_EN_CONFIG = types.DatasetConfig( + name="commonvoice-en", + base="commonvoice", + subset="en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_101_170), + types.DatasetSplitConfig(name="validation", num_samples=16_393), + ], + transcript_template="{{text_proc.format_asr_text(sentence)}}", + assistant_template="{{text_proc.format_asr_text(sentence)}}", +) + +CV_AR_CONFIG = types.DatasetConfig( + name="commonvoice-ar", + base="commonvoice", + subset="ar", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=28_369), + types.DatasetSplitConfig(name="validation", num_samples=10_470), + ], +) + +CV_DE_CONFIG = types.DatasetConfig( + name="commonvoice-de", + base="commonvoice", + subset="de", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=589_100), + types.DatasetSplitConfig(name="validation", num_samples=16_183), + ], +) + +CV_ES_CONFIG = types.DatasetConfig( + name="commonvoice-es", + base="commonvoice", + subset="es", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=336_846), + types.DatasetSplitConfig(name="validation", num_samples=15_857), + ], +) + +CV_FR_CONFIG = types.DatasetConfig( + name="commonvoice-fr", + base="commonvoice", + subset="fr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=558_054), + types.DatasetSplitConfig(name="validation", num_samples=16_159), + ], +) + +CV_IT_CONFIG = types.DatasetConfig( + name="commonvoice-it", + base="commonvoice", + subset="it", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=169_771), + types.DatasetSplitConfig(name="validation", num_samples=15_149), + ], +) + +CV_JA_CONFIG = types.DatasetConfig( + name="commonvoice-ja", + base="commonvoice", + subset="ja", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=10_039), + types.DatasetSplitConfig(name="validation", num_samples=6_261), + ], +) + +CV_PT_CONFIG = types.DatasetConfig( + name="commonvoice-pt", + base="commonvoice", + subset="pt", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=21_968), + types.DatasetSplitConfig(name="validation", num_samples=9_464), + ], +) + +CV_RU_CONFIG = types.DatasetConfig( + name="commonvoice-ru", + base="commonvoice", + subset="ru", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=26_377), + types.DatasetSplitConfig(name="validation", num_samples=10_203), + ], +) + +CV_HI_CONFIG = types.DatasetConfig( + name="commonvoice-hi", + base="commonvoice", + subset="hi", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=9_378), + types.DatasetSplitConfig(name="validation", num_samples=4_856), + ], +) + +CV_TR_CONFIG = types.DatasetConfig( + name="commonvoice-tr", + base="commonvoice", + subset="tr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=35_147), + types.DatasetSplitConfig(name="validation", num_samples=11_258), + ], +) + +CV_SV_CONFIG = types.DatasetConfig( + name="commonvoice-sv", + base="commonvoice", + subset="sv-SE", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_744), + types.DatasetSplitConfig(name="validation", num_samples=5_210), + ], +) + +CV_UK_CONFIG = types.DatasetConfig( + name="commonvoice-uk", + base="commonvoice", + subset="uk", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=25_137), + types.DatasetSplitConfig(name="validation", num_samples=10_007), + ], +) + +CV_EN_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-en-transcription", + base="commonvoice-en", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_AR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ar-transcription", + base="commonvoice-ar", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_DE_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-de-transcription", + base="commonvoice-de", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_ES_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-es-transcription", + base="commonvoice-es", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_FR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-fr-transcription", + base="commonvoice-fr", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_IT_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-it-transcription", + base="commonvoice-it", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_JA_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ja-transcription", + base="commonvoice-ja", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_PT_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-pt-transcription", + base="commonvoice-pt", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) +CV_RU_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-ru-transcription", + base="commonvoice-ru", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_HI_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-hi-transcription", + base="commonvoice-hi", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_TR_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-tr-transcription", + base="commonvoice-tr", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_SV_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-sv-transcription", + base="commonvoice-sv", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_UK_TRANS_CONFIG = types.DatasetConfig( + name="commonvoice-uk-transcription", + base="commonvoice-uk", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +CV_EN_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-en-continuation", + base="commonvoice-en", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_AR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ar-continuation", + base="commonvoice-ar", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_DE_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-de-continuation", + base="commonvoice-de", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_ES_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-es-continuation", + base="commonvoice-es", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_FR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-fr-continuation", + base="commonvoice-fr", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_IT_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-it-continuation", + base="commonvoice-it", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_JA_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ja-continuation", + base="commonvoice-ja", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_PT_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-pt-continuation", + base="commonvoice-pt", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +CV_RU_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-ru-continuation", + base="commonvoice-ru", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + + +CV_HI_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-hi-continuation", + base="commonvoice-hi", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_TR_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-tr-continuation", + base="commonvoice-tr", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_SV_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-sv-continuation", + base="commonvoice-sv", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +CV_UK_CONT_CONFIG = types.DatasetConfig( + name="commonvoice-uk-continuation", + base="commonvoice-uk", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + CV_BASE_CONFIG, + CV_EN_CONFIG, + CV_AR_CONFIG, + CV_DE_CONFIG, + CV_ES_CONFIG, + CV_FR_CONFIG, + CV_IT_CONFIG, + CV_JA_CONFIG, + CV_PT_CONFIG, + CV_RU_CONFIG, + CV_HI_CONFIG, + CV_TR_CONFIG, + CV_SV_CONFIG, + CV_UK_CONFIG, + CV_EN_TRANS_CONFIG, + CV_AR_TRANS_CONFIG, + CV_DE_TRANS_CONFIG, + CV_ES_TRANS_CONFIG, + CV_FR_TRANS_CONFIG, + CV_IT_TRANS_CONFIG, + CV_JA_TRANS_CONFIG, + CV_PT_TRANS_CONFIG, + CV_RU_TRANS_CONFIG, + CV_HI_TRANS_CONFIG, + CV_TR_TRANS_CONFIG, + CV_SV_TRANS_CONFIG, + CV_UK_TRANS_CONFIG, + CV_EN_CONT_CONFIG, + CV_AR_CONT_CONFIG, + CV_DE_CONT_CONFIG, + CV_ES_CONT_CONFIG, + CV_FR_CONT_CONFIG, + CV_IT_CONT_CONFIG, + CV_JA_CONT_CONFIG, + CV_PT_CONT_CONFIG, + CV_RU_CONT_CONFIG, + CV_HI_CONT_CONFIG, + CV_TR_CONT_CONFIG, + CV_SV_CONT_CONFIG, + CV_UK_CONT_CONFIG, +] diff --git a/ultravox/data/configs/covost2.py b/ultravox/data/configs/covost2.py new file mode 100644 index 00000000..42091199 --- /dev/null +++ b/ultravox/data/configs/covost2.py @@ -0,0 +1,481 @@ +from ultravox.data import types + +CVST_BASE_CONFIG = types.DatasetConfig( + name="covost2", + path="fixie-ai/covost2", + user_template=types.TRANSLATION_USER_TEMPLATE, + transcript_template="{{sentence}}", + assistant_template="{{translation}}", +) + +CVST_AR_EN_CONFIG = types.DatasetConfig( + name="covost2-ar-en", + base="covost2", + subset="ar_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_283), + types.DatasetSplitConfig(name="validation", num_samples=1_758), + types.DatasetSplitConfig(name="test", num_samples=1_695), + ], + user_template_args={"target": "English"}, +) + +CVST_CA_EN_CONFIG = types.DatasetConfig( + name="covost2-ca-en", + base="covost2", + subset="ca_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=95_854), + types.DatasetSplitConfig(name="validation", num_samples=12_730), + types.DatasetSplitConfig(name="test", num_samples=12_730), + ], + user_template_args={"target": "English"}, +) + +CVST_CY_EN_CONFIG = types.DatasetConfig( + name="covost2-cy-en", + base="covost2", + subset="cy_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_241), + types.DatasetSplitConfig(name="validation", num_samples=690), + types.DatasetSplitConfig(name="test", num_samples=690), + ], + user_template_args={"target": "English"}, +) + +CVST_DE_EN_CONFIG = types.DatasetConfig( + name="covost2-de-en", + base="covost2", + subset="de_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=127_834), + types.DatasetSplitConfig(name="validation", num_samples=13_511), + types.DatasetSplitConfig(name="test", num_samples=13_511), + ], + user_template_args={"target": "English"}, +) + +CVST_EN_AR_CONFIG = types.DatasetConfig( + name="covost2-en-ar", + base="covost2", + subset="en_ar", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Arabic"}, +) + +CVST_EN_CA_CONFIG = types.DatasetConfig( + name="covost2-en-ca", + base="covost2", + subset="en_ca", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Catalan"}, +) + +CVST_EN_CY_CONFIG = types.DatasetConfig( + name="covost2-en-cy", + base="covost2", + subset="en_cy", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Welsh"}, +) + +CVST_EN_DE_CONFIG = types.DatasetConfig( + name="covost2-en-de", + base="covost2", + subset="en_de", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "German"}, +) + +CVST_EN_ET_CONFIG = types.DatasetConfig( + name="covost2-en-et", + base="covost2", + subset="en_et", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Estonian"}, +) + +CVST_EN_FA_CONFIG = types.DatasetConfig( + name="covost2-en-fa", + base="covost2", + subset="en_fa", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Persian"}, +) + +CVST_EN_ID_CONFIG = types.DatasetConfig( + name="covost2-en-id", + base="covost2", + subset="en_id", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Indonesian"}, +) + +CVST_EN_JA_CONFIG = types.DatasetConfig( + name="covost2-en-ja", + base="covost2", + subset="en_ja", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Japanese"}, +) + +CVST_EN_LV_CONFIG = types.DatasetConfig( + name="covost2-en-lv", + base="covost2", + subset="en_lv", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Latvian"}, +) + +CVST_EN_MN_CONFIG = types.DatasetConfig( + name="covost2-en-mn", + base="covost2", + subset="en_mn", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Mongolian"}, +) + +CVST_EN_SL_CONFIG = types.DatasetConfig( + name="covost2-en-sl", + base="covost2", + subset="en_sl", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Slovenian"}, +) + +CVST_EN_SV_CONFIG = types.DatasetConfig( + name="covost2-en-sv", + base="covost2", + subset="en_sv-SE", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Swedish"}, +) + +CVST_EN_TA_CONFIG = types.DatasetConfig( + name="covost2-en-ta", + base="covost2", + subset="en_ta", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Tamil"}, +) + +CVST_EN_TR_CONFIG = types.DatasetConfig( + name="covost2-en-tr", + base="covost2", + subset="en_tr", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Turkish"}, +) + +CVST_EN_ZH_CONFIG = types.DatasetConfig( + name="covost2-en-zh", + base="covost2", + subset="en_zh-CN", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=289_430), + types.DatasetSplitConfig(name="validation", num_samples=15_531), + types.DatasetSplitConfig(name="test", num_samples=15_531), + ], + user_template_args={"target": "Chinese"}, +) + +CVST_ES_EN_CONFIG = types.DatasetConfig( + name="covost2-es-en", + base="covost2", + subset="es_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=79_015), + types.DatasetSplitConfig(name="validation", num_samples=13_221), + types.DatasetSplitConfig(name="test", num_samples=13_221), + ], + user_template_args={"target": "English"}, +) + +CVST_ET_EN_CONFIG = types.DatasetConfig( + name="covost2-et-en", + base="covost2", + subset="et_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_782), + types.DatasetSplitConfig(name="validation", num_samples=1_576), + types.DatasetSplitConfig(name="test", num_samples=1_571), + ], + user_template_args={"target": "English"}, +) + +CVST_FA_EN_CONFIG = types.DatasetConfig( + name="covost2-fa-en", + base="covost2", + subset="fa_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=53_949), + types.DatasetSplitConfig(name="validation", num_samples=3_445), + types.DatasetSplitConfig(name="test", num_samples=3_445), + ], + user_template_args={"target": "English"}, +) + +CVST_FR_EN_CONFIG = types.DatasetConfig( + name="covost2-fr-en", + base="covost2", + subset="fr_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=207_374), + types.DatasetSplitConfig(name="validation", num_samples=14_760), + types.DatasetSplitConfig(name="test", num_samples=14_760), + ], + user_template_args={"target": "English"}, +) + +CVST_ID_EN_CONFIG = types.DatasetConfig( + name="covost2-id-en", + base="covost2", + subset="id_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_243), + types.DatasetSplitConfig(name="validation", num_samples=792), + types.DatasetSplitConfig(name="test", num_samples=844), + ], + user_template_args={"target": "English"}, +) + +CVST_IT_EN_CONFIG = types.DatasetConfig( + name="covost2-it-en", + base="covost2", + subset="it_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=31_698), + types.DatasetSplitConfig(name="validation", num_samples=8_940), + types.DatasetSplitConfig(name="test", num_samples=8_951), + ], + user_template_args={"target": "English"}, +) + +CVST_JA_EN_CONFIG = types.DatasetConfig( + name="covost2-ja-en", + base="covost2", + subset="ja_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_119), + types.DatasetSplitConfig(name="validation", num_samples=635), + types.DatasetSplitConfig(name="test", num_samples=684), + ], + user_template_args={"target": "English"}, +) + +CVST_LV_EN_CONFIG = types.DatasetConfig( + name="covost2-lv-en", + base="covost2", + subset="lv_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_337), + types.DatasetSplitConfig(name="validation", num_samples=1_125), + types.DatasetSplitConfig(name="test", num_samples=1_629), + ], + user_template_args={"target": "English"}, +) + +CVST_MN_EN_CONFIG = types.DatasetConfig( + name="covost2-mn-en", + base="covost2", + subset="mn_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_067), + types.DatasetSplitConfig(name="validation", num_samples=1_761), + types.DatasetSplitConfig(name="test", num_samples=1_759), + ], + user_template_args={"target": "English"}, +) + +CVST_NL_EN_CONFIG = types.DatasetConfig( + name="covost2-nl-en", + base="covost2", + subset="nl_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_108), + types.DatasetSplitConfig(name="validation", num_samples=1_699), + types.DatasetSplitConfig(name="test", num_samples=1_699), + ], + user_template_args={"target": "English"}, +) + +CVST_PT_EN_CONFIG = types.DatasetConfig( + name="covost2-pt-en", + base="covost2", + subset="pt_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=9_158), + types.DatasetSplitConfig(name="validation", num_samples=3_318), + types.DatasetSplitConfig(name="test", num_samples=4_023), + ], + user_template_args={"target": "English"}, +) + +CVST_RU_EN_CONFIG = types.DatasetConfig( + name="covost2-ru-en", + base="covost2", + subset="ru_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=12_112), + types.DatasetSplitConfig(name="validation", num_samples=6_110), + types.DatasetSplitConfig(name="test", num_samples=6_300), + ], + user_template_args={"target": "English"}, +) + +CVST_SL_EN_CONFIG = types.DatasetConfig( + name="covost2-sl-en", + base="covost2", + subset="sl_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_843), + types.DatasetSplitConfig(name="validation", num_samples=509), + types.DatasetSplitConfig(name="test", num_samples=360), + ], + user_template_args={"target": "English"}, +) + +CVST_SV_EN_CONFIG = types.DatasetConfig( + name="covost2-sv-en", + base="covost2", + subset="sv-SE_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=2_160), + types.DatasetSplitConfig(name="validation", num_samples=1_349), + types.DatasetSplitConfig(name="test", num_samples=1_595), + ], + user_template_args={"target": "English"}, +) + +CVST_TA_EN_CONFIG = types.DatasetConfig( + name="covost2-ta-en", + base="covost2", + subset="ta_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_358), + types.DatasetSplitConfig(name="validation", num_samples=384), + types.DatasetSplitConfig(name="test", num_samples=786), + ], + user_template_args={"target": "English"}, +) + +CVST_TR_EN_CONFIG = types.DatasetConfig( + name="covost2-tr-en", + base="covost2", + subset="tr_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=3_966), + types.DatasetSplitConfig(name="validation", num_samples=1_624), + types.DatasetSplitConfig(name="test", num_samples=1_629), + ], + user_template_args={"target": "English"}, +) + +CVST_ZH_EN_CONFIG = types.DatasetConfig( + name="covost2-zh-en", + base="covost2", + subset="zh-CN_en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=7_085), + types.DatasetSplitConfig(name="validation", num_samples=4_843), + types.DatasetSplitConfig(name="test", num_samples=4_898), + ], + user_template_args={"target": "English"}, +) + +configs = [ + CVST_BASE_CONFIG, + CVST_AR_EN_CONFIG, + CVST_CA_EN_CONFIG, + CVST_CY_EN_CONFIG, + CVST_DE_EN_CONFIG, + CVST_EN_AR_CONFIG, + CVST_EN_CA_CONFIG, + CVST_EN_CY_CONFIG, + CVST_EN_DE_CONFIG, + CVST_EN_ET_CONFIG, + CVST_EN_FA_CONFIG, + CVST_EN_ID_CONFIG, + CVST_EN_JA_CONFIG, + CVST_EN_LV_CONFIG, + CVST_EN_MN_CONFIG, + CVST_EN_SL_CONFIG, + CVST_EN_SV_CONFIG, + CVST_EN_TA_CONFIG, + CVST_EN_TR_CONFIG, + CVST_EN_ZH_CONFIG, + CVST_ES_EN_CONFIG, + CVST_ET_EN_CONFIG, + CVST_FA_EN_CONFIG, + CVST_FR_EN_CONFIG, + CVST_ID_EN_CONFIG, + CVST_IT_EN_CONFIG, + CVST_JA_EN_CONFIG, + CVST_LV_EN_CONFIG, + CVST_MN_EN_CONFIG, + CVST_NL_EN_CONFIG, + CVST_PT_EN_CONFIG, + CVST_RU_EN_CONFIG, + CVST_SL_EN_CONFIG, + CVST_SV_EN_CONFIG, + CVST_TA_EN_CONFIG, + CVST_TR_EN_CONFIG, + CVST_ZH_EN_CONFIG, +] diff --git a/ultravox/data/configs/gigaspeech.py b/ultravox/data/configs/gigaspeech.py new file mode 100644 index 00000000..17b889a4 --- /dev/null +++ b/ultravox/data/configs/gigaspeech.py @@ -0,0 +1,27 @@ +from ultravox.data import types + +GS_XL_CONFIG = types.DatasetConfig( + name="gigaspeech-xl", + path="fixie-ai/gigaspeech", + subset="xl-empty-audio-removed", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=8_266_422), + ], + transcript_template="{{text_proc.format_asr_text(text)}}", + assistant_template="{{text_proc.format_asr_text(text)}}", +) + +GS_XL_TRANS_CONFIG = types.DatasetConfig( + name="gigaspeech-xl-transcription", + base="gigaspeech-xl", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +GS_XL_CONT_CONFIG = types.DatasetConfig( + name="gigaspeech-xl-continuation", + base="gigaspeech-xl", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [GS_XL_CONFIG, GS_XL_TRANS_CONFIG, GS_XL_CONT_CONFIG] diff --git a/ultravox/data/configs/librispeech.py b/ultravox/data/configs/librispeech.py new file mode 100644 index 00000000..2f186336 --- /dev/null +++ b/ultravox/data/configs/librispeech.py @@ -0,0 +1,62 @@ +from ultravox.data import types + +LS_BASE_CONFIG = types.DatasetConfig( + name="librispeech", + path="fixie-ai/librispeech_asr", + transcript_template="{{text_proc.format_asr_text(text)}}", + assistant_template="{{text_proc.format_asr_text(text)}}", +) + +LS_CLEAN_CONFIG = types.DatasetConfig( + name="librispeech-clean", + base="librispeech", + subset="clean", + splits=[ + types.DatasetSplitConfig(name="train.100", num_samples=28_539), + types.DatasetSplitConfig(name="train.360", num_samples=104_014), + ], +) + +LS_OTHER_CONFIG = types.DatasetConfig( + name="librispeech-other", + base="librispeech", + subset="other", + splits=[ + types.DatasetSplitConfig(name="train.500", num_samples=148_688), + ], +) + +LS_CLEAN_TRANS_CONFIG = types.DatasetConfig( + name="librispeech-clean-transcription", + base="librispeech-clean", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +LS_OTHER_TRANS_CONFIG = types.DatasetConfig( + name="librispeech-other-transcription", + base="librispeech-other", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +LS_CLEAN_CONT_CONFIG = types.DatasetConfig( + name="librispeech-clean-continuation", + base="librispeech-clean", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) +LS_OTHER_CONT_CONFIG = types.DatasetConfig( + name="librispeech-other-continuation", + base="librispeech-other", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + LS_BASE_CONFIG, + LS_CLEAN_CONFIG, + LS_OTHER_CONFIG, + LS_CLEAN_TRANS_CONFIG, + LS_OTHER_TRANS_CONFIG, + LS_CLEAN_CONT_CONFIG, + LS_OTHER_CONT_CONFIG, +] diff --git a/ultravox/data/configs/multilingual_librispeech.py b/ultravox/data/configs/multilingual_librispeech.py new file mode 100644 index 00000000..4209f27c --- /dev/null +++ b/ultravox/data/configs/multilingual_librispeech.py @@ -0,0 +1,58 @@ +from ultravox.data import types + +ML_BASE_CONFIG = types.DatasetConfig( + name="multilingual_librispeech", + path="fixie-ai/multilingual_librispeech", + transcript_template="{{transcript}}", + assistant_template="{{transcript}}", +) + +ML_NL_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl", + base="multilingual_librispeech", + subset="dutch", + splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], +) + +ML_PT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt", + base="multilingual_librispeech", + subset="portuguese", + splits=[types.DatasetSplitConfig(name="train", num_samples=37_533)], +) + +ML_NL_TRANS_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl-transcription", + base="multilingual_librispeech-nl", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +ML_PT_TRANS_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt-transcription", + base="multilingual_librispeech-pt", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +ML_NL_CONT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-nl-continuation", + base="multilingual_librispeech-nl", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +ML_PT_CONT_CONFIG = types.DatasetConfig( + name="multilingual_librispeech-pt-continuation", + base="multilingual_librispeech-pt", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + ML_BASE_CONFIG, + ML_NL_CONFIG, + ML_PT_CONFIG, + ML_NL_TRANS_CONFIG, + ML_PT_TRANS_CONFIG, + ML_NL_CONT_CONFIG, + ML_PT_CONT_CONFIG, +] diff --git a/ultravox/data/configs/peoplespeech.py b/ultravox/data/configs/peoplespeech.py new file mode 100644 index 00000000..36f8e8f8 --- /dev/null +++ b/ultravox/data/configs/peoplespeech.py @@ -0,0 +1,34 @@ +from ultravox.data import types + +PS_BASE_CONFIG = types.DatasetConfig( + name="peoplespeech", + path="fixie-ai/peoples_speech", + subset="clean", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_501_271), + types.DatasetSplitConfig( + name="test", num_samples=34_898, split_type=types.DatasetSplit.VALIDATION + ), + ], + assistant_template="{{text_proc.format_asr_text(text)}}", + transcript_template="{{text_proc.format_asr_text(text)}}", +) + +PS_TRANS_CONFIG = types.DatasetConfig( + name="peoplespeech-clean-transcription", + base="peoplespeech", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +PS_CONT_CONFIG = types.DatasetConfig( + name="peoplespeech-clean-continuation", + base="peoplespeech", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + PS_BASE_CONFIG, + PS_TRANS_CONFIG, + PS_CONT_CONFIG, +] diff --git a/ultravox/data/configs/voxpopuli.py b/ultravox/data/configs/voxpopuli.py new file mode 100644 index 00000000..dd62b862 --- /dev/null +++ b/ultravox/data/configs/voxpopuli.py @@ -0,0 +1,17 @@ +from ultravox.data import types + +VP_EN_CONFIG = types.DatasetConfig( + name="voxpopuli-en", + path="facebook/voxpopuli", + subset="en", + splits=[ + types.DatasetSplitConfig(name="train", num_samples=1_000_000), + types.DatasetSplitConfig(name="validation", num_samples=10_000), + ], + assistant_template="{{raw_text}}", + transcript_template="{{raw_text}}", +) + +configs = [ + VP_EN_CONFIG, +] diff --git a/ultravox/data/configs/wenetspeech.py b/ultravox/data/configs/wenetspeech.py new file mode 100644 index 00000000..a487a3cc --- /dev/null +++ b/ultravox/data/configs/wenetspeech.py @@ -0,0 +1,28 @@ +from ultravox.data import types + +WS_BASE_CONFIG = types.DatasetConfig( + name="wenetspeech", + path="fixie-ai/wenetspeech", + subset="L_fixed", + splits=[types.DatasetSplitConfig(name="train", num_samples=14_621_415)], + transcript_template="{{text}}", +) + +WS_TRANS_CONFIG = types.DatasetConfig( + name="wenetspeech-transcription", + base="wenetspeech", + user_template=types.TRANSCRIPTION_USER_TEMPLATE, +) + +WS_CONT_CONFIG = types.DatasetConfig( + name="wenetspeech-continuation", + base="wenetspeech", + user_template=types.CONTINUATION_USER_TEMPLATE, + assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, +) + +configs = [ + WS_BASE_CONFIG, + WS_TRANS_CONFIG, + WS_CONT_CONFIG, +] From 996b2c0ed28b7167b1c0f688df7d47722fa43cd3 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Tue, 5 Nov 2024 23:58:29 -0800 Subject: [PATCH 08/13] update --- ultravox/data/types.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ultravox/data/types.py b/ultravox/data/types.py index 58eccc1d..b0b78ab3 100644 --- a/ultravox/data/types.py +++ b/ultravox/data/types.py @@ -54,12 +54,12 @@ class DatasetSplitConfig(helpers.Serializable): def __post_init__(self): """Automatically set split type based on split name""" if self.split_type is None: - if self.name == "test": - self.split_type = DatasetSplit.TEST - elif self.name == "validation": - self.split_type = DatasetSplit.VALIDATION - else: - self.split_type = DatasetSplit.TRAIN + try: + self.split_type = DatasetSplit(self.name) + except ValueError: + raise ValueError( + f"Could not automatically determine split type for '{self.name}'. Please explicitly specify split_type for splits that are not named 'train', 'validation', or 'test'." + ) @dataclasses.dataclass From ab44c5c1a79076f98b5b0f097cecf251e22e81e8 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Wed, 6 Nov 2024 00:03:07 -0800 Subject: [PATCH 09/13] update --- ultravox/data/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ultravox/data/types.py b/ultravox/data/types.py index b0b78ab3..3802f5d4 100644 --- a/ultravox/data/types.py +++ b/ultravox/data/types.py @@ -55,7 +55,7 @@ def __post_init__(self): """Automatically set split type based on split name""" if self.split_type is None: try: - self.split_type = DatasetSplit(self.name) + self.split_type = DatasetSplit(self.name.lower()) except ValueError: raise ValueError( f"Could not automatically determine split type for '{self.name}'. Please explicitly specify split_type for splits that are not named 'train', 'validation', or 'test'." From 2e6c153cd785ef387f1e29778e5bebd6e7726254 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Wed, 6 Nov 2024 00:06:28 -0800 Subject: [PATCH 10/13] update --- ultravox/data/configs/librispeech.py | 6 +++--- ultravox/data/types.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ultravox/data/configs/librispeech.py b/ultravox/data/configs/librispeech.py index 2f186336..73e5cf9c 100644 --- a/ultravox/data/configs/librispeech.py +++ b/ultravox/data/configs/librispeech.py @@ -12,8 +12,8 @@ base="librispeech", subset="clean", splits=[ - types.DatasetSplitConfig(name="train.100", num_samples=28_539), - types.DatasetSplitConfig(name="train.360", num_samples=104_014), + types.DatasetSplitConfig(name="train.100", num_samples=28_539, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig(name="train.360", num_samples=104_014, split_type=types.DatasetSplit.TRAIN), ], ) @@ -22,7 +22,7 @@ base="librispeech", subset="other", splits=[ - types.DatasetSplitConfig(name="train.500", num_samples=148_688), + types.DatasetSplitConfig(name="train.500", num_samples=148_688, split_type=types.DatasetSplit.TRAIN), ], ) diff --git a/ultravox/data/types.py b/ultravox/data/types.py index 3802f5d4..2d24559e 100644 --- a/ultravox/data/types.py +++ b/ultravox/data/types.py @@ -58,7 +58,7 @@ def __post_init__(self): self.split_type = DatasetSplit(self.name.lower()) except ValueError: raise ValueError( - f"Could not automatically determine split type for '{self.name}'. Please explicitly specify split_type for splits that are not named 'train', 'validation', or 'test'." + f"Could not automatically determine split type from split name '{self.name}'. Please explicitly specify split_type for splits that are not named 'train', 'validation', or 'test'." ) From 367938bf3862796738e7158a264d5eabc3340f07 Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Wed, 6 Nov 2024 00:15:06 -0800 Subject: [PATCH 11/13] update --- ultravox/data/configs/librispeech.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ultravox/data/configs/librispeech.py b/ultravox/data/configs/librispeech.py index 73e5cf9c..0fd9c083 100644 --- a/ultravox/data/configs/librispeech.py +++ b/ultravox/data/configs/librispeech.py @@ -12,8 +12,12 @@ base="librispeech", subset="clean", splits=[ - types.DatasetSplitConfig(name="train.100", num_samples=28_539, split_type=types.DatasetSplit.TRAIN), - types.DatasetSplitConfig(name="train.360", num_samples=104_014, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig( + name="train.100", num_samples=28_539, split_type=types.DatasetSplit.TRAIN + ), + types.DatasetSplitConfig( + name="train.360", num_samples=104_014, split_type=types.DatasetSplit.TRAIN + ), ], ) @@ -22,7 +26,9 @@ base="librispeech", subset="other", splits=[ - types.DatasetSplitConfig(name="train.500", num_samples=148_688, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig( + name="train.500", num_samples=148_688, split_type=types.DatasetSplit.TRAIN + ), ], ) From f04598fc67f2f660801736fdc7669f211b5bd52b Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Wed, 6 Nov 2024 00:23:50 -0800 Subject: [PATCH 12/13] update --- ultravox/data/datasets_test.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ultravox/data/datasets_test.py b/ultravox/data/datasets_test.py index 98e4c6f6..7c81fe1e 100644 --- a/ultravox/data/datasets_test.py +++ b/ultravox/data/datasets_test.py @@ -169,8 +169,8 @@ def test_dataset_config(): name="fake_dataset", path="mock_path", splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000), - types.DatasetSplitConfig(name="other", num_samples=10000), + types.DatasetSplitConfig(name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig(name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN), types.DatasetSplitConfig(name="validation", num_samples=1000), types.DatasetSplitConfig( name="another_validation", @@ -201,8 +201,8 @@ def test_dataset_config_serialization(): name="fake_dataset", path="fake_path", splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000), - types.DatasetSplitConfig(name="other", num_samples=10000), + types.DatasetSplitConfig(name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig(name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN), ], ) serialized = config.dumps_yaml() @@ -221,7 +221,7 @@ def test_generic_dataset(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], ) ds = FakeGenericDataset(5, config) assert len(ds) == 5 @@ -240,7 +240,7 @@ def test_generic_dataset_custom_templates(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], user_template="Listen to the following and respond with 'xyzzy':\n<|audio|>", assistant_template="xyzzy", transcript_template="{{text}}", @@ -265,7 +265,7 @@ def test_generic_dataset_text_only(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], user_template="Transcribe\n<|audio|>", ) ds = FakeGenericDataset(5, config, types.VoiceDatasetArgs(include_audio=False)) @@ -283,7 +283,7 @@ def test_generic_dataset_merge_configs(): base_config = types.DatasetConfig( name="fake_base", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], ) mid_config = types.DatasetConfig( name="fake_mid", @@ -315,7 +315,7 @@ def test_generic_dataset_length_mismatch(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], ) ds = FakeGenericDataset(10, config) assert len(ds) == 5 @@ -327,7 +327,7 @@ def test_generic_dataset_length_mismatch(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=10)], + splits=[types.DatasetSplitConfig(name="fake", num_samples=10, split_type=types.DatasetSplit.TRAIN)], ) ds = FakeGenericDataset(5, config) assert len(ds) == 10 From 96d58c6090bbd579c9693b0dfae2582dd518b8ec Mon Sep 17 00:00:00 2001 From: Zhongqiang Huang Date: Wed, 6 Nov 2024 00:29:51 -0800 Subject: [PATCH 13/13] update --- ultravox/data/datasets_test.py | 52 +++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/ultravox/data/datasets_test.py b/ultravox/data/datasets_test.py index 7c81fe1e..3fd9689d 100644 --- a/ultravox/data/datasets_test.py +++ b/ultravox/data/datasets_test.py @@ -169,8 +169,12 @@ def test_dataset_config(): name="fake_dataset", path="mock_path", splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN), - types.DatasetSplitConfig(name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig( + name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN + ), + types.DatasetSplitConfig( + name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN + ), types.DatasetSplitConfig(name="validation", num_samples=1000), types.DatasetSplitConfig( name="another_validation", @@ -201,8 +205,12 @@ def test_dataset_config_serialization(): name="fake_dataset", path="fake_path", splits=[ - types.DatasetSplitConfig(name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN), - types.DatasetSplitConfig(name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN), + types.DatasetSplitConfig( + name="clean", num_samples=5000, split_type=types.DatasetSplit.TRAIN + ), + types.DatasetSplitConfig( + name="other", num_samples=10000, split_type=types.DatasetSplit.TRAIN + ), ], ) serialized = config.dumps_yaml() @@ -221,7 +229,11 @@ def test_generic_dataset(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN + ) + ], ) ds = FakeGenericDataset(5, config) assert len(ds) == 5 @@ -240,7 +252,11 @@ def test_generic_dataset_custom_templates(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN + ) + ], user_template="Listen to the following and respond with 'xyzzy':\n<|audio|>", assistant_template="xyzzy", transcript_template="{{text}}", @@ -265,7 +281,11 @@ def test_generic_dataset_text_only(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN + ) + ], user_template="Transcribe\n<|audio|>", ) ds = FakeGenericDataset(5, config, types.VoiceDatasetArgs(include_audio=False)) @@ -283,7 +303,11 @@ def test_generic_dataset_merge_configs(): base_config = types.DatasetConfig( name="fake_base", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN + ) + ], ) mid_config = types.DatasetConfig( name="fake_mid", @@ -315,7 +339,11 @@ def test_generic_dataset_length_mismatch(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=5, split_type=types.DatasetSplit.TRAIN + ) + ], ) ds = FakeGenericDataset(10, config) assert len(ds) == 5 @@ -327,7 +355,11 @@ def test_generic_dataset_length_mismatch(): config = types.DatasetConfig( name="fake_dataset", path="fake_path", - splits=[types.DatasetSplitConfig(name="fake", num_samples=10, split_type=types.DatasetSplit.TRAIN)], + splits=[ + types.DatasetSplitConfig( + name="fake", num_samples=10, split_type=types.DatasetSplit.TRAIN + ) + ], ) ds = FakeGenericDataset(5, config) assert len(ds) == 10