From 27de8458bbfe77258235d077eb55cb68e7701d59 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Tue, 11 Jun 2024 01:02:26 +0300 Subject: [PATCH 01/17] cherry pick of #9266 (#9411) * add deprecation warnings for non-mcore models Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change warning default time Signed-off-by: dimapihtar * remove unused import Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * remove deprecated tests Signed-off-by: dimapihtar * set mcore_gpt to True Signed-off-by: dimapihtar * set mcore_bert to True Signed-off-by: dimapihtar * remove deprecated tests Signed-off-by: dimapihtar * remove deprecated unit tests Signed-off-by: dimapihtar * add deprecation warning Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * remove deprecated playbook Signed-off-by: dimapihtar * remove deprecated tutorial Signed-off-by: dimapihtar * turn off FA for Bert Signed-off-by: dimapihtar * turn of FA for Bert Signed-off-by: dimapihtar * change mcore commit Signed-off-by: dimapihtar * adjustments * update TE commit Signed-off-by: dimapihtar * fix mcore precision issue Signed-off-by: dimapihtar * change precision for bert Signed-off-by: dimapihtar * change precision for fine-tuning Signed-off-by: dimapihtar * turn off fused attention for bert Signed-off-by: dimapihtar * fix bert test Signed-off-by: dimapihtar * revert tests Signed-off-by: dimapihtar * fix typo Signed-off-by: dimapihtar * remove unnecessary Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: Pablo Garay --- .github/workflows/cicd-main.yml | 2065 ++++++----------- .../conf/megatron_bert_config.yaml | 8 +- .../conf/megatron_gpt_config.yaml | 6 +- .../assistant_data_processor.py | 19 +- .../dialogue/data_processor/data_processor.py | 8 +- .../data_processor/design_data_processor.py | 6 +- .../mellon_qa_data_processor.py | 15 +- .../data_processor/ms_marco_data_processor.py | 12 +- .../data_processor/sgd_data_processor.py | 34 +- .../dialogue/dataset/dialogue_bert_dataset.py | 15 +- .../dialogue_gpt_classification_dataset.py | 15 +- .../dialogue_gpt_generation_dataset.py | 15 +- .../dialogue_nearest_neighbour_dataset.py | 4 + .../dialogue_s2s_generation_dataset.py | 15 +- .../dialogue_zero_shot_intent_dataset.py | 21 +- .../megatron/base_prompt_learning_dataset.py | 20 +- .../megatron/gpt_prompt_learning_dataset.py | 32 +- .../dataset/qa_bert_dataset.py | 14 +- .../question_answering/dataset/qa_dataset.py | 32 +- .../dataset/qa_gpt_dataset.py | 21 +- .../dataset/qa_s2s_dataset.py | 35 +- .../question_answering_squad/qa_dataset.py | 24 +- .../bert_example.py | 104 +- .../dialogue_gpt_classification_model.py | 26 +- .../dialogue/dialogue_gpt_generation_model.py | 19 +- .../dialogue_nearest_neighbour_model.py | 11 +- .../dialogue/dialogue_s2s_generation_model.py | 14 +- .../dialogue_zero_shot_intent_model.py | 10 +- .../intent_slot_classification_model.py | 15 +- .../nlp/models/dialogue/sgdqa_model.py | 16 +- .../entity_linking/entity_linking_model.py | 6 +- .../glue_benchmark/glue_benchmark_model.py | 3 + .../megatron/bert/bert_model.py | 22 +- .../language_modeling/megatron/gpt_model.py | 16 +- .../megatron_base_prompt_learning_model.py | 4 + .../megatron_gpt_prompt_learning_model.py | 65 +- .../question_answering/qa_base_model.py | 11 +- .../question_answering/qa_bert_model.py | 32 +- .../models/question_answering/qa_gpt_model.py | 34 +- .../nlp/models/question_answering/qa_model.py | 6 +- .../models/question_answering/qa_s2s_model.py | 44 +- .../spellchecking_model.py | 11 +- nemo/utils/decorators/__init__.py | 2 +- nemo/utils/decorators/deprecated.py | 39 +- tests/collections/nlp/test_dialogue.py | 278 --- .../nlp/test_entity_linking_model.py | 84 - tests/collections/nlp/test_megatron.py | 81 - tests/collections/nlp/test_mem_map_dataset.py | 133 -- tests/collections/nlp/test_prompt_learning.py | 142 -- tests/collections/nlp/test_qna.py | 240 -- .../nlp/test_question_answering.py | 185 -- .../test_spellchecking_asr_customization.py | 1102 --------- tutorials/nlp/Dialogue.ipynb | 717 ------ tutorials/nlp/Entity_Linking_Medical.ipynb | 632 ----- tutorials/nlp/GLUE_Benchmark.ipynb | 566 ----- tutorials/nlp/MegatronBert_export.ipynb | 280 --- tutorials/nlp/Question_Answering.ipynb | 1163 ---------- ...pellMapper_English_ASR_Customization.ipynb | 1412 ----------- 58 files changed, 1252 insertions(+), 8709 deletions(-) delete mode 100644 tests/collections/nlp/test_dialogue.py delete mode 100644 tests/collections/nlp/test_entity_linking_model.py delete mode 100644 tests/collections/nlp/test_megatron.py delete mode 100644 tests/collections/nlp/test_mem_map_dataset.py delete mode 100644 tests/collections/nlp/test_prompt_learning.py delete mode 100644 tests/collections/nlp/test_qna.py delete mode 100644 tests/collections/nlp/test_question_answering.py delete mode 100644 tests/collections/nlp/test_spellchecking_asr_customization.py delete mode 100644 tutorials/nlp/Dialogue.ipynb delete mode 100644 tutorials/nlp/Entity_Linking_Medical.ipynb delete mode 100644 tutorials/nlp/GLUE_Benchmark.ipynb delete mode 100644 tutorials/nlp/MegatronBert_export.ipynb delete mode 100644 tutorials/nlp/Question_Answering.ipynb delete mode 100644 tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 12b8cdcb8eedf..01a8cfc4b0df6 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -871,318 +871,6 @@ jobs: pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ output_manifest=preds.json - # L2: Dialogue Classification - - # TODO: pleasefixme - # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure-gpus-1 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/dialogue && \ - # python dialogue.py \ - # model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.dataset.dialogues_example_dir=sgd_gen_outputs \ - # model.dataset.task_name=debug_sample \ - # trainer.max_steps=1 \ - # trainer.max_epochs=1 \ - # model.train_ds.batch_size=2 \ - # model.validation_ds.batch_size=2 \ - # model.test_ds.batch_size=2 \ - # model.nemo_path=null \ - # trainer.val_check_interval=0.0 \ - # trainer.devices=1 \ - # model.dataset.use_cache=false \ - # model.tokenizer.special_tokens={pad_token:"endoftext"} \ - # model.tokenizer.tokenizer_name=gpt2 \ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \ - # trainer.accelerator=gpu \ - # exp_manager=null && \ - # rm -rf sgd_gen_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs - - # L2: Dialogue Generation - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null - AFTER_SCRIPT: | - rm -rf sgd_answer_extender_s2s - -# - name: L2: Dialogue Generation Part 2 -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# parallel { -# - name: Dialogue: Answer Extender using DialogueGPTGenerationModel -# - run: | -# cd examples/nlp/dialogue && \ -# python dialogue.py \ -# do_training=False \ -# model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ -# model.dataset.dialogues_example_dir=answer_extender \ -# model.library=huggingface \ -# model.dataset.task=ms_marco \ -# model.dataset.debug_mode=True \ -# trainer.val_check_interval=0.0 \ -# trainer.devices=1 \ -# model.dataset.use_cache=false \ -# model.language_model.pretrained_model_name=gpt2 \ -# trainer.accelerator=gpu \ -# exp_manager=null && \ -# rm -rf answer_extender -# } -# } -# } -# } - - # L2: COPY - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender - # L2: Duplex Text Normalization L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] @@ -1212,216 +900,6 @@ jobs: data.test_ds.use_cache=false \ data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv -# Runs out of memory on the 12G TITAN V (GPU 0 on main CI) -# TODO: add when megatron bert is supported again in NeMo -# - name: L2: MegaBERT Token Classification -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# - run: | -# cd examples/nlp/token_classification && \ -# python token_classification_train.py \ -# model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ -# model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ -# model.train_ds.batch_size=10 \ -# model.dataset.max_seq_length=50 \ -# model.dataset.use_cache=false \ -# trainer.accelerator=gpu \ -# trainer.strategy=ddp \ -# trainer.precision=16 \ -# trainer.devices=1 \ -# trainer.accelerator="gpu" \ -# +trainer.fast_dev_run=true \ -# exp_manager=null -# } -# } - - # L2: BERT Text Classification - L2_BERT_Text_Classification_with_BERT_Test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BART_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: @@ -1653,241 +1131,7 @@ jobs: pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; rm -rf "${data_dir}" - - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null; - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}" - - # Punctuation & Capitalization tarred dataset: - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output; - - rm -rf "${output_dir}" "${data_dir}" - - # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}" - - # TODO: pleasefixme - # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/token_classification && \ - # work_dir="$(mktemp -d -p "$(pwd)")" && \ - # output_dir="${work_dir}/output" && \ - # mkdir -p "${output_dir}" && \ - # data_dir="${work_dir}/data" && \ - # mkdir -p "${data_dir}" && \ - # cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - # conf_name=punctuation_capitalization_config_with_ids && \ - # cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # python punctuation_capitalization_train_evaluate.py \ - # --config-path "${work_dir}" \ - # --config-name "${conf_name}" \ - # model.train_ds.use_tarred_dataset=false \ - # model.train_ds.ds_item="${data_dir}" \ - # model.validation_ds.ds_item="${data_dir}" \ - # model.test_ds.ds_item="${data_dir}" \ - # model.language_model.pretrained_model_name=distilbert-base-uncased \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # +exp_manager.explicit_log_dir="${output_dir}" \ - # +do_testing=false && \ - # python punctuation_capitalization_train_evaluate.py \ - # +do_training=false \ - # +do_testing=true \ - # ~model.train_ds \ - # ~model.validation_ds \ - # model.test_ds.ds_item="${data_dir}" \ - # pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # exp_manager=null && \ - # rm -rf "${work_dir}" - - # Punctuation & Capitalization inference - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32; - rm -rf "${output_dir}" # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: @@ -1947,23 +1191,6 @@ jobs: #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - # L2: Entity Linking - L2_Entity_Linking_Self_Alignment_Pretraining_BERT: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 # is in the release container @@ -2581,211 +1808,250 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings L2_Megatron_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] @@ -3086,168 +2352,189 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + + L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + # trainer.devices=2 \ + # trainer.accelerator=gpu \ + # trainer.log_every_n_steps=1 \ + # trainer.val_check_interval=2 \ + # trainer.limit_val_batches=1 \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=6 \ + # trainer.gradient_clip_val=1.0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + # exp_manager.resume_if_exists=True \ + # model.tensor_model_parallel_size=2 \ + # model.optim.name=fused_adam \ + # model.optim.lr=2e-4 \ + # model.optim.sched.warmup_steps=2 \ + # model.optim.sched.constant_steps=2 \ + # model.optim.sched.min_lr=8e-5 \ + # model.max_position_embeddings=128 \ + # model.encoder_seq_length=128 \ + # model.data.seq_length=128 \ + # model.position_embedding_type=rope \ + # model.rotary_percentage=0.5 \ + # model.normalization=rmsnorm \ + # model.bias=False \ + # model.bias_activation_fusion=False \ + # model.bias_dropout_add_fusion=False \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # model.num_layers=8 \ + # model.hidden_size=256 \ + # model.num_attention_heads=8 \ + # model.activations_checkpoint_method=block \ + # model.activations_checkpoint_granularity=full \ + # model.activations_checkpoint_num_layers=1 \ + # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" # This test requires Ampere but some of the test GPUs are Volta # Need to add a check for compute capability before uncommenting this test @@ -3343,169 +2630,192 @@ jobs: L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # not testing resume functionality to save time on ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=alibi \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=kerple \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] @@ -3663,36 +2973,50 @@ jobs: L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.peft.peft_scheme=none \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_sft_results - + runs-on: self-hosted-azure-gpus-1 + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + trainer.max_steps=4 \ + trainer.val_check_interval=4 \ + trainer.enable_checkpointing=False \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + exp_manager.checkpoint_callback_params.save_best_model=False \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.peft.peft_scheme=none \ + model.optim.name=distributed_fused_adam \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.num_workers=0 \ + model.data.train_ds.concat_sampling_probabilities=[1.0] + + rm -rf examples/nlp/language_modeling/gpt_sft_results + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4545,75 +3869,7 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/bart_pretrain_results - # L2: Megatron T5 GLUE/XNLI Finetuning - # TODO(Oktai15): update it in 1.8.0 version - L2_Megatron_T5_GLUE_RTE: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_glue_results - - L2_Megatron_T5_GLUE_XNLI: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_xnli_results - + L2_Megatron_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4941,23 +4197,7 @@ jobs: - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel - - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel - - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel - L2_Duplex_Text_Normalization_with_Tarred_dataset - - L2_BERT_Text_Classification_with_BERT_Test - - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0 - - L2_Parallel_BART_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0 - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test @@ -4965,13 +4205,8 @@ jobs: - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data - - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset - - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir - - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text - L2_Pretraining_BERT_pretraining_from_Text - L2_Pretraining_BERT_from_Preprocessed - - L2_Entity_Linking_Self_Alignment_Pretraining_BERT - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation @@ -5013,8 +4248,6 @@ jobs: - L2_Megatron_T5_Eval - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_GLUE_RTE - - L2_Megatron_T5_GLUE_XNLI - L2_Megatron_T5_PEFT_Lora_TP2 - L2_Megatron_Mock_Data_Generation_MockGPTDataset - L2_Megatron_Mock_Data_Generation_MockT5Dataset diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index bc66ae717ebb3..4eef38e715d45 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -5,7 +5,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -41,7 +41,7 @@ exp_manager: model: # model parallelism - mcore_bert: False + mcore_bert: True micro_batch_size: 4 global_batch_size: 8 tensor_model_parallel_size: 1 @@ -85,7 +85,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 grad_div_ar_fusion: False @@ -158,4 +158,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 \ No newline at end of file + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c83..1f63f7742ea06 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,7 +9,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -121,7 +121,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py index 98d24802189e6..92c56a4c20df6 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueAssistantDataProcessor'] @@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg): data_dir: path to data directory tokenizer: tokenizer object """ + # deprecation warning + deprecated_warning("DialogueAssistantDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -69,16 +73,15 @@ def open_file(self, filename): @staticmethod def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids): - """ Extract continuous spans of slot_ids - To accomodate slots with distinct labels for B-label1 and I-label1, + To accomodate slots with distinct labels for B-label1 and I-label1, slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1 - + Args: Slot: list of int representing slot of each word token - For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 + For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday" Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens, each containing a start position and an exclusive end position @@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str): "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words}, "label_positions": { "slots": { - slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,} + slot: { + "start": position[0], + "exclusive_end": position[1], + "slot": slot, + } for slot, position in slot_to_start_and_exclusive_end.items() } }, diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py index 2a4b21c705353..c41c1f5e04ca2 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py @@ -17,6 +17,7 @@ import random from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDataProcessor'] @@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor): """ def __init__(self): + # deprecation warning + deprecated_warning("DialogueDataProcessor") + raise NotImplementedError() def get_train_examples(self): @@ -58,8 +62,8 @@ def get_test_examples(self): def get_relevant_idxs(dataset_split, n_samples, dev_proportion): """ Obtain indexes for each dataset_split, when train and dev sets are not in separate files - - Args: + + Args: dataset_split: train, dev or test n_samples: total number of samples dev_proportion: value from 1 to 99 that represent proportion of data in dev set diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py index 5e58919b76522..56e99c4bcfe91 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py @@ -19,6 +19,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDesignDataProcessor'] @@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueDesignDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -50,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py index 58814a8eee90d..67d58ff5d21ec 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py @@ -19,13 +19,13 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMellonQADataProcessor'] class DialogueMellonQADataProcessor(DialogueDataProcessor): - """Data Processor for Mellon QA dialogues. - """ + """Data Processor for Mellon QA dialogues.""" def __init__(self, data_dir: str, tokenizer: object, cfg=None): """ @@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMellonQADataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -51,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str): input_example = { "utterance": utterance, "example_id": i, - "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,}, + "labels": { + "response": answer, + "fluent_response": well_formed_answer, + "passage": passage, + }, } example = DialogueInputExample(input_example) examples.append(example) diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py index 78f434c1d5dda..d09960a35d690 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py @@ -19,15 +19,16 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMSMarcoDataProcessor'] class DialogueMSMarcoDataProcessor(DialogueDataProcessor): """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering) - Please agree to the Terms of Use before downloading data at - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz + Please agree to the Terms of Use before downloading data at + https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz + https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz """ def __init__(self, data_dir: str, tokenizer: object, cfg=None): @@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): debug_mode: reduce number of samples to load in order to increase speed of processing cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMSMarcoDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -55,7 +59,7 @@ def open_json(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py index a78e1973e55fa..1d37c26f1c45e 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py @@ -28,6 +28,7 @@ from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample from nemo.collections.nlp.data.dialogue.sgd.schema import Schema from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning from nemo.utils.get_rank import is_global_rank_zero __all__ = ['DialogueSGDDataProcessor'] @@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): # git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git ***Data format*** - SGD data comes with a JSON schema file and dialogue files for each dataset split. + SGD data comes with a JSON schema file and dialogue files for each dataset split. In the following we will show an example for a service entry in the schema file. * service_name @@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * result_slots (not used) - In the following we will show an example for a dialogue. + In the following we will show an example for a dialogue. * dialogue_id * services * turns @@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * state * active_intent * requeste_slots - * slot_values + * slot_values * speaker - [USER, SYSTEM] * utterance """ def __init__( - self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None, + self, + data_dir: str, + dialogues_example_dir: str, + tokenizer: object, + cfg=None, ): """ Constructs DialogueSGDDataProcessor @@ -104,6 +109,9 @@ def __init__( tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueSGDDataProcessor") + self.data_dir = data_dir self.cfg = cfg @@ -213,7 +221,7 @@ def get_labels(self): def get_dialog_examples(self, dataset_split: str) -> List[object]: """ - Loads preprocessed dialogue examples from disk. + Loads preprocessed dialogue examples from disk. Args: dataset_split: dataset split Returns: @@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp Returns a list of `InputExample`s of the data splits' dialogues. Args: dataset_split: data split, can be "train", "dev", or "test". - schemas: schema for all services of all datasets + schemas: schema for all services of all datasets subsample: whether to balance postive and negative samples in the dataset Returns: examples: a list of `InputExample`s. @@ -447,9 +455,9 @@ def _create_examples_from_turn( "example_id_num": example_id_num, "utterance": user_utterance, "system_utterance": system_utterance, - "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]} - if system_frame is not None - else None, + "system_slots": ( + {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None + ), "system_actions": system_frame["actions"] if system_frame is not None else None, "labels": { "service": service, @@ -464,9 +472,11 @@ def _create_examples_from_turn( for intent in schemas.get_service_schema(service).intents ], "slots": { - slot: schemas.get_service_schema(service).get_categorical_slot_values(slot) - if slot in categorical_slots - else [] + slot: ( + schemas.get_service_schema(service).get_categorical_slot_values(slot) + if slot in categorical_slots + else [] + ) for slot in all_possible_slots }, }, diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py index 0931fe383f943..33d46c308e810 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py @@ -21,12 +21,12 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset'] class DialogueBERTDataset(DialogueDataset): - """ Creates a dataset to use for the task of joint intent and slot classification with pretrained model. @@ -37,8 +37,7 @@ class DialogueBERTDataset(DialogueDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -57,6 +56,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: config container for dataset """ + # deprecation warning + deprecated_warning("DialogueBERTDataset") + self.cfg = cfg self.all_possible_labels = dialogues_processor.intents self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))} @@ -183,7 +185,7 @@ def get_features( ignore_start_end=False, ): """ - Convert queries (utterance, intent label and slot labels) to BERT input format + Convert queries (utterance, intent label and slot labels) to BERT input format """ all_subtokens = [] @@ -297,7 +299,7 @@ class DialogueIntentSlotInferenceDataset(DialogueBERTDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: """ - Returns definitions of module output ports. + Returns definitions of module output ports. """ return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), @@ -308,6 +310,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: } def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): + # deprecation warning + deprecated_warning("DialogueIntentSlotInferenceDataset") + if do_lower_case: queries = [query.lower() for query in queries] diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py index 1ac04a856a89c..f89a5013c2ae6 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py @@ -21,27 +21,31 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class DialogueGPTClassificationDataset(DialogueDataset): ''' Designed for classification tasks such as intent/domain classification as well as slot tagging - Dataset Class + Dataset Class 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling + 3. Users can configurate which labels to use for modelling (e.g. intent classification, slot filling or both together etc) ''' def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor + """Constructor Args: dataset_split: dataset split dialogues_processor: Data generator for SGD dialogues tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTClassificationDataset") + self.cfg = cfg if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation": @@ -229,19 +233,18 @@ def collate_fn(self, batch): return all_items def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. service: restaurant e.g. service: restaurant e.g. \nintent: set alarm\nslots: (), () Generation example: - e.g. service: + e.g. service: ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py index 7de02d75c5744..8ddbc2e3925e4 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py @@ -18,12 +18,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueGPTGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -31,6 +32,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -80,7 +84,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -91,7 +95,6 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' For each example, this function determines the format of input and output sequences based on user-specified conguration. This is controlled by model.dataset.input_field and model.dataset.output_field @@ -99,9 +102,9 @@ def __getitem__(self, idx: int): If model.dataset.input_field == response and model.dataset.output_field == fluent_response: Input = "response: " and output = "response: fluent_response: " (with loss calculated from only) If model.dataset.input_field == utterance and model.dataset.output_field == response: - Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) + Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) If model.dataset.input_field == passage+utterance and model.dataset.output_field == response: - Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) + Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py index 8618f2f8c7b4b..dc123ca0e3d73 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py @@ -17,6 +17,7 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourDataset'] @@ -33,6 +34,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c dialogues_processor: Data generator for dialogues tokenizer: tokenizer to split text into sub-word tokens """ + # deprecation warning + deprecated_warning("DialogueNearestNeighbourDataset") + self.cfg = cfg self.tokenizer = tokenizer self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py index 78fda55edd2ea..df522b74e8614 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py @@ -16,12 +16,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueS2SGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -29,6 +30,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueS2SGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -45,7 +49,7 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c @staticmethod def format_actions(prompt_template, actions): """ - Formats actions based on prompt_template + Formats actions based on prompt_template Args: prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions @@ -118,7 +122,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -128,13 +132,12 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. INPUT - "response: " OUTPUT - "" # input_label_type = response, output_label_type = fluent_response e.g. INPUT - "utterance: " OUTPUT - "" # input_label_type = utterance, output_label_type = response e.g. INPUT - "passage: utterance: " OUTPUT - "" # input_label_type = passage+utterance, output_label_type = response diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py index f2a0f58bcfac2..c1308238bea14 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentDataset'] @@ -36,8 +37,7 @@ class DialogueZeroShotIntentDataset(GLUEDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -55,6 +55,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c num_classes: number of classes in the data (should be either 2 or 3, corresponding to labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) """ + # deprecation warning + deprecated_warning("DialogueZeroShotIntentDataset") + self.cfg = cfg self.tokenizer = tokenizer if self.cfg.num_classes not in [2, 3]: @@ -69,9 +72,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c 'eos_token': tokenizer.eos_token, 'pad_token': tokenizer.pad_token, 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token - if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() - else None, + 'sep_token_extra': ( + tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None + ), } self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) @@ -128,9 +131,9 @@ def convert_examples_to_features( * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - + The convention in BERT is: - + a. For sequence pairs: * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 @@ -148,9 +151,9 @@ def convert_examples_to_features( For classification tasks, the first vector (corresponding to [CLS]) is used as as the "sentence vector". Note that this only makes sense because the entire model is fine-tuned. - + The convention for NMT is: - + a. For sequence pairs: * tokens: is this jack ##ville ? no it is not . * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py index 5d985466ff6cb..bbd14f47a6514 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.modules.common import VirtualPromptSource from nemo.core import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['BasePromptLearningDataset'] @@ -41,6 +42,9 @@ def __init__( add_eos: bool = True, for_train: bool = True, ): + # deprecation warning + deprecated_warning("BasePromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -72,7 +76,7 @@ def __init__( raise ValueError("Datasets must be a list of dicts or a list of filepath strings") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -85,7 +89,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits return input_example def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -115,7 +119,7 @@ def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virt return input_ids def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -187,11 +191,11 @@ def pad_taskname_ids(self, taskname_ids): def find_subsequence_location(sequence, subsequence): - """ Finds the start and end index of the first occurance - of a given subsequence within a larger list. Returns - the two indices corresponding to the postition of - the first and last token of the subseqeunce. - Assumes subsequence is known to be in sequence. + """Finds the start and end index of the first occurance + of a given subsequence within a larger list. Returns + the two indices corresponding to the postition of + the first and last token of the subseqeunce. + Assumes subsequence is known to be in sequence. """ assert len(sequence) >= len(subsequence), "subsequence too long" diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 4b1b4f61d4391..11795bd150f11 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids from nemo.core import Dataset from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GPTPromptLearningDataset'] @@ -30,7 +31,7 @@ class GPTPromptLearningDataset(Dataset): """ The dataset class for prompt-tuning or p-tuning pretrained GPT models. - + Args: data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example tokenizer (tokenizer): Tokenizer from frozen language model @@ -39,7 +40,7 @@ class GPTPromptLearningDataset(Dataset): pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [, , ...] up to max num virtual tokens pad_token_id (int): ID of pad token from tokenizer max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example add_eos (bool): Whether to add an end of sentence token to each data example for_train (bool): Whether you're creating a dataset for training or inference @@ -63,6 +64,9 @@ def __init__( cache_data_path: str = None, # the cache file load_cache: bool = True, # whether to load from the cache if it is available ): + # deprecation warning + deprecated_warning("GPTPromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -112,9 +116,9 @@ def __init__( def load_data(self, dataset): """ Loads a dataset by filling in the task templates specified in the config file - with the information from each training/inference example. Converts all input - text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in - the task templates with the actual virtual prompt token ids. + with the information from each training/inference example. Converts all input + text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in + the task templates with the actual virtual prompt token ids. params: dataset: A list of json objects or a dictionary objects each @@ -241,7 +245,7 @@ def _input_sanity_checks( assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end" def _insert_text_in_template(self, input_example, prompt_template_fields, doc): - """ Format the input example according to the template """ + """Format the input example according to the template""" for field in prompt_template_fields: if field in doc.keys(): field_text = doc[field] @@ -255,7 +259,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc): return input_example.strip(" ") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -270,7 +274,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits def _truncate_input( self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits ): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -297,8 +301,8 @@ def _truncate_input( return input_ids def _find_answer_start(self, taskname, input_ids, answer_field, doc): - """ Find the token ids corresponding to the answer start, for loss masking purposes. - Assumes the answer is always at the end of the prompt. + """Find the token ids corresponding to the answer start, for loss masking purposes. + Assumes the answer is always at the end of the prompt. """ answer_text = doc[answer_field] answer_text = self._add_leading_space(taskname, answer_field, answer_text) @@ -313,7 +317,7 @@ def _find_answer_start(self, taskname, input_ids, answer_field, doc): return answer_start_idx def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -331,7 +335,7 @@ def _ceil_to_nearest(self, n, m): return (n + m - 1) // m * m def collate_fn(self, batch, tp_workers=0): - """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ + """Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch""" taskname_ids, input_ids, answer_starts = zip(*batch) # Pad taskname_ids to be the same length for the prompt encoder @@ -380,7 +384,7 @@ def collate_fn(self, batch, tp_workers=0): return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): - """ Pad input_ids in batch to max batch length while building loss mask """ + """Pad input_ids in batch to max batch length while building loss mask""" batch_loss_masks = [] padded_input_ids = [] for ids, answer_start_idx in zip(input_ids, answer_starts): @@ -410,7 +414,7 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): def inference_collate_fn(self, batch): """ - Used for loading inference data. + Used for loading inference data. """ task_id_nums, input_ids, answer_starts = zip(*batch) input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids]) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py index 4070098b5e673..87174b69ffc2a 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py @@ -22,10 +22,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_bert_input_example import BERTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQADataset(QADataset): - """ Creates a Dataset for BERT architecture based Exractive QA """ + """Creates a Dataset for BERT architecture based Exractive QA""" def __init__( self, @@ -41,6 +42,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("BERTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -92,7 +96,7 @@ def __init__( self.features[i] = BERTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -110,7 +114,7 @@ def _set_cached_features_filename(self): ) def _convert_examples_to_features(self): - """ Converts loaded examples to features """ + """Converts loaded examples to features""" logging.info(f"Preprocessing data into features.") @@ -161,7 +165,7 @@ def _convert_examples_to_features(self): example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = self.tokenizer.text_to_tokens(token) @@ -199,7 +203,7 @@ def _convert_examples_to_features(self): # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [self.tokenizer.cls_token] + query_tokens + [self.tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py index 783b2dd33f313..553f5984952ca 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py @@ -28,14 +28,24 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class QADataset(Dataset): - ''' Abstract base class for QA Datasets with common utility methods ''' + '''Abstract base class for QA Datasets with common utility methods''' def __init__( - self, data_file: str, processor: object, tokenizer: object, mode: str, num_samples: int, **kwargs, + self, + data_file: str, + processor: object, + tokenizer: object, + mode: str, + num_samples: int, + **kwargs, ): + # deprecation warning + deprecated_warning("QADataset") + self.mode = mode self.data_file = data_file self.processor = processor @@ -100,7 +110,7 @@ def get_best_span_index(doc_spans, position): best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -150,7 +160,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -179,7 +189,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -193,7 +203,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -204,7 +214,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -282,9 +292,13 @@ def get_doc_tokens_and_offset_from_context_id( @staticmethod def improve_answer_span( - doc_tokens: List[str], input_start: int, input_end: int, tokenizer: object, orig_answer_text: str, + doc_tokens: List[str], + input_start: int, + input_end: int, + tokenizer: object, + orig_answer_text: str, ): - """ Returns tokenized answer spans that better match the annotated answer """ + """Returns tokenized answer spans that better match the annotated answer""" tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py index d6484b33e202d..1eeb312a62a92 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py @@ -24,10 +24,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_gpt_input_example import GPTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQADataset(QADataset): - """ Creates a Dataset for GPT architecture based Generative QA """ + """Creates a Dataset for GPT architecture based Generative QA""" def __init__( self, @@ -44,6 +45,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("GPTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -76,7 +80,7 @@ def __init__( self.features[i] = GPTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -120,7 +124,11 @@ def _convert_examples_to_features(self): formatted_query, query_tokens_length = self._prep_query(query_prefix, example) formatted_answer, answer_tokens_length = self._prep_answer(example) context_tokens, context_spans = self._prep_context( - example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ) unique_id = self._encode_all_context_spans( @@ -170,7 +178,12 @@ def _prep_answer(self, example): return self._get_truncated_sentence_and_len(target, self.max_answer_length) def _prep_context( - self, example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + self, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ): """ Calculates the maximum possible length for a given context given a question diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py index 1f9a8ef615a9a..c65c8a43c4404 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py @@ -23,10 +23,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_s2s_input_example import S2SQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQADataset(QADataset): - """ Creates a Dataset for T5/BART architecture based Generative QA """ + """Creates a Dataset for T5/BART architecture based Generative QA""" def __init__( self, @@ -43,6 +44,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("S2SQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -75,7 +79,7 @@ def __init__( self.features[i] = S2SQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -117,7 +121,12 @@ def _convert_examples_to_features(self): context_tokens, context_spans = self._prep_context(example, query_tokens, context_prefix_tokens) unique_id = self._encode_all_context_spans( - unique_id, context_spans, context_tokens, formatted_query, example, example_index, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ) # delete self.examples during training mode to save memory @@ -155,7 +164,13 @@ def _prep_context(self, example, query_tokens, context_prefix_tokens): return context_tokens, context_spans def _encode_all_context_spans( - self, unique_id, context_spans, context_tokens, formatted_query, example, example_index, + self, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ): """ Fromats all spans extracted from a single context as: @@ -173,7 +188,11 @@ def _encode_all_context_spans( # encode input encoded_input_dict = self.tokenizer.tokenizer( - source, truncation=True, max_length=self.max_seq_length, padding="max_length", return_tensors="pt", + source, + truncation=True, + max_length=self.max_seq_length, + padding="max_length", + return_tensors="pt", ) input_ids = torch.squeeze(encoded_input_dict["input_ids"]) input_attn_mask = torch.squeeze(encoded_input_dict["attention_mask"]) @@ -223,7 +242,11 @@ def _encode_answer(self, example, context_span_text): target = example.answer_text encoded_output_dict = self.tokenizer.tokenizer( - target, truncation=True, max_length=self.max_answer_length, padding="max_length", return_tensors="pt", + target, + truncation=True, + max_length=self.max_answer_length, + padding="max_length", + return_tensors="pt", ) labels = torch.squeeze(encoded_output_dict["input_ids"]) labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100 diff --git a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py index ee1a0957dbbb8..2abe9b7c0aaa6 100644 --- a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py @@ -46,6 +46,7 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SquadDataset', 'InputFeatures', '_check_is_max_context'] @@ -114,7 +115,7 @@ def get_best_span_index(doc_spans, position): """ best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -165,6 +166,9 @@ def __init__( mode: str, use_cache: bool, ): + # deprecation warning + deprecated_warning("SquadDataset") + self.tokenizer = tokenizer self.version_2_with_negative = version_2_with_negative self.processor = SquadProcessor(data_file=data_file, mode=mode) @@ -337,7 +341,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -375,7 +379,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -387,7 +391,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -398,7 +402,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -481,7 +485,7 @@ def convert_examples_to_features( if self.mode != TRAINING_MODE: example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = tokenizer.text_to_tokens(token) @@ -521,7 +525,7 @@ def convert_examples_to_features( # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] @@ -681,7 +685,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(self.examples): + for example_index, example in enumerate(self.examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -706,7 +710,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(features): + for feature_index, feature in enumerate(features): pos = unique_id_to_pos[feature.unique_id] start_indexes = get_best_indexes(start_logits[pos], n_best_size) end_indexes = get_best_indexes(end_logits[pos], n_best_size) @@ -825,7 +829,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py index 803d0eaf8aed2..c98abb300c64d 100644 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py @@ -20,6 +20,8 @@ from transformers import PreTrainedTokenizerBase +from nemo.utils.decorators import deprecated_warning + """Build BERT Examples from asr hypothesis, customization candidates, target labels, span info. """ @@ -52,7 +54,7 @@ def __init__( input_ids: indices of single characters (treated as subwords) input_mask: list of bools with 0s in place of input_ids to be masked segment_ids: list of ints from 0 to 10 to denote the text segment type ( - 0 - for tokens of ASR hypothesis, + 0 - for tokens of ASR hypothesis, 1 - for tokens of the first candidate ... 10 - for tokens of the tenth candidate @@ -60,7 +62,7 @@ def __init__( input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer) input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords - character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) + character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set labels_mask: bool tensor with 0s in place of label tokens to be masked labels: indices of semiotic classes which should be predicted from each of the @@ -68,6 +70,9 @@ def __init__( spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM) default_label: The default label """ + # deprecation warning + deprecated_warning("BertExample") + input_len = len(input_ids) if not ( input_len == len(input_mask) @@ -123,6 +128,9 @@ def __init__( tokenizer: Tokenizer object. max_seq_length: Maximum sequence length. """ + # deprecation warning + deprecated_warning("BertExampleBuilder") + self._label_map = label_map self._semiotic_classes = semiotic_classes self._tokenizer = tokenizer @@ -183,9 +191,15 @@ def build_bert_example( tags[start:end] = [t for i in range(end - start)] # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) + ( + input_ids, + input_mask, + segment_ids, + labels_mask, + labels, + _, + _, + ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags) # get input features for words hyp_with_words = hyp.replace(" ", "").replace("_", " ") @@ -243,11 +257,11 @@ def build_bert_example( return example def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: - """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample - - Example: - span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] + """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample + + Example: + span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] + result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] """ result_spans = [] @@ -267,26 +281,26 @@ def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: def _get_fragment_indices( self, hyp: str, targets: List[int], span_info_parts: List[str] ) -> Tuple[List[Tuple[int, int, int]]]: - """ Build fragment indices for real candidates. - This is used only at inference. - After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). - In this function we - 1) adjust start/end positions to match word borders (possibly in multiple ways). - 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). - - Args: - hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). - targets: list of candidate ids (only for real candidates, not dummy) - span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. - Returns: - List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. - Note that returned fragments can be unsorted and can overlap, it's ok. - Example: - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - targets: [1 2 3 4 6 7 9] - span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. - fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] - """ + """Build fragment indices for real candidates. + This is used only at inference. + After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). + In this function we + 1) adjust start/end positions to match word borders (possibly in multiple ways). + 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). + + Args: + hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). + targets: list of candidate ids (only for real candidates, not dummy) + span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. + Returns: + List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. + Note that returned fragments can be unsorted and can overlap, it's ok. + Example: + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + targets: [1 2 3 4 6 7 9] + span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. + fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] + """ fragment_indices = [] @@ -337,18 +351,18 @@ def _get_fragment_indices( return fragment_indices def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]: - """ Maps each single character to the position of its corresponding subword. - - Args: - input_ids: List of character token ids. - input_ids_for_subwords: List of subword token ids. - Returns: - List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) - - Example: - input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] - input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] - result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] + """Maps each single character to the position of its corresponding subword. + + Args: + input_ids: List of character token ids. + input_ids_for_subwords: List of subword token ids. + Returns: + List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) + + Example: + input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] + input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] + result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] """ character_pos_to_subword_pos = [0 for _ in input_ids] @@ -453,7 +467,7 @@ def _get_input_features( ref: "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy" tags: None (not used for word-based case) - resulting token sequence: + resulting token sequence: '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] """ @@ -542,9 +556,9 @@ def read_input_file( infer: If true, input examples do not contain target info. Returns: - examples: List of converted examples (BertExample). + examples: List of converted examples (BertExample). or - (examples, hyps_refs): If infer==true, returns h + (examples, hyps_refs): If infer==true, returns h """ if not path.exists(input_filename): diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py index 7737bfa67f00c..07ca790866c7c 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py @@ -45,14 +45,19 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTClassificationModel'] class DialogueGPTClassificationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTClassificationModel") self.cfg = cfg self.eval_mode = cfg.dataset.eval_mode @@ -101,14 +106,14 @@ def __init__( def setup_optimizer_param_groups(self): """ - ModelPT override for prompt learning. - Optimizer will get self._optimizer_param_groups. + ModelPT override for prompt learning. + Optimizer will get self._optimizer_param_groups. Makes two optimizer param groups, one for the frozen model params - and one for the prompt-table/prompt-encoder params. The learning + and one for the prompt-table/prompt-encoder params. The learning rate for the frozen model's params will always be zero effectively freezing the model's params but still allowing for the needed gradients - to be passed around in pipeline parallel models. The prompt-encoder - and/or prompt table will use the learning rate set by the user. + to be passed around in pipeline parallel models. The prompt-encoder + and/or prompt table will use the learning rate set by the user. """ if not self.prompt_learning: super().setup_optimizer_param_groups() @@ -328,7 +333,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids) @@ -708,7 +716,9 @@ def prepare_data(self): ) elif self._cfg.dataset.task == 'design': self.dialogues_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset, + data_dir=self._cfg.dataset.data_dir, + tokenizer=self.tokenizer, + cfg=self._cfg.dataset, ) else: raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model") diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py index 602c15a50c761..116605b65d528 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTGenerationModel'] @@ -43,8 +44,12 @@ class DialogueGPTGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTGenerationModel") self.cfg = cfg self.data_prepared = False @@ -108,7 +113,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -155,7 +163,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1) @@ -228,7 +239,7 @@ def setup(self, stage=None): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py index 455b0fa17a856..29e2627fa038a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py @@ -34,14 +34,18 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourModel'] class DialogueNearestNeighbourModel(NLPModel): - """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions """ + """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueNearestNeighbourModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) if self.cfg.library == "huggingface": @@ -155,7 +159,10 @@ def on_validation_epoch_end(self): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, decoded_inputs, + filename, + predicted_labels, + ground_truth_labels, + decoded_inputs, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py index 9655fbea2722a..73f09f62b1d5a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -46,8 +47,12 @@ class DialogueS2SGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueS2SGenerationModel") self.cfg = cfg self.data_prepared = False @@ -120,7 +125,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -172,7 +180,7 @@ def forward(self, input_ids, attention_masks, labels): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py index 0e007a7bcdd1b..5298c060df089 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py @@ -36,6 +36,7 @@ from nemo.collections.nlp.models import TextClassificationModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentModel'] @@ -44,6 +45,9 @@ class DialogueZeroShotIntentModel(TextClassificationModel): """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueZeroShotIntentModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) @@ -275,7 +279,10 @@ def on_validation_epoch_end(self, split="val"): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, utterances, + filename, + predicted_labels, + ground_truth_labels, + utterances, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} @@ -316,7 +323,6 @@ def predict( entailment_idx=1, contradiction_idx=0, ) -> List[Dict]: - """ Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query. diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py index a34afa64674df..777d468084e22 100644 --- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py @@ -35,12 +35,15 @@ from nemo.core.classes import typecheck from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class IntentSlotClassificationModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ Initializes BERT Joint Intent and Slot model. - """ + """Initializes BERT Joint Intent and Slot model.""" + # deprecation warning + deprecated_warning("IntentSlotClassificationModel") + self.max_seq_length = cfg.dataset.max_seq_length self.cfg = cfg # Check the presence of data_dir. @@ -78,7 +81,7 @@ def _set_defaults_data_desc(self, cfg): OmegaConf.set_struct(cfg, True) def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): - """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """ + """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc.""" # Save data from data desc to config - so it can be reused later, e.g. in inference. data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) OmegaConf.set_struct(cfg, False) @@ -112,7 +115,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): OmegaConf.set_struct(cfg, True) def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: - """ Saves label ids map to a file """ + """Saves label ids map to a file""" with open(filename, 'w') as out: labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) @@ -120,7 +123,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: logging.info(f'Labels mapping saved to : {out.name}') def _reconfigure_classifier(self): - """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """ + """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" self.classifier = SequenceTokenClassifier( hidden_size=self.hidden_size, @@ -310,7 +313,7 @@ def get_utterance_tokens(self, token_ids, token_masks): Args: token_ids: IntTensor of size (max_seq_len, ) token_masks: BoolTensor of size (max_seq_len, ) - + Returns token_list: List of Str (list of tokens with len <= max_seq_len) """ diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py index b350fd01fa090..3b30dfccd9cee 100644 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SGDQAModel'] @@ -44,7 +45,7 @@ class SGDQAModel(NLPModel): Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049) The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855). - The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. + The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker. The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes the dialogue state across turns. @@ -55,6 +56,9 @@ def output_module(self): return self.decoder def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("SGDQAModel") + self.data_prepared = False super().__init__(cfg=cfg, trainer=trainer) self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout) @@ -146,7 +150,7 @@ def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_ Called at every validation step to aggregate and postprocess outputs on each GPU Args: batch: input batch at validation step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -163,7 +167,7 @@ def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: i Called at every test step to aggregate and postprocess outputs on each GPU Args: batch: input batch at test step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -318,8 +322,8 @@ def eval_step_helper(self, batch: List[torch.Tensor]): torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype), total_scores, ) - max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens ** 2), axis=-1) - max_span_p = torch.max(total_scores.view(-1, max_num_tokens ** 2), axis=-1)[0] + max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1) + max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0] span_start_index = torch.floor_divide(max_span_index, max_num_tokens) span_end_index = torch.fmod(max_span_index, max_num_tokens) @@ -415,7 +419,7 @@ def format_turn_id(ex_id_num): def combine_predictions_in_example(predictions: dict, batch_size: int): ''' - Combines predicted values to a single example. + Combines predicted values to a single example. Args: predictions: predictions ordered by keys then batch batch_size: batch size diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py index f3ef3ccb87f99..4afae81e38939 100644 --- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py +++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py @@ -26,6 +26,7 @@ from nemo.core.classes.exportable import Exportable from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['EntityLinkingModel'] @@ -44,6 +45,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes the SAP-BERT model for entity linking.""" + # deprecation warning + deprecated_warning("EntityLinkingModel") + # tokenizer needed before super().__init__() so dataset and loader can process data self._setup_tokenizer(cfg.tokenizer) @@ -123,7 +127,7 @@ def on_validation_epoch_end(self): Args: outputs: list of individual outputs of each validation step. Returns: - + """ if self.validation_step_outputs: avg_loss = torch.stack( diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py index 4a073e2ada1ca..4447ebb893862 100644 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py @@ -31,6 +31,7 @@ from nemo.core.classes import typecheck from nemo.core.neural_types import NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GLUEModel'] @@ -78,6 +79,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): """ Initializes model to use BERT model for GLUE tasks. """ + # deprecation warning + deprecated_warning("GLUEModel") if cfg.task_name not in cfg.supported_tasks: raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}') diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index e7ae529fe4e28..67a4802d83f6c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -14,7 +14,6 @@ """BERT model.""" -import warnings from dataclasses import dataclass import torch @@ -33,6 +32,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -142,7 +142,13 @@ def forward(self, hidden_states, word_embeddings_weight): def post_language_model_processing( - lm_output, pooled_output, lm_head, binary_head, lm_labels, logit_weights, fp16_lm_cross_entropy, + lm_output, + pooled_output, + lm_head, + binary_head, + lm_labels, + logit_weights, + fp16_lm_cross_entropy, ): # lm_logits: [s, b, vocab_size] lm_logits = lm_head(lm_output, logit_weights) @@ -348,7 +354,10 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly - self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,) + self.lm_head = MCoreBertLMHead( + self.config.hidden_size, + self.config, + ) self.output_layer = tensor_parallel.ColumnParallelLinear( self.config.hidden_size, @@ -476,10 +485,9 @@ def __init__( sequence_parallel=False, position_embedding_type='learned_absolute', ): - warnings.warn( - "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.", - DeprecationWarning, - ) + # deprecation warning + deprecated_warning("NeMoBertModel", "MCoreBertModelWrapperWithPostLNSupport") + super(NeMoBertModel, self).__init__(config=config) self.fp16_lm_cross_entropy = fp16_lm_cross_entropy self.add_binary_head = add_binary_head diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index 19fafb796fd73..c572d94acd110 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -24,6 +24,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -167,6 +168,9 @@ def __init__( seq_len_interpolation_factor=None, rotary_base=10000, ): + # deprecation warning + deprecated_warning("GPTModel", "McoreGPTModel") + super(GPTModel, self).__init__(config=config, share_token_embeddings=share_embeddings_and_output_weights) self.parallel_output = parallel_output @@ -250,7 +254,9 @@ def __init__( if self.share_embeddings_and_output_weights: self.initialize_word_embeddings( - init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size, + init_method=init_method_normal(init_method_std), + vocab_size=vocab_size, + hidden_size=hidden_size, ) def set_input_tensor(self, input_tensor): @@ -299,9 +305,11 @@ def forward( post_process_result = post_language_model_processing( loss_lm_output, loss_labels, - self.language_model.output_layer.weight - if not self.share_embeddings_and_output_weights - else self.word_embeddings_weight(), + ( + self.language_model.output_layer.weight + if not self.share_embeddings_and_output_weights + else self.word_embeddings_weight() + ), get_key_value, self.parallel_output, forward_method_parallel_output, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py index d151925635ab6..f6ee4b20183c5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py @@ -37,6 +37,7 @@ from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -82,6 +83,9 @@ class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration): """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronBasePromptLearningModel") + super().__init__(cfg, trainer) self.init_model(cfg, trainer) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 5ee7a3fcf4806..acfc22439a7db 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -44,6 +44,7 @@ from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches @@ -72,25 +73,28 @@ class MegatronGPTPromptLearningModel(MegatronBasePromptLearningModel): """ - Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. + Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. Prompt Tuning initalizes virtual prompt embeddings directly from a copy of certain token embeddings from the the pretrained GPT model's vocabulary - and directly tunes these embedding weights. The token embeddings used in - initalization are specified by the user in the config file. The model can - be prompt-tuned for multiple tasks at once. virtual prompts are stored in a - prompt table and can be added or deleted without disrupting virtual prompts - for other tasks. + and directly tunes these embedding weights. The token embeddings used in + initalization are specified by the user in the config file. The model can + be prompt-tuned for multiple tasks at once. virtual prompts are stored in a + prompt table and can be added or deleted without disrupting virtual prompts + for other tasks. P-tuning initializes an LSTM encoder model that generates virtual prompt embeddings for every task. Each task shares the same encoder. After ptuning is compelete, the learned virtual prompts can be saved to the prompt table - using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a - new virtual prompt via p-tuning, they do not need to retrain on all previous + using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a + new virtual prompt via p-tuning, they do not need to retrain on all previous tasks. This gives p-tuning the same task flexiblity as prompt-tuning. """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronGPTPromptLearningModel") + super().__init__(cfg, trainer) self.inference_params = None @@ -305,8 +309,8 @@ def forward( def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): """ - Dataloader produces a global batch which is turned into an iterator of microbatches. - The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. + Dataloader produces a global batch which is turned into an iterator of microbatches. + The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. """ # Get seq length of batch batch, _, _ = next(dataloader_iter) @@ -361,15 +365,15 @@ def training_step(self, dataloader_iter): return loss_mean def backward(self, *args, **kwargs): - """ LightningModule hook to do backward. - We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. - No need to call it here. + """LightningModule hook to do backward. + We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. + No need to call it here. """ return def optimizer_zero_grad(self, *args, **kwargs): - """ LightningModule hook to zero grad. - We want this to do nothing as we are zeroing grads during the training_step. + """LightningModule hook to zero grad. + We want this to do nothing as we are zeroing grads during the training_step. """ return @@ -415,11 +419,19 @@ def validation_step(self, dataloader_iter): labels_text.append(label) if mode == 'val': self.validation_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) else: self.test_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) return { 'loss': loss_mean, @@ -427,8 +439,10 @@ def validation_step(self, dataloader_iter): 'labels': labels_text, } - self.validation_step_outputs.append({'loss': loss_mean}) if mode == 'val' else self.test_step_outputs.append( - {'loss': loss_mean} + ( + self.validation_step_outputs.append({'loss': loss_mean}) + if mode == 'val' + else self.test_step_outputs.append({'loss': loss_mean}) ) return {'loss': loss_mean} @@ -481,7 +495,8 @@ def on_validation_epoch_end(self): gather_results_dedup = list(set(itertools.chain(*gather_results))) val_metric_dict = self.validation_metric.get_score( - [i[1] for i in gather_results_dedup], [i[0] for i in gather_results_dedup], + [i[1] for i in gather_results_dedup], + [i[0] for i in gather_results_dedup], ) for metric, val in val_metric_dict.items(): @@ -638,9 +653,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True - if num_workers > 0 - else False, # (@adithyare and @eharper) We need this to make spawn=True to work. + persistent_workers=( + True if num_workers > 0 else False + ), # (@adithyare and @eharper) We need this to make spawn=True to work. ) return dataset, dataloader @@ -815,7 +830,7 @@ def list_available_models(cls): def get_pseudo_tokens(num_virtual_tokens): """ Takes in an integer and returns a list of strings where each string - is a numbered virtual token placeholder. If + is a numbered virtual token placeholder. If num_virtual_tokens = 3, then this function returns: ["", "", ""] @@ -823,7 +838,7 @@ def get_pseudo_tokens(num_virtual_tokens): Args: num_virtual_tokens: (int) Number of virtual token strings you want to make - returns a list of string. + returns a list of string. """ pseudo_tokens = [ diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py index bfb45f51b6ac7..7ca78f2e136e4 100644 --- a/nemo/collections/nlp/models/question_answering/qa_base_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py @@ -25,10 +25,14 @@ ) from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BaseQAModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=True): + # deprecation warning + deprecated_warning("BaseQAModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer, no_lm_init=no_lm_init) @@ -82,10 +86,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): @torch.no_grad() def _get_per_sample_perplexity(self, logits, labels): - """ Returns average perplexity for each sample in the batch """ + """Returns average perplexity for each sample in the batch""" loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') - unreduced_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1),) + unreduced_loss = loss_fct( + logits.view(-1, logits.size(-1)), + labels.view(-1), + ) unreduced_loss = unreduced_loss.reshape(labels.shape) mask_0 = unreduced_loss != 0 per_sample_perplexity = torch.exp((unreduced_loss * mask_0).sum(axis=1) / mask_0.sum(axis=1)) diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py index 196fab4e3a046..d4bdef6d871dc 100644 --- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py @@ -31,12 +31,15 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQAModel(BaseQAModel): - """ BERT model with a QA (token classification) head """ + """BERT model with a QA (token classification) head""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("BERTQAModel") super().__init__(cfg=cfg, trainer=trainer, no_lm_init=False) self.classifier = TokenClassifier( @@ -190,7 +193,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ @@ -209,7 +212,10 @@ def inference( logging.set_verbosity(logging.WARNING) infer_datalayer = self.setup_inference_data( - file, batch_size=batch_size, num_samples=num_samples, num_workers=2, + file, + batch_size=batch_size, + num_samples=num_samples, + num_workers=2, ) all_logits = [] @@ -244,7 +250,9 @@ def inference( if output_prediction_file: QAMetrics.dump_predicted_answers_to_file( - output_prediction_file, infer_datalayer.dataset.examples, all_predictions, + output_prediction_file, + infer_datalayer.dataset.examples, + all_predictions, ) if output_nbest_file: @@ -324,7 +332,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -349,7 +357,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] start_indexes = self._get_best_indexes(start_logits[pos], n_best_size) end_indexes = self._get_best_indexes(end_logits[pos], n_best_size) @@ -468,7 +476,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text @@ -531,7 +539,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_best_indexes(self, logits, n_best_size): - """ Get the n-best logits from a list """ + """Get the n-best logits from a list""" best_indices = np.argsort(logits)[::-1] @@ -570,7 +578,7 @@ def _get_final_text(self, pred_text: str, orig_text: str, do_lower_case: bool, v def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): + for i, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i @@ -599,14 +607,16 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logging.warning( - "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, + "Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, + tok_ns_text, ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): + for i, tok_index in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py index 405b9a1e05ade..059cf5625f150 100644 --- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py @@ -27,10 +27,14 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("GPTQAModel") + self.cfg = cfg self.setup_tokenizer(cfg.tokenizer) @@ -102,7 +106,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -185,10 +193,19 @@ def inference( return all_predictions, all_nbest_perdictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_predictions = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -226,7 +243,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} for index, unique_id in enumerate(unique_ids): @@ -242,7 +264,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -250,7 +272,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py index 6fb2054a22370..2147d7d6a5bfc 100644 --- a/nemo/collections/nlp/models/question_answering/qa_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['QAModel'] @@ -42,6 +43,9 @@ class QAModel(NLPModel): """ def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("QAModel") + super().__init__(cfg=cfg, trainer=trainer) self.classifier = TokenClassifier( hidden_size=self.hidden_size, @@ -186,7 +190,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py index 81001fb66da7b..5ad959fd1b6f6 100644 --- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py @@ -28,10 +28,13 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("S2SQAModel") self.cfg = cfg @@ -120,7 +123,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -145,7 +152,11 @@ def forward(self, input_ids, input_attn_mask, labels): labels = torch.where(labels != -100, labels, torch.zeros_like(labels)) output_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels)) unmasked_unreduced_loss = self.language_model( - input_ids, labels[:, :-1], input_attn_mask, output_attn_masks[:, :-1], lm_labels=labels[:, 1:], + input_ids, + labels[:, :-1], + input_attn_mask, + output_attn_masks[:, :-1], + lm_labels=labels[:, 1:], ) loss = self.language_model.loss_func(output_attn_masks[:, 1:], unmasked_unreduced_loss) per_sample_perplexity = torch.exp(unmasked_unreduced_loss) @@ -210,10 +221,19 @@ def inference( return all_predictions, all_nbest_predictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_json = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -251,7 +271,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} @@ -268,7 +293,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -276,7 +301,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] @@ -339,7 +364,10 @@ def _generate_candidates(self, input_ids, input_attn_mask): "max_length": num_tokens_to_generate, } generated_tokens = self.language_model.generate(**param_dict) - generated_answers = self.tokenizer.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True,) + generated_answers = self.tokenizer.tokenizer.batch_decode( + generated_tokens, + skip_special_tokens=True, + ) generated_answers = [ans.strip() for ans in generated_answers] elif self.cfg.library == 'megatron': diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py index eed94f2e1e315..d9e08f6764fc2 100644 --- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -35,7 +35,7 @@ from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging -from nemo.utils.decorators import experimental +from nemo.utils.decorators import deprecated_warning, experimental __all__ = ["SpellcheckingAsrCustomizationModel"] @@ -48,7 +48,7 @@ class SpellcheckingAsrCustomizationModel(NLPModel): It takes as input ASR hypothesis and candidate customization entries. It labels the hypothesis with correct entry index or 0. Example input: [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ... - Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 + Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 Example output: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 ... """ @@ -67,6 +67,9 @@ def output_module(self): return self def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None: + # deprecation warning + deprecated_warning("SpellcheckingAsrCustomizationModel") + super().__init__(cfg=cfg, trainer=trainer) # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids @@ -321,7 +324,7 @@ def on_test_epoch_end(self): @torch.no_grad() def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None: - """ Main function for Inference + """Main function for Inference Args: dataloader_cfg: config for dataloader @@ -517,7 +520,7 @@ def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.ut Setup function for a infer data loader. Args: cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory - input_name: path to input file. + input_name: path to input file. Returns: A pytorch DataLoader. """ diff --git a/nemo/utils/decorators/__init__.py b/nemo/utils/decorators/__init__.py index 4468a3bc09b5e..2cfec9e40d648 100644 --- a/nemo/utils/decorators/__init__.py +++ b/nemo/utils/decorators/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from nemo.utils.decorators.deprecated import deprecated +from nemo.utils.decorators.deprecated import deprecated, deprecated_warning from nemo.utils.decorators.experimental import experimental from nemo.utils.decorators.port_docs import add_port_docs diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py index 65f92e62563e0..40957bb343d42 100644 --- a/nemo/utils/decorators/deprecated.py +++ b/nemo/utils/decorators/deprecated.py @@ -30,14 +30,14 @@ def deprecated(wrapped=None, version=None, explanation=None, wait_seconds=0): """ - Decorator which can be used for indicating that a function/class is deprecated and going to be removed. - Tracks down which function/class printed the warning and will print it only once per call. - - Args: - version: Version in which the function/class will be removed (optional). - explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). - wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned - with subsequent logging messages. + Decorator which can be used for indicating that a function/class is deprecated and going to be removed. + Tracks down which function/class printed the warning and will print it only once per call. + + Args: + version: Version in which the function/class will be removed (optional). + explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. """ if wrapped is None: @@ -71,3 +71,26 @@ def wrapper(wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) return wrapper(wrapped) + + +def deprecated_warning(old_method=None, new_method=None, wait_seconds=2): + """ + Function which can be used for indicating that a function/class is deprecated and going to be removed. + + Args: + old_method: Name of deprecated class/function. + new_method: Name of new class/function to use. + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. + """ + + # Create a banner + if new_method is not None: + msg = f"***** {old_method} is deprecated. Please, use {new_method} instead. *****" + else: + msg = f"***** {old_method} is deprecated and will be removed soon. *****" + banner = '\n'.join(['*' * len(msg)] * 2 + [msg] + ['*' * len(msg)] * 2) + + logging.warning(f"\n\n{banner}\n") + logging.warning(f"Waiting for {wait_seconds} seconds before this message disappears.") + time.sleep(wait_seconds) diff --git a/tests/collections/nlp/test_dialogue.py b/tests/collections/nlp/test_dialogue.py deleted file mode 100644 index 9c227f737d988..0000000000000 --- a/tests/collections/nlp/test_dialogue.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch - -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import ( - DialogueGPTClassificationDataset, -) -from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics, DialogueGenerationMetrics -from nemo.collections.nlp.models.dialogue.dialogue_nearest_neighbour_model import DialogueNearestNeighbourModel - - -@pytest.mark.unit -def test_dialogue_metric_generation_f1(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - precision, recall, f1 = DialogueGenerationMetrics._get_one_f1(generated_field, ground_truth_field) - assert precision == 75 - assert recall == 75 - assert f1 == 75 - - -@pytest.mark.unit -def test_dialogue_metric_split_label_and_slots(): - fields = ["reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)", "time_of_day(7pm)"] - labels, slots_list = DialogueClassificationMetrics.split_label_and_slots(fields, with_slots=True) - assert labels == ["reserve_restaurant", 'none'] - assert slots_list == [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - - -@pytest.mark.unit -def test_dialogue_metric_slot_filling_metrics(): - generated_slots = [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - ground_truth_slots = [["time_of_day(7pm)"], ["time_of_day(7pm)", "number_of_people(3)"]] - - ( - avg_precision, - avg_recall, - avg_f1, - avg_joint_goal_accuracy, - ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots) - - assert avg_precision == 75 - assert avg_recall == 75 - assert avg_f1 == 75 - assert avg_joint_goal_accuracy == 0 - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_normalize_zero_shot_intent(): - label0 = 'food_ordering.contextual_query' - normalized_label0 = 'contextual query' - - label1 = 'food_ordering.nomatch' - normalized_label1 = 'no match' - - label2 = 'food_ordering.no' - normalized_label2 = 'no' - - assert normalized_label0 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label0) - assert normalized_label1 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label1) - assert normalized_label2 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label2) - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_get_continuous_slots(): - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {19: [3, 5], 18: [5, 6]} - - # here 18 and 19 maps to the same slot (originally variants of B-slot and I-slot) - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 18, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [3, 6]} - - # test if function works when non-empty slots are at boundary - slot_ids = [18, 54, 54, 19, 19] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [0, 1], 19: [3, 5]} - - -@pytest.mark.unit -def test_dialogue_assistant_map_bio_format_slots_to_unified_slots(): - - slots = ['B-time', 'I-time', 'B-alarm', 'I-alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '0', '2': '1', '3': '1', '4': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - # case in which BIOS scheme was not used in annotation - slots = ['time', 'alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '1', '2': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - -@pytest.mark.unit -def test_dialogue_data_processor_get_relevant_idxs(): - - dataset_split = 'train' - dev_proportion = 10 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 900 - assert idxs != list(range(900)) - - dataset_split = 'dev' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 400 - assert idxs != list(range(400)) - - dataset_split = 'test' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 1000 - assert idxs == list(range(1000)) - - -@pytest.mark.unit -def test_dialogue_sgd_data_processor_convert_camelcase_to_lower(): - label = 'none' - gt_converted_label = 'none' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'ReserveRestaurant' - gt_converted_label = 'reserve restaurant' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'Alarm' - gt_converted_label = 'alarm' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - slots = [] - linearized_slots = 'None' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': '7pm', 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': ['7pm', '1900'], 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - actions = [ - {'act': 'inform', 'slot': 'time', 'values': ['7pm', '1900']}, - {'act': 'confirm', 'slot': 'place', 'values': ['hall']}, - ] - - prompt_template = 'values' - formatted_actions = '7pm hall' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'slots_values' - formatted_actions = 'time (7pm) place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'acts_slots_values' - formatted_actions = 'inform time (7pm) confirm place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - -@pytest.mark.unit -def test_dialogue_sgd_dataset_naive_tokenize(): - - utterance = 'I am feeling hungry so I would like to find a place to eat.' - tokens = [ - 'I', - ' ', - 'am', - ' ', - 'feeling', - ' ', - 'hungry', - ' ', - 'so', - ' ', - 'I', - ' ', - 'would', - ' ', - 'like', - ' ', - 'to', - ' ', - 'find', - ' ', - 'a', - ' ', - 'place', - ' ', - 'to', - ' ', - 'eat', - '.', - ] - assert tokens == DialogueSGDBERTDataset._naive_tokenize(utterance) - - -@pytest.mark.unit -def test_dialogue_nearest_neighbour_mean_pooling(): - - model_output = [torch.ones(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.zeros(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.zeros(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.cat([torch.zeros(8, 256, 768), torch.ones(8, 256, 768)], axis=1)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float() * 0.5, DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) diff --git a/tests/collections/nlp/test_entity_linking_model.py b/tests/collections/nlp/test_entity_linking_model.py deleted file mode 100644 index 16b7681842964..0000000000000 --- a/tests/collections/nlp/test_entity_linking_model.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tempfile - -import pytest -import wget -from omegaconf import OmegaConf - -from nemo.collections.nlp.models import EntityLinkingModel - - -def get_cfg(): - - language_model = OmegaConf.create( - {"pretrained_model_name": "bert-base-uncased", "config_file": None, "config": None, "lm_checkpoint": None} - ) - - tokenizer = OmegaConf.create( - {"tokenizer_name": "bert-base-uncased", "vocab_file": None, "tokenizer_model": None, "do_lower_case": True} - ) - - model = OmegaConf.create( - { - "nemo_path": "sap_entity_linking.nemo", - "max_seq_length": 128, - "language_model": language_model, - "tokenizer": tokenizer, - "train_ds": None, - "validation_ds": None, - } - ) - - cfg = OmegaConf.create({"model": model}) - - return cfg - - -class TestEntityLinkingModel: - @pytest.mark.with_downloads() - @pytest.mark.unit - def test_creation_saving_restoring(self): - # Create a new temporary directory - with tempfile.TemporaryDirectory() as restore_dir: - with tempfile.TemporaryDirectory() as save_dir: - model = EntityLinkingModel(cfg=get_cfg().model) - assert isinstance(model, EntityLinkingModel) - - save_dir_path = save_dir - - # Where model will be saved - model_save_path = os.path.join(save_dir, f"{model.__class__.__name__}.nemo") - model.save_to(save_path=model_save_path) - - # Where model will be restored from - model_restore_path = os.path.join(restore_dir, f"{model.__class__.__name__}.nemo") - shutil.copy(model_save_path, model_restore_path) - - # at this point save_dir should not exist - assert save_dir_path is not None and not os.path.exists(save_dir_path) - assert not os.path.exists(model_save_path) - assert os.path.exists(model_restore_path) - - # attempt to restore - model_copy = model.__class__.restore_from(restore_path=model_restore_path) - assert model.num_weights == model_copy.num_weights - - -if __name__ == "__main__": - t = TestEntityLinkingModel() - t.test_creation_saving_restoring() diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py deleted file mode 100644 index 8206457ec6eee..0000000000000 --- a/tests/collections/nlp/test_megatron.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - import apex - - apex_available = True -except Exception: - apex_available = False - -import os -import tempfile - -import onnx -import pytest -import torch -from omegaconf import OmegaConf - -import nemo.collections.nlp as nemo_nlp -from nemo.core.classes import typecheck - - -def get_pretrained_bert_345m_uncased_model(): - model_name = "megatron-bert-345m-uncased" - config = {"language_model": {"pretrained_model_name": model_name}, "tokenizer": {}} - omega_conf = OmegaConf.create(config) - model = nemo_nlp.modules.get_lm_model(cfg=omega_conf) - if torch.cuda.is_available(): - model = model.cuda() - return model - - -class TestMegatron: - @pytest.mark.skip("This test was written for megatron-lm") - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_list_pretrained_models(self): - pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list() - assert len(pretrained_lm_models) > 0 - - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Only one Megatron model is allowed") - def test_get_model(self): - model = get_pretrained_bert_345m_uncased_model() - assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder) - - typecheck.set_typecheck_enabled(enabled=False) - inp = model.input_example() - out = model.forward(*inp) - typecheck.set_typecheck_enabled(enabled=True) - - @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine') - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Megatron-LM BERT support deprecated. Supported in NeMo < 1.5") - def test_onnx_export(self): - model = get_pretrained_bert_345m_uncased_model() - assert model - with tempfile.TemporaryDirectory() as tmpdir: - # Generate filename in the temporary directory. - # Test export. - model.export(os.path.join(".", "megatron.onnx")) - - -if __name__ == "__main__": - t = TestMegatron() - t.test_onnx_export() diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py deleted file mode 100644 index 20932b6c4e0df..0000000000000 --- a/tests/collections/nlp/test_mem_map_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import csv -import json -import os - -import pytest - -from nemo.collections.nlp.data.language_modeling import text_memmap_dataset - - -@pytest.fixture -def jsonl_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.jsonl" - - # Generate data to write to the JSONL file - data = [ - {"name": "John", "age": 30}, - {"name": "Jane", "age": 25}, - {"name": "Bob", "age": 35}, - ] - - # Write data to the JSONL file - with open(file_path, mode="w") as file: - for item in data: - json.dump(item, file) - file.write("\n") - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -@pytest.fixture -def csv_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.csv" - - # Generate data to write to the CSV file - data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]] - - # Write data to the CSV file - with open(file_path, mode="w", newline="") as file: - writer = csv.writer(file) - writer.writerows(data) - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -def test_jsonl_mem_map_dataset(jsonl_file): - """Test for JSONL memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert indexed_dataset[0] == {"name": "John", "age": 30} - assert indexed_dataset[1] == {"name": "Jane", "age": 25} - assert indexed_dataset[2] == {"name": "Bob", "age": 35} - - -def test_csv_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1) - assert indexed_dataset[0].strip() == "John" - assert indexed_dataset[1].strip() == "Jane" - assert indexed_dataset[2].strip() == "Bob" - - -def test_csv_fields_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVFieldsMemmapDataset( - dataset_paths=[csv_file], data_fields={"ID": 0, "Name": 1}, header_lines=1 - ) - assert isinstance(indexed_dataset[0], dict) - assert sorted(indexed_dataset[0].keys()) == ["ID", "Name"] - assert indexed_dataset[0]["ID"] == "1" and indexed_dataset[1]["ID"] == "2" and indexed_dataset[2]["ID"] == "3" - assert ( - indexed_dataset[0]["Name"].strip() == "John" - and indexed_dataset[1]["Name"].strip() == "Jane" - and indexed_dataset[2]["Name"].strip() == "Bob" - ) - - -@pytest.mark.parametrize( - "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset], -) -@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False]) -@pytest.mark.parametrize("relative_index_fn", [True, False]) -def test_mem_map_dataset_index_mapping_dir( - tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn, -): - """Test for index_mapping_dir.""" - if relative_index_fn: - jsonl_file = os.path.relpath(jsonl_file) - else: - jsonl_file = os.path.abspath(jsonl_file) - - if use_alternative_index_mapping_dir: - index_mapping_dir = tmp_path / "subdir" - dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)) - # Index files should not be created in default location. - assert not os.path.isfile(f"{jsonl_file}.idx.npy") - assert not os.path.isfile(f"{jsonl_file}.idx.info") - if relative_index_fn: - # Remove leading ".." sequences. - while jsonl_file.startswith(("../")): - jsonl_file = jsonl_file.lstrip("../") - idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx" - assert os.path.isfile(f"{idx_fn}.npy") - assert os.path.isfile(f"{idx_fn}.info") - else: - text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert os.path.isfile(f"{jsonl_file}.idx.npy") - assert os.path.isfile(f"{jsonl_file}.idx.info") diff --git a/tests/collections/nlp/test_prompt_learning.py b/tests/collections/nlp/test_prompt_learning.py deleted file mode 100644 index 4597fe9ecef08..0000000000000 --- a/tests/collections/nlp/test_prompt_learning.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -import pytest -import torch - -from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import get_pseudo_tokens -from nemo.collections.nlp.modules.common import VirtualPromptSource -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.core import Dataset - - -def get_prompt_tuning_dataset( - dataset_path, tokenizer, virtual_prompt_source, task_templates, pseudo_tokens, -): - dataset = GPTPromptLearningDataset( - data=[dataset_path], - tokenizer=tokenizer, - virtual_prompt_source=virtual_prompt_source, - task_templates=task_templates, - pseudo_tokens=pseudo_tokens, - pad_token_id=tokenizer.unk_id, - max_seq_length=512, - min_seq_length=1, - ) - - return dataset - - -def create_temp_dataset(): - example_dataset_a = [ - {'taskname': 'task name A', 'text': 'Test sentence one, Answer: ', 'answer': 'test'} for i in range(24) - ] - example_dataset_b = [ - {'taskname': 'task name B', 'question': 'This is a question', 'answer': 'test'} for i in range(13) - ] - example_dataset = example_dataset_a + example_dataset_b - temp_file_name = 'temp_dataset_file.jsonl' - - with open(temp_file_name, 'w') as temp: - for example in example_dataset: - temp.write(json.dumps(example) + '\n') - - return temp_file_name - - -def get_task_templates(): - task_templates = {} - task_templates['task name A'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{text}{answer}", - "prompt_template_fields": ['text', 'answer'], - "total_virtual_tokens": 5, - "virtual_token_splits": [5], - "truncate_field": None, - "answer_only_loss": True, - "answer_field": "answer", - "task_id_num": 0, - } - task_templates['task name B'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{question}<|VIRTUAL_PROMPT_1|>{answer}{extra}", - "prompt_template_fields": ['question', 'answer', 'extra'], - "total_virtual_tokens": 10, - "virtual_token_splits": [7, 3], - "truncate_field": None, - "answer_only_loss": False, - "answer_field": None, - "task_id_num": 1, - } - return task_templates - - -class TestMegatronGPTPromptLearningDataset: - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_init_prompt_learning_dataset(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - print(type(dataset)) - - assert isinstance(dataset, Dataset) - - os.remove(dataset_path) - - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_prompt_learning_dataset_collate_fn_prompt_encoder(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - batch = [dataset[i] for i in range(8)] - batch = dataset.collate_fn(batch) - - assert len(batch) == 6 - - _, _, _, _, _, taskname_ids = batch - - assert list(taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A") - - os.remove(dataset_path) - - -if __name__ == "__main__": - t = TestMegatronGPTPromptLearningDataset() - t.test_init_prompt_learning_dataset() - t.test_prompt_learning_dataset_collate_fn_prompt_encoder() - print('-' * 50 + '\nALL PROMPT TUNING UNIT TESTS PASS!\n' + '-' * 50) diff --git a/tests/collections/nlp/test_qna.py b/tests/collections/nlp/test_qna.py deleted file mode 100644 index 4a470cacb7113..0000000000000 --- a/tests/collections/nlp/test_qna.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections - -import pytest -import torch - -from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset -from nemo.collections.nlp.data.question_answering.dataset.qa_gpt_dataset import GPTQADataset -from nemo.collections.nlp.metrics.qa_metrics import QAMetrics - - -@pytest.mark.unit -def test_remove_articles(): - sentences = [ - "this is an apple", - "this is the apple", - "this is a fruit", - ] - - expected_article_removed_sents = ["this is apple", "this is apple", "this is fruit"] - - article_removed_sents = [QAMetrics.remove_articles(sent) for sent in sentences] - - assert article_removed_sents == expected_article_removed_sents - - -@pytest.mark.unit -def test_white_space_fix(): - sentences = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - expected_white_space_fixed_sents = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - white_space_fixed_sents = [QAMetrics.white_space_fix(sent) for sent in sentences] - - assert white_space_fixed_sents == expected_white_space_fixed_sents - - -@pytest.mark.unit -def test_remove_punc(): - sentence = "this, is. a! sentence: with; punctuations?" - expected_punc_removed_sent = "this is a sentence with punctuations" - - punc_removed_sent = QAMetrics.remove_punc(sentence) - - assert punc_removed_sent == expected_punc_removed_sent - - -@pytest.mark.unit -def test_get_normalized_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - -@pytest.mark.unit -def test_get_one_f1(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_get_one_exact_match(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = QADataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == QADataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - -@pytest.mark.unit -def test_gpt_no_pad_loss_masking(): - input_ids = [1] * 15 + [50257] * 15 - input_ids = torch.tensor(input_ids) - - input_attn_mask = [1] * 16 + [0] * 14 - input_attn_mask = torch.Tensor(input_attn_mask) - - training_mask_end = 10 - - expected_labels = [-100] * 10 + [1] * 5 + [50257] + [-100] * 14 - expected_labels = torch.tensor(expected_labels) - - labels = GPTQADataset.update_labels_for_no_pad_loss(input_ids, training_mask_end, input_attn_mask) - - assert torch.all(labels.eq(expected_labels)) diff --git a/tests/collections/nlp/test_question_answering.py b/tests/collections/nlp/test_question_answering.py deleted file mode 100644 index c4aacf449c501..0000000000000 --- a/tests/collections/nlp/test_question_answering.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -from pydoc import doc - -import pytest - -from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset -from nemo.collections.nlp.data.question_answering_squad.qa_squad_processing import ( - _get_tokens, - exact_match_score, - f1_score, -) - - -@pytest.mark.unit -def test_get_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - -@pytest.mark.unit -def test_f1_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_exact_match_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = SquadDataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == SquadDataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py deleted file mode 100644 index 8e4d6e9a7b8f6..0000000000000 --- a/tests/collections/nlp/test_spellchecking_asr_customization.py +++ /dev/null @@ -1,1102 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from transformers import AutoTokenizer - -from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - apply_replacements_to_text, - substitute_replacements_in_text, -) - - -@pytest.mark.unit -def test_substitute_replacements_in_text(): - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)] - gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False) - assert corrected_text == gold_text - - gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True) - assert corrected_text == gold_text_no_hyphen - - -@pytest.mark.unit -def test_apply_replacements_to_text(): - - # min_prob = 0.5 - # dp_data = None, - # min_dp_score_per_symbol: float = -99.9 - - # test more than one fragment to replace, test multiple same replacements - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [ - (66, 75, 'proterra', 0.99986), - (66, 75, 'proterra', 0.9956), - (101, 109, 'navistar', 0.93), - (101, 109, 'navistar', 0.91), - (101, 109, 'navistar', 0.92), - ] - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - # test that min_prob works - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - -@pytest.fixture() -def bert_example_builder(): - tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D") - label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10} - semiotic_classes = {"PLAIN": 0, "CUSTOM": 1} - max_seq_len = 256 - builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len) - return builder - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_creation(bert_example_builder): - assert bert_example_builder._tokenizer is not None - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_spans(bert_example_builder): - span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)] - spans = bert_example_builder._get_spans(span_info_parts) - spans.sort() - assert spans == gold_sorted_spans - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_fragment_indices(bert_example_builder): - hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w" - targets = [1] - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 8 17"] - gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 10 16"] - gold_sorted_fragment_indices = [(11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_input_features(bert_example_builder): - hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" - targets = [1, 3] - span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"] - - gold_tags = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - ] - gold_input_ids = [ - 101, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 1055, - 1035, - 1040, - 1045, - 1040, - 1045, - 1041, - 1035, - 1055, - 1051, - 1049, - 1051, - 1050, - 1035, - 1037, - 1050, - 1040, - 1035, - 1056, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 1035, - 1043, - 1048, - 1048, - 1051, - 102, - 1040, - 1045, - 1040, - 1045, - 1041, - 1054, - 1035, - 1055, - 1037, - 1057, - 1049, - 1051, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1045, - 1041, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1037, - 1050, - 1035, - 1043, - 1057, - 1045, - 1048, - 1048, - 1051, - 1056, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1041, - 1055, - 1055, - 1041, - 102, - 1049, - 1051, - 1050, - 1037, - 1040, - 1041, - 102, - 1039, - 1044, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 102, - 1055, - 1051, - 1048, - 1051, - 1049, - 1051, - 1050, - 102, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 102, - 1049, - 1041, - 1054, - 1039, - 1061, - 102, - ] - gold_input_mask = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - ] - gold_segment_ids = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - 10, - 10, - 10, - 10, - ] - gold_labels_mask = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ] - gold_input_ids_for_subwords = [ - 101, - 26357, - 2106, - 2666, - 2061, - 8202, - 1998, - 13012, - 16643, - 2319, - 1043, - 7174, - 102, - 2106, - 3771, - 7842, - 2819, - 2239, - 102, - 28625, - 3630, - 9856, - 102, - 9822, - 26458, - 7174, - 2102, - 102, - 13012, - 13473, - 11393, - 102, - 13813, - 3207, - 102, - 3017, - 102, - 15211, - 102, - 9168, - 102, - 2106, - 28173, - 4305, - 4305, - 102, - 8673, - 102, - ] - gold_input_mask_for_subwords = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - ] - gold_segment_ids_for_subwords = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 6, - 6, - 7, - 7, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - ] - gold_character_pos_to_subword_pos = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 3, - 3, - 3, - 4, - 4, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 8, - 8, - 8, - 9, - 9, - 9, - 10, - 11, - 11, - 11, - 12, - 13, - 13, - 13, - 14, - 14, - 14, - 14, - 15, - 15, - 16, - 16, - 17, - 17, - 18, - 19, - 19, - 19, - 19, - 19, - 20, - 20, - 21, - 21, - 21, - 22, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 24, - 24, - 24, - 25, - 25, - 25, - 26, - 27, - 28, - 28, - 28, - 29, - 29, - 29, - 30, - 30, - 30, - 31, - 32, - 32, - 32, - 32, - 33, - 33, - 34, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 36, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 38, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 40, - 41, - 41, - 41, - 42, - 42, - 42, - 43, - 43, - 44, - 44, - 45, - 46, - 46, - 46, - 46, - 46, - 47, - ] - - tags = [0 for _ in hyp.split()] - for p, t in zip(span_info_parts, targets): - c, start, end = p.split(" ") - start = int(start) - end = int(end) - tags[start:end] = [t for i in range(end - start)] - - # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) - - # get input features for words - hyp_with_words = hyp.replace(" ", "").replace("_", " ") - ref_with_words = ref.replace(" ", "").replace("_", " ") - ( - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - _, - _, - _, - _, - ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) - - character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords) - - assert tags == gold_tags - assert input_ids == gold_input_ids - assert input_mask == gold_input_mask - assert segment_ids == gold_segment_ids - assert labels_mask == gold_labels_mask - assert input_ids_for_subwords == gold_input_ids_for_subwords - assert input_mask_for_subwords == gold_input_mask_for_subwords - assert segment_ids_for_subwords == gold_segment_ids_for_subwords - assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb deleted file mode 100644 index ddd3bdd4f9298..0000000000000 --- a/tutorials/nlp/Dialogue.ipynb +++ /dev/null @@ -1,717 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "jaosjY4rGRNH" - }, - "source": [ - "# Installing NeMo from source\n", - "\n", - "\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "goQzOSflEq27" - }, - "outputs": [], - "source": [ - "import os \n", - "BRANCH = 'main'\n", - "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", - "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n", - "os.chdir('NeMo')\n", - "!./reinstall.sh\n", - "os.chdir('..')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjQ_z_xQMDIb" - }, - "source": [ - "# Overview\n", - "\n", - "There are three tasks as part of this tutorial\n", - "\n", - "1. Intent and Slot Classification using Assistant Dataset and a BERT model\n", - "2. Intent Classification using Schema Guided Dialogue Dataset and a GPT2 model\n", - "3. Answer Extender using MS Marco NLGen Dataset and a BART model\n", - "\n", - "Feel free to skip to the task that interests you most after installing NeMo from source." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AS-zwy8tEq2_" - }, - "source": [ - "# 1. Intent and Slot Classification using Assistant Dataset\n", - "\n", - "## 1.1 Task Description\n", - "\n", - "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n", - "for this Intent in a query.\n", - "For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n", - "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n", - "Intents and Slots names are usually task specific and defined as labels in the training data.\n", - "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n", - "\n", - "Our model enables to train and then detect both of these tasks together.\n", - "\n", - "Note: There is a similar model available at [Joint Intent Slot Classification Colab](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb). However, this model only support BERT style models while the model in this tutorial supports other types of models such as GPT2. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FJk_UAyeEq3B" - }, - "source": [ - "\n", - "## 1.2 Download Assistant dataset and convert to NeMo format\n", - "\n", - "This is a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n", - "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n", - "\n", - "An example is:\n", - "\n", - "* utterance: what alarms have i set for tomorrow \n", - "* intent: alarm_query\n", - "* slots: date(tomorrow)\n", - "\n", - "\n", - "Note: While only the assistant dataset is used here, import_dataset.py is also compatible with ATIS and SNIPS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjOVdGX2Eq3D" - }, - "outputs": [], - "source": [ - "# download and unzip the example dataset from github\n", - "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", - "!unzip master.zip\n", - "# convert the dataset to the NeMo format\n", - "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5n81deZsEq3G" - }, - "source": [ - "## 1.3 Training and/or Testing the model\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eoYc_8jhEq3G" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_bert_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='bert-base-uncased' \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GaPmHjayEbg8" - }, - "source": [ - "**Results after 3 epochs**\n", - "\n", - "Intent report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_query (label_id: 0) 100.00 94.44 97.14 18\n", - " alarm_remove (label_id: 1) 100.00 90.91 95.24 11\n", - " alarm_set (label_id: 2) 94.12 94.12 94.12 17\n", - " audio_volume_down (label_id: 3) 75.00 42.86 54.55 7\n", - " audio_volume_mute (label_id: 4) 100.00 92.86 96.30 14\n", - " audio_volume_up (label_id: 5) 72.22 100.00 83.87 13\n", - " calendar_query (label_id: 6) 87.50 77.78 82.35 18\n", - " calendar_remove (label_id: 7) 94.44 100.00 97.14 17\n", - " calendar_set (label_id: 8) 94.44 94.44 94.44 18\n", - " cooking_recipe (label_id: 9) 85.71 70.59 77.42 17\n", - " datetime_convert (label_id: 10) 88.89 100.00 94.12 8\n", - " datetime_query (label_id: 11) 89.47 100.00 94.44 17\n", - " email_addcontact (label_id: 12) 80.00 100.00 88.89 8\n", - " email_query (label_id: 13) 100.00 83.33 90.91 18\n", - " email_querycontact (label_id: 14) 78.95 88.24 83.33 17\n", - " email_sendemail (label_id: 15) 94.44 94.44 94.44 18\n", - " general_affirm (label_id: 16) 100.00 100.00 100.00 17\n", - " general_commandstop (label_id: 17) 100.00 100.00 100.00 18\n", - " general_confirm (label_id: 18) 100.00 100.00 100.00 17\n", - " general_dontcare (label_id: 19) 100.00 100.00 100.00 18\n", - " general_explain (label_id: 20) 100.00 100.00 100.00 17\n", - " general_joke (label_id: 21) 91.67 100.00 95.65 11\n", - " general_negate (label_id: 22) 100.00 100.00 100.00 18\n", - " general_praise (label_id: 23) 100.00 100.00 100.00 17\n", - " general_quirky (label_id: 24) 60.00 50.00 54.55 18\n", - " general_repeat (label_id: 25) 100.00 100.00 100.00 17\n", - " iot_cleaning (label_id: 26) 100.00 100.00 100.00 15\n", - " iot_coffee (label_id: 27) 85.71 100.00 92.31 18\n", - " iot_hue_lightchange (label_id: 28) 100.00 94.12 96.97 17\n", - " iot_hue_lightdim (label_id: 29) 100.00 100.00 100.00 12\n", - " iot_hue_lightoff (label_id: 30) 100.00 100.00 100.00 17\n", - " iot_hue_lighton (label_id: 31) 100.00 50.00 66.67 4\n", - " iot_hue_lightup (label_id: 32) 84.62 91.67 88.00 12\n", - " iot_wemo_off (label_id: 33) 100.00 100.00 100.00 9\n", - " iot_wemo_on (label_id: 34) 100.00 85.71 92.31 7\n", - " lists_createoradd (label_id: 35) 90.00 100.00 94.74 18\n", - " lists_query (label_id: 36) 100.00 94.12 96.97 17\n", - " lists_remove (label_id: 37) 88.89 88.89 88.89 18\n", - " music_likeness (label_id: 38) 100.00 93.75 96.77 16\n", - " music_query (label_id: 39) 100.00 100.00 100.00 17\n", - " music_settings (label_id: 40) 77.78 100.00 87.50 7\n", - " news_query (label_id: 41) 72.73 88.89 80.00 18\n", - " play_audiobook (label_id: 42) 100.00 100.00 100.00 17\n", - " play_game (label_id: 43) 93.75 83.33 88.24 18\n", - " play_music (label_id: 44) 85.00 100.00 91.89 17\n", - " play_podcasts (label_id: 45) 100.00 88.89 94.12 18\n", - " play_radio (label_id: 46) 84.21 94.12 88.89 17\n", - " qa_currency (label_id: 47) 85.00 94.44 89.47 18\n", - " qa_definition (label_id: 48) 89.47 100.00 94.44 17\n", - " qa_factoid (label_id: 49) 64.00 88.89 74.42 18\n", - " qa_maths (label_id: 50) 84.62 84.62 84.62 13\n", - " qa_stock (label_id: 51) 87.50 77.78 82.35 18\n", - " recommendation_events (label_id: 52) 87.50 82.35 84.85 17\n", - " recommendation_locations (label_id: 53) 83.33 83.33 83.33 18\n", - " recommendation_movies (label_id: 54) 100.00 60.00 75.00 10\n", - " social_post (label_id: 55) 100.00 94.12 96.97 17\n", - " social_query (label_id: 56) 100.00 82.35 90.32 17\n", - " takeaway_order (label_id: 57) 92.31 70.59 80.00 17\n", - " takeaway_query (label_id: 58) 93.75 83.33 88.24 18\n", - " transport_query (label_id: 59) 81.25 76.47 78.79 17\n", - " transport_taxi (label_id: 60) 100.00 100.00 100.00 16\n", - " transport_ticket (label_id: 61) 85.00 94.44 89.47 18\n", - " transport_traffic (label_id: 62) 93.75 88.24 90.91 17\n", - " weather_query (label_id: 63) 89.47 100.00 94.44 17\n", - " -------------------\n", - " micro avg 91.16 91.16 91.16 996\n", - " macro avg 91.66 90.44 90.48 996\n", - " weighted avg 91.72 91.16 91.04 996\n", - "```\n", - "Slot report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_type (label_id: 0) 0.00 0.00 0.00 2\n", - " app_name (label_id: 1) 0.00 0.00 0.00 1\n", - " artist_name (label_id: 2) 17.39 80.00 28.57 5\n", - " audiobook_author (label_id: 3) 0.00 0.00 0.00 0\n", - " audiobook_name (label_id: 4) 64.52 74.07 68.97 27\n", - " business_name (label_id: 5) 81.48 84.62 83.02 52\n", - " business_type (label_id: 6) 80.00 80.00 80.00 20\n", - " change_amount (label_id: 7) 57.14 66.67 61.54 6\n", - " coffee_type (label_id: 8) 100.00 33.33 50.00 3\n", - " color_type (label_id: 9) 75.00 92.31 82.76 13\n", - " cooking_type (label_id: 10) 0.00 0.00 0.00 1\n", - " currency_name (label_id: 11) 100.00 96.43 98.18 28\n", - " date (label_id: 12) 87.88 87.22 87.55 133\n", - " definition_word (label_id: 13) 85.00 85.00 85.00 20\n", - " device_type (label_id: 14) 84.75 76.92 80.65 65\n", - " drink_type (label_id: 15) 0.00 0.00 0.00 0\n", - " email_address (label_id: 16) 64.29 100.00 78.26 9\n", - " email_folder (label_id: 17) 100.00 50.00 66.67 2\n", - " event_name (label_id: 18) 80.00 75.00 77.42 64\n", - " food_type (label_id: 19) 84.38 77.14 80.60 35\n", - " game_name (label_id: 20) 93.55 78.38 85.29 37\n", - " game_type (label_id: 21) 0.00 0.00 0.00 0\n", - " general_frequency (label_id: 22) 0.00 0.00 0.00 9\n", - " house_place (label_id: 23) 80.95 91.89 86.08 37\n", - " ingredient (label_id: 24) 0.00 0.00 0.00 1\n", - " joke_type (label_id: 25) 100.00 100.00 100.00 5\n", - " list_name (label_id: 26) 89.29 69.44 78.12 36\n", - " meal_type (label_id: 27) 0.00 0.00 0.00 3\n", - " media_type (label_id: 28) 78.95 83.33 81.08 36\n", - " movie_name (label_id: 29) 0.00 0.00 0.00 1\n", - " movie_type (label_id: 30) 0.00 0.00 0.00 0\n", - " music_album (label_id: 31) 0.00 0.00 0.00 0\n", - " music_descriptor (label_id: 32) 0.00 0.00 0.00 2\n", - " music_genre (label_id: 33) 81.82 90.00 85.71 10\n", - " news_topic (label_id: 34) 80.00 30.77 44.44 13\n", - " order_type (label_id: 35) 100.00 42.11 59.26 19\n", - " person (label_id: 36) 70.79 100.00 82.89 63\n", - " personal_info (label_id: 37) 76.19 94.12 84.21 17\n", - " place_name (label_id: 38) 82.86 84.47 83.65 103\n", - " player_setting (label_id: 39) 75.00 42.86 54.55 7\n", - " playlist_name (label_id: 40) 0.00 0.00 0.00 3\n", - " podcast_descriptor (label_id: 41) 92.31 54.55 68.57 22\n", - " podcast_name (label_id: 42) 66.67 16.67 26.67 12\n", - " radio_name (label_id: 43) 94.87 94.87 94.87 39\n", - " relation (label_id: 44) 90.91 90.91 90.91 11\n", - " song_name (label_id: 45) 100.00 6.67 12.50 15\n", - " time (label_id: 46) 77.57 84.69 80.98 98\n", - " time_zone (label_id: 47) 44.44 100.00 61.54 4\n", - " timeofday (label_id: 48) 86.96 80.00 83.33 25\n", - " transport_agency (label_id: 49) 80.00 57.14 66.67 7\n", - " transport_descriptor (label_id: 50) 0.00 0.00 0.00 5\n", - " transport_name (label_id: 51) 0.00 0.00 0.00 0\n", - " transport_type (label_id: 52) 88.89 100.00 94.12 40\n", - " weather_descriptor (label_id: 53) 87.50 87.50 87.50 8\n", - " O (label_id: 54) 97.07 97.52 97.30 5408\n", - " -------------------\n", - " micro avg 94.24 94.24 94.24 6582\n", - " macro avg 64.87 59.93 59.17 6582\n", - " weighted avg 94.23 94.24 93.95 6582\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-44x5PqyrOeQ" - }, - "source": [ - "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QyqQbpR4rNHT" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "# model.dataset.target_template=with_slots: this perform slot filling with intent classification\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_gpt2_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " model.dataset.target_template=with_slots \\\n", - " model.dataset.eval_mode=generation \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FbQ-6TVM1yQg" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would be helpful\n", - "\n", - "Intent report:\n", - "\n", - " ```\n", - " label precision recall f1 support \n", - " transport query (label_id: 0) 72.73 84.21 78.05 19\n", - " weather query (label_id: 1) 94.74 94.74 94.74 19\n", - " play game (label_id: 2) 92.86 68.42 78.79 19\n", - " qa currency (label_id: 3) 100.00 100.00 100.00 19\n", - " qa maths (label_id: 4) 100.00 100.00 100.00 14\n", - " iot wemo off (label_id: 5) 75.00 100.00 85.71 9\n", - " datetime convert (label_id: 6) 46.67 87.50 60.87 8\n", - " email addcontact (label_id: 7) 70.00 87.50 77.78 8\n", - " music likeness (label_id: 8) 57.89 61.11 59.46 18\n", - " music query (label_id: 9) 78.57 57.89 66.67 19\n", - " general negate (label_id: 10) 95.00 100.00 97.44 19\n", - " email sendemail (label_id: 11) 92.86 68.42 78.79 19\n", - " general affirm (label_id: 12) 95.00 100.00 97.44 19\n", - " play audiobook (label_id: 13) 57.69 78.95 66.67 19\n", - " general praise (label_id: 14) 100.00 94.74 97.30 19\n", - " alarm set (label_id: 15) 85.71 94.74 90.00 19\n", - " general explain (label_id: 16) 100.00 89.47 94.44 19\n", - " iot wemo on (label_id: 17) 83.33 71.43 76.92 7\n", - " cooking recipe (label_id: 18) 90.00 94.74 92.31 19\n", - " music settings (label_id: 19) 60.00 42.86 50.00 7\n", - " social post (label_id: 20) 84.21 84.21 84.21 19\n", - " recommendation events (label_id: 21) 72.73 84.21 78.05 19\n", - " audio volume up (label_id: 22) 76.47 100.00 86.67 13\n", - " lists remove (label_id: 23) 73.08 100.00 84.44 19\n", - " transport ticket (label_id: 24) 94.74 94.74 94.74 19\n", - " general joke (label_id: 25) 100.00 100.00 100.00 12\n", - " play podcasts (label_id: 26) 94.12 84.21 88.89 19\n", - " iot hue lightchange (label_id: 27) 85.71 63.16 72.73 19\n", - " audio volume mute (label_id: 28) 84.62 73.33 78.57 15\n", - " general dontcare (label_id: 29) 95.00 100.00 97.44 19\n", - " qa definition (label_id: 30) 77.27 89.47 82.93 19\n", - " email querycontact (label_id: 31) 58.33 73.68 65.12 19\n", - " general commandstop (label_id: 32) 100.00 100.00 100.00 19\n", - " calendar remove (label_id: 33) 94.44 89.47 91.89 19\n", - " news query (label_id: 34) 100.00 57.89 73.33 19\n", - " calendar query (label_id: 35) 63.16 63.16 63.16 19\n", - " social query (label_id: 36) 88.24 83.33 85.71 18\n", - " transport traffic (label_id: 37) 90.48 100.00 95.00 19\n", - " transport taxi (label_id: 38) 100.00 94.44 97.14 18\n", - " alarm query (label_id: 39) 100.00 94.74 97.30 19\n", - " iot hue lightoff (label_id: 40) 88.89 84.21 86.49 19\n", - " takeaway order (label_id: 41) 81.25 68.42 74.29 19\n", - " iot coffee (label_id: 42) 100.00 94.74 97.30 19\n", - " recommendation movies (label_id: 43) 75.00 90.00 81.82 10\n", - " iot hue lightup (label_id: 44) 78.57 78.57 78.57 14\n", - " email query (label_id: 45) 85.71 94.74 90.00 19\n", - " lists createoradd (label_id: 46) 82.35 73.68 77.78 19\n", - " play radio (label_id: 47) 84.21 84.21 84.21 19\n", - " audio volume down (label_id: 48) 100.00 87.50 93.33 8\n", - " general quirky (label_id: 49) 30.00 15.79 20.69 19\n", - " play music (label_id: 50) 71.43 52.63 60.61 19\n", - " qa stock (label_id: 51) 90.48 100.00 95.00 19\n", - " iot cleaning (label_id: 52) 93.33 87.50 90.32 16\n", - " iot hue lightdim (label_id: 53) 100.00 100.00 100.00 12\n", - " recommendation locations (label_id: 54) 100.00 89.47 94.44 19\n", - " general repeat (label_id: 55) 100.00 100.00 100.00 19\n", - " takeaway query (label_id: 56) 77.27 89.47 82.93 19\n", - " alarm remove (label_id: 57) 100.00 100.00 100.00 11\n", - " datetime query (label_id: 58) 75.00 63.16 68.57 19\n", - " iot hue lighton (label_id: 59) 60.00 100.00 75.00 3\n", - " qa factoid (label_id: 60) 50.00 57.89 53.66 19\n", - " calendar set (label_id: 61) 75.00 78.95 76.92 19\n", - " general confirm (label_id: 62) 100.00 100.00 100.00 19\n", - " lists query (label_id: 63) 66.67 73.68 70.00 19\n", - " label_id: 64 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 83.55 83.55 83.55 1076\n", - " macro avg 83.53 83.93 83.01 1076\n", - " weighted avg 84.26 83.55 83.30 1076\n", - " \n", - "```\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " intent_f1 83.55018615722656\n", - " intent_precision 83.55018615722656\n", - " intent_recall 83.55018615722656\n", - " slot_f1 73.99985919756773\n", - "slot_joint_goal_accuracy 65.89219330855019\n", - " slot_precision 73.85223048327137\n", - " slot_recall 74.14807930607186\n", - " test_intent_accuracy 83.55018587360595\n", - " test_loss_epoch 0.019178826361894608\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gd42arYoEq3J" - }, - "source": [ - "# 2. Schema Guided Dialogue (SGD)\n", - "\n", - "## 2.1 Task Description\n", - "---\n", - "\n", - "SGD is a multi-domain intent classification dataset from Google with close to 100k examples.\n", - "\n", - "An example is:\n", - "\n", - "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n", - "* intent: ReserveRestaurant\n", - "* slots: {\"time\": \"11:30 am\"}\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "neH8rXwjEq3J" - }, - "source": [ - "## 2.2 Download the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IgD8eavfJ5pi" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7G7uPrUpEq3J" - }, - "source": [ - "## 2.3 Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gqo-rwQlEq3K" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./dstc8-schema-guided-dialogue' \\\n", - " model.dataset.dialogues_example_dir='./sgd_gpt2_predictions' \\\n", - " model.dataset.task='sgd' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kGDlV5HvI2PQ" - }, - "outputs": [], - "source": [ - "!ls sgd_gpt2_predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p8g0f5KDTu9K" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would needed to reach convergence.\n", - "\n", - "\n", - "```\n", - " label precision recall f1 support \n", - " check balance (label_id: 0) 0.00 0.00 0.00 0\n", - " find trains (label_id: 1) 80.20 91.95 85.68 348\n", - " make payment (label_id: 2) 83.12 28.07 41.97 228\n", - " book appointment (label_id: 3) 86.93 87.15 87.04 397\n", - " get cars available (label_id: 4) 96.88 90.51 93.58 274\n", - " get event dates (label_id: 5) 0.00 0.00 0.00 0\n", - " buy bus ticket (label_id: 6) 78.61 91.33 84.49 173\n", - " add event (label_id: 7) 0.00 0.00 0.00 0\n", - " get alarms (label_id: 8) 58.33 77.78 66.67 45\n", - " reserve car (label_id: 9) 83.75 72.43 77.68 185\n", - " get events (label_id: 10) 0.00 0.00 0.00 0\n", - " reserve roundtrip flights (label_id: 11) 0.00 0.00 0.00 0\n", - " lookup music (label_id: 12) 89.83 86.89 88.33 61\n", - " book house (label_id: 13) 91.13 92.50 91.81 200\n", - " search oneway flight (label_id: 14) 74.77 47.70 58.25 174\n", - " buy event tickets (label_id: 15) 72.19 95.31 82.15 128\n", - " find apartment (label_id: 16) 0.00 0.00 0.00 0\n", - " schedule visit (label_id: 17) 77.27 66.06 71.23 386\n", - " play media (label_id: 18) 92.94 86.81 89.77 91\n", - " get ride (label_id: 19) 99.41 98.82 99.12 170\n", - " reserve oneway flight (label_id: 20) 0.00 0.00 0.00 0\n", - " find bus (label_id: 21) 96.64 87.53 91.86 361\n", - " find restaurants (label_id: 22) 77.14 91.22 83.59 148\n", - " get times for movie (label_id: 23) 0.00 0.00 0.00 0\n", - " transfer money (label_id: 24) 0.00 0.00 0.00 0\n", - " request payment (label_id: 25) 46.71 63.39 53.79 112\n", - " play movie (label_id: 26) 100.00 65.11 78.87 321\n", - " search house (label_id: 27) 97.91 91.83 94.77 306\n", - " search roundtrip flights (label_id: 28) 67.49 82.41 74.21 199\n", - " find provider (label_id: 29) 95.11 90.53 92.77 602\n", - " find attractions (label_id: 30) 100.00 89.01 94.19 91\n", - " reserve hotel (label_id: 31) 56.75 97.04 71.62 169\n", - " lookup song (label_id: 32) 0.00 0.00 0.00 0\n", - " add alarm (label_id: 33) 95.68 60.18 73.89 221\n", - " find home by area (label_id: 34) 48.95 59.79 53.83 194\n", - " get available time (label_id: 35) 0.00 0.00 0.00 0\n", - " buy movie tickets (label_id: 36) 100.00 29.39 45.42 473\n", - " reserve restaurant (label_id: 37) 95.71 84.80 89.92 342\n", - " find movies (label_id: 38) 62.40 97.61 76.14 335\n", - " get weather (label_id: 39) 100.00 87.69 93.44 195\n", - " search hotel (label_id: 40) 99.35 52.60 68.78 289\n", - " find events (label_id: 41) 99.57 82.56 90.27 281\n", - " play song (label_id: 42) 0.00 0.00 0.00 0\n", - " rent movie (label_id: 43) 0.00 0.00 0.00 0\n", - " get train tickets (label_id: 44) 45.83 5.56 9.91 198\n", - " none (label_id: 45) 55.77 98.90 71.32 728\n", - " label_id: 46 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 77.23 77.23 77.23 8425\n", - " macro avg 82.01 76.68 76.56 8425\n", - " weighted avg 83.23 77.23 76.86 8425\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jUJb-9VLLBXo" - }, - "source": [ - "# 3. MS Marco\n", - "\n", - "## Task Description\n", - "\n", - "MS Marco NLGen is a dataset from Microsoft that takes extracted answers and questions and output fluent answers.\n", - "\n", - "An example is \n", - "\n", - "\n", - "* question: What county is Nine Mile in?\n", - "* extracted_answer: Onondaga\n", - "* fluent_answer: Nine Mile is in Onondaga county.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VtXEKG_UQU9u" - }, - "source": [ - "## Download and unzip files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b9avsZ1CEq3K" - }, - "outputs": [], - "source": [ - "!mkdir ms_marco\n", - "os.chdir('ms_marco')\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz\n", - "\n", - "!gunzip train_v2.1.json.gz\n", - "!gunzip dev_v2.1.json.gz\n", - "\n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json \n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json \n", - "\n", - "os.chdir('..')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "h7UZ9R8gQTFo" - }, - "source": [ - "## Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fwGQCwbvRf2m" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.dialogues_example_dir='./marco_bart_predictions' \\\n", - " model.dataset.data_dir='./ms_marco' \\\n", - " model.save_model=True \\\n", - " model.dataset.debug_mode=True \\\n", - " model.dataset.task='ms_marco' \\\n", - " model.language_model.pretrained_model_name='facebook/bart-base' \\\n", - " trainer.max_epochs=1 \\\n", - " model.dataset.debug_mode=False \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UL7ekAOZ2abi" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "Train more epochs for optimal performance\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " bleu 65.46179962158203\n", - " f1 78.24439835896995\n", - " precision 81.92473076099847\n", - " recall 76.72508929408436\n", - " test_accuracy 25.563487607283225\n", - " test_loss 0.4419259166606655\n", - " test_loss_epoch 0.4420809745788574\n", - " test_ppl 1.5557004846779854\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Dialogue.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb deleted file mode 100644 index dfdf594e6804f..0000000000000 --- a/tutorials/nlp/Entity_Linking_Medical.ipynb +++ /dev/null @@ -1,632 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "\n", - "## Install NeMo if using google collab or if its not installed locally\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Install dependencies\n", - "!pip install wget\n", - "!pip install faiss-gpu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import torch\n", - "import wget\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from omegaconf import OmegaConf\n", - "from pytorch_lightning import Trainer\n", - "from IPython.display import display\n", - "from tqdm import tqdm\n", - "\n", - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Task Description\n", - "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n", - "\n", - "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n", - "\n", - "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Self Alignment Pretraining\n", - "Self-Alignment pretraining is a second stage pretraining of an existing encoder (called second stage because the encoder model can be further finetuned after this more general pretraining step). The dataset used during training consists of pairs of concept synonyms that map to the same ID. At each training iteration, we only select *hard* examples present in the mini batch to calculate the loss and update the model weights. In this context, a hard example is an example where a concept is closer to an unrelated concept in the mini batch than it is to the synonym concept it is paired with by some margin. I encourage you to take a look at [section 2 of the paper](https://arxiv.org/pdf/2010.11784.pdf) for a more formal and in depth description of how hard examples are selected.\n", - "\n", - "We then use a [metric learning loss](https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf) calculated from the hard examples selected. This loss helps reshape the embedding space. The concept representation space is rearranged to be more suitable for entity matching via embedding cosine similarity. \n", - "\n", - "Now that we have idea of what's going on, let's get started!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download data into project directory\n", - "PROJECT_DIR = \".\" #Change if you don't want the current directory to be the project dir\n", - "DATA_DIR = os.path.join(PROJECT_DIR, \"tiny_example_data\")\n", - "\n", - "if not os.path.isdir(os.path.join(DATA_DIR)):\n", - " wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/tiny_example_data.zip',\n", - " os.path.join(PROJECT_DIR, \"tiny_example_data.zip\"))\n", - "\n", - " !unzip {PROJECT_DIR}/tiny_example_data.zip -d {PROJECT_DIR}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial we will be using a tiny toy dataset to demonstrate how to use NeMo's entity linking model functionality. The dataset includes synonyms for 12 medical concepts. Entity phrases with the same ID are synonyms for the same concept. For example, \"*chronic kidney failure*\", \"*gradual loss of kidney function*\", and \"*CKD*\" are all synonyms of concept ID 5. Here's the dataset before preprocessing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_data = pd.read_csv(os.path.join(DATA_DIR, \"tiny_example_dev_data.csv\"), names=[\"ID\", \"CONCEPT\"], index_col=False)\n", - "print(raw_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've already paired off the concepts for this dataset with the format `ID concept_synonym1 concept_synonym2`. Here are the first ten rows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data = pd.read_table(os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\"), names=[\"ID\", \"CONCEPT_SYN1\", \"CONCEPT_SYN2\"], delimiter='\\t')\n", - "print(training_data.head(10))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the [Unified Medical Language System (UMLS)](https://www.nlm.nih.gov/research/umls/index.html) dataset for full medical domain entity linking training. The data contains over 9 million entities and is a table of medical concepts with their corresponding concept IDs (CUI). After [requesting a free license and making a UMLS Terminology Services (UTS) account](https://www.nlm.nih.gov/research/umls/index.html), the [entire UMLS dataset](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) can be downloaded from the NIH's website. If you've cloned the NeMo repo you can run the data processing script located in `examples/nlp/entity_linking/data/umls_dataset_processing.py` on the full dataset. This script will take in the initial table of UMLS concepts and produce a .tsv file with each row formatted as `CUI\\tconcept_synonym1\\tconcept_synonym2`. Once the UMLS dataset .RRF file is downloaded, the script can be run from the `examples/nlp/entity_linking` directory like so: \n", - "```\n", - "python data/umls_dataset_processing.py\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Second stage pretrain a BERT Base encoder on the self-alignment pretraining task (SAP) for improved entity linking. Using a GPU, the model should take 5 minutes or less to train on this example dataset and training progress will be output below the cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download config\n", - "wget.download(f\"https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml\",\n", - " os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Load in config file\n", - "cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set config file variables\n", - "cfg.project_dir = PROJECT_DIR\n", - "cfg.model.nemo_path = os.path.join(PROJECT_DIR, \"tiny_example_sap_bert_model.nemo\")\n", - "cfg.model.train_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\")\n", - "cfg.model.validation_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_validation_pairs.tsv\")\n", - "\n", - "# remove distributed training flags\n", - "cfg.trainer.strategy = 'auto'\n", - "cfg.trainer.accelerator = 'auto'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the trainer and model\n", - "trainer = Trainer(**cfg.trainer)\n", - "exp_manager(trainer, cfg.get(\"exp_manager\", None))\n", - "model = nemo_nlp.models.EntityLinkingModel(cfg=cfg.model, trainer=trainer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train and save the model\n", - "trainer.fit(model)\n", - "model.save_to(cfg.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can run the script at `examples/nlp/entity_linking/self_alignment_pretraining.py` to train a model on a larger dataset. Run\n", - "\n", - "```\n", - "python self_alignment_pretraining.py project_dir=.\n", - "```\n", - "from the `examples/nlp/entity_linking` directory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Evaluation\n", - "\n", - "Let's evaluate our freshly trained model and compare its performance with a BERT Base encoder that hasn't undergone self-alignment pretraining. We first need to restore our trained model and load our BERT Base Baseline model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Restore second stage pretrained model\n", - "sap_model_cfg = cfg\n", - "sap_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_index\")\n", - "sap_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "sap_model = nemo_nlp.models.EntityLinkingModel.restore_from(sap_model_cfg.model.nemo_path).to(device)\n", - "\n", - "# Load original model\n", - "base_model_cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set train/val datasets to None to avoid loading datasets associated with training\n", - "base_model_cfg.model.train_ds = None\n", - "base_model_cfg.model.validation_ds = None\n", - "base_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"base_model_index\")\n", - "base_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "base_model = nemo_nlp.models.EntityLinkingModel(base_model_cfg.model).to(device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are going evaluate our model on a nearest neighbor task using top 1 and top 5 accuracies as our metric. We will be using a tiny example test knowledge base and test queries. For this evaluation we are going to be comparing every test query with every concept vector in our test set knowledge base. We will rank each item in the knowledge base by its cosine similarity with the test query. We'll then compare the IDs of the predicted most similar test knowledge base concepts with our ground truth query IDs to calculate top 1 and top 5 accuracies. For this metric higher is better." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Helper function to get data embeddings\n", - "def get_embeddings(model, dataloader):\n", - " embeddings, cids = [], []\n", - "\n", - " with torch.no_grad():\n", - " for batch in tqdm(dataloader):\n", - " input_ids, token_type_ids, attention_mask, batch_cids = batch\n", - " batch_embeddings = model.forward(input_ids=input_ids.to(device), \n", - " token_type_ids=token_type_ids.to(device), \n", - " attention_mask=attention_mask.to(device))\n", - "\n", - " # Accumulate index embeddings and their corresponding IDs\n", - " embeddings.extend(batch_embeddings.cpu().detach().numpy())\n", - " cids.extend(batch_cids)\n", - " \n", - " return embeddings, cids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, test_kb, test_queries, ks):\n", - " # Initialize knowledge base and query data loaders\n", - " test_kb_dataloader = model.setup_dataloader(test_kb, is_index_data=True)\n", - " test_query_dataloader = model.setup_dataloader(test_queries, is_index_data=True)\n", - " \n", - " # Get knowledge base and query embeddings\n", - " test_kb_embs, test_kb_cids = get_embeddings(model, test_kb_dataloader)\n", - " test_query_embs, test_query_cids = get_embeddings(model, test_query_dataloader)\n", - "\n", - " # Calculate the cosine distance between each query and knowledge base concept\n", - " score_matrix = np.matmul(np.array(test_query_embs), np.array(test_kb_embs).T)\n", - " accs = {k : 0 for k in ks}\n", - " \n", - " # Compare the knowledge base IDs of the knowledge base entities with \n", - " # the smallest cosine distance from the query \n", - " for query_idx in tqdm(range(len(test_query_cids))):\n", - " query_emb = test_query_embs[query_idx]\n", - " query_cid = test_query_cids[query_idx]\n", - " query_scores = score_matrix[query_idx]\n", - "\n", - " for k in ks:\n", - " topk_idxs = np.argpartition(query_scores, -k)[-k:]\n", - " topk_cids = [test_kb_cids[idx] for idx in topk_idxs]\n", - " \n", - " # If the correct query ID is among the top k closest kb IDs\n", - " # the model correctly linked the entity\n", - " match = int(query_cid in topk_cids)\n", - " accs[k] += match\n", - "\n", - " for k in ks:\n", - " accs[k] /= len(test_query_cids)\n", - " \n", - " return accs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create configs for our test data\n", - "test_kb = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_kb.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "test_queries = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_queries.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "ks = [1, 5]\n", - "\n", - "# Evaluate both models on our test data\n", - "base_accs = evaluate(base_model, test_kb, test_queries, ks)\n", - "base_accs[\"Model\"] = \"BERT Base Baseline\"\n", - "\n", - "sap_accs = evaluate(sap_model, test_kb, test_queries, ks)\n", - "sap_accs[\"Model\"] = \"BERT + SAP\"\n", - "\n", - "print(\"Top 1 and Top 5 Accuracy Comparison:\")\n", - "results_df = pd.DataFrame([base_accs, sap_accs], columns=[\"Model\", 1, 5])\n", - "results_df = results_df.style.set_properties(**{'text-align': 'left', }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n", - "display(results_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The purpose of this section was to show an example of evaluating your entity linking model. This evaluation set contains very little data, and no serious conclusions should be drawn about model performance. Top 1 accuracy should be between 0.7 and 1.0 for both models and top 5 accuracy should be between 0.8 and 1.0. When evaluating a model trained on a larger dataset, you can use a nearest neighbors index to speed up the evaluation time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building an Index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To qualitatively observe the improvement we gain from the second stage pretraining, let's build two indices. One will be built with BERT base embeddings before self-alignment pretraining and one will be built with the model we just trained. Our knowledge base in this tutorial will be in the same domain and have some overlapping concepts as the training set. This data file is formatted as `ID\\tconcept`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `EntityLinkingDataset` class can load the data used for training the entity linking encoder as well as for building the index if the `is_index_data` flag is set to true. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def build_index(cfg, model):\n", - " # Setup index dataset loader\n", - " index_dataloader = model.setup_dataloader(cfg.index.index_ds, is_index_data=True)\n", - " \n", - " # Get index dataset embeddings\n", - " embeddings, _ = get_embeddings(model, index_dataloader)\n", - " \n", - " # Train IVFFlat index using faiss\n", - " embeddings = np.array(embeddings)\n", - " quantizer = faiss.IndexFlatL2(cfg.index.dims)\n", - " index = faiss.IndexIVFFlat(quantizer, cfg.index.dims, cfg.index.nlist)\n", - " index = faiss.index_cpu_to_all_gpus(index)\n", - " index.train(embeddings)\n", - " \n", - " # Add concept embeddings to index\n", - " for i in tqdm(range(0, embeddings.shape[0], cfg.index.index_batch_size)):\n", - " index.add(embeddings[i:i+cfg.index.index_batch_size])\n", - "\n", - " # Save index\n", - " faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "build_index(sap_model_cfg, sap_model.to(device))\n", - "build_index(base_model_cfg, base_model.to(device))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking via Nearest Neighbor Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it's time to query our indices! We are going to query both our index built with embeddings from BERT Base, and our index with embeddings built from the SAP BERT model we trained. Our sample query phrases will be \"*high blood sugar*\" and \"*head pain*\". \n", - "\n", - "To query our indices, we first need to get the embedding of each query from the corresponding encoder model. We can then pass these query embeddings into the faiss index which will perform a nearest neighbor search, using cosine distance to compare the query embedding with embeddings present in the index. Once we get a list of knowledge base index concept IDs most closely matching our query, all that is left to do is map the IDs to a representative string describing the concept. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def query_index(cfg, model, index, queries, id2string):\n", - " # Get query embeddings from our entity linking encoder model\n", - " query_embs = get_query_embedding(queries, model).cpu().detach().numpy()\n", - " \n", - " # Use query embedding to find closest concept embedding in knowledge base\n", - " distances, neighbors = index.search(query_embs, cfg.index.top_n)\n", - " \n", - " # Get the canonical strings corresponding to the IDs of the query's nearest neighbors in the kb \n", - " neighbor_concepts = [[id2string[concept_id] for concept_id in query_neighbor] \\\n", - " for query_neighbor in neighbors]\n", - " \n", - " # Display most similar concepts in the knowledge base. \n", - " for query_idx in range(len(queries)):\n", - " print(f\"\\nThe most similar concepts to {queries[query_idx]} are:\")\n", - " for cid, concept, dist in zip(neighbors[query_idx], neighbor_concepts[query_idx], distances[query_idx]):\n", - " print(cid, concept, 1 - dist)\n", - "\n", - " \n", - "def get_query_embedding(queries, model):\n", - " # Tokenize our queries\n", - " model_input = model.tokenizer(queries,\n", - " add_special_tokens = True,\n", - " padding = True,\n", - " truncation = True,\n", - " max_length = 512,\n", - " return_token_type_ids = True,\n", - " return_attention_mask = True)\n", - " \n", - " # Pass tokenized input into model\n", - " query_emb = model.forward(input_ids=torch.LongTensor(model_input[\"input_ids\"]).to(device),\n", - " token_type_ids=torch.LongTensor(model_input[\"token_type_ids\"]).to(device),\n", - " attention_mask=torch.LongTensor(model_input[\"attention_mask\"]).to(device))\n", - " \n", - " return query_emb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load indices\n", - "sap_index = faiss.read_index(sap_model_cfg.index.index_save_name)\n", - "base_index = faiss.read_index(base_model_cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Map concept IDs to one canonical string\n", - "index_data = open(sap_model_cfg.index.index_ds.data_file, \"r\", encoding='utf-8-sig')\n", - "id2string = {}\n", - "\n", - "for line in index_data:\n", - " cid, concept = line.split(\"\\t\")\n", - " id2string[int(cid) - 1] = concept.strip()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id2string" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Some sample queries\n", - "queries = [\"high blood sugar\", \"head pain\"]\n", - "\n", - "# Query BERT Base\n", - "print(\"BERT Base output before Self Alignment Pretraining:\")\n", - "query_index(base_model_cfg, base_model, base_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")\n", - "\n", - "# Query SAP BERT\n", - "print(\"SAP BERT output after Self Alignment Pretraining:\")\n", - "query_index(sap_model_cfg, sap_model, sap_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even after only training on this tiny amount of data, the qualitative performance boost from self-alignment pretraining is visible. The baseline model links \"*high blood sugar*\" to the entity \"*6 diabetes*\" while our SAP BERT model accurately links \"*high blood sugar*\" to \"*Hyperinsulinemia*\". Similarly, \"*head pain*\" and \"*Myocardial infraction*\" are not the same concept, but \"*head pain*\" and \"*Headache*\" are." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For larger knowledge bases keeping the default embedding size might be too large and cause out of memory issues. You can apply PCA or some other dimensionality reduction method to your data to reduce its memory footprint. Code for creating a text file of all the UMLS entities in the correct format needed to build an index and creating a dictionary mapping concept ids to canonical concept strings can be found here `examples/nlp/entity_linking/data/umls_dataset_processing.py`. \n", - "\n", - "The code for extracting knowledge base concept embeddings, training and applying a PCA transformation to the embeddings, building a faiss index and querying the index from the command line is located at `examples/nlp/entity_linking/build_index.py` and `examples/nlp/entity_linking/query_index.py`. \n", - "\n", - "If you've cloned the NeMo repo, both of these steps can be run as follows on the command line from the `examples/nlp/entity_linking/` directory.\n", - "\n", - "```\n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands. Intermediate steps of the index building process are saved. In the occurrence of an error, previously completed steps do not need to be rerun. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Command Recap" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is a recap of the commands and steps to repeat this process on the full UMLS dataset. \n", - "\n", - "1) Download the UMLS dataset file `MRCONSO.RRF` from the NIH website and place it in the `examples/nlp/entity_linking/data` directory.\n", - "\n", - "2) Run the following commands from the `examples/nlp/entity_linking` directory\n", - "```\n", - "python data/umls_dataset_processing.py\n", - "python self_alignment_pretraining.py project_dir=. \n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "The model will take ~24hrs to train on two GPUs and ~48hrs to train on one GPU. By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands and changing `project_dir=` in the `self_alignment_pretraining.py` command. If you change the project directory, you should also move the `MRCONOSO.RRF` file to a `data` sub directory within the one you've specified. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned in the introduction, entity linking within NVIDIA NeMo is not limited to the medical domain. The same data processing and training steps can be applied to a variety of domains and use cases. You can edit the datasets used as well as training and loss function hyperparameters within your config file to better suit your domain." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb deleted file mode 100644 index b77b3439b444c..0000000000000 --- a/tutorials/nlp/GLUE_Benchmark.ipynb +++ /dev/null @@ -1,566 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "GLUE_Benchmark.ipynb", - "provenance": [], - "private_outputs": true, - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU", - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "o_0K1lsW1dj9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "JFWG-jYCfvD7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n", - "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n", - "\n", - "! pip install ipywidgets\n", - "! jupyter nbextension enable --py widgetsnbextension\n", - "\n", - "# Please restart the kernel after running this cell" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "dzqD2WDFOIN-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "daYw_Xll2ZR9", - "colab_type": "text" - }, - "source": [ - "In this tutorial, we are going to describe how to finetune a BERT-like model based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) on [GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding](https://openreview.net/pdf?id=rJ4km2R5t7). \n", - "\n", - "# GLUE tasks\n", - "GLUE Benchmark includes 9 natural language understanding tasks:\n", - "\n", - "## Single-Sentence Tasks\n", - "\n", - "* CoLA - [The Corpus of Linguistic Acceptability](https://arxiv.org/abs/1805.12471) is a set of English sentences from published linguistics literature. The task is to predict whether a given sentence is grammatically correct or not.\n", - "* SST-2 - [The Stanford Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence: positive or negative.\n", - "\n", - "## Similarity and Paraphrase tasks\n", - "\n", - "* MRPC - [The Microsoft Research Paraphrase Corpus](https://www.aclweb.org/anthology/I05-5002.pdf) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n", - "* QQP - [The Quora Question Pairs](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n", - "* STS-B - [The Semantic Textual Similarity Benchmark](https://arxiv.org/abs/1708.00055) is a collection of sentence pairs drawn from news headlines, video, and image captions, and natural language inference data. The task is to determine how similar two sentences are.\n", - "\n", - "## Inference Tasks\n", - "\n", - "* MNLI - [The Multi-Genre Natural Language Inference Corpus](https://cims.nyu.edu/~sbowman/multinli/multinli_0.9.pdf) is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The task has the matched (in-domain) and mismatched (cross-domain) sections.\n", - "* QNLI - [The Stanford Question Answering Dataset](https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf) is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question. The task is to determine whether the context sentence contains the answer to the question.\n", - "* RTE The Recognizing Textual Entailment (RTE) datasets come from a series of annual [textual entailment challenges](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment). The task is to determine whether the second sentence is the entailment of the first one or not.\n", - "* WNLI - The Winograd Schema Challenge is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices (Hector Levesque, Ernest Davis, and Leora Morgenstern. The winograd schema challenge. In Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. 2012).\n", - "\n", - "All tasks are classification tasks, except for the STS-B task which is a regression task. All classification tasks are 2-class problems, except for the MNLI task which has 3-classes.\n", - "\n", - "More details about GLUE benchmark could be found [here](https://gluebenchmark.com/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZnuziSwJ1yEB", - "colab_type": "text" - }, - "source": [ - "# Datasets\n", - "\n", - "**To proceed further, you need to download the GLUE data.** For example, you can download [this script](https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py) using `wget` and then execute it by running:\n", - "\n", - "`python download_glue_data.py`\n", - "\n", - "use `--tasks TASK` if datasets for only selected GLUE tasks are needed\n", - "\n", - "After running the above commands, you will have a folder `glue_data` with data folders for every GLUE task. For example, data for MRPC task would be under glue_data/MRPC.\n", - "\n", - "This tutorial and [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py) work with all GLUE tasks without any modifications. For this tutorial, we are going to use MRPC task.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "--wJ2891aIIE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# supported task names: [\"cola\", \"sst-2\", \"mrpc\", \"sts-b\", \"qqp\", \"mnli\", \"qnli\", \"rte\", \"wnli\"]\n", - "TASK = 'mrpc'\n", - "DATA_DIR = 'glue_data/MRPC'\n", - "WORK_DIR = \"WORK_DIR\"\n", - "MODEL_CONFIG = 'glue_benchmark_config.yaml'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qB0oLE4R9EhJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! ls -l $DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gMWuU69pbUDe", - "colab_type": "text" - }, - "source": [ - "For each task, there are 3 files: `train.tsv, dev.tsv, and test.tsv`. Note, MNLI has 2 dev sets: matched and mismatched, evaluation on both dev sets will be done automatically." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6UDPgadLN6SG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# let's take a look at the training data \n", - "! head -n 5 {DATA_DIR}/train.tsv" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_whKCxfTMo6Y", - "colab_type": "text" - }, - "source": [ - "# Model configuration\n", - "\n", - "Now, let's take a closer look at the model's configuration and learn to train the model.\n", - "\n", - "GLUE model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Sequence Regression module (for STS-B task) or Sequence classifier module (for the rest of the tasks).\n", - "\n", - "The model is defined in a config file which declares multiple important sections. They are:\n", - "- **model**: All arguments that are related to the Model - language model, a classifier, optimizer and schedulers, datasets and any other related information\n", - "\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "T1gA8PsJ13MJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/glue_benchmark/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mX3KmWMvSUQw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(OmegaConf.to_yaml(config))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZCgWzNBkaQLZ", - "colab_type": "text" - }, - "source": [ - "# Model Training\n", - "## Setting up Data within the config\n", - "\n", - "Among other things, the config file contains dictionaries called **dataset**, **train_ds** and **validation_ds**. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n", - "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n", - "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n", - "\n", - "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", - "\n", - "Let's now add the data directory path, task name and output directory for saving predictions to the config." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LQHCJN-ZaoLp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "config.model.task_name = TASK\n", - "config.model.output_dir = WORK_DIR\n", - "config.model.dataset.data_dir = DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nB96-3sTc3yk", - "colab_type": "text" - }, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n", - "\n", - "Let's first instantiate a Trainer object" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1tG4FzZ4Ui60", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(OmegaConf.to_yaml(config.trainer))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "knF6QeQQdMrH", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# lets modify some trainer configs\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", - "\n", - "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", - "# config.trainer.amp_level = O1\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = 'auto'\n", - "\n", - "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", - "config.trainer.max_steps = 128\n", - "\n", - "trainer = pl.Trainer(**config.trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8IlEMdVxdr6p", - "colab_type": "text" - }, - "source": [ - "## Setting up a NeMo Experiment\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8uztqGAmdrYt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", - "\n", - "# the exp_dir provides a path to the current experiment for easy access\n", - "exp_dir = str(exp_dir)\n", - "exp_dir" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8tjLhUvL_o7_", - "colab_type": "text" - }, - "source": [ - "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model and use [Megatron-LM BERT](https://arxiv.org/abs/1909.08053) or [AlBERT model](https://arxiv.org/abs/1909.11942):" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Xeuc2i7Y_nP5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n", - "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n", - "\n", - "# specify BERT-like model, you want to use, for example, \"megatron-bert-345m-uncased\" or 'bert-base-uncased'\n", - "PRETRAINED_BERT_MODEL = \"albert-base-v1\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "RK2xglXyAUOO", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# add the specified above model parameters to the config\n", - "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fzNZNAVRjDD-", - "colab_type": "text" - }, - "source": [ - "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", - "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NgsGLydWo-6-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = nemo_nlp.models.GLUEModel(cfg=config.model, trainer=trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kQ592Tx4pzyB", - "colab_type": "text" - }, - "source": [ - "## Monitoring training progress\n", - "Optionally, you can create a Tensorboard visualization to monitor training progress." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mTJr16_pp0aS", - "colab_type": "code", - "colab": {} - }, - "source": [ - "try:\n", - " from google import colab\n", - " COLAB_ENV = True\n", - "except (ImportError, ModuleNotFoundError):\n", - " COLAB_ENV = False\n", - "\n", - "# Load the TensorBoard notebook extension\n", - "if COLAB_ENV:\n", - " %load_ext tensorboard\n", - " %tensorboard --logdir {exp_dir}\n", - "else:\n", - " print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CFgAlaIdndjW", - "colab_type": "text" - }, - "source": [ - "Note, it’s recommended to finetune the model on each task separately. Also, based on [GLUE Benchmark FAQ#12](https://gluebenchmark.com/faq), there are might be some differences in dev/test distributions for QQP task and in train/dev for WNLI task." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hUvnSpyjp0Dh", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# start model training\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ref1qSonGNhP", - "colab_type": "text" - }, - "source": [ - "## Training Script\n", - "\n", - "If you have NeMo installed locally, you can also train the model with [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py).\n", - "\n", - "To run training script, use:\n", - "\n", - "`python glue_benchmark.py \\\n", - " model.dataset.data_dir=PATH_TO_DATA_DIR \\\n", - " model.task_name=TASK`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KVPFofXaoKNE", - "colab_type": "text" - }, - "source": [ - "Average results after 3 runs:\n", - "\n", - "| Task | Metric | ALBERT-large | ALBERT-xlarge | Megatron-345m | BERT base paper | BERT large paper |\n", - "|-------|--------------------------|--------------|---------------|---------------|-----------------|------------------|\n", - "| CoLA | Matthew's correlation | 54.94 | 61.72 | 64.56 | 52.1 | 60.5 |\n", - "| SST-2 | Accuracy | 92.74 | 91.86 | 95.87 | 93.5 | 94.9 |\n", - "| MRPC | F1/Accuracy | 92.05/88.97 | 91.87/88.61 | 92.36/89.46 | 88.9/- | 89.3/- |\n", - "| STS-B | Person/Spearman corr. | 90.41/90.21 | 90.07/90.10 | 91.51/91.61 | -/85.8 | -/86.5 |\n", - "| QQP | F1/Accuracy | 88.26/91.26 | 88.80/91.65 | 89.18/91.91 | 71.2/- | 72.1/- |\n", - "| MNLI | Matched /Mismatched acc. | 86.69/86.81 | 88.66/88.73 | 89.86/89.81 | 84.6/83.4 | 86.7/85.9 |\n", - "| QNLI | Accuracy | 92.68 | 93.66 | 94.33 | 90.5 | 92.7 |\n", - "| RTE | Accuracy | 80.87 | 82.86 | 83.39 | 66.4 | 70.1 |\n", - "\n", - "WNLI task was excluded from the experiments due to the problematic WNLI set.\n", - "The dev sets were used for evaluation for ALBERT and Megatron models, and the test sets results for [the BERT paper](https://arxiv.org/abs/1810.04805).\n", - "\n", - "Hyperparameters used to get the results from the above table, could be found in the table below. Some tasks could be further finetuned to improve performance numbers, the tables are for a baseline reference only.\n", - "Each cell in the table represents the following parameters:\n", - "Number of GPUs used/ Batch Size/ Learning Rate/ Number of Epochs. For not specified parameters, please refer to the default parameters in the training script.\n", - "\n", - "| Task | ALBERT-large | ALBERT-xlarge | Megatron-345m |\n", - "|-------|--------------|---------------|---------------|\n", - "| CoLA | 1 / 32 / 1e-5 / 3 | 1 / 32 / 1e-5 / 10 | 4 / 16 / 2e-5 / 12 |\n", - "| SST-2 | 4 / 16 / 2e-5 / 5 | 4 / 16 / 2e-5 /12 | 4 / 16 / 2e-5 / 12 |\n", - "| MRPC | 1 / 32 / 1e-5 / 5 | 1 / 16 / 2e-5 / 5 | 1 / 16 / 2e-5 / 10 |\n", - "| STS-B | 1 / 16 / 2e-5 / 5 | 1 / 16 / 4e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n", - "| QQP | 1 / 16 / 2e-5 / 5 | 4 / 16 / 1e-5 / 12 | 4 / 16 / 1e-5 / 12 |\n", - "| MNLI | 4 / 64 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | \n", - "| QNLI | 4 / 16 / 1e-5 / 5 | 4 / 16 / 1e-5 / 5 | 4 / 16 / 2e-5 / 5 | \n", - "| RTE | 1 / 16 / 1e-5 / 5 | 1 / 16 / 1e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n" - ] - } - ] -} diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb deleted file mode 100644 index c19c07b670051..0000000000000 --- a/tutorials/nlp/MegatronBert_export.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "8046e96a", - "metadata": {}, - "outputs": [], - "source": [ - "BRANCH='main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38bfe8ea", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98c00a93", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ] - }, - { - "cell_type": "markdown", - "id": "e9fb1a66", - "metadata": {}, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "# Task Description\n", - "In this tutorial, we are going to describe how to export NeMo NLP models with BERT based models as the pre-trained model." - ] - }, - { - "cell_type": "markdown", - "id": "dd0fb016", - "metadata": {}, - "source": [ - "## Convert the Megatron-LM Weights to Nemo file\n", - "\n", - "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n", - "\n", - "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e451f219", - "metadata": {}, - "outputs": [], - "source": [ - "from nemo.collections.nlp.modules.common.megatron.megatron_utils import MEGATRON_CONFIG_MAP\n", - "import pathlib\n", - "\n", - "PRETRAINED_BERT_MODEL = \"megatron-bert-345m-uncased\" # specify BERT-like model from MEGATRON_CONFIG_MAP.keys()\n", - "nemo_out_path = \"qa_pretrained.nemo\" # the nemo output file name\n", - "\n", - "checkpoint_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['checkpoint']\n", - "vocab_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['vocab']\n", - "checkpoint_filename = pathlib.Path(checkpoint_url).name\n", - "vocab_filename = pathlib.Path(vocab_url).name\n", - "if not pathlib.Path(checkpoint_filename).exists():\n", - " print('downloading from checkpoint url', checkpoint_url)\n", - " !wget $checkpoint_url\n", - "if not pathlib.Path(vocab_filename).exists():\n", - " print('downloading from vocab url', vocab_url)\n", - " !wget $vocab_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7586b5c0", - "metadata": {}, - "outputs": [], - "source": [ - "WORK_DIR = \"WORK_DIR\"\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "\n", - "# Prepare the model parameters \n", - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "MODEL_CONFIG = \"megatron_bert_config.yaml\"\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/conf/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0dd3124", - "metadata": {}, - "outputs": [], - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "\n", - "config.model.megatron_legacy = True # set to true if you trained the NLP model on NeMo < 1.5.0\n", - "config.model.bias_gelu_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export \n", - "config.model.masked_softmax_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export\n", - "\n", - "config.model.num_layers = 24\n", - "config.model.hidden_size = 1024\n", - "config.model.ffn_hidden_size = 4096\n", - "config.model.num_attention_heads = 16\n", - "config.model.tokenizer.vocab_file = vocab_filename\n", - "config.model.tokenizer.type = 'BertWordPieceLowerCase' # change this to BertWordPieceCase if you are using a cased pretrained model\n", - "config.model.tensor_model_parallel_size = 1\n", - "config.model.data.data_prefix = ''\n", - "config.model.max_position_embeddings = 512\n", - "config.model.data.seq_length = 512\n", - "config.cfg = {}\n", - "config.cfg.cfg = config.model\n", - "with open('hparams.yaml', 'w') as f:\n", - " f.write(OmegaConf.to_yaml(config.cfg))\n", - "if(config.model.megatron_legacy):\n", - " checkpoint_filename = \"model_optim_rng_ca.pt\" #provide path to the pretrained pt file you used during training on NeMo < 1.5.0, for NeMo >= 1.5.0\n", - "print(checkpoint_filename)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47dca6de", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "PWD = os.getcwd()\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py')\n", - "!python -m torch.distributed.run --nproc_per_node=1 megatron_lm_ckpt_to_nemo.py --checkpoint_folder=$PWD --checkpoint_name=$checkpoint_filename --hparams_file=$PWD/hparams.yaml --nemo_file_path=$PWD/$nemo_out_path --model_type=bert --tensor_model_parallel_size=1" - ] - }, - { - "cell_type": "markdown", - "id": "1ae8d31b", - "metadata": {}, - "source": [ - "# Legacy NLP Bert based model conversion\n", - "\n", - "Step 1: Convert legacy nemo checkpoint to a checkpoint which is currently supported by nemo\n", - "\n", - "Step 2: Use the converted model from step 1 to export the nemo file to the required format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86639a3d", - "metadata": {}, - "outputs": [], - "source": [ - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/nemo_legacy_import/nlp_checkpoint_port.py')\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/export.py')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48820d57", - "metadata": {}, - "outputs": [], - "source": [ - "legacy_nemo_file_path = \"/NeMo/megatron_multiqa.nemo\" #path to you model trained on NeMo < 1.5\n", - "nemo_converted_out_path = \"converted_megatron_multiqa.nemo\"\n", - "megatron_absolute_language_model_path = \"/NeMo/tutorials/nlp/qa_pretrained.nemo\" # Give the absolute path of the model you obtained using megatron_lm_ckpt_to_nemo\n", - "onnx_export_out_path = \"onnx_megatron_multiqa.onnx\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7191e0cb", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python nlp_checkpoint_port.py {legacy_nemo_file_path} {nemo_converted_out_path} --megatron-legacy=True --megatron-checkpoint {megatron_absolute_language_model_path}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccc720ef", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python export.py {nemo_converted_out_path} {onnx_export_out_path} --autocast --runtime-check\")" - ] - }, - { - "cell_type": "markdown", - "id": "f10461f2", - "metadata": {}, - "source": [ - "# Convert a NLP model with BERT based pre-trained model trained on NeMo >= 1.5.0\n", - "\n", - "For models trained on NeMo >= 1.5.0, you just run the export script and skip the legacy conversion part" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0514ab37", - "metadata": {}, - "outputs": [], - "source": [ - "nemo_file_path = \"\"\n", - "onnx_export_out_path = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d6b5db4", - "metadata": {}, - "outputs": [], - "source": [ - "python export.py $nemo_converted_out_path $onnx_export_out_path --autocast --runtime-check" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb deleted file mode 100644 index 054928245d9d7..0000000000000 --- a/tutorials/nlp/Question_Answering.ipynb +++ /dev/null @@ -1,1163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "tiIOhb7iVC3J" - }, - "source": [ - "# Overview" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PucJwfbhVC3L" - }, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "This tutorial will demonstrate how to train, evaluate, and test three types of models for Question-Answering -\n", - "1. BERT-like models for Extractive Question-Answering\n", - "2. Sequence-to-Sequence (S2S) models for Generative Question-Answering (ex. T5/BART-like)\n", - "3. GPT-like models for Generative Question-Answering\n", - "\n", - "## Task Description\n", - "\n", - "- Given a context and a natural language query, we want to generate an answer for the query\n", - "- Depending on how the answer is generated, the task can be broadly divided into two types:\n", - " 1. Extractive Question Answering\n", - " 2. Generative Question Answering\n", - "\n", - "\n", - "### Extractive Question-Answering with BERT-like models\n", - "\n", - "Given a question and a context, both in natural language, predict the span within the context with a start and end position which indicates the answer to the question.\n", - "For every word in our training dataset we’re going to predict:\n", - "- likelihood this word is the start of the span \n", - "- likelihood this word is the end of the span\n", - "\n", - "We are using a BERT encoder with 2 span prediction heads for predicting start and end position of the answer. The span predictions are token classifiers consisting of a single linear layer.\n", - "\n", - "### Generative Question-Answering with S2S and GPT-like models\n", - "\n", - "Given a question and a context, both in natural language, generate an answer for the question. Unlike the BERT-like models, there is no constraint that the answer should be a span within the context." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IpX0w2PtVC3M" - }, - "source": [ - "# Installing NeMo" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "72XWYFQYVC3M" - }, - "source": [ - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_xQBtr0KVC3M" - }, - "outputs": [], - "source": [ - "BRANCH = 'main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9R1D6W58VC3N" - }, - "outputs": [], - "source": [ - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fof5-57iVC3N" - }, - "source": [ - "# Imports and constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KqKD-wReVC3O" - }, - "outputs": [], - "source": [ - "import os\n", - "import wget\n", - "import gc\n", - "\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf\n", - "\n", - "from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "pl.seed_everything(42)\n", - "gc.disable()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xhPr9Jf_VC3O" - }, - "outputs": [], - "source": [ - "# set the following paths\n", - "DATA_DIR = \"data_dir\" # directory for storing datasets\n", - "WORK_DIR = \"work_dir\" # directory for storing trained models, logs, additionally downloaded scripts\n", - "\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "os.makedirs(WORK_DIR, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dWymW8e0VC3O" - }, - "source": [ - "# Configuration" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0YhKTkuXVC3P" - }, - "source": [ - "The model is defined in a config file which declares multiple important sections:\n", - "- **model**: All arguments that will relate to the Model - language model, span prediction, optimizer and schedulers, datasets and any other related information\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning\n", - "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n", - "\n", - "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WOIWJqQ0VC3P" - }, - "outputs": [], - "source": [ - "# download the model's default configuration file \n", - "config_dir = WORK_DIR + '/conf/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + \"qa_conf.yaml\"):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/conf/qa_conf.yaml', config_dir)\n", - "else:\n", - " print ('config file already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cvD-gv-FVC3P" - }, - "outputs": [], - "source": [ - "# this will print the entire default config of the model\n", - "config_path = f'{WORK_DIR}/conf/qa_conf.yaml'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(\"Default Config - \\n\")\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E08e-ItPVC3P" - }, - "source": [ - "# Training and testing models on SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xn022MsKVC3Q" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c356CGL1VC3Q" - }, - "source": [ - "For this example, we are going to download the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset to showcase how to do training and inference. There are two datasets, SQuAD1.0 and SQuAD2.0. SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. SQuAD2.0 dataset combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gaju1h_bVC3Q" - }, - "source": [ - "To download both datasets, we use `NeMo/examples/nlp/question_answering/get_squad.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nb840_bZVC3Q" - }, - "outputs": [], - "source": [ - "# download get_squad.py script to download and preprocess the SQuAD data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/get_squad.py'):\n", - " print('Downloading get_squad.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/get_squad.py', WORK_DIR)\n", - "else:\n", - " print ('get_squad.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sOgY0tRzVC3Q" - }, - "outputs": [], - "source": [ - "# download and preprocess the data\n", - "!python $WORK_DIR/get_squad.py --destDir $DATA_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nprGkyvRVC3Q" - }, - "source": [ - "After execution of the above cell, your data folder will contain a subfolder \"squad\" the following four files for training and evaluation\n", - "\n", - "```\n", - "squad \n", - "│\n", - "└───v1.1\n", - "│ │ - train-v1.1.json\n", - "│ │ - dev-v1.1.json\n", - "│\n", - "└───v2.0\n", - " │ - train-v2.0.json\n", - " │ - dev-v2.0.json\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GX0KWQXKVC3Q" - }, - "outputs": [], - "source": [ - "!ls -LR {DATA_DIR}/squad" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RFVcvseOVC3R" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Grb0EeRqVC3R" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# indicates whether the dataset is of extractive nature or not\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "config.model.dataset.check_if_answer_in_context = True\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/squad/v2.0/train-v2.0.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 8\n", - "config.model.validation_ds.batch_size = 8\n", - "config.model.test_ds.batch_size = 8\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rFWF41VwVC3R" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "42yif-GIVC3R" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use [0] this tutorial does not support multiple GPUs. If needed please use NeMo/examples/nlp/question_answering/question_answering.py\n", - "config.trainer.accelerator = \"gpu\"\n", - "config.trainer.strategy=\"auto\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EDQzMBlbVC3R" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pxY4rnJBVC3R" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-SQuAD2\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N2_C8reNVC3R" - }, - "source": [ - "## BERT model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4Mf-_rioVC3R" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gtlGHzVJVC3R" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"bert-base-uncased\"\n", - "config.model.tokenizer.tokenizer_name = \"bert-base-uncased\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bert_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 3e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RaM7fe8rVC3R" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ukLzGmy9VC3R" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = BERTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qZIA69rlVC3R" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "asutB9ZzVC3R" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n5AIv0SEVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7k5kD6tvVC3S" - }, - "outputs": [], - "source": [ - "model = BERTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zyh0SNiyVC3S" - }, - "source": [ - "## S2S BART model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sy9IYgVYVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PKNmHKV5VC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5\n", - "\n", - "#remove vocab_file from gpt model\n", - "config.model.tokenizer.vocab_file = None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S_0glS4yVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8jWyHY1oVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xg-j39b4VC3S" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ocsf0EBDVC3S" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Vs3pl0VMVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NoW6_GO_VC3S" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7-iInbPVC3S" - }, - "source": [ - "## GPT2 model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VaIC0l2aVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5j6SVk6fVC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"gpt2\"\n", - "config.model.tokenizer.tokenizer_name = \"gpt2\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/gpt2_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 1e-4" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rWhhEuvzVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vBtP3ukDVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = GPTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EApFrJh8VC3T" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zYo2JDdOVC3T" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6aNEt06fVC3T" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ioLT4DVbVC3T" - }, - "outputs": [], - "source": [ - "model = GPTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hTWOlD9AVC3T" - }, - "source": [ - "# Training and testing models on MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZWsMwnGVC3T" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pRUAwgAbVC3T" - }, - "source": [ - "### Downloading the data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qz3DO9JGVC3T" - }, - "source": [ - "MS-MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. MS-MARCO consists of 1,010,916 queries generated from real, anonymized Bing user queries. The contexts are extracted from real web documents and the answers are generated by humans.\n", - "\n", - "Please agree to the Terms of Use at https://microsoft.github.io/msmarco/ before downloading the data\n", - "\n", - "The data can be downloaded at:\n", - "- https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "- https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Fm5MzZ91inP5" - }, - "outputs": [], - "source": [ - "os.makedirs(os.path.join(DATA_DIR, \"msmarco\"), exist_ok=True)\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/train_v2.1.json.gz\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nDmFHzBtVC3T" - }, - "source": [ - "### Converting to SQuAD format\n", - "\n", - "The script for converting MS-MARCO dataset to SQuAD can be found at `NeMo/examples/nlp/question_answering/convert_msmarco_to_squad_format.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tJtNIzZQVC3T" - }, - "outputs": [], - "source": [ - "# download convert_msmarco_to_squad_format.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/convert_msmarco_to_squad_format.py'):\n", - " print('Downloading convert_msmarco_to_squad_format.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/convert_msmarco_to_squad_format.py', WORK_DIR)\n", - "else:\n", - " print ('convert_msmarco_to_squad_format.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Io_esJPSuBcW" - }, - "outputs": [], - "source": [ - "# we will exclude examples from MS-MARCO dataset that do not have a wellFormedAnswer using a utility script\n", - "# download remove_ms_marco_samples_without_wellFormedAnswers.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/remove_ms_marco_samples_without_wellFormedAnswers.py'):\n", - " print('Downloading remove_ms_marco_samples_without_wellFormedAnswers.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py', WORK_DIR)\n", - "else:\n", - " print ('remove_ms_marco_samples_without_wellFormedAnswers.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cs_CXkfXuYVQ" - }, - "outputs": [], - "source": [ - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/train_v2.1.json\n", - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/dev_v2.1.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AUAKI086VC3T" - }, - "outputs": [], - "source": [ - "!(python $WORK_DIR/convert_msmarco_to_squad_format.py \\\n", - " --msmarco_train_input_filepath=$DATA_DIR/msmarco/train_v2.1.json \\\n", - " --msmarco_dev_input_filepath=$DATA_DIR/msmarco/dev_v2.1.json \\\n", - " --converted_train_save_path=$DATA_DIR/msmarco/msmarco-squad-format-train-v2.1.json \\\n", - " --converted_dev_save_path=$DATA_DIR/msmarco/msmarco-squad-format-dev-v2.1.json \\\n", - " --exclude_negative_samples=False \\\n", - " --keep_only_relevant_passages=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AeHesaFcVC3T" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rhx-_1X3VC3T" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "# should be False for MS-MARCO dataset, or other datasets of generative nature\n", - "config.model.dataset.check_if_answer_in_context = False\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-train-v2.1.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 16\n", - "config.model.validation_ds.batch_size = 16\n", - "config.model.test_ds.batch_size = 16\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X43k_EeqVC3T" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HavpkQLPVC3U" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0]\n", - "config.trainer.accelerator = \"gpu\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R-_FIZE2VC3U" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "10TT3okiVC3U" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-MSMARCO\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MKIq6YT-VC3U" - }, - "source": [ - "## S2S BART model for MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvf-QpYLVC3U" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DDVZ1a5fVC3U" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_msmarco_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3N75cdLRVC3U" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bv9UMkfxVC3U" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BhVuV9sWVC3U" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1JeaJ_OgVC3U" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yj0dGexaVC3U" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l1elN-WDVC3U" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Question_Answering.ipynb", - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3.8.0 ('test_ptl_1.7')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb deleted file mode 100644 index 71c7ca5051443..0000000000000 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ /dev/null @@ -1,1412 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "PiRuohn_FQco" - }, - "source": [ - "# Overview\n", - "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", - "\n", - "Estimated time: 10-15 min.\n", - "\n", - "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n", - "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n", - "\n", - "This model is an alternative to word boosting/shallow fusion approaches:\n", - " - does not require retraining ASR model;\n", - " - does not require beam-search/language model(LM);\n", - " - can be applied on top of any English ASR model output;" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qm5wmxVEGXgH" - }, - "source": [ - "## What is custom vocabulary?\n", - "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n", - "\n", - "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n", - "\n", - "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D5_XwuXDOKho" - }, - "source": [ - "## Tutorial Plan\n", - "\n", - "1. Create a sample custom vocabulary using some medical terminology.\n", - "2. Study what customization does - a detailed analysis of a small example.\n", - "3. Run a bigger example:\n", - " * Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n", - "\n", - "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n", - "* `puramesin` => `puromycin`\n", - "* `parromsin` => `puromycin`\n", - "* `and hydrod` => `anhydride`\n", - "* `lesh night and` => `lesch-nyhan`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "agz8B2CxXBBG" - }, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "koRPpYISNPuH" - }, - "source": [ - "## Installing NeMo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HCnnz3cgVc4Q" - }, - "outputs": [], - "source": [ - "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", - "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "GITHUB_ACCOUNT = \"NVIDIA\"\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n", - "\n", - "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", - "# comment out the below lines and set NEMO_DIR to your local path.\n", - "NEMO_DIR = 'nemo'\n", - "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_M92gCn_NW1_" - }, - "source": [ - "## Additional installs\n", - "We will use `sentence_splitter` to split abstracts to sentences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ddyJA3NtGl9C" - }, - "outputs": [], - "source": [ - "!pip install sentence_splitter" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qVa91rGkeFje" - }, - "source": [ - "Clone the SpellMapper model from HuggingFace.\n", - "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JiI9dkEm5cpW" - }, - "outputs": [], - "source": [ - "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8saqFOePVfFf" - }, - "source": [ - "## Imports\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tAJyiYn_VnrF" - }, - "outputs": [], - "source": [ - "import IPython.display as ipd\n", - "import json\n", - "import random\n", - "import re\n", - "import soundfile as sf\n", - "import torch\n", - "\n", - "from collections import Counter, defaultdict\n", - "from difflib import SequenceMatcher\n", - "from matplotlib.pyplot import imshow\n", - "from matplotlib import pyplot as plt\n", - "from sentence_splitter import SentenceSplitter\n", - "from typing import List, Set, Tuple\n", - "\n", - "from nemo.collections.tts.models import FastPitchModel\n", - "from nemo.collections.tts.models import HifiGanModel\n", - "\n", - "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n", - "\n", - "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n", - " get_all_candidates_coverage,\n", - " get_index,\n", - " load_ngram_mappings,\n", - " search_in_index,\n", - " get_candidates,\n", - " read_spellmapper_predictions,\n", - " apply_replacements_to_text,\n", - " load_ngram_mappings_for_dp,\n", - " get_alignment_by_dp,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mfAaOdAWUGUV" - }, - "source": [ - "Use seed to get a reproducible behaviour." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UlGnNKTuT_6A" - }, - "outputs": [], - "source": [ - "random.seed(0)\n", - "torch.manual_seed(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPPHI7Zd_fDz" - }, - "source": [ - "## Download data\n", - "\n", - "File `pubmed24n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n", - "\n", - "File `wordlist.txt` contains 100k **single-word** medical terms.\n", - "\n", - "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n", - "\n", - "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mX6cvE8xw2n1" - }, - "outputs": [], - "source": [ - "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0009.xml.gz\n", - "!gunzip pubmed24n0009.xml.gz\n", - "!grep \"AbstractText\" pubmed24n0009.xml > abstract.txt\n", - "\n", - "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n", - "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n", - "!wget https://norvig.com/ngrams/count_1w.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mBm9BeqNaRlC" - }, - "source": [ - "## Auxiliary functions\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kVUKhSh48Ypi" - }, - "outputs": [], - "source": [ - "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€™$•¼}{~—=“\\\"”″‟„]\")\n", - "\n", - "\n", - "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n", - " \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n", - " wordlist.txt - 100k single-word medical terms.\n", - " valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n", - " count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n", - " \"\"\"\n", - " common_words = set()\n", - " with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word, freq = line.strip().casefold().split(\"\\t\")\n", - " if int(freq) < 500000:\n", - " break\n", - " common_words.add(word)\n", - " print(\"Size of common words vocabulary:\", len(common_words))\n", - "\n", - " abbreviations = defaultdict(set)\n", - " medical_vocabulary = set()\n", - " with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - " # first line is header\n", - " for line in lines[1:]:\n", - " abbrev, _, phrase = line.strip().split(\"\\t\")\n", - " # skip phrases longer than 3 words because some of them are long explanations\n", - " if phrase.count(\" \") > 2:\n", - " continue\n", - " if phrase in common_words:\n", - " continue\n", - " medical_vocabulary.add(phrase)\n", - " abbrev = abbrev.lower()\n", - " abbreviations[abbrev].add(phrase)\n", - "\n", - " with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word = line.strip().casefold()\n", - " # skip words containing digits\n", - " if re.match(r\".*\\d.*\", word):\n", - " continue\n", - " if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n", - " continue\n", - " if word in common_words:\n", - " continue\n", - " medical_vocabulary.add(word)\n", - "\n", - " print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n", - " print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n", - " return medical_vocabulary, abbreviations\n", - "\n", - "\n", - "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n", - " \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n", - " Args:\n", - " medical_vocabulary: set of known medical words or phrases\n", - " Returns:\n", - " sentences: list of extracted sentences\n", - " all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n", - " all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n", - " \"\"\"\n", - " splitter = SentenceSplitter(language='en')\n", - "\n", - " all_sentences = []\n", - " all_found_singleword = set()\n", - " all_found_multiword = set()\n", - " with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " text = line.strip().replace(\"\", \"\").replace(\"\", \"\")\n", - " sents = splitter.split(text)\n", - " found_singleword = set()\n", - " found_multiword = set()\n", - " for sent in sents:\n", - " # remove anything in brackets from text\n", - " sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n", - " # remove quotes from text\n", - " sent = sent.replace(\"\\\"\", \"\")\n", - " # skip sentences containing digits because normalization is out of scope of this tutorial\n", - " if re.match(r\".*\\d.*\", sent):\n", - " continue\n", - " # skip sentences containing abbreviations with period inside the sentence (for the same reason)\n", - " if \". \" in sent:\n", - " continue\n", - " # skip long sentences as they may cause OOM issues\n", - " if len(sent) > 150:\n", - " continue\n", - " # replace all punctuation to space and convert to lowercase\n", - " sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n", - " sent_clean = \" \".join(sent_clean.split(\" \"))\n", - " words = sent_clean.split(\" \")\n", - "\n", - " found_phrases = set()\n", - " for begin in range(len(words)):\n", - " for end in range(begin + 1, min(begin + 4, len(words))):\n", - " phrase = \" \".join(words[begin:end])\n", - " if phrase in medical_vocabulary:\n", - " found_phrases.add(phrase)\n", - " if end - begin == 1:\n", - " found_singleword.add(phrase)\n", - " else:\n", - " found_multiword.add(phrase)\n", - " if len(found_phrases) > 0:\n", - " all_sentences.append((sent, \";\".join(found_phrases)))\n", - " all_found_singleword = all_found_singleword.union(found_singleword)\n", - " all_found_multiword = all_found_multiword.union(found_multiword)\n", - "\n", - " print(\"Sentences:\", len(all_sentences))\n", - " print(\"Unique single-word terms found:\", len(all_found_singleword))\n", - " print(\"Unique multi-word terms found:\", len(all_found_multiword))\n", - " print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n", - " \n", - " return all_sentences, all_found_singleword, all_found_multiword" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XU3xeCBVpWOL" - }, - "outputs": [], - "source": [ - "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n", - " \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n", - " Args:\n", - " i_words: list of words in first sequence\n", - " j_words: list of words in second sequence\n", - " Returns:\n", - " list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n", - " \"\"\"\n", - " s = SequenceMatcher(None, i_words, j_words)\n", - " result = []\n", - " for tag, i1, i2, j1, j2 in s.get_opcodes():\n", - " result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n", - " result = sorted(result, key=lambda x: x[3])\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ydXp_pFYmYu" - }, - "source": [ - "## Read medical data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WAeauax0SV1-" - }, - "outputs": [], - "source": [ - "medical_vocabulary, _ = get_medical_vocabulary()\n", - "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n", - "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n", - "big_sample = list(medical_vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FRli7-Kx7sOO" - }, - "outputs": [], - "source": [ - "for sent, phrases in sentences[0:10]:\n", - " print(sent, \"\\t\", phrases)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rL1VqH2_dk93" - }, - "source": [ - "# SpellMapper ASR Customization\n", - "\n", - "SpellMapper model relies on two offline preparation steps:\n", - "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n", - "2. Indexing of user vocabulary by n-grams.\n", - "\n", - "![Offline data preparation](images/spellmapper_data_preparation.png)\n", - "\n", - "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n", - "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n", - "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n", - "3. Do post-processing to combine results.\n", - "\n", - "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OeJpsMwslmrd" - }, - "source": [ - "## N-gram mappings\n", - "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uH6p0mOd12pi" - }, - "source": [ - "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n", - "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n", - "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n", - "* `` means that the original letter is deleted. \n", - "This auxiliary markup will be removed automatically during loading.\n", - "\n", - "`_` is used instead of real space symbol.\n", - "\n", - "Last three columns are:\n", - "* joint frequency\n", - "* frequency of original n-gram\n", - "* frequency of misspelled n-gram\n", - "\n", - "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qul163dB1sKp" - }, - "outputs": [], - "source": [ - "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eWxcrVWZ3Pfq" - }, - "source": [ - "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WHKhE945-N7o" - }, - "outputs": [], - "source": [ - "print(\"load n-gram mappings...\")\n", - "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n", - "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n", - "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n", - "print(\"Size of banned ngrams:\", len(ban_ngram))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "49IcMBfllvXN" - }, - "source": [ - "## Indexing of custom vocabulary" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b1K6paeee2Iu" - }, - "source": [ - "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n", - "\n", - "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n", - "gram mappings (see Index of custom vocabulary in Fig. 1)\n", - "\n", - "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xWb0jGqw6Woi" - }, - "outputs": [], - "source": [ - "custom_phrases = []\n", - "for phrase in medical_vocabulary:\n", - " if phrase not in found_singleword and phrase not in found_multiword:\n", - " continue\n", - " custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n", - "print(\"Size of customization vocabulary:\", len(custom_phrases))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UHWor5pD2Eyb" - }, - "source": [ - "Now we build the index for our custom phrases.\n", - "\n", - "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n", - "\n", - "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hs4RDXj0-xW9" - }, - "outputs": [], - "source": [ - "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n", - "print(\"Size of phrases:\", len(phrases))\n", - "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n", - "\n", - "# Save index to file - later we will use it in other script\n", - "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for ngram in ngram2phrases:\n", - " for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n", - " phrase = phrases[phrase_id]\n", - " out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RV1sdQ9rvar8" - }, - "source": [ - "## Small detailed example\n", - "\n", - "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n", - "\n", - "We will see \n", - "1. How this custom phrase is indexed.\n", - "2. How candidate retrieval works, given ASR-hypothesis.\n", - "3. How inference and post-processing work.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kGBTTJXixnrG" - }, - "source": [ - "### N-grams in index" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ryfUlqNMl4vQ" - }, - "source": [ - "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n", - "Columns: \n", - "1. n-gram\n", - "2. beginning position in the phrase\n", - "3. length\n", - "4. log probability\n", - "\n", - "Note that many n-grams are not from n-gram mappings file. Those are derived by growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x0ZVsXGBo8pt" - }, - "outputs": [], - "source": [ - "for ngram in ngram2phrases:\n", - " for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n", - " if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n", - " print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "20ov23ze4xeQ" - }, - "source": [ - "### Candidate retrieval\n", - "Candidate retrieval tasks are:\n", - " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n", - " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n", - " - Find approximate starting position for each candidate phrase. \n", - "\n", - "\n", - "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t_rhKQ3Xqa8A" - }, - "outputs": [], - "source": [ - "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n", - "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n", - "print(\" \".join(list(sent)))\n", - "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "orkRapbjF4aZ" - }, - "source": [ - "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n", - "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n", - "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n", - "\n", - "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n", - "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JF7u4_iiHLyI" - }, - "outputs": [], - "source": [ - "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n", - "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n", - "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "45mvKg8ZyNbr" - }, - "source": [ - "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n", - "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n", - "\n", - "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n", - "\n", - "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sjyn9I98udL9" - }, - "source": [ - "### Inference\n", - "\n", - "Now let's generate input for SpellMapper inference. \n", - "An input line should consist of 4 tab-separated columns:\n", - " - text of ASR-hypothesis\n", - " - texts of 10 candidates separated by semicolon\n", - " - 1-based ids of non-dummy candidates\n", - " - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n", - "Note that candidate retrieval is done inside the function `get_candidates`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cJnusVfBRhRX" - }, - "outputs": [], - "source": [ - "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n", - "letters = list(sent)\n", - "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n", - "# We add two columns with targets and span_info. \n", - "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n", - "targets = []\n", - "span_info = []\n", - "for idx, c in enumerate(candidates):\n", - " if c[1] == -1:\n", - " continue\n", - " targets.append(str(idx + 1)) # targets are 1-based\n", - " start = c[1]\n", - " end = min(c[1] + c[2], len(letters)) # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n", - " span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n", - "\n", - "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates]) + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n", - "out.close()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qpei5o89SmaU" - }, - "outputs": [], - "source": [ - "!cat spellmapper_input.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9rAmO15SS6go" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wd2aq4T1N5cs" - }, - "source": [ - "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n", - "1. ASR-hypothesis (same as in input)\n", - "2. 10 candidates separated with semicolon (same as in input)\n", - "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n", - "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ravgEX8cTFty" - }, - "outputs": [], - "source": [ - "!cat spellmapper_output.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "az26364-PHb2" - }, - "source": [ - "We can use some utility functions to apply found replacements and get actual corrected text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lPtFa_EhK8pb" - }, - "outputs": [], - "source": [ - "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n", - "text, replacements, _ = spellmapper_results[0]\n", - "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n", - "print(\"Text before correction:\\n\", text)\n", - "print(\"Text after correction:\\n\", corrected_text)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "efF7O-D91FLX" - }, - "source": [ - "# Bigger customization example\n", - "\n", - "Let's test customization on more data. The plan is\n", - " * Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r_EFPnyDcXZt" - }, - "source": [ - "## Run TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "i9F5SBhmr8rk" - }, - "outputs": [], - "source": [ - "# create a folder for wav files (TTS output)\n", - "!rm -r audio\n", - "!mkdir audio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JMbkNVt7YBAO" - }, - "outputs": [], - "source": [ - "if torch.cuda.is_available():\n", - " device = \"cuda\"\n", - "else:\n", - " device = \"cpu\"\n", - "\n", - "# Load FastPitch from HuggingFace\n", - "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n", - "# Load HifiGan vocoder from HuggingFace\n", - "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n", - "\n", - "# Write sentences that we want to feed to TTS\n", - "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for sent, _ in sentences[0:100]:\n", - " out.write(sent + \"\\n\")\n", - "\n", - "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n", - "i = 0\n", - "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n", - " for line in inp:\n", - " text = line.strip()\n", - " text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower() #replace all punctuation to space and convert to lowercase\n", - " text_clean = \" \".join(text_clean.split())\n", - "\n", - " parsed = spectrogram_generator.parse(text, normalize=True)\n", - "\n", - " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", - " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", - "\n", - " # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n", - " filename = \"audio/\" + str(i) + \".wav\"\n", - " sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n", - " out_manifest.write(\n", - " \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n", - " )\n", - " i += 1\n", - "\n", - " # display some examples\n", - " if i < 10:\n", - " print(f'\"{text}\"\\n')\n", - " ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n", - "\n", - "out_manifest.close()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9T3CZcCAmxCz" - }, - "source": [ - "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`.", - "\n", - "Note that TTS model may mispronounce some unknown words, for example, abbreviations like `tRNAs`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pR_T1HnttVjm" - }, - "outputs": [], - "source": [ - "lines = []\n", - "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - "\n", - "for line in lines:\n", - " try:\n", - " data = json.loads(line.strip())\n", - " except:\n", - " print(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bt2TMLLvdUHm" - }, - "source": [ - "Free GPU memory to avoid OOM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZwEpAOCaRH7s" - }, - "outputs": [], - "source": [ - "del spectrogram_generator\n", - "del vocoder\n", - "torch.cuda.empty_cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HrensakWdLkt" - }, - "source": [ - "## Run baseline ASR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IQNIo2M_mqJc" - }, - "source": [ - "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n", - "\n", - "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NMN63ux1mJiG" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/transcribe_speech.py \\\n", - " pretrained_name=\"stt_en_conformer_ctc_large\" \\\n", - " dataset_manifest=manifest.json \\\n", - " output_filename=ctc_baseline_transcript_tmp.json \\\n", - " batch_size=2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L3swQ8uqqgnp" - }, - "source": [ - "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n", - "\n", - "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n", - "\n", - "So we need to correct the manifests to ensure that all spaces are single." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z17sxkmXrXpJ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n", - "\n", - "for i in range(len(test_data)):\n", - " # if there are multiple spaces in the string they will be merged to one\n", - " test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n", - "\n", - "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n", - " for d in test_data:\n", - " line = json.dumps(d)\n", - " out.write(line + \"\\n\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PuKtfhbVkVJY" - }, - "outputs": [], - "source": [ - "!head -n 4 ctc_baseline_transcript.json" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aCJw9NEXqRg8" - }, - "source": [ - "### Calculating WER of baseline transcript\n", - "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZmNEGVWQsGo2" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_baseline_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AvPwJr0ZqdkN" - }, - "source": [ - "### See fragments that differ\n", - "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RAeaVCpMv78y" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['text'] for data in test_data]\n", - "audio_filepath = [data['audio_filepath'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"PRED vs REF\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dUSOF7iD1w_9" - }, - "source": [ - "## Run SpellMapper" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "x39BQhYB6_Fr" - }, - "source": [ - "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y8x-yT5WqfFz" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n", - " --manifest ctc_baseline_transcript.json \\\n", - " --custom_vocab_index index.txt \\\n", - " --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n", - " --short2full_name short2full.txt \\\n", - " --output_name spellmapper_input.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ueq_JAPWGs_Y" - }, - "source": [ - "Run the inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zgkqiiZtJjcB" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPQWJX8dFLfX" - }, - "source": [ - "Now we postprocess SpellMapper output and create output corrected manifest." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3eFU515yKvXP" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --ngram_mappings \"\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hRoIhhGh17tp" - }, - "source": [ - "### Calculating WER of corrected transcript." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qIT957bGo9AY" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NYXIPusupqOQ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['pred_text_before_correction'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"Corrected vs baseline\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DJtXlqXbTD6M" - }, - "source": [ - "### Filtering by Dynamic Programming(DP) score\n", - "\n", - "What else can be done?\n", - "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n", - "Let's look at an example. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "05Qf9wgHU_UR" - }, - "outputs": [], - "source": [ - "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n", - "\n", - "fragment = \"and hydrod\"\n", - "replacement = \"anhydride\"\n", - "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n", - "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n", - "path = get_alignment_by_dp(\n", - " replacement_spaced,\n", - " fragment_spaced,\n", - " dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n", - ")\n", - "print(\"Dynamic Programming path:\")\n", - "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n", - " print(\n", - " \"\\t\",\n", - " \"frag=\",\n", - " fragment_ngram,\n", - " \"; repl=\",\n", - " replacement_ngram,\n", - " \"; score=\",\n", - " score,\n", - " \"; sum_score=\",\n", - " sum_score,\n", - " \"; joint_freq=\",\n", - " joint_freq,\n", - " \"; orig_freq=\",\n", - " orig_freq,\n", - " \"; misspelled_freq=\",\n", - " misspelled_freq,\n", - " )\n", - "\n", - "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n", - "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hgfKPKckaLnc" - }, - "source": [ - "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n", - "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UhSXh7ht_JRn" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript_dp.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --use_dp \\\n", - " --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n", - " --min_dp_score_per_symbol -1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "u8R5YHB3vPC8" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript_dp.json \\\n", - " only_score_manifest=True" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "upvTbkFAeYtR" - }, - "source": [ - "# Final notes\n", - "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n", - "\n", - "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n", - "\n", - "3. To reproduce evaluation experiments from this paper see these scripts:\n", - " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - "\n", - "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", - "\n", - "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n", - "\n", - "6. Promising future research directions would be:\n", - " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", - " - retrain with adding more various false positives to the training data" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From ebba8b14263ca513c4453fcde0472785c19f46c1 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Mon, 10 Jun 2024 15:36:17 -0700 Subject: [PATCH 02/17] Add Dev Container Bug Report (#9430) * Add dev_container_bug_report.md Signed-off-by: Pablo Garay * Date field refactor --------- Signed-off-by: Pablo Garay --- .../dev_container_bug_report.md | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/dev_container_bug_report.md diff --git a/.github/ISSUE_TEMPLATE/dev_container_bug_report.md b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md new file mode 100644 index 0000000000000..fe81ec6252d87 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md @@ -0,0 +1,35 @@ +--- +container pulled on date: mm/dd/yyyy +name: Dev container - Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** + +A clear and concise description of what the bug is. + +**Steps/Code to reproduce bug** + +Please list *minimal* steps or code snippet for us to be able to reproduce the bug. + +A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports. + + +**Expected behavior** + +A clear and concise description of what you expected to happen. + +**Environment overview (please complete the following information)** + + - Environment location: Docker + - Method of install: Please specify exact commands you used to install. + - If method of install is [Docker], provide `docker pull` & `docker run` commands used + +**Additional context** + +Add any other context about the problem here. +Example: GPU model From 97aa7322a5de430a908f4bcafac371521c3116c0 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 11 Jun 2024 16:27:08 +0200 Subject: [PATCH 03/17] Enable specyfing alpha for SQ (#9423) Signed-off-by: Jan Lasek --- examples/nlp/language_modeling/conf/megatron_quantization.yaml | 1 + nemo/export/quantize/quantizer.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml index 88d10ae0a66cd..52454f5c89061 100644 --- a/examples/nlp/language_modeling/conf/megatron_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml @@ -26,6 +26,7 @@ quantization: calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset num_calib_size: 512 # number of samples used for calibration awq_block_size: 128 # block size for scaling factors in AWQ algorithm + alpha: 1.0 # alpha parameter in SmoothQuant algorithm export: decoder_type: llama # gptnext, gpt2, llama diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 4748f4957a52b..e25d529ec62cb 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -116,6 +116,9 @@ def __init__( "axis": None, "enable": enable_quant_kv_cache, } + if quantization_config.algorithm == "int8_sq": + logging.info(f"Using int8_sq alpha = {quantization_config.alpha}") + quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.alpha} self.quant_cfg = quant_cfg else: From 91ab412e484e29cf9ebe0286c428281b8e599523 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:27:07 +0300 Subject: [PATCH 04/17] add support for new mcore ds features (#9388) * add validation_drop_last and add_extra_token params support for mcore ds Signed-off-by: dimapihtar * pad samples with dummy tokens only Signed-off-by: dimapihtar * Apply isort and black reformatting Signed-off-by: dimapihtar * use no_seqlen_plus_one_input_tokens as mcore's add_extra_token Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * set train_valid_test_num_samples[1] to None Signed-off-by: dimapihtar * add test case when validation_drop_last is False Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * set validation_drop_last as True by default Signed-off-by: dimapihtar * Update nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> * Update nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> --------- Signed-off-by: dimapihtar Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: jbaczek <45043825+jbaczek@users.noreply.github.com> --- .github/workflows/cicd-main.yml | 2 ++ .../nlp/data/language_modeling/megatron/data_samplers.py | 5 ++--- .../nlp/models/language_modeling/megatron_gpt_model.py | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 01a8cfc4b0df6..6cf60271e0d7f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2398,6 +2398,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings @@ -2432,6 +2433,7 @@ jobs: model.activations_checkpoint_method=block \ model.activations_checkpoint_granularity=full \ model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 6818f99d0e4f4..4a8b989a7b6d6 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -91,8 +91,7 @@ def __len__(self): return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod - def __iter__(self): - ... + def __iter__(self): ... class MegatronPretrainingSampler(BaseMegatronSampler): @@ -107,7 +106,7 @@ def __iter__(self): indices = range(self.consumed_samples, self.total_samples) if (not self.drop_last) and self.pad_samples_to_global_batch_size: pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = range(-1, -pad_samples_num - 1, -1) + pad_indices = [None] * pad_samples_num indices = chain(indices, pad_indices) for idx in indices: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 718991dc203da..8cb8d95150c97 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1472,8 +1472,7 @@ def build_train_valid_test_datasets(self): # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). # Setting N = 1 we force E to be 1 as well if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[1] = 1 - + train_valid_test_num_samples[1] = None # Add extra FIM tokens to tokenizer if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron': fim_tokens = self.cfg.data.fim.extra_tokens @@ -1498,6 +1497,7 @@ def build_train_valid_test_datasets(self): is_dataset_built_on_rank = lambda: True mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False + add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) kwargs = { "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, @@ -1508,6 +1508,8 @@ def build_train_valid_test_datasets(self): "eod_mask_loss": self.eod_mask_loss, "create_attention_mask": not self.get_attention_mask_from_fusion, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), + "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True), + "add_extra_token_to_sequence": add_extra_token, } data_prefix = self.cfg.data.data_prefix From df5f8cb0a16caadf319f8ebe96c2199fcb8594b2 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:54:14 -0700 Subject: [PATCH 05/17] Akoumparouli/profiling docs (#9420) * profiling docs Signed-off-by: Alexandros Koumparoulis * fix docstring Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- docs/source/core/core.rst | 32 +++++++ nemo/core/classes/modelPT.py | 181 ++++++++++++++++++----------------- 2 files changed, 127 insertions(+), 86 deletions(-) diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 1c9325cf0a96c..3c1a496993bd4 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -741,3 +741,35 @@ To register a child model, use the ``register_nemo_submodule`` method of the par else: self.child_model = None + + +Profiling +--------- + +NeMo offers users two options for profiling: Nsys & CUDA memory profiling. These two options allow users +to debug performance issues as well as memory issues such as memory leaks. + +To enable Nsys profiling, add the following options to the model config: +nsys_profile: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + +Finally, the model training script with: + +nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... +See more options at `nsight user guide `_. + + + +To enable CUDA memory profiling, add the following options to the model config: + +memory_profile: + enabled: True + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + rank: 0 # Global rank ID to profile + output_path: None # Path to store the profile output file + +And invoke your NeMo script without any changes in the invocation command. diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 0a9054c23da83..f5d61a8edb157 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -220,37 +220,40 @@ def on_fit_start(self) -> None: return super().on_fit_start() def register_artifact( - self, config_path: str, src: str, verify_src_exists: bool = True, + self, + config_path: str, + src: str, + verify_src_exists: bool = True, ): - """ Register model artifacts with this function. These artifacts (files) will be included inside .nemo file - when model.save_to("mymodel.nemo") is called. + """Register model artifacts with this function. These artifacts (files) will be included inside .nemo file + when model.save_to("mymodel.nemo") is called. - How it works: + How it works: - 1. It always returns existing absolute path which can be used during Model constructor call - EXCEPTION: src is None or "" in which case nothing will be done and src will be returned - 2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts + 1. It always returns existing absolute path which can be used during Model constructor call + EXCEPTION: src is None or "" in which case nothing will be done and src will be returned + 2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts - .. code-block:: + .. code-block:: - If "src" is local existing path: - then it will be returned in absolute path form. - elif "src" starts with "nemo_file:unique_artifact_name": - .nemo will be untarred to a temporary folder location and an actual existing path will be returned - else: - an error will be raised. + If "src" is local existing path: + then it will be returned in absolute path form. + elif "src" starts with "nemo_file:unique_artifact_name": + .nemo will be untarred to a temporary folder location and an actual existing path will be returned + else: + an error will be raised. - WARNING: use .register_artifact calls in your models' constructors. - The returned path is not guaranteed to exist after you have exited your model's constructor. + WARNING: use .register_artifact calls in your models' constructors. + The returned path is not guaranteed to exist after you have exited your model's constructor. - Args: - config_path (str): Artifact key. Usually corresponds to the model config. - src (str): Path to artifact. - verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if - src is not found. Defaults to True. + Args: + config_path (str): Artifact key. Usually corresponds to the model config. + src (str): Path to artifact. + verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if + src is not found. Defaults to True. - Returns: - str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life + Returns: + str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life """ if src is None or src == "": @@ -610,7 +613,9 @@ def setup_megatron_optimization(self, optim_config: Union[Dict[str, Any], DictCo return megatron_optim_config def setup_optimization( - self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, + self, + optim_config: Optional[Union[DictConfig, Dict]] = None, + optim_kwargs: Optional[Dict[str, Any]] = None, ): """Prepares an optimizer from a string name and its optional config parameters. @@ -760,7 +765,10 @@ def setup_optimization( if optimizer_name == 'mcore_distributed_optim': # setup megatron_optim_config and get Mcore based optimizer with the wrapper megatron_optim_config = self.setup_megatron_optimization(optimizer_args) - _megatron_optimizer = get_megatron_optimizer(megatron_optim_config, self.model,) + _megatron_optimizer = get_megatron_optimizer( + megatron_optim_config, + self.model, + ) optimizer = McoreDistributedOptimizer(_megatron_optimizer) else: @@ -781,30 +789,30 @@ def setup_optimization( def setup_optimizer_param_groups(self): """ - Used to create param groups for the optimizer. - As an example, this can be used to specify per-layer learning rates: - - optim.SGD([ - {'params': model.base.parameters()}, - {'params': model.classifier.parameters(), 'lr': 1e-3} - ], lr=1e-2, momentum=0.9) - - See https://pytorch.org/docs/stable/optim.html for more information. - By default, ModelPT will use self.parameters(). - Override this method to add custom param groups. - In the config file, add 'optim_param_groups' to support different LRs - for different components (unspecified params will use the default LR): - - model: - optim_param_groups: - encoder: - lr: 1e-4 - momentum: 0.8 - decoder: - lr: 1e-3 - optim: - lr: 3e-3 - momentum: 0.9 + Used to create param groups for the optimizer. + As an example, this can be used to specify per-layer learning rates: + + optim.SGD([ + {'params': model.base.parameters()}, + {'params': model.classifier.parameters(), 'lr': 1e-3} + ], lr=1e-2, momentum=0.9) + + See https://pytorch.org/docs/stable/optim.html for more information. + By default, ModelPT will use self.parameters(). + Override this method to add custom param groups. + In the config file, add 'optim_param_groups' to support different LRs + for different components (unspecified params will use the default LR): + + model: + optim_param_groups: + encoder: + lr: 1e-4 + momentum: 0.8 + decoder: + lr: 1e-3 + optim: + lr: 3e-3 + momentum: 0.9 """ if not hasattr(self, "parameters"): self._optimizer_param_groups = None @@ -1710,26 +1718,27 @@ def update_save_restore_connector(cls, save_restore_connector): setattr(cls, '_save_restore_connector', save_restore_connector) def _setup_profiling(self): - """ Enables nsys profiling - To use, add the following optoins to the model config: - ## Nsys profiling options - nsys_profile: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes - And then wrap the model training script with: - nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... - See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling - - Enables CUDA memory profiling - To use, add the following optoins to the model config: - ## CUDA memory profiling options - memory_profile: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - rank: 0 # Global rank ID to profile - output_path: None # Path to store the profile output file + """Enables nsys profiling + To use, add the following optoins to the model config: + ## Nsys profiling options + nsys_profile: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + And then wrap the model training script with: + nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... + See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling + + Enables CUDA memory profiling + To use, add the following options to the model config: + ## CUDA memory profiling options + memory_profile: + enabled: True + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + rank: 0 # Global rank ID to profile + output_path: None # Path to store the profile output file """ if self.cfg.get('nsys_profile', None) is not None: if self.cfg.nsys_profile.get('enabled', False): @@ -1791,9 +1800,9 @@ def _setup_profiling(self): ) def on_train_start(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start - We use it here to copy the relevant config for dynamic freezing. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start + We use it here to copy the relevant config for dynamic freezing. """ # dynamic freezing @@ -1810,9 +1819,9 @@ def on_train_start(self): setattr(self, '_freeze_cfg', None) def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> Optional[int]: - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start - We use it here to enable nsys profiling and dynamic freezing. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start + We use it here to enable nsys profiling and dynamic freezing. """ # nsys profiling @@ -1856,9 +1865,9 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O self._freeze_cfg['is_frozen'][ml] = False def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int = 0) -> None: - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end - We use it here to enable nsys profiling. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end + We use it here to enable nsys profiling. """ if self.device.type == 'cuda': @@ -1893,30 +1902,30 @@ def _cleanup_on_execution_end(self): self._test_step_outputs = None def on_train_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end - We use it here to cleanup the dynamic freezing config. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end + We use it here to cleanup the dynamic freezing config. """ self._cleanup_on_execution_end() def on_test_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end """ self._cleanup_on_execution_end() def on_predict_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end """ self._cleanup_on_execution_end() # TODO: Remove in PTL 1.7.2 def cuda(self, device=None): - """ PTL is overriding this method and changing the pytorch behavior of a module. + """PTL is overriding this method and changing the pytorch behavior of a module. The PTL LightingModule override will move the module to device 0 if device is None. See the PTL method here: https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113 From c51cdbb5d2ab8e99cb48d621cc33706931b13a7f Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Tue, 11 Jun 2024 15:55:01 -0400 Subject: [PATCH 06/17] LoRA for MoE Layer (#9396) * initial moe lora impl Signed-off-by: Chen Cui * Apply isort and black reformatting Signed-off-by: cuichenx * fix dangling adapter Signed-off-by: Chen Cui * update to newest mcore code Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui Signed-off-by: cuichenx Co-authored-by: cuichenx --- .../common/megatron/adapters/mcore_mixins.py | 73 ++++++++++++--- .../megatron/adapters/parallel_adapters.py | 88 +++++++++++++++++-- nemo/collections/nlp/parts/peft_config.py | 40 +++++++-- 3 files changed, 173 insertions(+), 28 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index a85c155cc0a85..bcfe07f702a0d 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -14,19 +14,16 @@ import torch import torch.nn.functional as F -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.tensor_parallel import ColumnParallelLinear from megatron.core.transformer.attention import SelfAttention -from megatron.core.transformer.custom_layers.transformer_engine import ( - SplitAlongDim, - TEColumnParallelLinear, - TELayerNormColumnParallelLinear, -) +from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim from megatron.core.transformer.mlp import MLP +from megatron.core.transformer.moe.experts import SequentialMLP from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.utils import make_viewless_tensor @@ -37,6 +34,8 @@ LoraDenseAttentionAdapterConfig, LoraHto4HAdapterConfig, LoraKQVAdapterConfig, + LoraMoe4HtoHAdapterConfig, + LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, MLPInfusedAdapterConfig, @@ -281,13 +280,15 @@ def forward( class MCoreMLPMixin(MLP, MCoreAdapterModuleMixin): def mcore_register_adapters(self): """ - Setup NeMo IA3 adapter to this MCore layer. + Setup NeMo IA3 and LoRA adapter to this MCore layer. """ self.set_accepted_adapter_types( [ LoraUnfusedHto4HAdapterConfig._target_, LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, + LoraMoeHto4HAdapterConfig._target_, + LoraMoe4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_, ] ) # only self attn (packed qkv) for now @@ -302,9 +303,12 @@ def mcore_register_adapters(self): # overlap is used. self.linear_fc1.return_layernorm_output_gathered = True - def forward(self, hidden_states): + def forward(self, hidden_states, expert_idx=None): # [s, b, 4 * h/p] - if self.linear_fc1.te_return_bias: + if isinstance(self.linear_fc1, ColumnParallelLinear): + layernorm_output = hidden_states + intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) + elif self.linear_fc1.te_return_bias: intermediate_parallel, bias_parallel, layernorm_output = self.linear_fc1(hidden_states) else: # bias_parallel is None @@ -315,15 +319,19 @@ def forward(self, hidden_states): lora_adapter = None lora_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER) lora_unfused_fc1_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_Hto4H_ADAPTER) + lora_moe_fc1_adapter = self.get_adapter_module(AdapterName.LORA_MOE_Hto4H_ADAPTER) if lora_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']: lora_adapter = lora_fc1_adapter if lora_unfused_fc1_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_Hto4H_ADAPTER]['enabled']: assert lora_adapter is None, "Expected only one of LORA_Hto4H_ADAPTER or LORA_UNFUSED_Hto4H_ADAPTER" lora_adapter = lora_unfused_fc1_adapter + lora_output = 0 if lora_adapter: lora_output = lora_adapter(layernorm_output) - intermediate_parallel = intermediate_parallel + lora_output + elif lora_moe_fc1_adapter and self.adapter_cfg[AdapterName.LORA_MOE_Hto4H_ADAPTER]['enabled']: + lora_output = lora_moe_fc1_adapter(layernorm_output, expert_idx) + intermediate_parallel = intermediate_parallel + lora_output if self.config.bias_activation_fusion: if self.activation_func == F.gelu: @@ -363,14 +371,51 @@ def glu(x): # LoRA logic if self.is_adapter_available(): - lora_linear_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER) - if lora_linear_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']: - lora_output = lora_linear_fc2_adapter(intermediate_parallel) - output = output + lora_output + lora_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER) + lora_moe_fc2_adapter = self.get_adapter_module(AdapterName.LORA_MOE_4HtoH_ADAPTER) + + lora_output = 0 + if lora_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']: + lora_output = lora_fc2_adapter(intermediate_parallel) + elif lora_moe_fc2_adapter and self.adapter_cfg[AdapterName.LORA_MOE_4HtoH_ADAPTER]['enabled']: + lora_output = lora_moe_fc2_adapter(intermediate_parallel, expert_idx) + + output = output + lora_output return output, output_bias +class MCoreSequentialMLPMixin(SequentialMLP, MCoreAdapterModuleMixin): + def mcore_register_adapters(self): + """ + We don't want the SequentialMLP layer to take any adapters. We only want to override the forward() behavior + """ + pass + + def forward(self, permuted_local_hidden_states, tokens_per_expert): + output_local = torch.zeros_like(permuted_local_hidden_states) + output_bias_local = None + if self.add_bias: + output_bias_local = torch.zeros_like(permuted_local_hidden_states) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + for expert_num, expert in enumerate(self.local_experts): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + hidden = permuted_local_hidden_states[start:end] + output, output_bias = expert(hidden, expert_num) # expert: MLP + + output_local[start:end] = output + if self.add_bias: + output_bias = output_bias.expand_as(output) + output_bias_local[start:end, :] = output_bias + + return output_local, output_bias_local + + class MCoreGPTEmbeddingMixin(LanguageModelEmbedding, MCoreAdapterModuleMixin): def mcore_register_adapters(self): """ diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 61903e6b36735..21dace0088776 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -83,6 +83,8 @@ class AdapterName(str, enum.Enum): LORA_Hto4H_ADAPTER = "lora_hto4h_adapter" LORA_UNFUSED_Hto4H_ADAPTER = "lora_unfused_hto4h_adapter" LORA_4HtoH_ADAPTER = "lora_4htoh_adapter" + LORA_MOE_Hto4H_ADAPTER = "lora_moe_hto4h_adapter" + LORA_MOE_4HtoH_ADAPTER = "lora_moe_4htoh_adapter" MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter" PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter" @@ -611,6 +613,80 @@ class LoraUnfusedKQVAdapterConfig(AdapterConfig): _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__) +class LoraMoeAdapter(nn.Module, AdapterModuleUtil): + def __init__( + self, + num_moe_experts: int, + in_features: int, + out_features: int, + dim: int, + activation: str = 'identity', + norm_position: Optional[str] = None, + norm_type: Optional[str] = None, + column_init_method: str = 'xavier', + row_init_method: str = 'zero', + gather_output: bool = False, + input_is_parallel: bool = False, + dropout: float = 0.0, + model_parallel_config: Optional[ModelParallelConfig] = None, + alpha: float | None = None, + dropout_position: str = 'post', + a2a_experimental: bool = False, + **kwargs, + ): + super().__init__() + + self.num_moe_experts = num_moe_experts + adapter_args = { + "in_features": in_features, + "out_features": out_features, + "dim": dim, + "activation": activation, + "norm_position": norm_position, + "norm_type": norm_type, + "column_init_method": column_init_method, + "row_init_method": row_init_method, + "gather_output": gather_output, + "input_is_parallel": input_is_parallel, + "dropout": dropout, + "model_parallel_config": model_parallel_config, + "alpha": alpha, + "dropout_position": dropout_position, + "a2a_experimental": a2a_experimental, + } + self.expert_adapters = nn.ModuleList() + for i in range(num_moe_experts): + self.expert_adapters.append(ParallelLinearAdapter(**adapter_args)) + + def forward(self, x, expert_idx): + return self.expert_adapters[expert_idx](x) + + +@dataclass +class LoraMoeHto4HAdapterConfig(AdapterConfig): + num_moe_experts: int + in_features: int + out_features: int + dim: int + activation: str = 'identity' + norm_position: Optional[str] = None + norm_type: Optional[str] = None + column_init_method: str = 'xavier' + row_init_method: str = 'zero' + gather_output: bool = False + input_is_parallel: bool = False + dropout: float = 0.0 + dropout_position: str = 'post' + alpha: float | None = None + a2a_experimental: bool = False + _target_: str = "{0}.{1}".format(LoraMoeAdapter.__module__, LoraMoeAdapter.__name__) + + +@dataclass +class LoraMoe4HtoHAdapterConfig(LoraMoeHto4HAdapterConfig): + input_is_parallel: bool = True + + class PromptEncoderAdapter(nn.Module, AdapterModuleUtil): """ The Tensor Parallel MLP prompt encoder network that is used to generate the virtual @@ -690,20 +766,14 @@ def set_inference_table(self, prompt_representation: torch.Tensor): self.is_inference_ready = True return True - def clear_inference_table( - self, - ): + def clear_inference_table(self): self.inference_table.fill_(0.0) self.is_inference_ready = False - def get_inference_table( - self, - ): + def get_inference_table(self): return self.inference_table.data - def inner_forward( - self, - ): + def inner_forward(self): input_embeds = self.embedding(self.indices).unsqueeze(0) intermediate_parallel, bias_parallel = self.first(input_embeds) intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 4d558ce001147..50c97e3498855 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -23,6 +23,7 @@ MCoreGPTEmbeddingMixin, MCoreMLPMixin, MCoreSelfAttentionMixin, + MCoreSequentialMLPMixin, MCoreTransformerLayerMixin, ) except (ImportError, ModuleNotFoundError): @@ -36,6 +37,8 @@ LoraHto4HAdapterConfig, LoraKQVAdapterConfig, LoraKQVAdapterWeightTyingConfig, + LoraMoe4HtoHAdapterConfig, + LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, MLPInfusedAdapterConfig, @@ -176,7 +179,10 @@ def __init__(self, cfg): elif module == PEFT_MODULE_MAP["hto4h_module"]: hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size - if lora_cfg.get("variant", "nemo") == "canonical": + if cfg.get('num_moe_experts', None): + _adapter_name = AdapterName.LORA_MOE_Hto4H_ADAPTER + _adapter_cfg_cls = LoraMoeHto4HAdapterConfig + elif lora_cfg.get("variant", "nemo") == "canonical": _adapter_name = AdapterName.LORA_UNFUSED_Hto4H_ADAPTER _adapter_cfg_cls = LoraUnfusedHto4HAdapterConfig else: @@ -187,13 +193,35 @@ def __init__(self, cfg): cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, _adapter_cfg_cls ) name_key_to_cfg[_adapter_name] = adapter_cfg - name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] + if _adapter_name == AdapterName.LORA_MOE_Hto4H_ADAPTER: + name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)] + for i in range(int(cfg.num_moe_experts)): + name_key_to_mcore_mixins[_adapter_name].append( + (f"mlp.experts.local_experts.{i}", MCoreMLPMixin) + ) + else: + name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] + elif module == PEFT_MODULE_MAP["4htoh_module"]: + if cfg.get('num_moe_experts', None): + _adapter_name = AdapterName.LORA_MOE_4HtoH_ADAPTER + _adapter_cfg_cls = LoraMoe4HtoHAdapterConfig + else: + _adapter_name = AdapterName.LORA_4HtoH_ADAPTER + _adapter_cfg_cls = Lora4HtoHAdapterConfig + adapter_cfg = self._create_lora_config( - cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig + cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, _adapter_cfg_cls ) - name_key_to_cfg[AdapterName.LORA_4HtoH_ADAPTER] = adapter_cfg - name_key_to_mcore_mixins[AdapterName.LORA_4HtoH_ADAPTER] = [("mlp", MCoreMLPMixin)] + name_key_to_cfg[_adapter_name] = adapter_cfg + if _adapter_name == AdapterName.LORA_MOE_4HtoH_ADAPTER: + name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)] + for i in range(int(cfg.num_moe_experts)): + name_key_to_mcore_mixins[_adapter_name].append( + (f"mlp.experts.local_experts.{i}", MCoreMLPMixin) + ) + else: + name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] else: logging.error( f"Unrecognized target_module string: {module}.\n" @@ -228,6 +256,8 @@ def _create_lora_config( assert kv_channels is not None, "kv_channels must be provided for canonical Lora" config_args.update({"num_query_groups": num_query_groups, "kv_channels": kv_channels}) config_args.pop("out_features") + elif adapter_cfg_cls in (LoraMoeHto4HAdapterConfig, LoraMoe4HtoHAdapterConfig): + config_args.update({'num_moe_experts': cfg.num_moe_experts}) if lora_cfg.weight_tying: position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None) From bbdcd20c5753a4995957493c2e0ba4c2fd12054f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 11 Jun 2024 22:16:42 +0200 Subject: [PATCH 07/17] ci: Enrich notifications (#9412) * ci: Extract step output Signed-off-by: Oliver Koenig * ci: Enrich notifications Signed-off-by: Oliver Koenig * ci(notifications): Catch case multiple failures Signed-off-by: Oliver Koenig * ci(notifications): Logs to single line Signed-off-by: Oliver Koenig * ci(notifications): Infer job_url Signed-off-by: Oliver Koenig * ci(notifications): Make author and url clickable Signed-off-by: Oliver Koenig * ci(notifications): Extract the last 2K chars Signed-off-by: Oliver Koenig * ci(notifications): Update docs Signed-off-by: Oliver Koenig * ci(notifications): Disable b64 wrapping Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/scripts/slackHelper.sh | 23 ---------- .github/workflows/_test_template.yml | 39 +++++++++++++++- .github/workflows/cicd-main.yml | 66 +++++++++++++++++++++++++--- 3 files changed, 98 insertions(+), 30 deletions(-) delete mode 100644 .github/scripts/slackHelper.sh diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh deleted file mode 100644 index 4696cebcf13b0..0000000000000 --- a/.github/scripts/slackHelper.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -function sendSlackMessage() { - - WEBHOOK_URL="$1" - PIPELINE_URL="$2" - - curl -X POST -H "Content-type: application/json" --data "{ - \"blocks\": [ - { - \"type\": \"section\", - \"text\": { - \"type\": \"mrkdwn\", - \"text\": \"\ -🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*: - -\" - } - } - ] - }" $WEBHOOK_URL - -} diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 31e9452d0fe51..065af34408ccb 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -30,13 +30,16 @@ on: conclusion: description: Conclusion of main test step value: ${{ jobs.main.outputs.conclusion }} - + log: + description: Last 2000 characters of the test step's log + value: ${{ jobs.main.outputs.log }} jobs: main: runs-on: ${{ inputs.RUNNER }} timeout-minutes: ${{ inputs.TIMEOUT }} outputs: conclusion: ${{ steps.main.conclusion }} + log: ${{ steps.main.outputs.log }} container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} options: @@ -50,7 +53,39 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - id: main - run: ${{ inputs.SCRIPT }} + name: Run main script + run: | + set +e + ( + set -e + + ${{ inputs.SCRIPT }} + ) 2> >(tee err.log) + + EXIT_CODE=$? + # Slack only allows 3000 chars per block. + # Since a block contains information about other + # metdata than the log, we prune the log to 2000 + # chars. + min() { + if (( $1 > $2 )); then + echo $2 + else + echo $1 + fi + } + + log=$(cat err.log) + + MAX_LENGTH=$(echo $log | wc -m) + MAX_LENGTH=$(min $MAX_LENGTH 2000) + MAX_LENGTH=$(( $MAX_LENGTH - 1 )) + + log=$(echo "${log: -${MAX_LENGTH}}" | base64 -w 0) + echo "log=$log" | tee -a "$GITHUB_OUTPUT" + + exit $EXIT_CODE + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: failure() && inputs.IS_OPTIONAL == false - name: after_script diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 6cf60271e0d7f..fab97d71f47a3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4284,12 +4284,68 @@ jobs: - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} run: | - source .github/scripts/slackHelper.sh - - WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }} + set -x + + PR_INFO=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/pulls/${{ github.event.number }} + ) + PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"') + PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"') + PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - - sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL" + BASE_MESSAGE=' + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*." + } + } + ] + } + ' + + JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs" + SUMMARY="[]" + while IFS= read -r JOB; do + JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main" + JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" $JOBS_URL | jq --arg job_name "$JOB_NAME" -r '.jobs[] | select(.name == $job_name) | .id') + JOB_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/job/$JOB_ID" + + LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"') + + SUMMARY=$(echo "$SUMMARY" | jq \ + --arg pr "<$PR_URL|$PR_TITLE>" \ + --arg job "<$JOB_URL|$JOB_NAME>" \ + --arg logs "$LOGS" \ + --arg author "" \ + --arg branch ""\ + '. += [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + "PR: " + $pr + + "\nJob: " + $job + + "\nAuthor: " + $author + + "\nBranch: " + $branch + + "\nLogs:" + + "```\n" + $logs + "\n```" + ) + } + } + ]') + done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")') + + MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary') + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} run: | From 070e63dad6d70e3c231d44d810e29b63f9422a0c Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:52:47 -0700 Subject: [PATCH 08/17] apply user's precision to output checkpoint (#9222) Signed-off-by: Alexandros Koumparoulis --- .../convert_mistral_7b_nemo_to_hf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 07e12f36c3d7d..99d1795aea9c6 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -211,15 +211,18 @@ def convert(in_file, precision=None, cpu_only=True) -> None: else: output_layer_base_name = 'model.language_model.output_layer.weight' state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name]) - return state_dict, nemo_config + return state_dict, nemo_config, dtype if __name__ == '__main__': args = get_args() - hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision) + hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision) config = load_config(args.hf_model_name, nemo_config) - model = AutoModelForCausalLM.from_config(config) + model = AutoModelForCausalLM.from_config( + config, + torch_dtype=dtype, + ) model.load_state_dict(hf_state_dict) model.save_pretrained(args.output_path) hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_name) From 3c29fefe9ac442e594f1c35c0f8ecc09b5ef5015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 11 Jun 2024 22:49:05 -0400 Subject: [PATCH 09/17] Fix failing RIR unit test with lhotse 1.24+ (#9444) --- .../common/test_lhotse_dataloading.py | 144 ++++++++++++++---- 1 file changed, 117 insertions(+), 27 deletions(-) diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py index 744e2884d015e..111c00df392ac 100644 --- a/tests/collections/common/test_lhotse_dataloading.py +++ b/tests/collections/common/test_lhotse_dataloading.py @@ -158,9 +158,10 @@ def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]: root = nemo_manifest_path.parent / "nemo_tar" root.mkdir(exist_ok=True) - with TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer, SequentialJsonlWriter( - root / "tarred_audio_filepaths.jsonl" - ) as mft_writer: + with ( + TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer, + SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer, + ): for idx, d in enumerate(load_jsonl(nemo_manifest_path)): p = d["audio_filepath"] name = Path(p).name @@ -856,7 +857,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path): from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV - INT16MAX = 2 ** 15 + INT16MAX = 2**15 expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX audio_path = str(tmp_path / "dummy.wav") sf.write(audio_path, expected_audio, 16000) @@ -904,7 +905,7 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path): from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV - INT16MAX = 2 ** 15 + INT16MAX = 2**15 expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX audio_path = str(tmp_path / "dummy.wav") sf.write(audio_path, expected_audio, 16000) @@ -950,7 +951,13 @@ def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path): CutSet([cut]).to_file(cuts_path) config = OmegaConf.create( - {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,} + { + "cuts_path": cuts_path, + "sample_rate": 16000, + "use_lhotse": True, + "num_workers": 0, + "batch_size": 2, + } ) dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity()) @@ -981,13 +988,21 @@ def test_extended_data_input_cfg(cutset_shar_path, nemo_tarred_manifest_path_mul "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ], "sample_rate": 16000, @@ -1031,17 +1046,27 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ], "weight": 0.2, - "tags": {"group_name": "G1",}, + "tags": { + "group_name": "G1", + }, }, { "type": "group", @@ -1052,16 +1077,26 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D3",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D3", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D4",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D4", + }, }, ], - "tags": {"group_name": "G2",}, + "tags": { + "group_name": "G2", + }, }, ], "sample_rate": 16000, @@ -1107,13 +1142,21 @@ def test_extended_data_input_cfg_yaml_path(tmp_path, cutset_shar_path, nemo_tarr "manifest_filepath": str(nemo_tarred_manifest_path_multi[0]), "tarred_audio_filepaths": str(nemo_tarred_manifest_path_multi[1]), "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": str(cutset_shar_path), "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ] @@ -1166,7 +1209,13 @@ def txt_es_path(tmp_path_factory): def test_text_file_input(txt_en_path, txt_es_path): config = OmegaConf.create( { - "input_cfg": [{"type": "txt", "paths": txt_en_path, "language": "en",},], + "input_cfg": [ + { + "type": "txt", + "paths": txt_en_path, + "language": "en", + }, + ], "shuffle": True, "num_workers": 0, "batch_size": 4, @@ -1312,13 +1361,17 @@ def test_multimodal_text_audio_dataloading( "target_paths": es_paths, "source_language": "en", "target_language": "es", - "tags": {"modality": "text",}, + "tags": { + "modality": "text", + }, }, { "type": "nemo_tarred", "manifest_filepath": manifest_filepath, "tarred_audio_filepaths": tarred_audio_filepaths, - "tags": {"modality": "audio",}, + "tags": { + "modality": "audio", + }, }, ], "shuffle": True, @@ -1339,7 +1392,11 @@ def test_multimodal_text_audio_dataloading( ) dl = get_lhotse_dataloader_from_config( - config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer, + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + tokenizer=en_es_tokenizer, ) # Note: we use islice here because the dataloader will be infinite. @@ -1402,7 +1459,12 @@ def test_dataloader_with_noise_nemo_json(cutset_path: Path, nemo_manifest_path: "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1426,7 +1488,12 @@ def test_dataloader_with_noise_lhotse_jsonl(cutset_path: Path): "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1443,7 +1510,10 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ config = OmegaConf.create( { "cuts_path": str(cutset_path), - "noise_path": {"manifest_filepath": noise_json, "tarred_audio_filepaths": noise_tar,}, + "noise_path": { + "manifest_filepath": noise_json, + "tarred_audio_filepaths": noise_tar, + }, "noise_mix_prob": 1.0, "noise_snr": [-5.0, 5.0], "batch_size": 2, @@ -1451,7 +1521,12 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1464,6 +1539,8 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ def test_dataloader_with_synth_rir(cutset_path: Path): + from lhotse.augmentation import ReverbWithImpulseResponse + config = OmegaConf.create( { "cuts_path": str(cutset_path), @@ -1474,7 +1551,12 @@ def test_dataloader_with_synth_rir(cutset_path: Path): "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 4 @@ -1487,8 +1569,16 @@ def test_dataloader_with_synth_rir(cutset_path: Path): cut = batch[2] assert isinstance(cut, MonoCut) assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1 - assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse" + tfnm = cut.recording.transforms[0] + if isinstance(tfnm, dict): # lhotse<=1.23.0 + assert tfnm["name"] == "ReverbWithImpulseResponse" + else: # lhotse>=1.24.0 + assert isinstance(tfnm, ReverbWithImpulseResponse) cut = batch[3] assert isinstance(cut, MonoCut) assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1 - assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse" + tfnm = cut.recording.transforms[0] + if isinstance(tfnm, dict): # lhotse<=1.23.0 + assert tfnm["name"] == "ReverbWithImpulseResponse" + else: # lhotse>=1.24.0 + assert isinstance(tfnm, ReverbWithImpulseResponse) From 8e7e46052d12a27bd2c601240878c3406aba58b0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:50:56 +0200 Subject: [PATCH 10/17] Add option for mutex timeout in distributed optimizer backward hook (#9087) (#9091) * Tim: Add option for timeout in distopt callback mutex * Replace parent's _lock * Revert "Replace parent's _lock" This reverts commit 972d1b60432009e729bd51ac3b2d989cb4368b82. * Raise RuntimeError when timeout * Change RuntimeError to print --------- Signed-off-by: Jaemin Choi Co-authored-by: Jaemin Choi Co-authored-by: Jaemin Choi Co-authored-by: Michal Futrega Co-authored-by: Pablo Garay --- nemo/core/optim/distributed_adam.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 77d00de89232a..716c905493e05 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -13,6 +13,7 @@ # limitations under the License. import collections +import contextlib import itertools from typing import Callable, Dict, Iterable, Optional, Union @@ -108,6 +109,8 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam): but requires larger memory than distributing within all ranks, especially for pure data parallel models. (default: False). + lock_timeout (float, optional): timeout for callback mutex in + seconds. **kwargs: keyword arguments to pass to Apex DistributedFusedAdam. @@ -118,6 +121,7 @@ def __init__( params: Union[Iterable[torch.nn.Parameter], Iterable[dict]], disable_distributed_parameters: bool = False, distribute_within_nodes: bool = False, + lock_timeout: Optional[float] = None, **kwargs, ): @@ -152,6 +156,25 @@ def __init__( # Construct distributed optimizer super().__init__(param_groups, **kwargs) + # Create mutex with timeout + self._lock_with_timeout = None + if lock_timeout is not None: + + @contextlib.contextmanager + def lock_with_timeout(): + result = self._lock.acquire(timeout=lock_timeout) + try: + yield result + finally: + if result: + # Acquired lock before timeout + self._lock.release() + else: + # Failed to acquire lock before timeout + print(f'MegatronDistributedFusedAdam: Failed to acquire lock within {lock_timeout} seconds.') + + self._lock_with_timeout = lock_with_timeout + def _broadcast_params(self) -> None: # Assume params have already been synchronized pass @@ -166,7 +189,10 @@ def hook(*unused): 'before the forward pass (e.g. by calling data_ptr) ' 'or run DistributedFusedAdam with overlap_param_sync=False.' ) - with self._lock: + lock = self._lock + if self._lock_with_timeout is not None: + lock = self._lock_with_timeout() + with lock: need_to_initialize = 'fragments' not in self.state[param] if need_to_initialize: self._init_param_state(param, param_group_id, param_id) From 5f6ca08b91e3b249947ef1992d372304bfd7dc6f Mon Sep 17 00:00:00 2001 From: Marc Romeyn Date: Wed, 12 Jun 2024 17:21:29 +0200 Subject: [PATCH 11/17] [NeMo-UX] Adding support for mcore distributed optimizer (#9435) * Fixing mcore DDP wrapping * Trying to add support for mcore * Proposal how to support mcore's distributed optimizer * Apply isort and black reformatting Signed-off-by: marcromeyn * Remove some un-used code * Remove some un-used code * Apply isort and black reformatting Signed-off-by: marcromeyn * Make design more robust * Make design more robust * Re-use getattr_proxy * Apply isort and black reformatting Signed-off-by: marcromeyn * Add all-reduces to MegatronOptim * Apply isort and black reformatting Signed-off-by: marcromeyn * Remove optimizer_fn from GPTConfig * Apply isort and black reformatting Signed-off-by: marcromeyn * Trying to fix failing megatron_parallel tests * Apply isort and black reformatting Signed-off-by: marcromeyn --------- Signed-off-by: marcromeyn Co-authored-by: marcromeyn Co-authored-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- nemo/collections/llm/gpt/model/base.py | 24 ++++--- nemo/lightning/megatron_parallel.py | 77 ++++++++++++++--------- nemo/lightning/optim.py | 66 +++++++++++++++++++ nemo/lightning/pytorch/strategies.py | 34 ++++++---- tests/lightning/test_megatron_parallel.py | 3 +- 5 files changed, 152 insertions(+), 52 deletions(-) create mode 100644 nemo/lightning/optim.py diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 9bf710d989288..9f5c23493d030 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -1,15 +1,18 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional +from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union import pytorch_lightning as L import torch import torch.distributed from megatron.core.transformer.transformer_config import TransformerConfig +from pytorch_lightning.utilities.types import OptimizerLRScheduler +from torch import nn from torch.optim import Optimizer from nemo.collections.llm import fn from nemo.lightning import get_vocab_size, io from nemo.lightning.megatron_parallel import MaskedTokenLossReduction +from nemo.lightning.optim import MegatronOptim, OptimizerConfig if TYPE_CHECKING: from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel @@ -33,8 +36,6 @@ class GPTConfig(TransformerConfig): # TODO: Move this to better places? get_attention_mask_from_fusion: bool = False - optimizer_fn: Optional[Callable[["GPTModel"], Optimizer]] = None - def configure_model(self, tokenizer) -> "MCoreGPTModel": vp_size = self.virtual_pipeline_model_parallel_size if vp_size: @@ -69,20 +70,19 @@ def __init__( self, config: GPTConfig, # TODO: Add transformer_layer_spec when we update mcore + optim: Optional[Union[MegatronOptim, Callable[[nn.Module], OptimizerLRScheduler]]] = None, tokenizer: Optional["TokenizerSpec"] = None, ): super().__init__() self.config = config self.tokenizer = tokenizer + self.optim = optim or MegatronOptim(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True)) def configure_model(self) -> None: self.module = self.config.configure_model(self.tokenizer) - def configure_optimizers(self) -> Optimizer: - if self.config.optimizer_fn is not None: - return self.config.optimizer_fn(self) - - return gpt_default_optimizer(self) + def configure_optimizers(self, megatron_parallel=None): + return self.optim(megatron_parallel or self) def forward( self, @@ -172,9 +172,13 @@ def gpt_forward_step(model, batch) -> torch.Tensor: def gpt_default_optimizer(module) -> Optimizer: - from apex.optimizers import FusedAdam + # from apex.optimizers import FusedAdam + + from megatron.core.optimizer import OptimizerConfig + + return OptimizerConfig(lr=1e-4) - return FusedAdam(module.parameters(), lr=1e-4) + # return FusedAdam(module.parameters(), lr=1e-4) def get_batch_on_this_context_parallel_rank(batch): diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index d23e57941aafd..12a9da97c3426 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -3,6 +3,7 @@ import functools import inspect import queue +import types from collections import defaultdict from typing import ( Any, @@ -24,6 +25,7 @@ import torch import torch.distributed +from megatron.core.distributed import DistributedDataParallel as McoreDDP from megatron.core.distributed import DistributedDataParallelConfig from torch import Tensor, nn @@ -132,37 +134,37 @@ def __init__( _model.configure_model() _pipeline.append(_model) - if isinstance(ddp_config, DistributedDataParallelConfig): - from megatron.core.distributed import DistributedDataParallel as McoreDDP - - _pipeline = [ - McoreDDP( - model_chunk.config, - ddp_config, - model_chunk, - data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), - expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), - # Turn off bucketing for model_chunk 2 onwards, since communication for these - # model chunks is overlapped with compute anyway. - disable_bucketing=(model_chunk_idx > 0), - ) - for (model_chunk_idx, model_chunk) in enumerate(_pipeline) - ] + if isinstance(ddp_config, DistributedDataParallelConfig): + for model_chunk_idx, model_chunk in enumerate(_pipeline): + module = model_chunk.module + ddp = DDP( + module.config, + ddp_config, + module, + data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0), + ) + model_chunk.module = ddp + model_chunk.buffers = ddp.buffers # We need to do this explicitly since this is a attr pytorch uses + model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore - for i, model_module in enumerate(_pipeline): - if not cpu: - model_module.cuda(torch.cuda.current_device()) + for i, model_module in enumerate(_pipeline): + if not cpu: + model_module.cuda(torch.cuda.current_device()) - for param in model_module.parameters(): - set_defaults_if_not_set_tensor_model_parallel_attributes(param) + for param in model_module.parameters(): + set_defaults_if_not_set_tensor_model_parallel_attributes(param) - if hasattr(model_module, "configure_model"): - if not hasattr(model_module, "set_input_tensor"): - if hasattr(model_module.module, "set_input_tensor"): - model_module.set_input_tensor = model_module.module.set_input_tensor - else: - # TODO: What to do here? - pass + if hasattr(model_module, "configure_model"): + if not hasattr(model_module, "set_input_tensor"): + if hasattr(model_module.module, "set_input_tensor"): + model_module.set_input_tensor = model_module.module.set_input_tensor + else: + # TODO: What to do here? + pass # Print number of parameters. if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0: @@ -536,6 +538,7 @@ def __init__(self, name: str, is_property: bool = False, includes_self: bool = F self.includes_self = includes_self def __call__(self, module: nn.Module): + attr = getattr(module, self.name) if self.is_property: @@ -554,6 +557,24 @@ def wrapped(self, *args): return attr +def getattr_proxy(self, item: Any) -> Any: + try: + return super(self.__class__, self).__getattr__(item) + except AttributeError: + try: + return getattr(self.module, item) + except AttributeError: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'") + + +class DDP(McoreDDP): + def state_dict(self, prefix='', keep_vars=False, **kwargs): + self.module.state_dict(prefix=prefix, keep_vars=keep_vars, **kwargs) + + def __getattr__(self, item: Any) -> Any: + return getattr_proxy(self, item) + + class CallbackConnector: """ A connector for managing and invoking callbacks. diff --git a/nemo/lightning/optim.py b/nemo/lightning/optim.py new file mode 100644 index 0000000000000..d706680776bcf --- /dev/null +++ b/nemo/lightning/optim.py @@ -0,0 +1,66 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Callable, Optional + +from megatron.core.distributed import finalize_model_grads +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.utils import get_model_config +from pytorch_lightning.utilities.types import OptimizerLRScheduler +from torch.optim import Optimizer + +if TYPE_CHECKING: + from nemo.lightning.megatron_parallel import MegatronParallel + + +@dataclass +class MegatronOptim: + config: OptimizerConfig + finalize_model_grads: Callable = finalize_model_grads + + def create_optimizer( + self, + megatron_parallel: "MegatronParallel", + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, + ) -> Optimizer: + from nemo.core.optim import McoreDistributedOptimizer + + # TODO: Where should we put this? + get_model_config(megatron_parallel[0]).finalize_model_grads = finalize_model_grads + + mcore_opt = get_megatron_optimizer( + self.config, + list(megatron_parallel), + no_weight_decay_cond=no_weight_decay_cond, + scale_lr_cond=scale_lr_cond, + lr_mult=lr_mult, + ) + + return McoreDistributedOptimizer(mcore_opt) + + def configure_optimizer(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler: + from nemo.core.optim.lr_scheduler import CosineAnnealing + + opt = self.create_optimizer(megatron_parallel) + + # TODO: Make this configurable through the dataclass + lr_scheduler = CosineAnnealing(opt, max_steps=10, warmup_steps=750, constant_steps=80000, min_lr=int(6e-5)) + + return { + "optimizer": opt, + # REQUIRED: The scheduler instance + "scheduler": lr_scheduler, + # The unit of the scheduler's step size, could also be 'step'. + # 'epoch' updates the scheduler on epoch end whereas 'step' + # updates it after a optimizer update. + "interval": "epoch", + # How many epochs/steps should pass between calls to + # `scheduler.step()`. 1 corresponds to updating the learning + # rate after every epoch/step. + "frequency": 1, + # Metric to to monitor for schedulers like `ReduceLROnPlateau` + "monitor": "val_loss", + } + + def __call__(self, megatron_parallel: "MegatronParallel") -> OptimizerLRScheduler: + return self.configure_optimizer(megatron_parallel) diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 8fa178d7df010..7daef032376b2 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -1,4 +1,5 @@ import functools +import inspect import logging import shutil from collections import OrderedDict @@ -90,7 +91,7 @@ def __init__( self.ckpt_include_optimizer = ckpt_include_optimizer if ddp == "megatron": - self.ddp_config = DistributedDataParallelConfig() + self.ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) elif isinstance(ddp, DistributedDataParallelConfig): self.ddp_config = ddp elif ddp == "pytorch": @@ -165,18 +166,6 @@ def setup(self, trainer: pl.Trainer) -> None: trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer) - # set up optimizers after the wrapped module has been moved to the device - self.setup_optimizers(trainer) - - # TODO: Throw an execption if we have a mcore optimizer and no ddp_config - - if hasattr(self.precision_plugin, "convert_optimizer"): - _optimizers = [*self.optimizers] - _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) - self.optimizers = _optimizers - - _optimizers_to_device(self.optimizers, self.root_device) - import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState): @@ -223,6 +212,25 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None: cpu=isinstance(trainer.accelerator, CPUAccelerator), ddp_config=self.ddp_config, ) + + # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel + sig = inspect.signature(self.model.configure_optimizers) + if "megatron_parallel" in sig.parameters: + self.model.configure_optimizers = functools.partial( + self.model.configure_optimizers, megatron_parallel=self.megatron_parallel + ) + + self.setup_optimizers(trainer) + + # TODO: Throw an execption if we have a mcore optimizer and no ddp_config + + if hasattr(self.precision_plugin, "convert_optimizer"): + _optimizers = [*self.optimizers] + _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) + self.optimizers = _optimizers + + _optimizers_to_device(self.optimizers, self.root_device) + self.model = self.megatron_parallel self.model.trainer = trainer diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py index 31d20170c0b6b..fafd25e49f5af 100644 --- a/tests/lightning/test_megatron_parallel.py +++ b/tests/lightning/test_megatron_parallel.py @@ -55,7 +55,7 @@ def test_init_with_defaults(self, mocker, mock_pipeline): mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1) mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False) - megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline) + megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline, cpu=True) assert megatron_parallel.pipeline == mock_pipeline assert megatron_parallel.precision_plugin is None @@ -85,6 +85,7 @@ def test_init_with_custom_parameters( data_step=mock_data_step, forward_step=mock_forward_step, loss_reduction=mock_loss_reduction, + cpu=True, ) assert megatron_parallel.pipeline == mock_pipeline From 290456fba9cc2ca2c5a12a3ec9033792010aa206 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Wed, 12 Jun 2024 17:37:44 +0200 Subject: [PATCH 12/17] Use ModelOpt build_tensorrt_llm for building engines for qnemo checkpoints (#9452) * Enable specyfing alpha for SQ Signed-off-by: Jan Lasek * Enable specifying use_custom_all_reduce for export Signed-off-by: Jan Lasek * Use native TRT-LLM param names in export (partial) Signed-off-by: Jan Lasek * Detect TRT-LLM checkpoint programatically Signed-off-by: Jan Lasek * Pass use_custom_all_reduce in test_nemo_export.py Signed-off-by: Jan Lasek * Paramter parsing bugfix Signed-off-by: Jan Lasek * Revert "Paramter parsing bugfix" This reverts commit b0a4dd3859eec5258b3091daad27c292979a154f. Signed-off-by: Jan Lasek * Revert "Enable specifying use_custom_all_reduce for export" This reverts commit 9e419e3587a8b5c1eb8deda843ba37ee0fb1cf0d. Signed-off-by: Jan Lasek * Revert "Pass use_custom_all_reduce in test_nemo_export.py" This reverts commit be7081248b6d31a389e79438cdbe8737c51803ee. Signed-off-by: Jan Lasek * Rename checkpoint detection function Signed-off-by: Jan Lasek * Use ModelOpt build_tensorrt_llm utility for qnemo for performance alignment Signed-off-by: Jan Lasek * Import fix Signed-off-by: Jan Lasek * Apply isort and black reformatting Signed-off-by: janekl --------- Signed-off-by: Jan Lasek Signed-off-by: janekl Co-authored-by: janekl --- nemo/export/tensorrt_llm.py | 13 ++- .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py | 92 +++++++++---------- nemo/export/trt_llm/qnemo/utils.py | 18 ++++ 3 files changed, 76 insertions(+), 47 deletions(-) create mode 100644 nemo/export/trt_llm/qnemo/utils.py diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index c826848e9328a..6ad9d57a2ab83 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -33,6 +33,7 @@ from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import get_tokenzier, is_nemo_file, load_nemo_model from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer +from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load @@ -229,7 +230,7 @@ def export( tmp_dir = tempfile.TemporaryDirectory() nemo_export_dir = Path(tmp_dir.name) - if nemo_checkpoint_path.endswith("qnemo"): + if is_qnemo_checkpoint(nemo_checkpoint_path): if os.path.isdir(nemo_checkpoint_path): nemo_export_dir = nemo_checkpoint_path else: @@ -244,7 +245,17 @@ def export( max_output_len=max_output_len, max_batch_size=max_batch_size, max_prompt_embedding_table_size=max_prompt_embedding_table_size, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + use_parallel_embedding=use_parallel_embedding, + paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, + enable_multi_block_mode=enable_multi_block_mode, + use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, + max_lora_rank=max_lora_rank, + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, ) else: model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir) diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index b7e2f7bc29739..630330381e560 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import os -import subprocess +import glob +import os +import warnings from typing import List, Optional -CONFIG_NAME = "config.json" +from modelopt.deploy.llm import build_tensorrt_llm + +from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME def qnemo_to_tensorrt_llm( @@ -28,50 +30,48 @@ def qnemo_to_tensorrt_llm( max_output_len: int, max_batch_size: int, max_prompt_embedding_table_size: int, + tensor_parallel_size: int = None, + pipeline_parallel_size: int = None, + use_parallel_embedding: bool = False, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, + enable_multi_block_mode: bool = False, + use_lora_plugin: str = None, lora_target_modules: Optional[List[str]] = None, + max_lora_rank: int = 64, + max_num_tokens: int = None, + opt_num_tokens: int = None, ): - """Build TRT-LLM engine via trtllm-build CLI API in a subprocess.""" + """Build TensorRT-LLM engine with ModelOpt build_tensorrt_llm function.""" assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}" - print( - "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters" - " for quantized models is possible only on export step via nemo.export.quantize module." - " These parameters are ignored when building and running TensorRT-LLM engine below." + + warnings.warn( + "Note that setting tensor_parallel_size and pipeline_parallel_size parameters" + " for quantized models should be done on calibration step with nemo.export.quantize module." + " These parameters are ignored when building and running TensorRT-LLM engine below.", + UserWarning, + stacklevel=3, ) - # Load config to explicitly pass selected parameters to trtllm-build command: - with open(os.path.join(nemo_checkpoint_path, CONFIG_NAME), "r") as f: - model_config = json.load(f) - command = [ - "trtllm-build", - "--checkpoint_dir", - nemo_checkpoint_path, - "--output_dir", - engine_dir, - "--max_batch_size", - str(max_batch_size), - "--max_input_len", - str(max_input_len), - "--max_output_len", - str(max_output_len), - "--max_prompt_embedding_table_size", - str(max_prompt_embedding_table_size), - "--gemm_plugin", - model_config["dtype"], - "--gpt_attention_plugin", - model_config["dtype"], - "--strongly_typed", - "--use_custom_all_reduce", - "disable", - "--workers", - str(model_config["mapping"]["world_size"]), - ] - command_str = " ".join(command) - print(f"Build command is:\n{command_str}") - print("Running trtllm-build, this may take a while...") - result = subprocess.run(command, capture_output=True) # TODO: consider streaming logs - if result.returncode != 0: - print(result.stdout.decode()) - print(result.stderr.decode()) - raise RuntimeError("Error encountered for trtllm-build command, please check logs.") - print("Building engine done. Full logs are:") - print(result.stdout.decode()) + warnings.warn( + "Also use_parallel_embedding, paged_kv_cache, remove_input_padding, enable_multi_block_mode, max_num_tokens" + " and opt_num_tokens parameters are set by ModelOpt build_tensorrt_llm function in the optimal way and are" + " ignored on engine build step.", + UserWarning, + stacklevel=3, + ) + + num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*")))) + assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}" + + build_tensorrt_llm( + pretrained_config=os.path.join(nemo_checkpoint_path, CONFIG_NAME), + engine_dir=engine_dir, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + max_beam_width=1, + num_build_workers=num_build_workers, + enable_sparsity=False, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + ) diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py new file mode 100644 index 0000000000000..58d1d308507f4 --- /dev/null +++ b/nemo/export/trt_llm/qnemo/utils.py @@ -0,0 +1,18 @@ +import os +from pathlib import Path + +from nemo.export.tarutils import TarPath + +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "rank{}.safetensors" + + +def is_qnemo_checkpoint(path: str) -> bool: + """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence.""" + if os.path.isdir(path): + path = Path(path) + else: + path = TarPath(path) + config_path = path / CONFIG_NAME + tensor_path = path / WEIGHTS_NAME.format(0) + return config_path.exists() and tensor_path.exists() From 1c0bef011eb5b58a6fae76f1ae60cc94bf9b0bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 12 Jun 2024 18:36:15 +0200 Subject: [PATCH 13/17] ci: Fix extract last 2K chars of logs (#9450) ci(notifications): Fix extract of last 2K chars Signed-off-by: Oliver Koenig --- .github/workflows/_test_template.yml | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 065af34408ccb..5956a23bdd67f 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -63,26 +63,8 @@ jobs: ) 2> >(tee err.log) EXIT_CODE=$? - # Slack only allows 3000 chars per block. - # Since a block contains information about other - # metdata than the log, we prune the log to 2000 - # chars. - min() { - if (( $1 > $2 )); then - echo $2 - else - echo $1 - fi - } - - log=$(cat err.log) - - MAX_LENGTH=$(echo $log | wc -m) - MAX_LENGTH=$(min $MAX_LENGTH 2000) - MAX_LENGTH=$(( $MAX_LENGTH - 1 )) - - log=$(echo "${log: -${MAX_LENGTH}}" | base64 -w 0) - echo "log=$log" | tee -a "$GITHUB_OUTPUT" + + echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT" exit $EXIT_CODE From f8eeb794c381f479bb3b245aac81415660549a6d Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Wed, 12 Jun 2024 14:26:08 -0700 Subject: [PATCH 14/17] Add option to merge distributed optimizer buckets (#9414) * Add option to merge distopt buckets in GPT Signed-off-by: Tim Moon * Move distopt bucket merge logic to base LLM class Signed-off-by: Tim Moon * Apply isort and black reformatting Signed-off-by: timmoon10 --------- Signed-off-by: Tim Moon Signed-off-by: timmoon10 Co-authored-by: timmoon10 Co-authored-by: Sangkug Lym --- .../models/language_modeling/megatron_base_model.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index e7f2aa805a9c9..0828d88a81333 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -861,7 +861,15 @@ def configure_optimizers(self): # Initialize param buckets if explicitly provided if getattr(self, 'distributed_adam_buckets', None) is not None: - for bucket in self.distributed_adam_buckets: + buckets = self.distributed_adam_buckets + if self.cfg.get('distributed_adam_bucket_merge_size', 1) > 1: + # Merge buckets if needed + stride = self.cfg.get('distributed_adam_bucket_merge_size', 1) + buckets = [ + list(itertools.chain.from_iterable(buckets[i : i + stride])) + for i in range(0, len(buckets), stride) + ] + for bucket in buckets: self._optimizer.init_params_bucket(bucket) self._optimizer.init_params_bucket(self.parameters()) if hasattr(self, 'distributed_adam_buckets'): From 387f0b138d91da8996d982b8831ccf7370814ad1 Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Wed, 12 Jun 2024 17:01:33 -0600 Subject: [PATCH 15/17] Update readme with mlperf news (#9457) * update Signed-off-by: eharper * update Signed-off-by: eharper * remove link to image Signed-off-by: eharper * remove link to image Signed-off-by: eharper * fix formatting Signed-off-by: eharper --------- Signed-off-by: eharper --- README.rst | 122 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 34 deletions(-) diff --git a/README.rst b/README.rst index c4cbf759d9759..ab3a4b6b06c96 100644 --- a/README.rst +++ b/README.rst @@ -45,58 +45,112 @@ Latest News
Large Language Models and Multimodal -
- Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE (2024/03/16) +
+ + + NVIDIA sets new generative AI performance and scale records in MLPerf Training v4.0 + (2024/06/12) + + + Using NVIDIA NeMo Framework and NVIDIA Hopper GPUs NVIDIA was able to scale to 11,616 H100 GPUs and achieve near-linear performance scaling on LLM pretraining. + NVIDIA also achieved the highest LLM fine-tuning performance and raised the bar for text-to-image training. +

+
- An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework. +
+ + + Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE + (2024/03/16) + + + An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. + The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.

- Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso (2024/03/06) - - Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference. + + + Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso + (2024/03/06) + + + Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. + The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. + Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.

-
- -
- New NVIDIA NeMo Framework Features and NVIDIA H200 (2023/12/06) +
- NVIDIA NeMo Framework now includes several optimizations and enhancements, including: 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs. -

- H200-NeMo-performance -

-
- -
- NVIDIA now powers training for Amazon Titan Foundation models (2023/11/28) +
+ + + New NVIDIA NeMo Framework Features and NVIDIA H200 + (2023/12/06) + + + NVIDIA NeMo Framework now includes several optimizations and enhancements, + including: + 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, + 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, + 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and + 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs. +

+ + H200-NeMo-performance +

+
- NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. The NeMo Framework provides a versatile framework for building, customizing, and running LLMs. -

-
+
+ + + NVIDIA now powers training for Amazon Titan Foundation models + (2023/11/28) + + + NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). + The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. + The NeMo Framework provides a versatile framework for building, customizing, and running LLMs. +

+
Speech Recognition -
- New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model (2024/04/18) - - The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages. -

-
-
- Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models (2024/04/18) + + + New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model + (2024/04/18) + + + The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. + Canary also provides bi-directional translation, between English and the three other supported languages. +

+
- NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy. +
+ + + Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models + (2024/04/18) + + + NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. + These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.

-
+
- Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT (2024/04/18) - - NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B. + + + Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT + (2024/04/18) + + + NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. + This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.

From a72a0e790703c8eced7d95afc0e57dda244b733b Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Wed, 12 Jun 2024 22:22:33 -0400 Subject: [PATCH 16/17] TRT-LLM 0.10 Update (#9402) * reorg the export code Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * replaced log with raise Signed-off-by: Onur Yilmaz * add converter and loader folders Signed-off-by: Onur Yilmaz * move nemo_ckpt_convert into the converter folder Signed-off-by: Onur Yilmaz * move nemo_file into loader folder Signed-off-by: Onur Yilmaz * reorg converter Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * continue to reorg converter Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * continue to reorg Signed-off-by: Onur Yilmaz * move nemo file back into nemo folder Signed-off-by: Onur Yilmaz * renamed nemo folder to nemo_ckpt_loader Signed-off-by: Onur Yilmaz * remove unused function Signed-off-by: Onur Yilmaz * removed nemo file Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * moved a function to tensorrt_llm_run file Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * Remove unused imports Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * import csv added Signed-off-by: Onur Yilmaz * update the APIs Signed-off-by: Onur Yilmaz * add use_embedding_sharing param Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * do not add unused inputs during MG export Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * add cpp runtime test Signed-off-by: Onur Yilmaz * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * sharing embedding * Remove manually scaling * renaming to avoid nemo github issue Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz Signed-off-by: oyilmaz-nvidia Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: oyilmaz-nvidia Co-authored-by: Bobby Chen --- nemo/export/tensorrt_llm.py | 10 +++- .../trt_llm/converter/model_converter.py | 36 +++++++++--- .../converter/model_to_trt_llm_ckpt.py | 6 -- nemo/export/trt_llm/tensorrt_llm_build.py | 4 +- .../{test_nemo_export.py => nemo_export.py} | 38 ++++++++++++ tests/export/run.sh | 58 +++++++++---------- 6 files changed, 106 insertions(+), 46 deletions(-) rename tests/export/{test_nemo_export.py => nemo_export.py} (94%) diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 6ad9d57a2ab83..7cc92f0ca588e 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -121,6 +121,7 @@ def export( n_gpus: int = 1, tensor_parallel_size: int = None, pipeline_parallel_size: int = None, + gpus_per_node: int = None, max_input_len: int = 256, max_output_len: int = 256, max_input_token: Optional[int] = None, @@ -128,6 +129,7 @@ def export( max_batch_size: int = 8, max_prompt_embedding_table_size=None, use_parallel_embedding: bool = False, + use_embedding_sharing: bool = False, paged_kv_cache: bool = True, remove_input_padding: bool = True, dtype: str = "bfloat16", @@ -150,6 +152,7 @@ def export( n_gpus (int): number of GPUs to use for inference. tensor_parallel_size (int): tensor parallelism. pipeline_parallel_size (int): pipeline parallelism. + gpus_per_node (int): number of gpus per node. max_input_len (int): max input length. max_output_len (int): max output length. max_input_token (int): max input length. Deprecated, use max_input_len instead. @@ -157,6 +160,7 @@ def export( max_batch_size (int): max batch size. max_prompt_embedding_table_size (int): max prompt embedding size. use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not + use_embedding_sharing (bool): paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM. remove_input_padding (bool): enables removing input padding or not. dtype (str): Floating point type for model weights (Supports BFloat16/Float16). @@ -173,7 +177,7 @@ def export( if model_type not in self.get_supported_models_list: raise Exception( "Model {0} is not currently a supported model type. " - "Supported model types are llama, gptnext, falcon, and starcoder".format(model_type) + "Supported model types are llama, gptnext, falcon, and starcoder.".format(model_type) ) if model_type == "gpt" or model_type == "starcoder": @@ -189,6 +193,8 @@ def export( tensor_parallel_size = 1 pipeline_parallel_size = n_gpus + gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node + if Path(self.model_dir).exists(): if delete_existing_files and len(os.listdir(self.model_dir)) > 0: for files in os.listdir(self.model_dir): @@ -267,7 +273,9 @@ def export( dtype=dtype, tensor_parallel_size=tensor_parallel_size, pipeline_parallel_size=pipeline_parallel_size, + gpus_per_node=gpus_per_node, use_parallel_embedding=use_parallel_embedding, + use_embedding_sharing=use_embedding_sharing, ) for weight_dict, model_config in zip(weights_dicts, model_configs): diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 5e522d8bbff2c..da13449160f95 100644 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -72,9 +72,17 @@ def model_to_trtllm_ckpt( dtype: str = "bfloat16", tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, + gpus_per_node: int = None, use_parallel_embedding: bool = False, + use_embedding_sharing: bool = False, ) -> Tuple[List[Dict], List[PretrainedConfig]]: + if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing: + LOGGER.info( + "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True" + ) + use_embedding_sharing = True + weights_dict = convert_model_to_trt_llm_ckpt( model=model, nemo_model_config=nemo_model_config, @@ -88,12 +96,14 @@ def model_to_trtllm_ckpt( world_size = tensor_parallel_size * pipeline_parallel_size - lm_head_weight = weights_dict["lm_head.weight"] + has_lm_head = "lm_head.weight" in weights_dict + if has_lm_head: + lm_head_weight = weights_dict["lm_head.weight"] vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0] - vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) + vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size - if vocab_size_padded != vocab_size: + if has_lm_head and vocab_size_padded != vocab_size: pad_width = vocab_size_padded - vocab_size lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0) @@ -120,7 +130,7 @@ def model_to_trtllm_ckpt( 'hidden_act': hidden_act, 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': 0, - 'share_embedding_table': False, + 'share_embedding_table': use_embedding_sharing, 'quantization': { 'quant_algo': None, 'kv_cache_quant_algo': None, @@ -160,9 +170,15 @@ def model_to_trtllm_ckpt( "transformer.ln_f.bias", } + gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node + for i in range(world_size): mapping = tensorrt_llm.Mapping( - world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size + world_size=world_size, + rank=i, + tp_size=tensor_parallel_size, + pp_size=pipeline_parallel_size, + gpus_per_node=gpus_per_node, ) layers_range = mapping.pp_layers(num_layers) @@ -174,6 +190,8 @@ def model_to_trtllm_ckpt( if new_key.endswith(".bin"): # TP split if new_key.endswith(f"{mapping.tp_rank}.bin"): new_key = new_key.replace(f".{mapping.tp_rank}.bin", "") + else: + continue if "layers" in new_key: # PP layer_num = int(new_key.split(".")[2]) if layer_num in layers_range: @@ -202,15 +220,17 @@ def model_to_trtllm_ckpt( weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight if mapping.is_last_pp_rank(): - weights_dict_local["lm_head.weight"] = np.ascontiguousarray( - split(lm_head_weight, mapping.tp_size, mapping.tp_rank) - ) + if has_lm_head: + weights_dict_local["lm_head.weight"] = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank) + ) weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"] ln_f_bias = weights_dict.get("transformer.ln_f.bias") if ln_f_bias is not None: weights_dict_local["transformer.ln_f.bias"] = ln_f_bias + config["gpus_per_node"] = gpus_per_node model_config = PretrainedConfig(**config) model_config.mapping = mapping model_configs.append(model_config) diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py index df7e43548a444..c29edc87353e7 100644 --- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py +++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py @@ -158,8 +158,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): model_level_weights["transformer.position_embedding.weight"].append(val) if pp_idx == 0: val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] - if embedding_scaling: - val = val * float(math.sqrt(hidden_size)) vocab_size = val.shape[0] if use_parallel_embedding: @@ -171,10 +169,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): val = torch_to_numpy(val.to(storage_type).cpu()) model_level_weights["transformer.vocab_embedding.weight"].append(val) - if share_embeddings_and_output: - val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] - val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["lm_head.weight"].append(val) if has_lm_head and pp_idx == training_pp_size - 1: val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)] val = torch_to_numpy(val.to(storage_type).cpu()) diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index bbafec319fd5d..ef9a14c1d582a 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -19,7 +19,7 @@ from tensorrt_llm.builder import BuildConfig, Builder from tensorrt_llm.commands.build import build as build_trtllm from tensorrt_llm.logger import logger -from tensorrt_llm.lora_manager import LoraBuildConfig +from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights from tensorrt_llm.plugin import PluginConfig @@ -94,7 +94,7 @@ def build_and_save_engine( if use_lora_plugin is not None: build_config.plugin_config.set_lora_plugin(use_lora_plugin) - lora_config = LoraBuildConfig( + lora_config = LoraConfig( lora_dir=lora_ckpt_list, lora_ckpt_source='nemo', max_lora_rank=max_lora_rank, diff --git a/tests/export/test_nemo_export.py b/tests/export/nemo_export.py similarity index 94% rename from tests/export/test_nemo_export.py rename to tests/export/nemo_export.py index bac592c90cc29..5541cc0f8673b 100644 --- a/tests/export/test_nemo_export.py +++ b/tests/export/nemo_export.py @@ -128,6 +128,7 @@ def run_trt_llm_inference( trt_llm_model_dir, n_gpu=1, max_batch_size=8, + use_embedding_sharing=False, max_input_len=128, max_output_len=128, ptuning=False, @@ -216,6 +217,7 @@ def run_trt_llm_inference( lora_target_modules=lora_target_modules, max_num_tokens=int(max_input_len * max_batch_size * 0.2), opt_num_tokens=60, + use_embedding_sharing=use_embedding_sharing, save_nemo_model_config=True, ) @@ -237,6 +239,14 @@ def run_trt_llm_inference( stop_words_list=stop_words_list, ) + if not use_lora_plugin and not ptuning: + test_cpp_runtime( + engine_path=trt_llm_model_dir, + prompt=prompt, + max_output_len=max_output_len, + debug=True, + ) + nq = None nm = None output_deployed = "" @@ -290,6 +300,27 @@ def run_trt_llm_inference( raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) +def test_cpp_runtime( + engine_path, + prompt, + max_output_len, + debug, +): + trt_llm_exporter = TensorRTLLM(engine_path, load_model=True) + output = trt_llm_exporter.forward( + input_texts=prompt, + max_output_len=max_output_len, + top_k=1, + top_p=0.0, + temperature=1.0, + ) + + if debug: + print("") + print("--- Output deployed with cpp runtime: ", output) + print("") + + def run_existing_checkpoints( model_name, n_gpus, @@ -332,6 +363,12 @@ def run_existing_checkpoints( else: raise Exception("There is not lora checkpoint path defined.") + if model_info["model_type"] == "gemma": + print("*********************") + use_embedding_sharing = True + else: + use_embedding_sharing = False + return run_trt_llm_inference( model_name=model_name, model_type=model_info["model_type"], @@ -340,6 +377,7 @@ def run_existing_checkpoints( trt_llm_model_dir=model_info["trt_llm_model_dir"], n_gpu=n_gpus, max_batch_size=model_info["max_batch_size"], + use_embedding_sharing=use_embedding_sharing, max_input_len=512, max_output_len=model_info["max_output_len"], ptuning=ptuning, diff --git a/tests/export/run.sh b/tests/export/run.sh index 0071b13511136..b3badd25a8f91 100644 --- a/tests/export/run.sh +++ b/tests/export/run.sh @@ -20,32 +20,32 @@ for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done set +x -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 \ No newline at end of file +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 \ No newline at end of file From a01fa6d5f569d18ddf79bcb8cbe64193ac52b634 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Wed, 12 Jun 2024 22:22:54 -0400 Subject: [PATCH 17/17] In-framework deployment (#9438) * initial MegatronGPTDeployable class * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * delete old comment * first draft of MegatronGPTDeployable test script * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * small cleanup of test_triton_deployable.py * move MegatronGPTDeployable into nlp folder since it is language specific * update test_triton_deployable for new MegatronGPTDeployable location * renaming NemoQueryLLM classes * MegatronGPTDeployable should programatically generate input/output fields from the relevant internal classes instead of hard-coding whenever possible * add NemoTritonQueryLLMPyTorch class and example * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * MegatronGPTModel should always load on creation, also allow number of gpus to be controlled via argument * got logprobs working, but can only process one prompt at a time * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add nemo deployable to deploy_triton.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * multigpu working, with manual torch.distributed calls * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rename MegatronGPTDeployable to MegatronLLMDeployable * MegatronGPTDeployable->MegatronLLMDeployable rename for filenames * move torch.distributed calls inside MegatronLLMDeployable * add constructor for existing model class, tested working with Mistral7B and Nemotron3-22B * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rename test_triton_deployable.py to tests_pytriton_deploy.py * cleanup, comments, and style guide fixes * add warning for multigpu cases where users will need to be aware of pytorch lightning DDP behavior * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixing formatting of logprob outputs * fix single gpu behavior, and add padding to outputs to allow for multi-prompt logprob calculation * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * fixing codeQL issues * Apply isort and black reformatting Signed-off-by: jukim-nv * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia * removed min_length definition in previous commit but forgot to remove its use * update comments and arguments in deploy/nlp/query_llm.py * Apply isort and black reformatting Signed-off-by: jukim-nv * delete unused arguments from test_pytriton_deploy.py * remove some debug prints from megatronllm_deployable * rename test file due to pytest issue Signed-off-by: Onur Yilmaz --------- Signed-off-by: oyilmaz-nvidia Signed-off-by: jukim-nv Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Signed-off-by: Onur Yilmaz Co-authored-by: Justin Kim Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: oyilmaz-nvidia Co-authored-by: jukim-nv Co-authored-by: Pablo Garay --- nemo/deploy/nlp/__init__.py | 4 +- nemo/deploy/nlp/megatronllm_deployable.py | 316 ++++++++++++++++++++++ scripts/deploy/nlp/deploy_triton.py | 75 ++--- tests/deploy/pytriton_deploy.py | 136 ++++++++++ 4 files changed, 498 insertions(+), 33 deletions(-) create mode 100644 nemo/deploy/nlp/megatronllm_deployable.py create mode 100644 tests/deploy/pytriton_deploy.py diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py index 21e2ca2751f82..52d5b3dbff3eb 100644 --- a/nemo/deploy/nlp/__init__.py +++ b/nemo/deploy/nlp/__init__.py @@ -15,6 +15,8 @@ use_query_llm = True try: - from nemo.deploy.nlp.query_llm import NemoQueryLLM + from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMTensorRT except Exception: use_query_llm = False + +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py new file mode 100644 index 0000000000000..c27bbbd0102b0 --- /dev/null +++ b/nemo/deploy/nlp/megatronllm_deployable.py @@ -0,0 +1,316 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import IntEnum, auto +from pathlib import Path + +import numpy as np +import torch +import wrapt +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.text_generation_utils import ( + OutputType, + get_default_length_params, + get_default_sampling_params, +) +from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.deploy import ITritonDeployable +from nemo.deploy.utils import cast_output, str_ndarray2list + + +@wrapt.decorator +def noop_decorator(func): + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + +use_pytriton = True +batch = noop_decorator +try: + from pytriton.decorators import batch + from pytriton.model_config import Tensor +except Exception: + use_pytriton = False + +LOGGER = logging.getLogger("NeMo") + + +def GetTensorShape(pyvalue): + """ + utility function to get Triton Tensor shape from a python value + assume that lists are shape -1 and all others are scalars with shape 1 + """ + return (-1 if type(pyvalue) == list else 1,) + + +def GetNumpyDtype(pyvalue): + """ + utility function to get numpy dtype of a python value + e.g. bool -> np.bool_ + """ + ''' + manually defining the mapping of python type -> numpy type for now + is there a better way to do it? tried np.array(pyvalue).dtype, but that doesn't seem to work + ''' + py_to_numpy_mapping = {str: bytes, bool: np.bool_, float: np.single, int: np.int_} + python_type = type(pyvalue) + # for lists, return the type of the internal elements + if python_type == list: + python_type = type(pyvalue[0]) + numpy_type = py_to_numpy_mapping[python_type] + return numpy_type + + +class ServerSync(IntEnum): + """Enum for synchronization messages using torch.distributed""" + + WAIT = auto() + SIGNAL = auto() + + def to_long_tensor(self): + return torch.tensor([self], dtype=torch.long, device='cuda') + + +class MegatronLLMDeployable(ITritonDeployable): + """Triton inference server compatible deploy class for a .nemo model file""" + + def __init__( + self, + nemo_checkpoint_filepath: str = None, + num_devices: int = 1, + num_nodes: int = 1, + existing_model: MegatronGPTModel = None, + ): + if nemo_checkpoint_filepath is None and existing_model is None: + raise ValueError( + "MegatronLLMDeployable requires either a .nemo checkpoint filepath or an existing MegatronGPTModel, but both provided were None" + ) + if num_devices > 1: + LOGGER.warning( + "Creating a MegatronLLMDeployable with num_devices>1 will assume running with a PyTorch Lightning DDP-variant strategy, which will run the main script once per device. Make sure any user code is compatible with multiple executions!" + ) + + # if both existing_model and nemo_checkpoint_filepath are provided, existing_model will take precedence + if existing_model is not None: + self.model = existing_model + else: + self._load_from_nemo_checkpoint(nemo_checkpoint_filepath, num_devices, num_nodes) + + self.model.eval() + # helper threads spawned by torch.multiprocessing should loop inside this helper function + self._helper_thread_evaluation_loop() + + def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices: int, num_nodes: int): + if Path(nemo_checkpoint_filepath).exists(): + trainer = Trainer( + strategy=NLPDDPStrategy(), + devices=num_devices, + num_nodes=num_nodes, + ) + + custom_config = MegatronGPTModel.restore_from( + nemo_checkpoint_filepath, trainer=trainer, return_config=True + ) + # transformer_engine should always be true according to EricH, but GPT-2B model will fail if it is enabled + custom_config.transformer_engine = True + # using multi-gpu for tensor parallelism directly for now, could do pipeline parallel instead or a combination + custom_config.tensor_model_parallel_size = num_devices + # had to override these to make Nemotron3-22B work, see sample_sequence_batch() in text_generation_utils.py + custom_config.activations_checkpoint_granularity = None + custom_config.activations_checkpoint_method = None + + self.model = MegatronGPTModel.restore_from( + nemo_checkpoint_filepath, trainer=trainer, override_config_path=custom_config + ) + + def _helper_thread_evaluation_loop(self): + # only deploy the server on main thread, other threads enter this evaluation loop + if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0: + while True: + wait_value = ServerSync.WAIT.to_long_tensor() + torch.distributed.broadcast(wait_value, 0) + if wait_value.item() == ServerSync.SIGNAL: + self.model.generate(inputs=[""], length_params=None) + + _INPUT_PARAMETER_FIELDS = { + "prompts": (-1, bytes, False), + } + + ''' + there is no get_default equivalent for OutputType like there is for SamplingParameters and LengthParameters + but we still want to generate output using a real OutputType TypedDict for static type checking + ''' + _BLANK_OUTPUTTYPE: OutputType = { + 'sentences': [""], + 'tokens': [[""]], + 'logprob': [[0.0]], + 'full_logprob': [[0.0]], + 'token_ids': [[0]], + 'offsets': [[0]], + } + + @property + def get_triton_input(self): + input_parameters = tuple( + Tensor(name=name, shape=(shape,), dtype=dtype, optional=optional) + for name, (shape, dtype, optional) in self._INPUT_PARAMETER_FIELDS.items() + ) + ''' + in theory, would like to use typedict2tensor() function to generate Tensors, but it purposely ignores 1D arrays + asked JakubK why on 2024-04-26, but he doesn't know who owns the code + sampling_parameters = typedict2tensor(SamplingParam) + length_parameters = typedict2tensor(LengthParam) + ''' + default_sampling_params: SamplingParam = get_default_sampling_params() + sampling_parameters = tuple( + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value), + optional=True, + ) + for parameter_name, parameter_value in default_sampling_params.items() + ) + default_length_params: LengthParam = get_default_length_params() + length_parameters = tuple( + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value), + optional=True, + ) + for parameter_name, parameter_value in default_length_params.items() + ) + + inputs = input_parameters + sampling_parameters + length_parameters + return inputs + + @property + def get_triton_output(self): + # outputs are defined by the fields of OutputType + outputs = [ + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value[0]), + ) + for parameter_name, parameter_value in MegatronLLMDeployable._BLANK_OUTPUTTYPE.items() + ] + return outputs + + @staticmethod + def _sampling_params_from_triton_inputs(**inputs: np.ndarray): + """Extract SamplingParam fields from triton input dict""" + sampling_params: SamplingParam = get_default_sampling_params() + for sampling_param_field in sampling_params.keys(): + if sampling_param_field in inputs: + sampling_params[sampling_param_field] = inputs.pop(sampling_param_field)[0][0] + return sampling_params + + @staticmethod + def _length_params_from_triton_inputs(**inputs: np.ndarray): + """Extract LengthParam fields from triton input dict""" + length_params: LengthParam = get_default_length_params() + for length_param_field in length_params.keys(): + if length_param_field in inputs: + length_params[length_param_field] = inputs.pop(length_param_field)[0][0] + return length_params + + @batch + def triton_infer_fn(self, **inputs: np.ndarray): + """Triton server inference function that actually runs the model""" + if torch.distributed.is_initialized(): + distributed_rank = torch.distributed.get_rank() + if distributed_rank != 0: + raise ValueError( + f"Triton inference function should not be called on a thread with torch.distributed rank != 0, but this thread is rank {distributed_rank}" + ) + signal_value = ServerSync.SIGNAL.to_long_tensor() + torch.distributed.broadcast(signal_value, 0) + + input_strings = str_ndarray2list(inputs.pop("prompts")) + sampling_params = self._sampling_params_from_triton_inputs(**inputs) + length_params = self._length_params_from_triton_inputs(**inputs) + + model_output = self.model.generate( + inputs=input_strings, length_params=length_params, sampling_params=sampling_params + ) + ''' + model_output['sentences'] will be a list of strings (one per prompt) + other fields will either be a list of lists (tokens, for example) + or a list of pytorch Tensor + ''' + + triton_output = {} + _OUTPUT_FILLER_VALUES = { + 'tokens': "", + 'logprob': 0.0, + 'full_logprob': 0.0, + 'token_ids': -1, + 'offsets': -1, + } + for model_output_field, value in model_output.items(): + + if model_output_field != 'sentences' and value is not None: + # find length of longest non-sentence output item + field_longest_output_item = 0 + for item in value: + field_longest_output_item = max(field_longest_output_item, len(item)) + # then pad shorter items to match this length + for index, item in enumerate(value): + num_pad_values = field_longest_output_item - len(item) + if num_pad_values > 0: + pad_value = _OUTPUT_FILLER_VALUES[model_output_field] + if isinstance(item, torch.Tensor): + pad_tensor = torch.full( + (num_pad_values, item.size(1)) if item.dim() > 1 else (num_pad_values,), + pad_value, + dtype=item.dtype, + device='cuda', + ) + padded_item = torch.cat((item, pad_tensor)) + value[index] = padded_item + else: + pad_list = [pad_value] * num_pad_values + padded_item = item + pad_list + value[index] = padded_item + + field_dtype = GetNumpyDtype(MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0]) + if value is None: + # triton does not allow for optional output parameters, so need to populate them if they don't exist + triton_output[model_output_field] = np.full( + # 'sentences' should always have a valid value, so use that for the output shape + np.shape(model_output['sentences']), + MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0], + dtype=field_dtype, + ) + elif field_dtype == bytes: + # strings are cast to bytes + triton_output[model_output_field] = cast_output(value, field_dtype) + elif isinstance(value[0], torch.Tensor): + if value[0].dtype == torch.bfloat16: + # numpy currently does not support bfloat16, so need to manually convert it + triton_output[model_output_field] = np.array([tensor.cpu().float().numpy() for tensor in value]) + else: + triton_output[model_output_field] = np.array([tensor.cpu().numpy() for tensor in value]) + else: + # non-strings are output as-is (in numpy format) + triton_output[model_output_field] = np.array(value) + return triton_output diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 0f7866e57cda4..835ff46dd5fe9 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -19,9 +19,9 @@ from pathlib import Path from nemo.deploy import DeployPyTriton +from nemo.deploy.nlp import MegatronLLMDeployable from nemo.export import TensorRTLLM - LOGGER = logging.getLogger("NeMo") @@ -31,6 +31,13 @@ def get_args(argv): description=f"Deploy nemo models to Triton", ) parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") + parser.add_argument( + "-dsn", + "--direct_serve_nemo", + default=False, + action='store_true', + help="Serve the nemo model directly instead of exporting to TRTLLM first. Will ignore other TRTLLM-specific arguments.", + ) parser.add_argument( "-ptnc", "--ptuning_nemo_checkpoint", @@ -146,18 +153,7 @@ def get_args(argv): return args -def nemo_deploy(argv): - args = get_args(argv) - - if args.debug_mode: - loglevel = logging.DEBUG - else: - loglevel = logging.INFO - - LOGGER.setLevel(loglevel) - LOGGER.info("Logging level set to {}".format(loglevel)) - LOGGER.info(args) - +def get_trtllm_deployable(args): if args.triton_model_repository is None: trt_llm_path = "/tmp/trt_llm_model_dir/" LOGGER.info( @@ -170,28 +166,24 @@ def nemo_deploy(argv): trt_llm_path = args.triton_model_repository if args.nemo_checkpoint is None and args.triton_model_repository is None: - LOGGER.error( + raise ValueError( "The provided model repository is not a valid TensorRT-LLM model " "directory. Please provide a --nemo_checkpoint." ) - return if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository): - LOGGER.error( + raise ValueError( "The provided model repository is not a valid TensorRT-LLM model " "directory. Please provide a --nemo_checkpoint." ) - return if args.nemo_checkpoint is not None and args.model_type is None: - LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.") - return + raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") ptuning_tables_files = [] if not args.ptuning_nemo_checkpoint is None: if args.max_prompt_embedding_table_size is None: - LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") - return + raise ValueError("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") for pt_checkpoint in args.ptuning_nemo_checkpoint: ptuning_nemo_checkpoint_path = Path(pt_checkpoint) @@ -199,19 +191,16 @@ def nemo_deploy(argv): if ptuning_nemo_checkpoint_path.is_file(): ptuning_tables_files.append(pt_checkpoint) else: - LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) - return + raise IsADirectoryError("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) else: - LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint)) - return + raise FileNotFoundError("File or directory {0} does not exist.".format(pt_checkpoint)) if args.task_ids is not None: if len(ptuning_tables_files) != len(args.task_ids): - LOGGER.error( + raise RuntimeError( "Number of task ids and prompt embedding tables have to match. " "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids)) ) - return trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, @@ -245,8 +234,7 @@ def nemo_deploy(argv): save_nemo_model_config=True, ) except Exception as error: - LOGGER.error("An error has occurred during the model export. Error message: " + str(error)) - return + raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) try: for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files): @@ -265,12 +253,35 @@ def nemo_deploy(argv): prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, ) except Exception as error: - LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)) - return + raise RuntimeError( + "An error has occurred during adding the prompt embedding table(s). Error message: " + str(error) + ) + return trt_llm_exporter + + +def get_nemo_deployable(args): + if args.nemo_checkpoint is None: + raise ValueError("Direct serve requires a .nemo checkpoint") + return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) + + +def nemo_deploy(argv): + args = get_args(argv) + + if args.debug_mode: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + + LOGGER.setLevel(loglevel) + LOGGER.info("Logging level set to {}".format(loglevel)) + LOGGER.info(args) + + triton_deployable = get_nemo_deployable(args) if args.direct_serve_nemo else get_trtllm_deployable(args) try: nm = DeployPyTriton( - model=trt_llm_exporter, + model=triton_deployable, triton_model_name=args.triton_model_name, triton_model_version=args.triton_model_version, max_batch_size=args.max_batch_size, diff --git a/tests/deploy/pytriton_deploy.py b/tests/deploy/pytriton_deploy.py new file mode 100644 index 0000000000000..3b722d2d7fec2 --- /dev/null +++ b/tests/deploy/pytriton_deploy.py @@ -0,0 +1,136 @@ +import argparse + +import numpy as np +from pytriton.client import ModelClient + +from nemo.deploy.deploy_pytriton import DeployPyTriton +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMPyTorch + + +def test_triton_deployable(args): + megatron_deployable = MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) + + prompts = ["What is the biggest planet in the solar system?", "What is the fastest steam locomotive in history?"] + url = "localhost:8000" + model_name = args.model_name + init_timeout = 600.0 + + nm = DeployPyTriton( + model=megatron_deployable, + triton_model_name=model_name, + triton_model_version=1, + max_batch_size=8, + port=8000, + address="0.0.0.0", + streaming=False, + ) + nm.deploy() + nm.run() + + # run once with NemoTritonQueryLLMPyTorch + nemo_triton_query = NemoTritonQueryLLMPyTorch(url, model_name) + + result_dict = nemo_triton_query.query_llm( + prompts, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + max_length=args.max_output_token, + init_timeout=init_timeout, + ) + print("NemoTritonQueryLLMPyTriton result:") + print(result_dict) + + # run once with ModelClient, the results should be identical + str_ndarray = np.array(prompts)[..., np.newaxis] + prompts = np.char.encode(str_ndarray, "utf-8") + max_output_token = np.full(prompts.shape, args.max_output_token, dtype=np.int_) + top_k = np.full(prompts.shape, args.top_k, dtype=np.int_) + top_p = np.full(prompts.shape, args.top_p, dtype=np.single) + temperature = np.full(prompts.shape, args.temperature, dtype=np.single) + + with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: + result_dict = client.infer_batch( + prompts=prompts, + max_length=max_output_token, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ) + print("ModelClient result:") + print(result_dict) + + # test logprobs generation + # right now we don't support batches where output data is inconsistent in size, so submitting each prompt individually + all_probs = np.full(prompts.shape, True, dtype=np.bool_) + compute_logprob = np.full(prompts.shape, True, dtype=np.bool_) + with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: + logprob_results = client.infer_batch( + prompts=prompts, + max_length=max_output_token, + top_k=top_k, + top_p=top_p, + temperature=temperature, + all_probs=all_probs, + compute_logprob=compute_logprob, + ) + print("Logprob results:") + print(logprob_results) + + nm.stop() + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Deploy nemo models to Triton and benchmark the models", + ) + + parser.add_argument( + "--model_name", + type=str, + required=True, + ) + parser.add_argument( + "--num_gpus", + type=int, + default=1, + ) + parser.add_argument( + "--nemo_checkpoint", + type=str, + required=True, + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=8, + ) + parser.add_argument( + "--max_output_token", + type=int, + default=128, + ) + parser.add_argument( + "--top_k", + type=int, + default=1, + ) + parser.add_argument( + "--top_p", + type=float, + default=0.0, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + ) + + return parser.parse_args() + + +if __name__ == '__main__': + args = get_args() + test_triton_deployable(args)