diff --git a/.github/ISSUE_TEMPLATE/dev_container_bug_report.md b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md new file mode 100644 index 000000000000..fe81ec6252d8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/dev_container_bug_report.md @@ -0,0 +1,35 @@ +--- +container pulled on date: mm/dd/yyyy +name: Dev container - Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** + +A clear and concise description of what the bug is. + +**Steps/Code to reproduce bug** + +Please list *minimal* steps or code snippet for us to be able to reproduce the bug. + +A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports. + + +**Expected behavior** + +A clear and concise description of what you expected to happen. + +**Environment overview (please complete the following information)** + + - Environment location: Docker + - Method of install: Please specify exact commands you used to install. + - If method of install is [Docker], provide `docker pull` & `docker run` commands used + +**Additional context** + +Add any other context about the problem here. +Example: GPU model diff --git a/.github/scripts/slackHelper.sh b/.github/scripts/slackHelper.sh deleted file mode 100644 index 4696cebcf13b..000000000000 --- a/.github/scripts/slackHelper.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -function sendSlackMessage() { - - WEBHOOK_URL="$1" - PIPELINE_URL="$2" - - curl -X POST -H "Content-type: application/json" --data "{ - \"blocks\": [ - { - \"type\": \"section\", - \"text\": { - \"type\": \"mrkdwn\", - \"text\": \"\ -🚨 *CI/CD failure at <$PIPELINE_URL|NeMo CI>*: - -\" - } - } - ] - }" $WEBHOOK_URL - -} diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 31e9452d0fe5..5956a23bdd67 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -30,13 +30,16 @@ on: conclusion: description: Conclusion of main test step value: ${{ jobs.main.outputs.conclusion }} - + log: + description: Last 2000 characters of the test step's log + value: ${{ jobs.main.outputs.log }} jobs: main: runs-on: ${{ inputs.RUNNER }} timeout-minutes: ${{ inputs.TIMEOUT }} outputs: conclusion: ${{ steps.main.conclusion }} + log: ${{ steps.main.outputs.log }} container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} options: @@ -50,7 +53,21 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - id: main - run: ${{ inputs.SCRIPT }} + name: Run main script + run: | + set +e + ( + set -e + + ${{ inputs.SCRIPT }} + ) 2> >(tee err.log) + + EXIT_CODE=$? + + echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT" + + exit $EXIT_CODE + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" if: failure() && inputs.IS_OPTIONAL == false - name: after_script diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 12b8cdcb8eed..abac79310fdf 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -871,318 +871,6 @@ jobs: pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ output_manifest=preds.json - # L2: Dialogue Classification - - # TODO: pleasefixme - # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure-gpus-1 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/dialogue && \ - # python dialogue.py \ - # model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.dataset.dialogues_example_dir=sgd_gen_outputs \ - # model.dataset.task_name=debug_sample \ - # trainer.max_steps=1 \ - # trainer.max_epochs=1 \ - # model.train_ds.batch_size=2 \ - # model.validation_ds.batch_size=2 \ - # model.test_ds.batch_size=2 \ - # model.nemo_path=null \ - # trainer.val_check_interval=0.0 \ - # trainer.devices=1 \ - # model.dataset.use_cache=false \ - # model.tokenizer.special_tokens={pad_token:"endoftext"} \ - # model.tokenizer.tokenizer_name=gpt2 \ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \ - # trainer.accelerator=gpu \ - # exp_manager=null && \ - # rm -rf sgd_gen_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs - - # L2: Dialogue Generation - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null - AFTER_SCRIPT: | - rm -rf sgd_answer_extender_s2s - -# - name: L2: Dialogue Generation Part 2 -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# parallel { -# - name: Dialogue: Answer Extender using DialogueGPTGenerationModel -# - run: | -# cd examples/nlp/dialogue && \ -# python dialogue.py \ -# do_training=False \ -# model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ -# model.dataset.dialogues_example_dir=answer_extender \ -# model.library=huggingface \ -# model.dataset.task=ms_marco \ -# model.dataset.debug_mode=True \ -# trainer.val_check_interval=0.0 \ -# trainer.devices=1 \ -# model.dataset.use_cache=false \ -# model.language_model.pretrained_model_name=gpt2 \ -# trainer.accelerator=gpu \ -# exp_manager=null && \ -# rm -rf answer_extender -# } -# } -# } -# } - - # L2: COPY - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender - # L2: Duplex Text Normalization L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] @@ -1212,216 +900,6 @@ jobs: data.test_ds.use_cache=false \ data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv -# Runs out of memory on the 12G TITAN V (GPU 0 on main CI) -# TODO: add when megatron bert is supported again in NeMo -# - name: L2: MegaBERT Token Classification -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# - run: | -# cd examples/nlp/token_classification && \ -# python token_classification_train.py \ -# model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ -# model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ -# model.train_ds.batch_size=10 \ -# model.dataset.max_seq_length=50 \ -# model.dataset.use_cache=false \ -# trainer.accelerator=gpu \ -# trainer.strategy=ddp \ -# trainer.precision=16 \ -# trainer.devices=1 \ -# trainer.accelerator="gpu" \ -# +trainer.fast_dev_run=true \ -# exp_manager=null -# } -# } - - # L2: BERT Text Classification - L2_BERT_Text_Classification_with_BERT_Test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BART_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: @@ -1653,241 +1131,7 @@ jobs: pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; rm -rf "${data_dir}" - - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null; - - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}" - - # Punctuation & Capitalization tarred dataset: - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output; - rm -rf "${output_dir}" "${data_dir}" - - # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}" - - # TODO: pleasefixme - # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/token_classification && \ - # work_dir="$(mktemp -d -p "$(pwd)")" && \ - # output_dir="${work_dir}/output" && \ - # mkdir -p "${output_dir}" && \ - # data_dir="${work_dir}/data" && \ - # mkdir -p "${data_dir}" && \ - # cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - # conf_name=punctuation_capitalization_config_with_ids && \ - # cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # python punctuation_capitalization_train_evaluate.py \ - # --config-path "${work_dir}" \ - # --config-name "${conf_name}" \ - # model.train_ds.use_tarred_dataset=false \ - # model.train_ds.ds_item="${data_dir}" \ - # model.validation_ds.ds_item="${data_dir}" \ - # model.test_ds.ds_item="${data_dir}" \ - # model.language_model.pretrained_model_name=distilbert-base-uncased \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # +exp_manager.explicit_log_dir="${output_dir}" \ - # +do_testing=false && \ - # python punctuation_capitalization_train_evaluate.py \ - # +do_training=false \ - # +do_testing=true \ - # ~model.train_ds \ - # ~model.validation_ds \ - # model.test_ds.ds_item="${data_dir}" \ - # pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # exp_manager=null && \ - # rm -rf "${work_dir}" - - # Punctuation & Capitalization inference - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32; - rm -rf "${output_dir}" # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: @@ -1947,23 +1191,6 @@ jobs: #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - # L2: Entity Linking - L2_Entity_Linking_Self_Alignment_Pretraining_BERT: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 # is in the release container @@ -2581,211 +1808,250 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings L2_Megatron_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] @@ -3086,168 +2352,191 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.validation_drop_last=False \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + + L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + # trainer.devices=2 \ + # trainer.accelerator=gpu \ + # trainer.log_every_n_steps=1 \ + # trainer.val_check_interval=2 \ + # trainer.limit_val_batches=1 \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=6 \ + # trainer.gradient_clip_val=1.0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + # exp_manager.resume_if_exists=True \ + # model.tensor_model_parallel_size=2 \ + # model.optim.name=fused_adam \ + # model.optim.lr=2e-4 \ + # model.optim.sched.warmup_steps=2 \ + # model.optim.sched.constant_steps=2 \ + # model.optim.sched.min_lr=8e-5 \ + # model.max_position_embeddings=128 \ + # model.encoder_seq_length=128 \ + # model.data.seq_length=128 \ + # model.position_embedding_type=rope \ + # model.rotary_percentage=0.5 \ + # model.normalization=rmsnorm \ + # model.bias=False \ + # model.bias_activation_fusion=False \ + # model.bias_dropout_add_fusion=False \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # model.num_layers=8 \ + # model.hidden_size=256 \ + # model.num_attention_heads=8 \ + # model.activations_checkpoint_method=block \ + # model.activations_checkpoint_granularity=full \ + # model.activations_checkpoint_num_layers=1 \ + # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" # This test requires Ampere but some of the test GPUs are Volta # Need to add a check for compute capability before uncommenting this test @@ -3343,169 +2632,192 @@ jobs: L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # not testing resume functionality to save time on ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=alibi \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=kerple \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] @@ -3663,36 +2975,50 @@ jobs: L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.peft.peft_scheme=none \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_sft_results - + runs-on: self-hosted-azure-gpus-1 + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + trainer.max_steps=4 \ + trainer.val_check_interval=4 \ + trainer.enable_checkpointing=False \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + exp_manager.checkpoint_callback_params.save_best_model=False \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.peft.peft_scheme=none \ + model.optim.name=distributed_fused_adam \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.num_workers=0 \ + model.data.train_ds.concat_sampling_probabilities=[1.0] + + rm -rf examples/nlp/language_modeling/gpt_sft_results + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4545,75 +3871,7 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/bart_pretrain_results - # L2: Megatron T5 GLUE/XNLI Finetuning - # TODO(Oktai15): update it in 1.8.0 version - L2_Megatron_T5_GLUE_RTE: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_glue_results - - L2_Megatron_T5_GLUE_XNLI: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_xnli_results - + L2_Megatron_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4941,23 +4199,7 @@ jobs: - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel - - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel - - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel - L2_Duplex_Text_Normalization_with_Tarred_dataset - - L2_BERT_Text_Classification_with_BERT_Test - - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0 - - L2_Parallel_BART_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0 - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test @@ -4965,13 +4207,8 @@ jobs: - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data - - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset - - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir - - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text - L2_Pretraining_BERT_pretraining_from_Text - L2_Pretraining_BERT_from_Preprocessed - - L2_Entity_Linking_Self_Alignment_Pretraining_BERT - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation @@ -5013,8 +4250,6 @@ jobs: - L2_Megatron_T5_Eval - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_GLUE_RTE - - L2_Megatron_T5_GLUE_XNLI - L2_Megatron_T5_PEFT_Lora_TP2 - L2_Megatron_Mock_Data_Generation_MockGPTDataset - L2_Megatron_Mock_Data_Generation_MockT5Dataset @@ -5049,12 +4284,69 @@ jobs: - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }} run: | - source .github/scripts/slackHelper.sh - - WEBHOOK_URL=${{ secrets.SLACK_WEBHOOK }} + set -x + + PR_INFO=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/pulls/${{ github.event.number }} + ) + PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"') + PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"') + PIPELINE_URL=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - - sendSlackMessage "$WEBHOOK_URL" "$PIPELINE_URL" + BASE_MESSAGE=' + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*." + } + } + ] + } + ' + + # We are close to reaching 100 jobs: Once we break that barrier, we have to iterate pages + JOBS_URL="https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" + SUMMARY="[]" + while IFS= read -r JOB; do + JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main" + JOB_ID=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" $JOBS_URL | jq --arg job_name "$JOB_NAME" -r '.jobs[] | select(.name == $job_name) | .id') + JOB_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}/job/$JOB_ID" + + LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"') + + SUMMARY=$(echo "$SUMMARY" | jq \ + --arg pr "<$PR_URL|$PR_TITLE>" \ + --arg job "<$JOB_URL|$JOB_NAME>" \ + --arg logs "$LOGS" \ + --arg author "" \ + --arg branch ""\ + '. += [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + "PR: " + $pr + + "\nJob: " + $job + + "\nAuthor: " + $author + + "\nBranch: " + $branch + + "\nLogs:" + + "```\n" + $logs + "\n```" + ) + } + } + ]') + done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")') + + MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary') + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }} - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} run: | diff --git a/README.rst b/README.rst index c4cbf759d975..ab3a4b6b06c9 100644 --- a/README.rst +++ b/README.rst @@ -45,58 +45,112 @@ Latest News
Large Language Models and Multimodal -
- Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE (2024/03/16) +
+ + + NVIDIA sets new generative AI performance and scale records in MLPerf Training v4.0 + (2024/06/12) + + + Using NVIDIA NeMo Framework and NVIDIA Hopper GPUs NVIDIA was able to scale to 11,616 H100 GPUs and achieve near-linear performance scaling on LLM pretraining. + NVIDIA also achieved the highest LLM fine-tuning performance and raised the bar for text-to-image training. +

+
- An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework. +
+ + + Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE + (2024/03/16) + + + An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. + The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.

- Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso (2024/03/06) - - Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference. + + + Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso + (2024/03/06) + + + Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. + The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. + Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.

-
- -
- New NVIDIA NeMo Framework Features and NVIDIA H200 (2023/12/06) +
- NVIDIA NeMo Framework now includes several optimizations and enhancements, including: 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs. -

- H200-NeMo-performance -

-
- -
- NVIDIA now powers training for Amazon Titan Foundation models (2023/11/28) +
+ + + New NVIDIA NeMo Framework Features and NVIDIA H200 + (2023/12/06) + + + NVIDIA NeMo Framework now includes several optimizations and enhancements, + including: + 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, + 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, + 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and + 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs. +

+ + H200-NeMo-performance +

+
- NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. The NeMo Framework provides a versatile framework for building, customizing, and running LLMs. -

-
+
+ + + NVIDIA now powers training for Amazon Titan Foundation models + (2023/11/28) + + + NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). + The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. + The NeMo Framework provides a versatile framework for building, customizing, and running LLMs. +

+
Speech Recognition -
- New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model (2024/04/18) - - The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. Canary also provides bi-directional translation, between English and the three other supported languages. -

-
-
- Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models (2024/04/18) + + + New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model + (2024/04/18) + + + The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. + Canary also provides bi-directional translation, between English and the three other supported languages. +

+
- NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhereβ€”on any cloud and on-premisesβ€”released the Parakeet family of automatic speech recognition (ASR) models. These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy. +
+ + + Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models + (2024/04/18) + + + NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhereβ€”on any cloud and on-premisesβ€”released the Parakeet family of automatic speech recognition (ASR) models. + These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.

-
+
- Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT (2024/04/18) - - NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhereβ€”on any cloud and on-premisesβ€”recently released Parakeet-TDT. This new addition to the β€―NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B. + + + Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT + (2024/04/18) + + + NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhereβ€”on any cloud and on-premisesβ€”recently released Parakeet-TDT. + This new addition to the β€―NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.

diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 1c9325cf0a96..3c1a496993bd 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -741,3 +741,35 @@ To register a child model, use the ``register_nemo_submodule`` method of the par else: self.child_model = None + + +Profiling +--------- + +NeMo offers users two options for profiling: Nsys & CUDA memory profiling. These two options allow users +to debug performance issues as well as memory issues such as memory leaks. + +To enable Nsys profiling, add the following options to the model config: +nsys_profile: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + +Finally, the model training script with: + +nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... +See more options at `nsight user guide `_. + + + +To enable CUDA memory profiling, add the following options to the model config: + +memory_profile: + enabled: True + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + rank: 0 # Global rank ID to profile + output_path: None # Path to store the profile output file + +And invoke your NeMo script without any changes in the invocation command. diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index bc66ae717ebb..4eef38e715d4 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -5,7 +5,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -41,7 +41,7 @@ exp_manager: model: # model parallelism - mcore_bert: False + mcore_bert: True micro_batch_size: 4 global_batch_size: 8 tensor_model_parallel_size: 1 @@ -85,7 +85,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 grad_div_ar_fusion: False @@ -158,4 +158,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 \ No newline at end of file + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c8..1f63f7742ea0 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,7 +9,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -121,7 +121,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml new file mode 100644 index 000000000000..30e0beb0d5e5 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config_hyena.yaml @@ -0,0 +1,277 @@ +defaults: + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: + +name: megatron_gpt_hyena +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models + gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_gpt_hyena + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + +model: + # use GPTModel from megatron.core + mcore_gpt: True + + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 16 # limited by GPU memory + global_batch_size: 256 # will use more micro batches to reach global batch size + rampup_batch_size: null # Should be a list of 3 values: [, , ] + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline + + # model architecture + encoder_seq_length: 2048 + max_position_embeddings: ${.encoder_seq_length} + num_layers: 18 + hidden_size: 864 + ffn_hidden_size: 1728 + num_attention_heads: 1 + init_method_std: 0.023 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. + normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: True # Whether to use bias terms in all weight matrices. + activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + openai_gelu: False # Use OpenAI's GELU instead of the default GeLU + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: True # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + + name: te_gpt_hyena # key for selecting the correct ModuleSpec + + hyena: + # HyenaOperator parameters + max_seq_length: ${model.encoder_seq_length} # Maximum input sequence length. + order: 2 # Depth of the Hyena recurrence + num_heads: 1 # Number of heads (this is separate from model.num_attention_heads) + dropout: 0.0 + short_filter_order: 3 # Length of the explicit input convolutional filter + activation: "identity" # type of act between kernel output and output projection + + # HyenaConv parameters + precision: ${trainer.precision} # Training precision (required for FlashFFTConv initialization) + bias: true # Whether to apply a bias term following long convolution + + # HyenaFilter parameters + emb_dim: 33 # dimension of the filter's internal positional encoding + learn_pos_emb_z: true # whether the positional embeddings are learned + mlp_width: 64 # Width of the MLP parametrizing the implicit filter + sine_freq: 14 # frequency of periodic activations + num_inner_mlps: 2 # number of inner linear layers inside filter MLP + normalized: False # whether to apply normalization after modulation + + # ExponentialModulation parameters + modulate: True # Whether to apply exponential decay modulation + learn_modulation: False # Whether decay rates are learned + fast_decay_pct: 0.3 + slow_decay_pct: 1.5 + target: 1e-2 + shift: 0.0 + + tokenizer: + library: 'megatron' + type: 'GPT2BPETokenizer' + model: null + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. + + # Mixed precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: True # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + + + # Miscellaneous + seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Transformer Engine + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + ub_tp_comm_overlap: False + # Use userbuffer backend to overlap tensor-parallel communications with computes. + # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models. + ub_tp_comm_overlap_cfg: null + # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`, + # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings. + # If the configuration file is not provided a default setting is used for all communicators. + + ## Flash Attention + use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True + + data: + # Path to data must be specified by the user. + # Supports List, String and Dictionary + # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} + # Or see example below: + # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" + data_prefix: ??? + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: mmap + splits_string: "99990,8,2" + seq_length: ${model.encoder_seq_length} + skip_warmup: True + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: distributed_fused_adam + overlap_grad_sync: True + overlap_param_sync: False + contiguous_grad_buffer: True + lr: 6e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 636 + constant_steps: 100000 + min_lr: 2e-5 + + gc_interval: 0 + # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. + # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/examples/nlp/language_modeling/conf/megatron_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_quantization.yaml index 88d10ae0a66c..52454f5c8906 100644 --- a/examples/nlp/language_modeling/conf/megatron_quantization.yaml +++ b/examples/nlp/language_modeling/conf/megatron_quantization.yaml @@ -26,6 +26,7 @@ quantization: calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset num_calib_size: 512 # number of samples used for calibration awq_block_size: 128 # block size for scaling factors in AWQ algorithm + alpha: 1.0 # alpha parameter in SmoothQuant algorithm export: decoder_type: llama # gptnext, gpt2, llama diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py index 2dca468fab35..33143364ede1 100644 --- a/nemo/collections/asr/modules/audio_preprocessing.py +++ b/nemo/collections/asr/modules/audio_preprocessing.py @@ -100,7 +100,7 @@ def __init__(self, win_length, hop_length): @torch.no_grad() def forward(self, input_signal, length): if input_signal.dtype != torch.float32: - logging.warn( + logging.warning( f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.", mode=logging_mode.ONCE, ) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 824d84ffb461..fdcfbda047c8 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -4,7 +4,7 @@ import pytorch_lightning as pl from nemo.collections.llm.utils import task -from nemo.lightning import MegatronStrategy, Trainer, io, teardown +from nemo.lightning import MegatronStrategy, OptimizerModule, Trainer, io, teardown @task(namespace="llm") @@ -12,6 +12,7 @@ def train( model: pl.LightningModule, data: pl.LightningDataModule, trainer: Trainer, + opt: Optional[OptimizerModule] = None, tokenizer: Optional[str] = None, source: Optional[str] = None, export: Optional[str] = None, @@ -23,6 +24,8 @@ def train( model (pl.LightningModule): The model to be trained. data (pl.LightningDataModule): The data module containing training data. trainer (Trainer): The trainer instance configured with a MegatronStrategy. + opt (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer + from the model will be used. tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'. source (Optional[str]): Path to a checkpoint from which to continue training. export (Optional[str]): Filename to save the exported checkpoint after training. @@ -58,6 +61,9 @@ def train( if source: _add_ckpt_path(source, model, fit_kwargs) + if opt: + opt.connect(model) + trainer.fit(model, data, **fit_kwargs) print(f"Saving checkpoint to: {export_dir}") diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 9bf710d98928..e577ddb63d26 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -1,15 +1,16 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional +from typing import TYPE_CHECKING, Dict, Literal, Optional import pytorch_lightning as L import torch import torch.distributed +from megatron.core.optimizer import OptimizerConfig from megatron.core.transformer.transformer_config import TransformerConfig -from torch.optim import Optimizer from nemo.collections.llm import fn from nemo.lightning import get_vocab_size, io from nemo.lightning.megatron_parallel import MaskedTokenLossReduction +from nemo.lightning.pytorch.opt import MegatronOptimizerModule, OptimizerModule if TYPE_CHECKING: from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel @@ -33,8 +34,6 @@ class GPTConfig(TransformerConfig): # TODO: Move this to better places? get_attention_mask_from_fusion: bool = False - optimizer_fn: Optional[Callable[["GPTModel"], Optimizer]] = None - def configure_model(self, tokenizer) -> "MCoreGPTModel": vp_size = self.virtual_pipeline_model_parallel_size if vp_size: @@ -69,21 +68,18 @@ def __init__( self, config: GPTConfig, # TODO: Add transformer_layer_spec when we update mcore + optim: Optional[OptimizerModule] = None, tokenizer: Optional["TokenizerSpec"] = None, ): super().__init__() self.config = config self.tokenizer = tokenizer + self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True)) + self.optim.connect(self) # This will bind the `configure_optimizers` method def configure_model(self) -> None: self.module = self.config.configure_model(self.tokenizer) - def configure_optimizers(self) -> Optimizer: - if self.config.optimizer_fn is not None: - return self.config.optimizer_fn(self) - - return gpt_default_optimizer(self) - def forward( self, input_ids: torch.Tensor, @@ -171,12 +167,6 @@ def gpt_forward_step(model, batch) -> torch.Tensor: return model(**forward_args) -def gpt_default_optimizer(module) -> Optimizer: - from apex.optimizers import FusedAdam - - return FusedAdam(module.parameters(), lr=1e-4) - - def get_batch_on_this_context_parallel_rank(batch): from megatron.core import parallel_state @@ -229,4 +219,4 @@ def get_packed_seq_params(batch): ) -__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step", "gpt_default_optimizer"] +__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step"] diff --git a/nemo/collections/multimodal/speech_llm/data/build_dataset.py b/nemo/collections/multimodal/speech_llm/data/build_dataset.py index b042386cea3b..698a01836169 100644 --- a/nemo/collections/multimodal/speech_llm/data/build_dataset.py +++ b/nemo/collections/multimodal/speech_llm/data/build_dataset.py @@ -207,6 +207,11 @@ def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict ) return dataloader + pad_to_global_batch = not data_cfg.drop_last + if is_eval: + # don't pad to global batch if in eval mode, unless explicitly set by user (e.g., eval with DDP) + pad_to_global_batch = (not data_cfg.drop_last) and data_cfg.get("pad_samples_to_global_batch_size", False) + batch_sampler = MegatronPretrainingBatchSampler( total_samples=len(dataset), consumed_samples=consumed_samples, @@ -215,7 +220,7 @@ def build_speechllm_dataloader(dataset, data_cfg, consumed_samples=0, is_predict data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), drop_last=data_cfg.drop_last, - pad_samples_to_global_batch_size=not data_cfg.drop_last, + pad_samples_to_global_batch_size=pad_to_global_batch, ) dataloader = torch.utils.data.DataLoader( diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py index 98d24802189e..92c56a4c20df 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueAssistantDataProcessor'] @@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg): data_dir: path to data directory tokenizer: tokenizer object """ + # deprecation warning + deprecated_warning("DialogueAssistantDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -69,16 +73,15 @@ def open_file(self, filename): @staticmethod def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids): - """ Extract continuous spans of slot_ids - To accomodate slots with distinct labels for B-label1 and I-label1, + To accomodate slots with distinct labels for B-label1 and I-label1, slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1 - + Args: Slot: list of int representing slot of each word token - For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 + For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday" Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens, each containing a start position and an exclusive end position @@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str): "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words}, "label_positions": { "slots": { - slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,} + slot: { + "start": position[0], + "exclusive_end": position[1], + "slot": slot, + } for slot, position in slot_to_start_and_exclusive_end.items() } }, diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py index 2a4b21c70535..c41c1f5e04ca 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py @@ -17,6 +17,7 @@ import random from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDataProcessor'] @@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor): """ def __init__(self): + # deprecation warning + deprecated_warning("DialogueDataProcessor") + raise NotImplementedError() def get_train_examples(self): @@ -58,8 +62,8 @@ def get_test_examples(self): def get_relevant_idxs(dataset_split, n_samples, dev_proportion): """ Obtain indexes for each dataset_split, when train and dev sets are not in separate files - - Args: + + Args: dataset_split: train, dev or test n_samples: total number of samples dev_proportion: value from 1 to 99 that represent proportion of data in dev set diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py index 5e58919b7652..56e99c4bcfe9 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py @@ -19,6 +19,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDesignDataProcessor'] @@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueDesignDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -50,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py index 58814a8eee90..67d58ff5d21e 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py @@ -19,13 +19,13 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMellonQADataProcessor'] class DialogueMellonQADataProcessor(DialogueDataProcessor): - """Data Processor for Mellon QA dialogues. - """ + """Data Processor for Mellon QA dialogues.""" def __init__(self, data_dir: str, tokenizer: object, cfg=None): """ @@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMellonQADataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -51,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str): input_example = { "utterance": utterance, "example_id": i, - "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,}, + "labels": { + "response": answer, + "fluent_response": well_formed_answer, + "passage": passage, + }, } example = DialogueInputExample(input_example) examples.append(example) diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py index 78f434c1d5dd..d09960a35d69 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py @@ -19,15 +19,16 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMSMarcoDataProcessor'] class DialogueMSMarcoDataProcessor(DialogueDataProcessor): """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering) - Please agree to the Terms of Use before downloading data at - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz + Please agree to the Terms of Use before downloading data at + https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz + https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz """ def __init__(self, data_dir: str, tokenizer: object, cfg=None): @@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): debug_mode: reduce number of samples to load in order to increase speed of processing cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMSMarcoDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -55,7 +59,7 @@ def open_json(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py index a78e1973e55f..1d37c26f1c45 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py @@ -28,6 +28,7 @@ from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample from nemo.collections.nlp.data.dialogue.sgd.schema import Schema from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning from nemo.utils.get_rank import is_global_rank_zero __all__ = ['DialogueSGDDataProcessor'] @@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): # git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git ***Data format*** - SGD data comes with a JSON schema file and dialogue files for each dataset split. + SGD data comes with a JSON schema file and dialogue files for each dataset split. In the following we will show an example for a service entry in the schema file. * service_name @@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * result_slots (not used) - In the following we will show an example for a dialogue. + In the following we will show an example for a dialogue. * dialogue_id * services * turns @@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * state * active_intent * requeste_slots - * slot_values + * slot_values * speaker - [USER, SYSTEM] * utterance """ def __init__( - self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None, + self, + data_dir: str, + dialogues_example_dir: str, + tokenizer: object, + cfg=None, ): """ Constructs DialogueSGDDataProcessor @@ -104,6 +109,9 @@ def __init__( tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueSGDDataProcessor") + self.data_dir = data_dir self.cfg = cfg @@ -213,7 +221,7 @@ def get_labels(self): def get_dialog_examples(self, dataset_split: str) -> List[object]: """ - Loads preprocessed dialogue examples from disk. + Loads preprocessed dialogue examples from disk. Args: dataset_split: dataset split Returns: @@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp Returns a list of `InputExample`s of the data splits' dialogues. Args: dataset_split: data split, can be "train", "dev", or "test". - schemas: schema for all services of all datasets + schemas: schema for all services of all datasets subsample: whether to balance postive and negative samples in the dataset Returns: examples: a list of `InputExample`s. @@ -447,9 +455,9 @@ def _create_examples_from_turn( "example_id_num": example_id_num, "utterance": user_utterance, "system_utterance": system_utterance, - "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]} - if system_frame is not None - else None, + "system_slots": ( + {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None + ), "system_actions": system_frame["actions"] if system_frame is not None else None, "labels": { "service": service, @@ -464,9 +472,11 @@ def _create_examples_from_turn( for intent in schemas.get_service_schema(service).intents ], "slots": { - slot: schemas.get_service_schema(service).get_categorical_slot_values(slot) - if slot in categorical_slots - else [] + slot: ( + schemas.get_service_schema(service).get_categorical_slot_values(slot) + if slot in categorical_slots + else [] + ) for slot in all_possible_slots }, }, diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py index 0931fe383f94..33d46c308e81 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py @@ -21,12 +21,12 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset'] class DialogueBERTDataset(DialogueDataset): - """ Creates a dataset to use for the task of joint intent and slot classification with pretrained model. @@ -37,8 +37,7 @@ class DialogueBERTDataset(DialogueDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -57,6 +56,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: config container for dataset """ + # deprecation warning + deprecated_warning("DialogueBERTDataset") + self.cfg = cfg self.all_possible_labels = dialogues_processor.intents self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))} @@ -183,7 +185,7 @@ def get_features( ignore_start_end=False, ): """ - Convert queries (utterance, intent label and slot labels) to BERT input format + Convert queries (utterance, intent label and slot labels) to BERT input format """ all_subtokens = [] @@ -297,7 +299,7 @@ class DialogueIntentSlotInferenceDataset(DialogueBERTDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: """ - Returns definitions of module output ports. + Returns definitions of module output ports. """ return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), @@ -308,6 +310,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: } def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): + # deprecation warning + deprecated_warning("DialogueIntentSlotInferenceDataset") + if do_lower_case: queries = [query.lower() for query in queries] diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py index 1ac04a856a89..f89a5013c2ae 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py @@ -21,27 +21,31 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class DialogueGPTClassificationDataset(DialogueDataset): ''' Designed for classification tasks such as intent/domain classification as well as slot tagging - Dataset Class + Dataset Class 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling + 3. Users can configurate which labels to use for modelling (e.g. intent classification, slot filling or both together etc) ''' def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor + """Constructor Args: dataset_split: dataset split dialogues_processor: Data generator for SGD dialogues tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTClassificationDataset") + self.cfg = cfg if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation": @@ -229,19 +233,18 @@ def collate_fn(self, batch): return all_items def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. service: restaurant e.g. service: restaurant e.g. \nintent: set alarm\nslots: (), () Generation example: - e.g. service: + e.g. service: ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py index 7de02d75c574..8ddbc2e3925e 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py @@ -18,12 +18,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueGPTGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -31,6 +32,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -80,7 +84,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -91,7 +95,6 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' For each example, this function determines the format of input and output sequences based on user-specified conguration. This is controlled by model.dataset.input_field and model.dataset.output_field @@ -99,9 +102,9 @@ def __getitem__(self, idx: int): If model.dataset.input_field == response and model.dataset.output_field == fluent_response: Input = "response: " and output = "response: fluent_response: " (with loss calculated from only) If model.dataset.input_field == utterance and model.dataset.output_field == response: - Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) + Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) If model.dataset.input_field == passage+utterance and model.dataset.output_field == response: - Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) + Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py index 8618f2f8c7b4..dc123ca0e3d7 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py @@ -17,6 +17,7 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourDataset'] @@ -33,6 +34,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c dialogues_processor: Data generator for dialogues tokenizer: tokenizer to split text into sub-word tokens """ + # deprecation warning + deprecated_warning("DialogueNearestNeighbourDataset") + self.cfg = cfg self.tokenizer = tokenizer self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py index 78fda55edd2e..df522b74e861 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py @@ -16,12 +16,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueS2SGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -29,6 +30,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueS2SGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -45,7 +49,7 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c @staticmethod def format_actions(prompt_template, actions): """ - Formats actions based on prompt_template + Formats actions based on prompt_template Args: prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions @@ -118,7 +122,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -128,13 +132,12 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. INPUT - "response: " OUTPUT - "" # input_label_type = response, output_label_type = fluent_response e.g. INPUT - "utterance: " OUTPUT - "" # input_label_type = utterance, output_label_type = response e.g. INPUT - "passage: utterance: " OUTPUT - "" # input_label_type = passage+utterance, output_label_type = response diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py index f2a0f58bcfac..c1308238bea1 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentDataset'] @@ -36,8 +37,7 @@ class DialogueZeroShotIntentDataset(GLUEDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -55,6 +55,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c num_classes: number of classes in the data (should be either 2 or 3, corresponding to labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) """ + # deprecation warning + deprecated_warning("DialogueZeroShotIntentDataset") + self.cfg = cfg self.tokenizer = tokenizer if self.cfg.num_classes not in [2, 3]: @@ -69,9 +72,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c 'eos_token': tokenizer.eos_token, 'pad_token': tokenizer.pad_token, 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token - if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() - else None, + 'sep_token_extra': ( + tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None + ), } self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) @@ -128,9 +131,9 @@ def convert_examples_to_features( * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - + The convention in BERT is: - + a. For sequence pairs: * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 @@ -148,9 +151,9 @@ def convert_examples_to_features( For classification tasks, the first vector (corresponding to [CLS]) is used as as the "sentence vector". Note that this only makes sense because the entire model is fine-tuned. - + The convention for NMT is: - + a. For sequence pairs: * tokens: is this jack ##ville ? no it is not . * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py index 5d985466ff6c..bbd14f47a651 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.modules.common import VirtualPromptSource from nemo.core import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['BasePromptLearningDataset'] @@ -41,6 +42,9 @@ def __init__( add_eos: bool = True, for_train: bool = True, ): + # deprecation warning + deprecated_warning("BasePromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -72,7 +76,7 @@ def __init__( raise ValueError("Datasets must be a list of dicts or a list of filepath strings") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -85,7 +89,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits return input_example def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -115,7 +119,7 @@ def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virt return input_ids def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -187,11 +191,11 @@ def pad_taskname_ids(self, taskname_ids): def find_subsequence_location(sequence, subsequence): - """ Finds the start and end index of the first occurance - of a given subsequence within a larger list. Returns - the two indices corresponding to the postition of - the first and last token of the subseqeunce. - Assumes subsequence is known to be in sequence. + """Finds the start and end index of the first occurance + of a given subsequence within a larger list. Returns + the two indices corresponding to the postition of + the first and last token of the subseqeunce. + Assumes subsequence is known to be in sequence. """ assert len(sequence) >= len(subsequence), "subsequence too long" diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index 6818f99d0e4f..4a8b989a7b6d 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -91,8 +91,7 @@ def __len__(self): return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @abc.abstractmethod - def __iter__(self): - ... + def __iter__(self): ... class MegatronPretrainingSampler(BaseMegatronSampler): @@ -107,7 +106,7 @@ def __iter__(self): indices = range(self.consumed_samples, self.total_samples) if (not self.drop_last) and self.pad_samples_to_global_batch_size: pad_samples_num = -len(indices) % self.global_batch_size - pad_indices = range(-1, -pad_samples_num - 1, -1) + pad_indices = [None] * pad_samples_num indices = chain(indices, pad_indices) for idx in indices: diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 4b1b4f61d439..11795bd150f1 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids from nemo.core import Dataset from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GPTPromptLearningDataset'] @@ -30,7 +31,7 @@ class GPTPromptLearningDataset(Dataset): """ The dataset class for prompt-tuning or p-tuning pretrained GPT models. - + Args: data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example tokenizer (tokenizer): Tokenizer from frozen language model @@ -39,7 +40,7 @@ class GPTPromptLearningDataset(Dataset): pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [, , ...] up to max num virtual tokens pad_token_id (int): ID of pad token from tokenizer max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example add_eos (bool): Whether to add an end of sentence token to each data example for_train (bool): Whether you're creating a dataset for training or inference @@ -63,6 +64,9 @@ def __init__( cache_data_path: str = None, # the cache file load_cache: bool = True, # whether to load from the cache if it is available ): + # deprecation warning + deprecated_warning("GPTPromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -112,9 +116,9 @@ def __init__( def load_data(self, dataset): """ Loads a dataset by filling in the task templates specified in the config file - with the information from each training/inference example. Converts all input - text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in - the task templates with the actual virtual prompt token ids. + with the information from each training/inference example. Converts all input + text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in + the task templates with the actual virtual prompt token ids. params: dataset: A list of json objects or a dictionary objects each @@ -241,7 +245,7 @@ def _input_sanity_checks( assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end" def _insert_text_in_template(self, input_example, prompt_template_fields, doc): - """ Format the input example according to the template """ + """Format the input example according to the template""" for field in prompt_template_fields: if field in doc.keys(): field_text = doc[field] @@ -255,7 +259,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc): return input_example.strip(" ") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -270,7 +274,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits def _truncate_input( self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits ): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -297,8 +301,8 @@ def _truncate_input( return input_ids def _find_answer_start(self, taskname, input_ids, answer_field, doc): - """ Find the token ids corresponding to the answer start, for loss masking purposes. - Assumes the answer is always at the end of the prompt. + """Find the token ids corresponding to the answer start, for loss masking purposes. + Assumes the answer is always at the end of the prompt. """ answer_text = doc[answer_field] answer_text = self._add_leading_space(taskname, answer_field, answer_text) @@ -313,7 +317,7 @@ def _find_answer_start(self, taskname, input_ids, answer_field, doc): return answer_start_idx def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -331,7 +335,7 @@ def _ceil_to_nearest(self, n, m): return (n + m - 1) // m * m def collate_fn(self, batch, tp_workers=0): - """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ + """Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch""" taskname_ids, input_ids, answer_starts = zip(*batch) # Pad taskname_ids to be the same length for the prompt encoder @@ -380,7 +384,7 @@ def collate_fn(self, batch, tp_workers=0): return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): - """ Pad input_ids in batch to max batch length while building loss mask """ + """Pad input_ids in batch to max batch length while building loss mask""" batch_loss_masks = [] padded_input_ids = [] for ids, answer_start_idx in zip(input_ids, answer_starts): @@ -410,7 +414,7 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): def inference_collate_fn(self, batch): """ - Used for loading inference data. + Used for loading inference data. """ task_id_nums, input_ids, answer_starts = zip(*batch) input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids]) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py index 4070098b5e67..87174b69ffc2 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py @@ -22,10 +22,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_bert_input_example import BERTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQADataset(QADataset): - """ Creates a Dataset for BERT architecture based Exractive QA """ + """Creates a Dataset for BERT architecture based Exractive QA""" def __init__( self, @@ -41,6 +42,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("BERTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -92,7 +96,7 @@ def __init__( self.features[i] = BERTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -110,7 +114,7 @@ def _set_cached_features_filename(self): ) def _convert_examples_to_features(self): - """ Converts loaded examples to features """ + """Converts loaded examples to features""" logging.info(f"Preprocessing data into features.") @@ -161,7 +165,7 @@ def _convert_examples_to_features(self): example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = self.tokenizer.text_to_tokens(token) @@ -199,7 +203,7 @@ def _convert_examples_to_features(self): # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [self.tokenizer.cls_token] + query_tokens + [self.tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py index 783b2dd33f31..553f5984952c 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py @@ -28,14 +28,24 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class QADataset(Dataset): - ''' Abstract base class for QA Datasets with common utility methods ''' + '''Abstract base class for QA Datasets with common utility methods''' def __init__( - self, data_file: str, processor: object, tokenizer: object, mode: str, num_samples: int, **kwargs, + self, + data_file: str, + processor: object, + tokenizer: object, + mode: str, + num_samples: int, + **kwargs, ): + # deprecation warning + deprecated_warning("QADataset") + self.mode = mode self.data_file = data_file self.processor = processor @@ -100,7 +110,7 @@ def get_best_span_index(doc_spans, position): best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -150,7 +160,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -179,7 +189,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -193,7 +203,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -204,7 +214,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -282,9 +292,13 @@ def get_doc_tokens_and_offset_from_context_id( @staticmethod def improve_answer_span( - doc_tokens: List[str], input_start: int, input_end: int, tokenizer: object, orig_answer_text: str, + doc_tokens: List[str], + input_start: int, + input_end: int, + tokenizer: object, + orig_answer_text: str, ): - """ Returns tokenized answer spans that better match the annotated answer """ + """Returns tokenized answer spans that better match the annotated answer""" tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py index d6484b33e202..1eeb312a62a9 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py @@ -24,10 +24,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_gpt_input_example import GPTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQADataset(QADataset): - """ Creates a Dataset for GPT architecture based Generative QA """ + """Creates a Dataset for GPT architecture based Generative QA""" def __init__( self, @@ -44,6 +45,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("GPTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -76,7 +80,7 @@ def __init__( self.features[i] = GPTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -120,7 +124,11 @@ def _convert_examples_to_features(self): formatted_query, query_tokens_length = self._prep_query(query_prefix, example) formatted_answer, answer_tokens_length = self._prep_answer(example) context_tokens, context_spans = self._prep_context( - example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ) unique_id = self._encode_all_context_spans( @@ -170,7 +178,12 @@ def _prep_answer(self, example): return self._get_truncated_sentence_and_len(target, self.max_answer_length) def _prep_context( - self, example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + self, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ): """ Calculates the maximum possible length for a given context given a question diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py index 1f9a8ef615a9..c65c8a43c440 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py @@ -23,10 +23,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_s2s_input_example import S2SQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQADataset(QADataset): - """ Creates a Dataset for T5/BART architecture based Generative QA """ + """Creates a Dataset for T5/BART architecture based Generative QA""" def __init__( self, @@ -43,6 +44,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("S2SQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -75,7 +79,7 @@ def __init__( self.features[i] = S2SQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -117,7 +121,12 @@ def _convert_examples_to_features(self): context_tokens, context_spans = self._prep_context(example, query_tokens, context_prefix_tokens) unique_id = self._encode_all_context_spans( - unique_id, context_spans, context_tokens, formatted_query, example, example_index, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ) # delete self.examples during training mode to save memory @@ -155,7 +164,13 @@ def _prep_context(self, example, query_tokens, context_prefix_tokens): return context_tokens, context_spans def _encode_all_context_spans( - self, unique_id, context_spans, context_tokens, formatted_query, example, example_index, + self, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ): """ Fromats all spans extracted from a single context as: @@ -173,7 +188,11 @@ def _encode_all_context_spans( # encode input encoded_input_dict = self.tokenizer.tokenizer( - source, truncation=True, max_length=self.max_seq_length, padding="max_length", return_tensors="pt", + source, + truncation=True, + max_length=self.max_seq_length, + padding="max_length", + return_tensors="pt", ) input_ids = torch.squeeze(encoded_input_dict["input_ids"]) input_attn_mask = torch.squeeze(encoded_input_dict["attention_mask"]) @@ -223,7 +242,11 @@ def _encode_answer(self, example, context_span_text): target = example.answer_text encoded_output_dict = self.tokenizer.tokenizer( - target, truncation=True, max_length=self.max_answer_length, padding="max_length", return_tensors="pt", + target, + truncation=True, + max_length=self.max_answer_length, + padding="max_length", + return_tensors="pt", ) labels = torch.squeeze(encoded_output_dict["input_ids"]) labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100 diff --git a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py index ee1a0957dbbb..2abe9b7c0aaa 100644 --- a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py @@ -46,6 +46,7 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SquadDataset', 'InputFeatures', '_check_is_max_context'] @@ -114,7 +115,7 @@ def get_best_span_index(doc_spans, position): """ best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -165,6 +166,9 @@ def __init__( mode: str, use_cache: bool, ): + # deprecation warning + deprecated_warning("SquadDataset") + self.tokenizer = tokenizer self.version_2_with_negative = version_2_with_negative self.processor = SquadProcessor(data_file=data_file, mode=mode) @@ -337,7 +341,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -375,7 +379,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -387,7 +391,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -398,7 +402,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -481,7 +485,7 @@ def convert_examples_to_features( if self.mode != TRAINING_MODE: example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = tokenizer.text_to_tokens(token) @@ -521,7 +525,7 @@ def convert_examples_to_features( # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] @@ -681,7 +685,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(self.examples): + for example_index, example in enumerate(self.examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -706,7 +710,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(features): + for feature_index, feature in enumerate(features): pos = unique_id_to_pos[feature.unique_id] start_indexes = get_best_indexes(start_logits[pos], n_best_size) end_indexes = get_best_indexes(end_logits[pos], n_best_size) @@ -825,7 +829,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py index 803d0eaf8aed..c98abb300c64 100644 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py @@ -20,6 +20,8 @@ from transformers import PreTrainedTokenizerBase +from nemo.utils.decorators import deprecated_warning + """Build BERT Examples from asr hypothesis, customization candidates, target labels, span info. """ @@ -52,7 +54,7 @@ def __init__( input_ids: indices of single characters (treated as subwords) input_mask: list of bools with 0s in place of input_ids to be masked segment_ids: list of ints from 0 to 10 to denote the text segment type ( - 0 - for tokens of ASR hypothesis, + 0 - for tokens of ASR hypothesis, 1 - for tokens of the first candidate ... 10 - for tokens of the tenth candidate @@ -60,7 +62,7 @@ def __init__( input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer) input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords - character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) + character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set labels_mask: bool tensor with 0s in place of label tokens to be masked labels: indices of semiotic classes which should be predicted from each of the @@ -68,6 +70,9 @@ def __init__( spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM) default_label: The default label """ + # deprecation warning + deprecated_warning("BertExample") + input_len = len(input_ids) if not ( input_len == len(input_mask) @@ -123,6 +128,9 @@ def __init__( tokenizer: Tokenizer object. max_seq_length: Maximum sequence length. """ + # deprecation warning + deprecated_warning("BertExampleBuilder") + self._label_map = label_map self._semiotic_classes = semiotic_classes self._tokenizer = tokenizer @@ -183,9 +191,15 @@ def build_bert_example( tags[start:end] = [t for i in range(end - start)] # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) + ( + input_ids, + input_mask, + segment_ids, + labels_mask, + labels, + _, + _, + ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags) # get input features for words hyp_with_words = hyp.replace(" ", "").replace("_", " ") @@ -243,11 +257,11 @@ def build_bert_example( return example def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: - """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample - - Example: - span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] + """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample + + Example: + span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] + result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] """ result_spans = [] @@ -267,26 +281,26 @@ def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: def _get_fragment_indices( self, hyp: str, targets: List[int], span_info_parts: List[str] ) -> Tuple[List[Tuple[int, int, int]]]: - """ Build fragment indices for real candidates. - This is used only at inference. - After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). - In this function we - 1) adjust start/end positions to match word borders (possibly in multiple ways). - 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). - - Args: - hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). - targets: list of candidate ids (only for real candidates, not dummy) - span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. - Returns: - List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. - Note that returned fragments can be unsorted and can overlap, it's ok. - Example: - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - targets: [1 2 3 4 6 7 9] - span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. - fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] - """ + """Build fragment indices for real candidates. + This is used only at inference. + After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). + In this function we + 1) adjust start/end positions to match word borders (possibly in multiple ways). + 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). + + Args: + hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). + targets: list of candidate ids (only for real candidates, not dummy) + span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. + Returns: + List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. + Note that returned fragments can be unsorted and can overlap, it's ok. + Example: + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + targets: [1 2 3 4 6 7 9] + span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. + fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] + """ fragment_indices = [] @@ -337,18 +351,18 @@ def _get_fragment_indices( return fragment_indices def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]: - """ Maps each single character to the position of its corresponding subword. - - Args: - input_ids: List of character token ids. - input_ids_for_subwords: List of subword token ids. - Returns: - List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) - - Example: - input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] - input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] - result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] + """Maps each single character to the position of its corresponding subword. + + Args: + input_ids: List of character token ids. + input_ids_for_subwords: List of subword token ids. + Returns: + List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) + + Example: + input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] + input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] + result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] """ character_pos_to_subword_pos = [0 for _ in input_ids] @@ -453,7 +467,7 @@ def _get_input_features( ref: "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy" tags: None (not used for word-based case) - resulting token sequence: + resulting token sequence: '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] """ @@ -542,9 +556,9 @@ def read_input_file( infer: If true, input examples do not contain target info. Returns: - examples: List of converted examples (BertExample). + examples: List of converted examples (BertExample). or - (examples, hyps_refs): If infer==true, returns h + (examples, hyps_refs): If infer==true, returns h """ if not path.exists(input_filename): diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py index 7737bfa67f00..07ca790866c7 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py @@ -45,14 +45,19 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTClassificationModel'] class DialogueGPTClassificationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTClassificationModel") self.cfg = cfg self.eval_mode = cfg.dataset.eval_mode @@ -101,14 +106,14 @@ def __init__( def setup_optimizer_param_groups(self): """ - ModelPT override for prompt learning. - Optimizer will get self._optimizer_param_groups. + ModelPT override for prompt learning. + Optimizer will get self._optimizer_param_groups. Makes two optimizer param groups, one for the frozen model params - and one for the prompt-table/prompt-encoder params. The learning + and one for the prompt-table/prompt-encoder params. The learning rate for the frozen model's params will always be zero effectively freezing the model's params but still allowing for the needed gradients - to be passed around in pipeline parallel models. The prompt-encoder - and/or prompt table will use the learning rate set by the user. + to be passed around in pipeline parallel models. The prompt-encoder + and/or prompt table will use the learning rate set by the user. """ if not self.prompt_learning: super().setup_optimizer_param_groups() @@ -328,7 +333,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids) @@ -708,7 +716,9 @@ def prepare_data(self): ) elif self._cfg.dataset.task == 'design': self.dialogues_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset, + data_dir=self._cfg.dataset.data_dir, + tokenizer=self.tokenizer, + cfg=self._cfg.dataset, ) else: raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model") diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py index 602c15a50c76..116605b65d52 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTGenerationModel'] @@ -43,8 +44,12 @@ class DialogueGPTGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTGenerationModel") self.cfg = cfg self.data_prepared = False @@ -108,7 +113,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -155,7 +163,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1) @@ -228,7 +239,7 @@ def setup(self, stage=None): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py index 455b0fa17a85..29e2627fa038 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py @@ -34,14 +34,18 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourModel'] class DialogueNearestNeighbourModel(NLPModel): - """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions """ + """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueNearestNeighbourModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) if self.cfg.library == "huggingface": @@ -155,7 +159,10 @@ def on_validation_epoch_end(self): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, decoded_inputs, + filename, + predicted_labels, + ground_truth_labels, + decoded_inputs, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py index 9655fbea2722..73f09f62b1d5 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -46,8 +47,12 @@ class DialogueS2SGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueS2SGenerationModel") self.cfg = cfg self.data_prepared = False @@ -120,7 +125,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -172,7 +180,7 @@ def forward(self, input_ids, attention_masks, labels): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py index 0e007a7bcdd1..5298c060df08 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py @@ -36,6 +36,7 @@ from nemo.collections.nlp.models import TextClassificationModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentModel'] @@ -44,6 +45,9 @@ class DialogueZeroShotIntentModel(TextClassificationModel): """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueZeroShotIntentModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) @@ -275,7 +279,10 @@ def on_validation_epoch_end(self, split="val"): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, utterances, + filename, + predicted_labels, + ground_truth_labels, + utterances, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} @@ -316,7 +323,6 @@ def predict( entailment_idx=1, contradiction_idx=0, ) -> List[Dict]: - """ Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query. diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py index a34afa64674d..777d468084e2 100644 --- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py @@ -35,12 +35,15 @@ from nemo.core.classes import typecheck from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class IntentSlotClassificationModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ Initializes BERT Joint Intent and Slot model. - """ + """Initializes BERT Joint Intent and Slot model.""" + # deprecation warning + deprecated_warning("IntentSlotClassificationModel") + self.max_seq_length = cfg.dataset.max_seq_length self.cfg = cfg # Check the presence of data_dir. @@ -78,7 +81,7 @@ def _set_defaults_data_desc(self, cfg): OmegaConf.set_struct(cfg, True) def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): - """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """ + """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc.""" # Save data from data desc to config - so it can be reused later, e.g. in inference. data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) OmegaConf.set_struct(cfg, False) @@ -112,7 +115,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): OmegaConf.set_struct(cfg, True) def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: - """ Saves label ids map to a file """ + """Saves label ids map to a file""" with open(filename, 'w') as out: labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) @@ -120,7 +123,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: logging.info(f'Labels mapping saved to : {out.name}') def _reconfigure_classifier(self): - """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """ + """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" self.classifier = SequenceTokenClassifier( hidden_size=self.hidden_size, @@ -310,7 +313,7 @@ def get_utterance_tokens(self, token_ids, token_masks): Args: token_ids: IntTensor of size (max_seq_len, ) token_masks: BoolTensor of size (max_seq_len, ) - + Returns token_list: List of Str (list of tokens with len <= max_seq_len) """ diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py index b350fd01fa09..3b30dfccd9ce 100644 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SGDQAModel'] @@ -44,7 +45,7 @@ class SGDQAModel(NLPModel): Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049) The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855). - The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. + The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker. The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes the dialogue state across turns. @@ -55,6 +56,9 @@ def output_module(self): return self.decoder def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("SGDQAModel") + self.data_prepared = False super().__init__(cfg=cfg, trainer=trainer) self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout) @@ -146,7 +150,7 @@ def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_ Called at every validation step to aggregate and postprocess outputs on each GPU Args: batch: input batch at validation step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -163,7 +167,7 @@ def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: i Called at every test step to aggregate and postprocess outputs on each GPU Args: batch: input batch at test step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -318,8 +322,8 @@ def eval_step_helper(self, batch: List[torch.Tensor]): torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype), total_scores, ) - max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens ** 2), axis=-1) - max_span_p = torch.max(total_scores.view(-1, max_num_tokens ** 2), axis=-1)[0] + max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1) + max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0] span_start_index = torch.floor_divide(max_span_index, max_num_tokens) span_end_index = torch.fmod(max_span_index, max_num_tokens) @@ -415,7 +419,7 @@ def format_turn_id(ex_id_num): def combine_predictions_in_example(predictions: dict, batch_size: int): ''' - Combines predicted values to a single example. + Combines predicted values to a single example. Args: predictions: predictions ordered by keys then batch batch_size: batch size diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py index f3ef3ccb87f9..4afae81e3893 100644 --- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py +++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py @@ -26,6 +26,7 @@ from nemo.core.classes.exportable import Exportable from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['EntityLinkingModel'] @@ -44,6 +45,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes the SAP-BERT model for entity linking.""" + # deprecation warning + deprecated_warning("EntityLinkingModel") + # tokenizer needed before super().__init__() so dataset and loader can process data self._setup_tokenizer(cfg.tokenizer) @@ -123,7 +127,7 @@ def on_validation_epoch_end(self): Args: outputs: list of individual outputs of each validation step. Returns: - + """ if self.validation_step_outputs: avg_loss = torch.stack( diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py index 4a073e2ada1c..4447ebb89386 100644 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py @@ -31,6 +31,7 @@ from nemo.core.classes import typecheck from nemo.core.neural_types import NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GLUEModel'] @@ -78,6 +79,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): """ Initializes model to use BERT model for GLUE tasks. """ + # deprecation warning + deprecated_warning("GLUEModel") if cfg.task_name not in cfg.supported_tasks: raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}') diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index e7ae529fe4e2..67a4802d83f6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -14,7 +14,6 @@ """BERT model.""" -import warnings from dataclasses import dataclass import torch @@ -33,6 +32,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -142,7 +142,13 @@ def forward(self, hidden_states, word_embeddings_weight): def post_language_model_processing( - lm_output, pooled_output, lm_head, binary_head, lm_labels, logit_weights, fp16_lm_cross_entropy, + lm_output, + pooled_output, + lm_head, + binary_head, + lm_labels, + logit_weights, + fp16_lm_cross_entropy, ): # lm_logits: [s, b, vocab_size] lm_logits = lm_head(lm_output, logit_weights) @@ -348,7 +354,10 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly - self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,) + self.lm_head = MCoreBertLMHead( + self.config.hidden_size, + self.config, + ) self.output_layer = tensor_parallel.ColumnParallelLinear( self.config.hidden_size, @@ -476,10 +485,9 @@ def __init__( sequence_parallel=False, position_embedding_type='learned_absolute', ): - warnings.warn( - "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.", - DeprecationWarning, - ) + # deprecation warning + deprecated_warning("NeMoBertModel", "MCoreBertModelWrapperWithPostLNSupport") + super(NeMoBertModel, self).__init__(config=config) self.fp16_lm_cross_entropy = fp16_lm_cross_entropy self.add_binary_head = add_binary_head diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index 19fafb796fd7..c572d94acd11 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -24,6 +24,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -167,6 +168,9 @@ def __init__( seq_len_interpolation_factor=None, rotary_base=10000, ): + # deprecation warning + deprecated_warning("GPTModel", "McoreGPTModel") + super(GPTModel, self).__init__(config=config, share_token_embeddings=share_embeddings_and_output_weights) self.parallel_output = parallel_output @@ -250,7 +254,9 @@ def __init__( if self.share_embeddings_and_output_weights: self.initialize_word_embeddings( - init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size, + init_method=init_method_normal(init_method_std), + vocab_size=vocab_size, + hidden_size=hidden_size, ) def set_input_tensor(self, input_tensor): @@ -299,9 +305,11 @@ def forward( post_process_result = post_language_model_processing( loss_lm_output, loss_labels, - self.language_model.output_layer.weight - if not self.share_embeddings_and_output_weights - else self.word_embeddings_weight(), + ( + self.language_model.output_layer.weight + if not self.share_embeddings_and_output_weights + else self.word_embeddings_weight() + ), get_key_value, self.parallel_output, forward_method_parallel_output, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index e7f2aa805a9c..0828d88a8133 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -861,7 +861,15 @@ def configure_optimizers(self): # Initialize param buckets if explicitly provided if getattr(self, 'distributed_adam_buckets', None) is not None: - for bucket in self.distributed_adam_buckets: + buckets = self.distributed_adam_buckets + if self.cfg.get('distributed_adam_bucket_merge_size', 1) > 1: + # Merge buckets if needed + stride = self.cfg.get('distributed_adam_bucket_merge_size', 1) + buckets = [ + list(itertools.chain.from_iterable(buckets[i : i + stride])) + for i in range(0, len(buckets), stride) + ] + for bucket in buckets: self._optimizer.init_params_bucket(bucket) self._optimizer.init_params_bucket(self.parameters()) if hasattr(self, 'distributed_adam_buckets'): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py index d151925635ab..f6ee4b20183c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py @@ -37,6 +37,7 @@ from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -82,6 +83,9 @@ class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration): """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronBasePromptLearningModel") + super().__init__(cfg, trainer) self.init_model(cfg, trainer) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 718991dc203d..eb7d7b694e2f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -44,6 +44,7 @@ from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec import get_gpt_layer_modelopt_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel +from nemo.collections.nlp.modules.common.hyena.hyena_spec import get_gpt_layer_with_te_and_hyena_spec from nemo.collections.nlp.modules.common.megatron.build_model import build_model from nemo.collections.nlp.modules.common.megatron.module import Float16Module from nemo.collections.nlp.modules.common.megatron.utils import ( @@ -143,7 +144,7 @@ def mcore_supports_moe() -> bool: return False -def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True): +def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True, hyena_cfg: Dict = None): if num_experts is not None: assert mcore_supports_moe(), "Megatron-core >= v0.5.0 is required for MoE" @@ -155,6 +156,7 @@ def get_specs(spec_name, num_experts=None, moe_grouped_gemm=False, use_te=True): "megatron_falcon_gpt": get_falcon_layer_spec(), "megatron_gpt_full_te_layer_autocast": get_gpt_full_te_layer_autocast_spec(), "modelopt": get_gpt_layer_modelopt_spec(), + "te_gpt_hyena": get_gpt_layer_with_te_and_hyena_spec(hyena_cfg), } if spec_name not in name_spec_dict: raise ValueError(f"Spec name '{spec_name}' is not recognized.") @@ -417,6 +419,7 @@ def model_provider_func(self, pre_process, post_process): self.transformer_config.num_moe_experts, self.transformer_config.moe_grouped_gemm, self.transformer_engine, + self.cfg.get('hyena', None), ), vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), max_sequence_length=self.cfg.get('encoder_seq_length', 512), @@ -1472,8 +1475,7 @@ def build_train_valid_test_datasets(self): # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). # Setting N = 1 we force E to be 1 as well if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[1] = 1 - + train_valid_test_num_samples[1] = None # Add extra FIM tokens to tokenizer if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron': fim_tokens = self.cfg.data.fim.extra_tokens @@ -1498,6 +1500,7 @@ def build_train_valid_test_datasets(self): is_dataset_built_on_rank = lambda: True mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False + add_extra_token = not self.cfg.data.get("no_seqlen_plus_one_input_tokens", False) kwargs = { "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, @@ -1508,6 +1511,8 @@ def build_train_valid_test_datasets(self): "eod_mask_loss": self.eod_mask_loss, "create_attention_mask": not self.get_attention_mask_from_fusion, "mmap_bin_files": self.cfg.data.get("mmap_bin_files", True), + "drop_last_partial_validation_sequence": self.cfg.data.get("validation_drop_last", True), + "add_extra_token_to_sequence": add_extra_token, } data_prefix = self.cfg.data.data_prefix diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 5ee7a3fcf480..acfc22439a7d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -44,6 +44,7 @@ from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches @@ -72,25 +73,28 @@ class MegatronGPTPromptLearningModel(MegatronBasePromptLearningModel): """ - Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. + Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. Prompt Tuning initalizes virtual prompt embeddings directly from a copy of certain token embeddings from the the pretrained GPT model's vocabulary - and directly tunes these embedding weights. The token embeddings used in - initalization are specified by the user in the config file. The model can - be prompt-tuned for multiple tasks at once. virtual prompts are stored in a - prompt table and can be added or deleted without disrupting virtual prompts - for other tasks. + and directly tunes these embedding weights. The token embeddings used in + initalization are specified by the user in the config file. The model can + be prompt-tuned for multiple tasks at once. virtual prompts are stored in a + prompt table and can be added or deleted without disrupting virtual prompts + for other tasks. P-tuning initializes an LSTM encoder model that generates virtual prompt embeddings for every task. Each task shares the same encoder. After ptuning is compelete, the learned virtual prompts can be saved to the prompt table - using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a - new virtual prompt via p-tuning, they do not need to retrain on all previous + using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a + new virtual prompt via p-tuning, they do not need to retrain on all previous tasks. This gives p-tuning the same task flexiblity as prompt-tuning. """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronGPTPromptLearningModel") + super().__init__(cfg, trainer) self.inference_params = None @@ -305,8 +309,8 @@ def forward( def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): """ - Dataloader produces a global batch which is turned into an iterator of microbatches. - The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. + Dataloader produces a global batch which is turned into an iterator of microbatches. + The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. """ # Get seq length of batch batch, _, _ = next(dataloader_iter) @@ -361,15 +365,15 @@ def training_step(self, dataloader_iter): return loss_mean def backward(self, *args, **kwargs): - """ LightningModule hook to do backward. - We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. - No need to call it here. + """LightningModule hook to do backward. + We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. + No need to call it here. """ return def optimizer_zero_grad(self, *args, **kwargs): - """ LightningModule hook to zero grad. - We want this to do nothing as we are zeroing grads during the training_step. + """LightningModule hook to zero grad. + We want this to do nothing as we are zeroing grads during the training_step. """ return @@ -415,11 +419,19 @@ def validation_step(self, dataloader_iter): labels_text.append(label) if mode == 'val': self.validation_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) else: self.test_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) return { 'loss': loss_mean, @@ -427,8 +439,10 @@ def validation_step(self, dataloader_iter): 'labels': labels_text, } - self.validation_step_outputs.append({'loss': loss_mean}) if mode == 'val' else self.test_step_outputs.append( - {'loss': loss_mean} + ( + self.validation_step_outputs.append({'loss': loss_mean}) + if mode == 'val' + else self.test_step_outputs.append({'loss': loss_mean}) ) return {'loss': loss_mean} @@ -481,7 +495,8 @@ def on_validation_epoch_end(self): gather_results_dedup = list(set(itertools.chain(*gather_results))) val_metric_dict = self.validation_metric.get_score( - [i[1] for i in gather_results_dedup], [i[0] for i in gather_results_dedup], + [i[1] for i in gather_results_dedup], + [i[0] for i in gather_results_dedup], ) for metric, val in val_metric_dict.items(): @@ -638,9 +653,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True - if num_workers > 0 - else False, # (@adithyare and @eharper) We need this to make spawn=True to work. + persistent_workers=( + True if num_workers > 0 else False + ), # (@adithyare and @eharper) We need this to make spawn=True to work. ) return dataset, dataloader @@ -815,7 +830,7 @@ def list_available_models(cls): def get_pseudo_tokens(num_virtual_tokens): """ Takes in an integer and returns a list of strings where each string - is a numbered virtual token placeholder. If + is a numbered virtual token placeholder. If num_virtual_tokens = 3, then this function returns: ["", "", ""] @@ -823,7 +838,7 @@ def get_pseudo_tokens(num_virtual_tokens): Args: num_virtual_tokens: (int) Number of virtual token strings you want to make - returns a list of string. + returns a list of string. """ pseudo_tokens = [ diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py index bfb45f51b6ac..7ca78f2e136e 100644 --- a/nemo/collections/nlp/models/question_answering/qa_base_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py @@ -25,10 +25,14 @@ ) from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BaseQAModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=True): + # deprecation warning + deprecated_warning("BaseQAModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer, no_lm_init=no_lm_init) @@ -82,10 +86,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): @torch.no_grad() def _get_per_sample_perplexity(self, logits, labels): - """ Returns average perplexity for each sample in the batch """ + """Returns average perplexity for each sample in the batch""" loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') - unreduced_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1),) + unreduced_loss = loss_fct( + logits.view(-1, logits.size(-1)), + labels.view(-1), + ) unreduced_loss = unreduced_loss.reshape(labels.shape) mask_0 = unreduced_loss != 0 per_sample_perplexity = torch.exp((unreduced_loss * mask_0).sum(axis=1) / mask_0.sum(axis=1)) diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py index 196fab4e3a04..d4bdef6d871d 100644 --- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py @@ -31,12 +31,15 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQAModel(BaseQAModel): - """ BERT model with a QA (token classification) head """ + """BERT model with a QA (token classification) head""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("BERTQAModel") super().__init__(cfg=cfg, trainer=trainer, no_lm_init=False) self.classifier = TokenClassifier( @@ -190,7 +193,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ @@ -209,7 +212,10 @@ def inference( logging.set_verbosity(logging.WARNING) infer_datalayer = self.setup_inference_data( - file, batch_size=batch_size, num_samples=num_samples, num_workers=2, + file, + batch_size=batch_size, + num_samples=num_samples, + num_workers=2, ) all_logits = [] @@ -244,7 +250,9 @@ def inference( if output_prediction_file: QAMetrics.dump_predicted_answers_to_file( - output_prediction_file, infer_datalayer.dataset.examples, all_predictions, + output_prediction_file, + infer_datalayer.dataset.examples, + all_predictions, ) if output_nbest_file: @@ -324,7 +332,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -349,7 +357,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] start_indexes = self._get_best_indexes(start_logits[pos], n_best_size) end_indexes = self._get_best_indexes(end_logits[pos], n_best_size) @@ -468,7 +476,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text @@ -531,7 +539,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_best_indexes(self, logits, n_best_size): - """ Get the n-best logits from a list """ + """Get the n-best logits from a list""" best_indices = np.argsort(logits)[::-1] @@ -570,7 +578,7 @@ def _get_final_text(self, pred_text: str, orig_text: str, do_lower_case: bool, v def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): + for i, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i @@ -599,14 +607,16 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logging.warning( - "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, + "Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, + tok_ns_text, ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): + for i, tok_index in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py index 405b9a1e05ad..059cf5625f15 100644 --- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py @@ -27,10 +27,14 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("GPTQAModel") + self.cfg = cfg self.setup_tokenizer(cfg.tokenizer) @@ -102,7 +106,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -185,10 +193,19 @@ def inference( return all_predictions, all_nbest_perdictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_predictions = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -226,7 +243,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} for index, unique_id in enumerate(unique_ids): @@ -242,7 +264,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -250,7 +272,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py index 6fb2054a2237..2147d7d6a5bf 100644 --- a/nemo/collections/nlp/models/question_answering/qa_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['QAModel'] @@ -42,6 +43,9 @@ class QAModel(NLPModel): """ def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("QAModel") + super().__init__(cfg=cfg, trainer=trainer) self.classifier = TokenClassifier( hidden_size=self.hidden_size, @@ -186,7 +190,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py index 81001fb66da7..5ad959fd1b6f 100644 --- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py @@ -28,10 +28,13 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("S2SQAModel") self.cfg = cfg @@ -120,7 +123,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -145,7 +152,11 @@ def forward(self, input_ids, input_attn_mask, labels): labels = torch.where(labels != -100, labels, torch.zeros_like(labels)) output_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels)) unmasked_unreduced_loss = self.language_model( - input_ids, labels[:, :-1], input_attn_mask, output_attn_masks[:, :-1], lm_labels=labels[:, 1:], + input_ids, + labels[:, :-1], + input_attn_mask, + output_attn_masks[:, :-1], + lm_labels=labels[:, 1:], ) loss = self.language_model.loss_func(output_attn_masks[:, 1:], unmasked_unreduced_loss) per_sample_perplexity = torch.exp(unmasked_unreduced_loss) @@ -210,10 +221,19 @@ def inference( return all_predictions, all_nbest_predictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_json = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -251,7 +271,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} @@ -268,7 +293,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -276,7 +301,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] @@ -339,7 +364,10 @@ def _generate_candidates(self, input_ids, input_attn_mask): "max_length": num_tokens_to_generate, } generated_tokens = self.language_model.generate(**param_dict) - generated_answers = self.tokenizer.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True,) + generated_answers = self.tokenizer.tokenizer.batch_decode( + generated_tokens, + skip_special_tokens=True, + ) generated_answers = [ans.strip() for ans in generated_answers] elif self.cfg.library == 'megatron': diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py index eed94f2e1e31..d9e08f6764fc 100644 --- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -35,7 +35,7 @@ from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging -from nemo.utils.decorators import experimental +from nemo.utils.decorators import deprecated_warning, experimental __all__ = ["SpellcheckingAsrCustomizationModel"] @@ -48,7 +48,7 @@ class SpellcheckingAsrCustomizationModel(NLPModel): It takes as input ASR hypothesis and candidate customization entries. It labels the hypothesis with correct entry index or 0. Example input: [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ... - Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 + Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 Example output: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 ... """ @@ -67,6 +67,9 @@ def output_module(self): return self def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None: + # deprecation warning + deprecated_warning("SpellcheckingAsrCustomizationModel") + super().__init__(cfg=cfg, trainer=trainer) # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids @@ -321,7 +324,7 @@ def on_test_epoch_end(self): @torch.no_grad() def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None: - """ Main function for Inference + """Main function for Inference Args: dataloader_cfg: config for dataloader @@ -517,7 +520,7 @@ def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.ut Setup function for a infer data loader. Args: cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory - input_name: path to input file. + input_name: path to input file. Returns: A pytorch DataLoader. """ diff --git a/nemo/collections/nlp/modules/common/hyena/README.md b/nemo/collections/nlp/modules/common/hyena/README.md new file mode 100644 index 000000000000..a5e7b32cc590 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/README.md @@ -0,0 +1,26 @@ +## Required Dependencies for Hyena + +We depend on 3rd-party libraries for FFT convolutions implementation. Each library supports different use-cases: + +| Library | Supported Sequence Length | Single/Multi-Head Support | +|:----------------:|:-------------------------:|:-------------------------:| +| Safari `fftconv` | Up to 8192 | 1 or 8 heads | +| FlashFFTConv | Up to 4M | Single-head only | + +Note the overlapping support for single-head with sequence length up to 8192. By default, in this case we default to Safari `fftconv` as it is faster (and fallback to FlashFFTConv). The user may force the FFT convolution implementation used by setting the configuration key `model.hyena.fftconv_type` to either `safari` or `flash`. + +### Installation + +#### Safari `fftconv` + +Install from the [Safari repository](https://github.com/HazyResearch/safari/tree/main/csrc/fftconv). Run the following in a terminal: + +```bash +git clone https://github.com/HazyResearch/safari.git +cd safari/csrc/fftconv +pip install . +``` + +#### FlashFFTConv + +Follow the [installation instructions](https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#installation) in the FlashFFTConv repository. diff --git a/nemo/collections/nlp/modules/common/hyena/__init__.py b/nemo/collections/nlp/modules/common/hyena/__init__.py new file mode 100644 index 000000000000..f976e8f9d9c6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/__init__.py @@ -0,0 +1 @@ +from nemo.collections.nlp.modules.common.hyena.hyena import HyenaOperator diff --git a/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py new file mode 100644 index 000000000000..ca9a44489697 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py @@ -0,0 +1,129 @@ +import math + +import torch +from einops import rearrange +from fftconv import fftconv_bwd, fftconv_fwd + +# Code taken from: +# https://github.com/HazyResearch/safari/blob/main/src/ops/fftconv.py + + +class FFTConvFunc(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + u, + k, + D, + dropout_mask=None, + gelu=True, + force_fp16_output=False, + output_hbl_layout=False, + v=None, + head_dim=1, + q=None, + fftfp16=False, + k_rev=None, + ): + seqlen = u.shape[-1] + fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16) + k_f = torch.fft.rfft(k, n=fft_size) + if k_rev is not None: + k_f = k_f + torch.fft.rfft(k_rev, n=fft_size).conj() + if u.stride(-1) != 1: + u = u.contiguous() + k_f = k_f.contiguous() + D = D.contiguous() + if v is not None and v.stride(-1) != 1: + v = v.contiguous() + if q is not None and q.stride(-1) != 1: + q = q.contiguous() + if dropout_mask is not None: + dropout_mask = dropout_mask.contiguous() + ctx.save_for_backward(u, k_f, D, dropout_mask, v, q) + ctx.output_hbl_layout = output_hbl_layout + ctx.head_dim = head_dim + ctx.gelu = gelu + ctx.fftfp16 = fftfp16 + ctx.has_k_rev = k_rev is not None + out = fftconv_fwd( + u, + k_f, + D, + v, + head_dim, + q, + dropout_mask, + gelu, + False, + False, + fft_size, + force_fp16_output, + output_hbl_layout, + fftfp16, + ) + return out + + @staticmethod + def backward(ctx, dout): + if ctx.output_hbl_layout: + dout = rearrange(rearrange(dout, 'b h l -> h b l').contiguous(), 'h b l -> b h l') + else: + dout = dout.contiguous() + u, k_f, D, dropout_mask, v, q = ctx.saved_tensors + seqlen = u.shape[-1] + fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16) + du, dk_f, dD, dv, dq = fftconv_bwd( + dout, + u, + k_f, + D, + v, + ctx.head_dim, + q, + dropout_mask, + ctx.gelu, + False, + False, + fft_size, + ctx.output_hbl_layout, + ctx.fftfp16, + ) + dk = torch.fft.irfft(dk_f, n=fft_size, norm='forward')[..., :seqlen] + dk_rev = None if not ctx.has_k_rev else torch.fft.irfft(dk_f.conj(), n=fft_size, norm='forward')[..., :seqlen] + if v is not None: + dv = dv.to(dtype=v.dtype) # We do atomicAdd in fp32 so might need to convert to fp16 + return ( + du, + dk, + dD, + None, + None, + None, + None, + dv, + None, + dq, + None, + dk_rev, + ) + + +def fftconv_func( + u, + k, + D, + dropout_mask=None, + gelu=True, + force_fp16_output=False, + output_hbl_layout=False, + v=None, + head_dim=1, + q=None, + fftfp16=False, + k_rev=None, +): + return FFTConvFunc.apply( + u, k, D, dropout_mask, gelu, force_fp16_output, output_hbl_layout, v, head_dim, q, fftfp16, k_rev + ) diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py new file mode 100644 index 000000000000..f087a3d7a244 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/hyena.py @@ -0,0 +1,381 @@ +# Implementation of Hyena operator +# +# Michael Poli and Stefano Massaroli and Eric Nguyen and Daniel Y Fu and Tri Dao and Stephen Baccus and +# Yoshua Bengio and Stefano Ermon and Christopher Re, +# Hyena Hierarchy: Towards Larger Convolutional Language Models +# 2023, https://arxiv.org/abs/2302.10866 +# +# Multi-head variant introduced in: +# +# Stefano Massaroli and Michael Poli and Daniel Y Fu and Hermann Kumbong and Rom Nishijima Parnichkun and +# David W. Romero and Aman Timalsina and Quinn McIntyre and Beidi Chen and Atri Rudra and Ce Zhang and +# Christopher Re and Stefano Ermon and Yoshua Bengio, +# Laughing Hyena Distillery: Extracting Compact Recurrences From Convolutions +# NeurIPS 2023, https://arxiv.org/abs/2310.18780 +# +# Code is heavily based on the reference implementations from: +# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py +# https://github.com/athms/mad-lab/blob/main/mad/model/layers/hyena.py + +from dataclasses import dataclass +from typing import Union + +import torch +import torch.nn as nn +from einops import rearrange +from megatron.core.transformer.custom_layers.transformer_engine import ( + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) +from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp +from megatron.core.transformer.spec_utils import ModuleSpec, build_module +from megatron.core.transformer.transformer_config import TransformerConfig + +from nemo.collections.common.parts.utils import activation_registry +from nemo.collections.nlp.modules.common.hyena.hyena_filter import HyenaFilter, HyenaFilterSubmodules +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision +from nemo.utils.metaclasses import Singleton + +try: + from nemo.collections.nlp.modules.common.hyena.fftconv_wrapper import fftconv_func as safari_fftconv_fn + + HAVE_SAFARI_FFTCONV = True +except ImportError: + HAVE_SAFARI_FFTCONV = False + +try: + from flashfftconv import FlashFFTConv as FlashFFTConvImpl + + HAVE_FLASHFFTCONV = True + + class FlashFFTConv(metaclass=Singleton): + # Recommendation is to create single instance per model + # https://github.com/HazyResearch/flash-fft-conv?tab=readme-ov-file#example-model + def __init__(self, seqlen, dtype): + self.flashfftconv = FlashFFTConvImpl(seqlen, dtype) + +except ImportError: + HAVE_FLASHFFTCONV = False + +try: + from causal_conv1d import causal_conv1d_fn + + HAVE_CAUSAL_CONV1D = True +except ImportError: + HAVE_CAUSAL_CONV1D = False + + +@dataclass +class HyenaOperatorSubmodules: + in_proj: Union[ModuleSpec, type] = IdentityOp + short_filter: Union[ModuleSpec, type] = IdentityFuncOp + implicit_filter: Union[ModuleSpec, type] = IdentityOp + out_proj: Union[ModuleSpec, type] = IdentityOp + + +def auto_assign_attrs(cls, **kwargs): + for k, v in kwargs.items(): + setattr(cls, k, v) + + +class CausalDepthWiseConv1d(nn.Module): + def __init__(self, channels, width, bias=True): + if not HAVE_CAUSAL_CONV1D: + raise ImportError("Missing causal-conv1d library, please run 'pip install causal-conv1d'") + + super().__init__() + self.channels = channels + self.width = width + self._conv_1d = nn.Conv1d( + in_channels=channels, + out_channels=channels, + kernel_size=width, + padding=width - 1, + groups=channels, + bias=bias, + ) + + def forward(self, x): + return causal_conv1d_fn(x, self._conv_1d.weight.squeeze(1), self._conv_1d.bias) + + +class HyenaConv(nn.Module): + def __init__( + self, + d_model: int, + max_seq_length: int, + order: int, + bias: bool = True, + filter_cls: Union[ModuleSpec, type] = HyenaFilter, + filter_submodules: HyenaFilterSubmodules = None, + **filter_kwargs, + ): + super().__init__() + self.d_model = d_model + self.order = order + self.max_seq_length = max_seq_length + self.use_bias = bias + bias_shape = self.d_model * (self.order - 1) + if self.use_bias: + self.bias = nn.Parameter(torch.randn(bias_shape)) + else: + self.bias = torch.zeros(bias_shape) + + self.filter = build_module( + filter_cls, + self.d_model * (self.order - 1), + submodules=filter_submodules, + seq_len=max_seq_length, + **filter_kwargs, + ) + + +class SingleHeadHyenaConv(HyenaConv): + def __init__( + self, + d_model: int, + max_seq_length: int, + order: int, + bias: bool = True, + filter_cls: Union[ModuleSpec, type] = HyenaFilter, + filter_submodules: HyenaFilterSubmodules = None, + fftconv_type: str = None, + precision: str = 'bf16', + **filter_kwargs, + ): + super().__init__( + d_model, + max_seq_length, + order, + bias=bias, + filter_cls=filter_cls, + filter_submodules=filter_submodules, + **filter_kwargs, + ) + + if fftconv_type is None: + if max_seq_length <= 8192 and HAVE_SAFARI_FFTCONV: + # safari-fftconv supports seq-len <= 8192 and is a bit faster vs. flashfftconv + fftconv_type = 'safari' + else: + fftconv_type = 'flash' + + if fftconv_type not in ['safari', 'flash']: + raise ValueError("fftconv_type must be one of ['safari', 'flash']") + if fftconv_type == 'safari' and max_seq_length > 8192: + raise ValueError('Safari-fftconv only supports sequence length up to 8192') + if fftconv_type == 'safari' and not HAVE_SAFARI_FFTCONV: + raise ImportError('Safari-fftconv library not found. Please see README at for instructions.') + if fftconv_type == 'flash' and not HAVE_FLASHFFTCONV: + raise ImportError('flashfftconv library not found. Please see README at for instructions.') + + if fftconv_type == 'safari': + self.fftconv_fn = self._safari_fft + else: # fftconv_type == 'flash' + self.flashfftconv = FlashFFTConv( + 2 * self.max_seq_length, torch_dtype_from_precision(precision) + ).flashfftconv + self.fftconv_fn = self._flash_fft + + def _safari_fft(self, x, k, bias): + bias = bias.to(dtype=torch.float32) + return safari_fftconv_fn(x, k, bias, gelu=False) + + def _flash_fft(self, x, k, bias): + x = x.contiguous() + y = self.flashfftconv(x, k) + x * bias.unsqueeze(dim=1) + return y + + def forward(self, x, k, recurrence_idx): + bias = rearrange(self.bias, '(v o) -> o v', v=self.d_model, o=self.order - 1)[recurrence_idx] + y = self.fftconv_fn(x, k, bias) + return y + + +class MultiHeadHyenaConv(HyenaConv): + def __init__( + self, + d_model: int, + max_seq_length: int, + order: int, + num_heads: int, + bias: bool = True, + filter_cls: Union[ModuleSpec, type] = HyenaFilter, + filter_submodules: HyenaFilterSubmodules = None, + fftconv_type: str = None, + precision: str = 'bf16', + **filter_kwargs, + ): + if num_heads == 1: + raise ValueError('Expecting num_heads > 1') + if order != 2: + raise ValueError(f'Multi-head supported only with order == 2 (got order {self.order})') + if not HAVE_SAFARI_FFTCONV: + raise ImportError('Safari-fftconv library not found. Please see README at for instructions.') + + super().__init__( + d_model, + max_seq_length, + order, + bias=bias, + filter_cls=filter_cls, + filter_submodules=filter_submodules, + **filter_kwargs, + ) + self.num_heads = num_heads + + def forward(self, v, k, x1, x2): + bias = self.bias.to(dtype=torch.float32) + y = safari_fftconv_fn(v, k, bias, gelu=False, output_hbl_layout=True, v=x2, head_dim=self.num_heads, q=x1) + return y + + +class HyenaOperator(nn.Module): + def __init__( + self, + config: TransformerConfig, + max_seq_length: int, + order: int = 2, + num_heads: int = 1, + dropout: float = 0.0, + short_filter_order: int = 3, + activation: str = "identity", + submodules: HyenaOperatorSubmodules = None, + layer_number=None, + **long_conv_kwargs, + ): + r""" + Hyena operator described in the paper https://arxiv.org/pdf/2302.10866.pdf + + Args: + max_seq_length: (int): Maximum input sequence length. + order: (int): Depth of the Hyena recurrence. Defaults to 2 + num_heads: (int): Number of heads. Defaults to 1 + dropout: (float): Dropout probability. Defaults to 0.0 + short_filter_order: (int): Length of the explicit input convolutional filter. Defaults to 3 + activation: (str): type of act between kernel output and output projection (default identity) + """ + super().__init__() + + if submodules is None: + submodules = HyenaOperatorSubmodules( + in_proj=TELayerNormColumnParallelLinear, + short_filter=CausalDepthWiseConv1d, + implicit_filter=HyenaFilter, + out_proj=TERowParallelLinear, + ) + + if order < 2: + raise ValueError(f'Order must be at least 2, (got {self.order})') + + d_model = config.hidden_size + if d_model % num_heads != 0: + raise ValueError(f'Model dimension {d_model} must be divisible by num heads {num_heads}') + head_dim = d_model // num_heads + + auto_assign_attrs( + self, + d_model=d_model, + order=order, + max_seq_length=max_seq_length, + num_heads=num_heads, + head_dim=head_dim, + short_filter_order=short_filter_order, + activation=activation, + mcore_config=config, + ) + self.activation = activation_registry[activation]() + self.dropout = nn.Dropout(dropout) + + # Setup input and output projections (over the width dimension) + self.in_proj = build_module( + submodules.in_proj, + self.d_model, + (self.order + 1) * self.d_model, + config=self.mcore_config, + init_method=self.mcore_config.init_method, + gather_output=False, + bias=True, + skip_bias_add=False, + is_expert=False, + tp_comm_buffer_name='in_proj', + ) + + self.out_proj = build_module( + submodules.out_proj, + self.d_model, + self.d_model, + config=self.mcore_config, + init_method=self.mcore_config.output_layer_init_method, + bias=True, + input_is_parallel=True, + skip_bias_add=True, + is_expert=False, + tp_comm_buffer_name='out_proj', + ) + + # Setup short filter + total_width = self.d_model * (self.order + 1) + self.short_filter = build_module(submodules.short_filter, total_width, self.short_filter_order) + + # Setup long convolution with implicit filter + long_conv_args = [self.head_dim, self.max_seq_length, self.order] + long_conv_kwargs['filter_cls'] = submodules.implicit_filter + long_conv_kwargs['filter_submodules'] = submodules.implicit_filter.submodules + if self.num_heads == 1: + self.long_conv = SingleHeadHyenaConv(*long_conv_args, **long_conv_kwargs) + self.conv_fwd_fn = self.conv_single_head + else: + long_conv_args.append(self.num_heads) + self.long_conv = MultiHeadHyenaConv(*long_conv_args, **long_conv_kwargs) + self.conv_fwd_fn = self.conv_multi_head + + def forward(self, u, *args, **kwargs): + l = u.size(0) + l_filter = min(l, self.max_seq_length) + u = self.in_proj(u) + u = u[0] if isinstance(u, tuple) else u + u = rearrange(u, 'l b d -> b d l') # In MCore the leading dimension is the sequence dimension + + k = self.long_conv.filter(l_filter) + # `c` is always 1 by default + k = rearrange(k, 'c l v -> c v l', v=self.head_dim)[0] + + uc = self.short_filter(u)[..., :l_filter] + + k = k.to(dtype=torch.float32) + y = self.conv_fwd_fn(uc, k) + + y = rearrange(y, 'b d l -> b l d') + y = self.activation(y) + y = self.out_proj(y) + if isinstance(y, tuple): + y, bias = y + else: + bias = None + + # Convert back to sequence-first for MCore + y = rearrange(y, 'b l d -> l b d') + + # MCore TransformerLayer expects tuple where 2nd element represents the bias, it can be None + return y, bias + + def conv_single_head(self, uc, k): + k = rearrange(k, '(o v) l -> o v l', v=self.head_dim, o=self.order - 1) + + *x, v = uc.split(self.d_model, dim=1) + for o, x_i in enumerate(reversed(x[1:])): + v = self.dropout(v * x_i) + v = self.long_conv(v, k=k[o], recurrence_idx=o) + + y = v * x[0] + return y + + def conv_multi_head(self, uc, k): + x1, x2, v = uc.split(self.d_model, dim=1) + x1 = x1.contiguous() + x2 = x2.contiguous() + v = v.contiguous() + + y = self.long_conv(v, k, x1, x2) + return y diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_filter.py b/nemo/collections/nlp/modules/common/hyena/hyena_filter.py new file mode 100644 index 000000000000..bf6752102480 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/hyena_filter.py @@ -0,0 +1,173 @@ +import math +from dataclasses import dataclass +from typing import Union + +import torch +import torch.nn as nn + +from megatron.core.transformer.identity_op import IdentityOp +from megatron.core.transformer.spec_utils import ModuleSpec, build_module + +# Code mostly taken from: +# https://github.com/HazyResearch/safari/blob/flashfftconv/src/models/sequence/hyena.py + + +@dataclass +class HyenaFilterSubmodules: + positional_embedding: Union[ModuleSpec, type] = IdentityOp + linear: Union[ModuleSpec, type] = IdentityOp + activation: Union[ModuleSpec, type] = IdentityOp + modulation: Union[ModuleSpec, type] = IdentityOp + + +def register(module: nn.Module, name: str, tensor: torch.Tensor, learnable: bool): + if learnable: + module.register_parameter(name, nn.Parameter(tensor)) + else: + module.register_buffer(name, tensor) + + +class Sin(nn.Module): + def __init__(self, dim: int, freq: float = 10, train_freq: bool = True): + """ + Sinusoidal activation function with (optionally learned) per-channel frequency + """ + super().__init__() + self.freq = nn.Parameter(freq * torch.ones(1, dim)) if train_freq else freq * torch.ones(1, dim) + + def forward(self, x): + return torch.sin(self.freq * x) + + +class PositionalEmbedding(nn.Module): + def __init__( + self, + emb_dim: int, + seq_len: int, + learn_pos_emb_z: bool = True, + ): + """Complex exponential positional embeddings for Hyena filters.""" + super().__init__() + + self.seq_len = seq_len + # The time embedding fed to the filters is normalized so that t_f = 1 + t = torch.linspace(0, 1, self.seq_len)[None, :, None] # 1, L, 1 + + if emb_dim > 1: + bands = (emb_dim - 1) // 2 + # To compute the right embeddings we use the "proper" linspace + t_rescaled = torch.linspace(0, seq_len - 1, seq_len)[None, :, None] + w = 2 * math.pi * t_rescaled / seq_len # 1, L, 1 + + f = torch.linspace(1e-4, bands - 1, bands)[None, None] + z = torch.exp(-1j * f * w) + z = torch.cat([t, z.real, z.imag], dim=-1) + register(self, "z", z, learnable=learn_pos_emb_z) + register(self, "t", t, learnable=False) + + def forward(self, L): + return self.z[:, :L], self.t[:, :L] + + +class ExponentialModulation(nn.Module): + def __init__( + self, + d_model: int, + modulate: bool = True, + learn_modulation: bool = False, + fast_decay_pct: float = 0.3, + slow_decay_pct: float = 1.5, + target: float = 1e-2, + shift: float = 0.0, + ): + """ + Exponential decay modulation with (optionally learned) per-channel decay rate + """ + super().__init__() + self.modulate = modulate + self.shift = shift + max_decay = math.log(target) / fast_decay_pct + min_decay = math.log(target) / slow_decay_pct + deltas = torch.linspace(min_decay, max_decay, d_model)[None, None] + register(self, "deltas", deltas, learnable=learn_modulation) + + def forward(self, t, x): + if self.modulate: + decay = torch.exp(-t * self.deltas.abs()) + x = x * (decay + self.shift) + return x + + +class HyenaFilter(nn.Module): + def __init__( + self, + d_model: int, + seq_len: int = 1024, + emb_dim: int = 3, + learn_pos_emb_z: bool = True, + mlp_width: int = 64, + sine_freq: int = 1, + num_inner_mlps: int = 2, + normalized: bool = False, + submodules: HyenaFilterSubmodules = None, + **modulation_kwargs, + ): + """ + Implicit long filter with modulation. + + Args: + d_model (int): number of channels in the input + emb_dim (int): dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands + mlp_width (int): Width of the MLP parametrizing the implicit filter. Defaults to 64 + seq_len (int): length of input sequence + learn_pos_emb_z (bool): whether the positional embeddings are learned + sine_freq (int): frequency of periodic activations + num_inner_mlps (int): number of inner linear layers inside filter MLP + normalized (bool): whether to apply normalization after modulation + """ + super().__init__() + + if submodules is None: + submodules = HyenaFilterSubmodules( + positional_embedding=PositionalEmbedding, + linear=nn.Linear, + activation=Sin, + modulation=ExponentialModulation, + ) + + self.d_model = d_model + self.mlp_width = mlp_width + + act = build_module(submodules.activation, dim=mlp_width, freq=sine_freq) + self.emb_dim = emb_dim + if emb_dim % 2 == 0 or emb_dim < 3: + raise ValueError("emb_dim must be odd and greater or equal to 3 (time, sine and cosine)") + self.seq_len = seq_len + + self.pos_emb = build_module(submodules.positional_embedding, emb_dim, seq_len, learn_pos_emb_z) + + # uses a variable number of inner linear layers + self.mlp = nn.Sequential( + build_module(submodules.linear, emb_dim, mlp_width), + act, + ) + for i in range(num_inner_mlps): + self.mlp.append(build_module(submodules.linear, mlp_width, mlp_width)) + self.mlp.append(act) + # final linear layer + self.mlp.append(build_module(submodules.linear, mlp_width, d_model, bias=False)) + + self.modulation = build_module(submodules.modulation, d_model, **modulation_kwargs) + + self.normalized = normalized + + def forward(self, L): + z, t = self.pos_emb(L) + h = self.mlp(z) + + h = self.modulation(t, h) + + if self.normalized: + h = h / torch.norm(h, dim=-1, p=1, keepdim=True) + + return h diff --git a/nemo/collections/nlp/modules/common/hyena/hyena_spec.py b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py new file mode 100644 index 000000000000..cd9fd66f4e75 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/hyena_spec.py @@ -0,0 +1,47 @@ +import torch.nn as nn +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec +from megatron.core.transformer.custom_layers.transformer_engine import ( + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) +from megatron.core.transformer.spec_utils import ModuleSpec + +from nemo.collections.nlp.modules.common.hyena.hyena import ( + CausalDepthWiseConv1d, + HyenaOperator, + HyenaOperatorSubmodules, +) +from nemo.collections.nlp.modules.common.hyena.hyena_filter import ( + ExponentialModulation, + HyenaFilter, + HyenaFilterSubmodules, + PositionalEmbedding, + Sin, +) + + +def get_hyena_layer_with_transformer_engine_spec(hyena_cfg): + return ModuleSpec( + module=HyenaOperator, + params=hyena_cfg, + submodules=HyenaOperatorSubmodules( + in_proj=TELayerNormColumnParallelLinear, + short_filter=CausalDepthWiseConv1d, + implicit_filter=ModuleSpec( + module=HyenaFilter, + submodules=HyenaFilterSubmodules( + positional_embedding=PositionalEmbedding, + linear=nn.Linear, + activation=Sin, + modulation=ExponentialModulation, + ), + ), + out_proj=TERowParallelLinear, + ), + ) + + +def get_gpt_layer_with_te_and_hyena_spec(hyena_cfg): + spec = get_gpt_layer_with_transformer_engine_spec() + spec.submodules.self_attention = get_hyena_layer_with_transformer_engine_spec(hyena_cfg) + return spec diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py index a85c155cc0a8..bcfe07f702a0 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py @@ -14,19 +14,16 @@ import torch import torch.nn.functional as F -from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb +from megatron.core.tensor_parallel import ColumnParallelLinear from megatron.core.transformer.attention import SelfAttention -from megatron.core.transformer.custom_layers.transformer_engine import ( - SplitAlongDim, - TEColumnParallelLinear, - TELayerNormColumnParallelLinear, -) +from megatron.core.transformer.custom_layers.transformer_engine import SplitAlongDim from megatron.core.transformer.mlp import MLP +from megatron.core.transformer.moe.experts import SequentialMLP from megatron.core.transformer.transformer_layer import TransformerLayer from megatron.core.utils import make_viewless_tensor @@ -37,6 +34,8 @@ LoraDenseAttentionAdapterConfig, LoraHto4HAdapterConfig, LoraKQVAdapterConfig, + LoraMoe4HtoHAdapterConfig, + LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, MLPInfusedAdapterConfig, @@ -281,13 +280,15 @@ def forward( class MCoreMLPMixin(MLP, MCoreAdapterModuleMixin): def mcore_register_adapters(self): """ - Setup NeMo IA3 adapter to this MCore layer. + Setup NeMo IA3 and LoRA adapter to this MCore layer. """ self.set_accepted_adapter_types( [ LoraUnfusedHto4HAdapterConfig._target_, LoraHto4HAdapterConfig._target_, Lora4HtoHAdapterConfig._target_, + LoraMoeHto4HAdapterConfig._target_, + LoraMoe4HtoHAdapterConfig._target_, MLPInfusedAdapterConfig._target_, ] ) # only self attn (packed qkv) for now @@ -302,9 +303,12 @@ def mcore_register_adapters(self): # overlap is used. self.linear_fc1.return_layernorm_output_gathered = True - def forward(self, hidden_states): + def forward(self, hidden_states, expert_idx=None): # [s, b, 4 * h/p] - if self.linear_fc1.te_return_bias: + if isinstance(self.linear_fc1, ColumnParallelLinear): + layernorm_output = hidden_states + intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) + elif self.linear_fc1.te_return_bias: intermediate_parallel, bias_parallel, layernorm_output = self.linear_fc1(hidden_states) else: # bias_parallel is None @@ -315,15 +319,19 @@ def forward(self, hidden_states): lora_adapter = None lora_fc1_adapter = self.get_adapter_module(AdapterName.LORA_Hto4H_ADAPTER) lora_unfused_fc1_adapter = self.get_adapter_module(AdapterName.LORA_UNFUSED_Hto4H_ADAPTER) + lora_moe_fc1_adapter = self.get_adapter_module(AdapterName.LORA_MOE_Hto4H_ADAPTER) if lora_fc1_adapter and self.adapter_cfg[AdapterName.LORA_Hto4H_ADAPTER]['enabled']: lora_adapter = lora_fc1_adapter if lora_unfused_fc1_adapter and self.adapter_cfg[AdapterName.LORA_UNFUSED_Hto4H_ADAPTER]['enabled']: assert lora_adapter is None, "Expected only one of LORA_Hto4H_ADAPTER or LORA_UNFUSED_Hto4H_ADAPTER" lora_adapter = lora_unfused_fc1_adapter + lora_output = 0 if lora_adapter: lora_output = lora_adapter(layernorm_output) - intermediate_parallel = intermediate_parallel + lora_output + elif lora_moe_fc1_adapter and self.adapter_cfg[AdapterName.LORA_MOE_Hto4H_ADAPTER]['enabled']: + lora_output = lora_moe_fc1_adapter(layernorm_output, expert_idx) + intermediate_parallel = intermediate_parallel + lora_output if self.config.bias_activation_fusion: if self.activation_func == F.gelu: @@ -363,14 +371,51 @@ def glu(x): # LoRA logic if self.is_adapter_available(): - lora_linear_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER) - if lora_linear_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']: - lora_output = lora_linear_fc2_adapter(intermediate_parallel) - output = output + lora_output + lora_fc2_adapter = self.get_adapter_module(AdapterName.LORA_4HtoH_ADAPTER) + lora_moe_fc2_adapter = self.get_adapter_module(AdapterName.LORA_MOE_4HtoH_ADAPTER) + + lora_output = 0 + if lora_fc2_adapter and self.adapter_cfg[AdapterName.LORA_4HtoH_ADAPTER]['enabled']: + lora_output = lora_fc2_adapter(intermediate_parallel) + elif lora_moe_fc2_adapter and self.adapter_cfg[AdapterName.LORA_MOE_4HtoH_ADAPTER]['enabled']: + lora_output = lora_moe_fc2_adapter(intermediate_parallel, expert_idx) + + output = output + lora_output return output, output_bias +class MCoreSequentialMLPMixin(SequentialMLP, MCoreAdapterModuleMixin): + def mcore_register_adapters(self): + """ + We don't want the SequentialMLP layer to take any adapters. We only want to override the forward() behavior + """ + pass + + def forward(self, permuted_local_hidden_states, tokens_per_expert): + output_local = torch.zeros_like(permuted_local_hidden_states) + output_bias_local = None + if self.add_bias: + output_bias_local = torch.zeros_like(permuted_local_hidden_states) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + for expert_num, expert in enumerate(self.local_experts): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + hidden = permuted_local_hidden_states[start:end] + output, output_bias = expert(hidden, expert_num) # expert: MLP + + output_local[start:end] = output + if self.add_bias: + output_bias = output_bias.expand_as(output) + output_bias_local[start:end, :] = output_bias + + return output_local, output_bias_local + + class MCoreGPTEmbeddingMixin(LanguageModelEmbedding, MCoreAdapterModuleMixin): def mcore_register_adapters(self): """ diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 61903e6b3673..21dace008877 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -83,6 +83,8 @@ class AdapterName(str, enum.Enum): LORA_Hto4H_ADAPTER = "lora_hto4h_adapter" LORA_UNFUSED_Hto4H_ADAPTER = "lora_unfused_hto4h_adapter" LORA_4HtoH_ADAPTER = "lora_4htoh_adapter" + LORA_MOE_Hto4H_ADAPTER = "lora_moe_hto4h_adapter" + LORA_MOE_4HtoH_ADAPTER = "lora_moe_4htoh_adapter" MULTIMODAL_PROJECTOR_ADAPTER = "mm_projector_adapter" PARALLEL_LINEAR_ADAPTER = "parallel_linear_adapter" @@ -611,6 +613,80 @@ class LoraUnfusedKQVAdapterConfig(AdapterConfig): _target_: str = "{0}.{1}".format(LoraUnfusedKQVAdapter.__module__, LoraUnfusedKQVAdapter.__name__) +class LoraMoeAdapter(nn.Module, AdapterModuleUtil): + def __init__( + self, + num_moe_experts: int, + in_features: int, + out_features: int, + dim: int, + activation: str = 'identity', + norm_position: Optional[str] = None, + norm_type: Optional[str] = None, + column_init_method: str = 'xavier', + row_init_method: str = 'zero', + gather_output: bool = False, + input_is_parallel: bool = False, + dropout: float = 0.0, + model_parallel_config: Optional[ModelParallelConfig] = None, + alpha: float | None = None, + dropout_position: str = 'post', + a2a_experimental: bool = False, + **kwargs, + ): + super().__init__() + + self.num_moe_experts = num_moe_experts + adapter_args = { + "in_features": in_features, + "out_features": out_features, + "dim": dim, + "activation": activation, + "norm_position": norm_position, + "norm_type": norm_type, + "column_init_method": column_init_method, + "row_init_method": row_init_method, + "gather_output": gather_output, + "input_is_parallel": input_is_parallel, + "dropout": dropout, + "model_parallel_config": model_parallel_config, + "alpha": alpha, + "dropout_position": dropout_position, + "a2a_experimental": a2a_experimental, + } + self.expert_adapters = nn.ModuleList() + for i in range(num_moe_experts): + self.expert_adapters.append(ParallelLinearAdapter(**adapter_args)) + + def forward(self, x, expert_idx): + return self.expert_adapters[expert_idx](x) + + +@dataclass +class LoraMoeHto4HAdapterConfig(AdapterConfig): + num_moe_experts: int + in_features: int + out_features: int + dim: int + activation: str = 'identity' + norm_position: Optional[str] = None + norm_type: Optional[str] = None + column_init_method: str = 'xavier' + row_init_method: str = 'zero' + gather_output: bool = False + input_is_parallel: bool = False + dropout: float = 0.0 + dropout_position: str = 'post' + alpha: float | None = None + a2a_experimental: bool = False + _target_: str = "{0}.{1}".format(LoraMoeAdapter.__module__, LoraMoeAdapter.__name__) + + +@dataclass +class LoraMoe4HtoHAdapterConfig(LoraMoeHto4HAdapterConfig): + input_is_parallel: bool = True + + class PromptEncoderAdapter(nn.Module, AdapterModuleUtil): """ The Tensor Parallel MLP prompt encoder network that is used to generate the virtual @@ -690,20 +766,14 @@ def set_inference_table(self, prompt_representation: torch.Tensor): self.is_inference_ready = True return True - def clear_inference_table( - self, - ): + def clear_inference_table(self): self.inference_table.fill_(0.0) self.is_inference_ready = False - def get_inference_table( - self, - ): + def get_inference_table(self): return self.inference_table.data - def inner_forward( - self, - ): + def inner_forward(self): input_embeds = self.embedding(self.indices).unsqueeze(0) intermediate_parallel, bias_parallel = self.first(input_embeds) intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index 4d558ce00114..50c97e349885 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -23,6 +23,7 @@ MCoreGPTEmbeddingMixin, MCoreMLPMixin, MCoreSelfAttentionMixin, + MCoreSequentialMLPMixin, MCoreTransformerLayerMixin, ) except (ImportError, ModuleNotFoundError): @@ -36,6 +37,8 @@ LoraHto4HAdapterConfig, LoraKQVAdapterConfig, LoraKQVAdapterWeightTyingConfig, + LoraMoe4HtoHAdapterConfig, + LoraMoeHto4HAdapterConfig, LoraUnfusedHto4HAdapterConfig, LoraUnfusedKQVAdapterConfig, MLPInfusedAdapterConfig, @@ -176,7 +179,10 @@ def __init__(self, cfg): elif module == PEFT_MODULE_MAP["hto4h_module"]: hto4h_projection_size = cfg.ffn_hidden_size * 2 if fast_glu_activation else cfg.ffn_hidden_size - if lora_cfg.get("variant", "nemo") == "canonical": + if cfg.get('num_moe_experts', None): + _adapter_name = AdapterName.LORA_MOE_Hto4H_ADAPTER + _adapter_cfg_cls = LoraMoeHto4HAdapterConfig + elif lora_cfg.get("variant", "nemo") == "canonical": _adapter_name = AdapterName.LORA_UNFUSED_Hto4H_ADAPTER _adapter_cfg_cls = LoraUnfusedHto4HAdapterConfig else: @@ -187,13 +193,35 @@ def __init__(self, cfg): cfg, lora_cfg, cfg.hidden_size, hto4h_projection_size, _adapter_cfg_cls ) name_key_to_cfg[_adapter_name] = adapter_cfg - name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] + if _adapter_name == AdapterName.LORA_MOE_Hto4H_ADAPTER: + name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)] + for i in range(int(cfg.num_moe_experts)): + name_key_to_mcore_mixins[_adapter_name].append( + (f"mlp.experts.local_experts.{i}", MCoreMLPMixin) + ) + else: + name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] + elif module == PEFT_MODULE_MAP["4htoh_module"]: + if cfg.get('num_moe_experts', None): + _adapter_name = AdapterName.LORA_MOE_4HtoH_ADAPTER + _adapter_cfg_cls = LoraMoe4HtoHAdapterConfig + else: + _adapter_name = AdapterName.LORA_4HtoH_ADAPTER + _adapter_cfg_cls = Lora4HtoHAdapterConfig + adapter_cfg = self._create_lora_config( - cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, Lora4HtoHAdapterConfig + cfg, lora_cfg, cfg.ffn_hidden_size, cfg.hidden_size, _adapter_cfg_cls ) - name_key_to_cfg[AdapterName.LORA_4HtoH_ADAPTER] = adapter_cfg - name_key_to_mcore_mixins[AdapterName.LORA_4HtoH_ADAPTER] = [("mlp", MCoreMLPMixin)] + name_key_to_cfg[_adapter_name] = adapter_cfg + if _adapter_name == AdapterName.LORA_MOE_4HtoH_ADAPTER: + name_key_to_mcore_mixins[_adapter_name] = [("mlp.experts", MCoreSequentialMLPMixin)] + for i in range(int(cfg.num_moe_experts)): + name_key_to_mcore_mixins[_adapter_name].append( + (f"mlp.experts.local_experts.{i}", MCoreMLPMixin) + ) + else: + name_key_to_mcore_mixins[_adapter_name] = [("mlp", MCoreMLPMixin)] else: logging.error( f"Unrecognized target_module string: {module}.\n" @@ -228,6 +256,8 @@ def _create_lora_config( assert kv_channels is not None, "kv_channels must be provided for canonical Lora" config_args.update({"num_query_groups": num_query_groups, "kv_channels": kv_channels}) config_args.pop("out_features") + elif adapter_cfg_cls in (LoraMoeHto4HAdapterConfig, LoraMoe4HtoHAdapterConfig): + config_args.update({'num_moe_experts': cfg.num_moe_experts}) if lora_cfg.weight_tying: position_embedding_strategy = lora_cfg.get("position_embedding_strategy", None) diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 0a9054c23da8..f5d61a8edb15 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -220,37 +220,40 @@ def on_fit_start(self) -> None: return super().on_fit_start() def register_artifact( - self, config_path: str, src: str, verify_src_exists: bool = True, + self, + config_path: str, + src: str, + verify_src_exists: bool = True, ): - """ Register model artifacts with this function. These artifacts (files) will be included inside .nemo file - when model.save_to("mymodel.nemo") is called. + """Register model artifacts with this function. These artifacts (files) will be included inside .nemo file + when model.save_to("mymodel.nemo") is called. - How it works: + How it works: - 1. It always returns existing absolute path which can be used during Model constructor call - EXCEPTION: src is None or "" in which case nothing will be done and src will be returned - 2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts + 1. It always returns existing absolute path which can be used during Model constructor call + EXCEPTION: src is None or "" in which case nothing will be done and src will be returned + 2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts - .. code-block:: + .. code-block:: - If "src" is local existing path: - then it will be returned in absolute path form. - elif "src" starts with "nemo_file:unique_artifact_name": - .nemo will be untarred to a temporary folder location and an actual existing path will be returned - else: - an error will be raised. + If "src" is local existing path: + then it will be returned in absolute path form. + elif "src" starts with "nemo_file:unique_artifact_name": + .nemo will be untarred to a temporary folder location and an actual existing path will be returned + else: + an error will be raised. - WARNING: use .register_artifact calls in your models' constructors. - The returned path is not guaranteed to exist after you have exited your model's constructor. + WARNING: use .register_artifact calls in your models' constructors. + The returned path is not guaranteed to exist after you have exited your model's constructor. - Args: - config_path (str): Artifact key. Usually corresponds to the model config. - src (str): Path to artifact. - verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if - src is not found. Defaults to True. + Args: + config_path (str): Artifact key. Usually corresponds to the model config. + src (str): Path to artifact. + verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return None even if + src is not found. Defaults to True. - Returns: - str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life + Returns: + str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model instance life """ if src is None or src == "": @@ -610,7 +613,9 @@ def setup_megatron_optimization(self, optim_config: Union[Dict[str, Any], DictCo return megatron_optim_config def setup_optimization( - self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, + self, + optim_config: Optional[Union[DictConfig, Dict]] = None, + optim_kwargs: Optional[Dict[str, Any]] = None, ): """Prepares an optimizer from a string name and its optional config parameters. @@ -760,7 +765,10 @@ def setup_optimization( if optimizer_name == 'mcore_distributed_optim': # setup megatron_optim_config and get Mcore based optimizer with the wrapper megatron_optim_config = self.setup_megatron_optimization(optimizer_args) - _megatron_optimizer = get_megatron_optimizer(megatron_optim_config, self.model,) + _megatron_optimizer = get_megatron_optimizer( + megatron_optim_config, + self.model, + ) optimizer = McoreDistributedOptimizer(_megatron_optimizer) else: @@ -781,30 +789,30 @@ def setup_optimization( def setup_optimizer_param_groups(self): """ - Used to create param groups for the optimizer. - As an example, this can be used to specify per-layer learning rates: - - optim.SGD([ - {'params': model.base.parameters()}, - {'params': model.classifier.parameters(), 'lr': 1e-3} - ], lr=1e-2, momentum=0.9) - - See https://pytorch.org/docs/stable/optim.html for more information. - By default, ModelPT will use self.parameters(). - Override this method to add custom param groups. - In the config file, add 'optim_param_groups' to support different LRs - for different components (unspecified params will use the default LR): - - model: - optim_param_groups: - encoder: - lr: 1e-4 - momentum: 0.8 - decoder: - lr: 1e-3 - optim: - lr: 3e-3 - momentum: 0.9 + Used to create param groups for the optimizer. + As an example, this can be used to specify per-layer learning rates: + + optim.SGD([ + {'params': model.base.parameters()}, + {'params': model.classifier.parameters(), 'lr': 1e-3} + ], lr=1e-2, momentum=0.9) + + See https://pytorch.org/docs/stable/optim.html for more information. + By default, ModelPT will use self.parameters(). + Override this method to add custom param groups. + In the config file, add 'optim_param_groups' to support different LRs + for different components (unspecified params will use the default LR): + + model: + optim_param_groups: + encoder: + lr: 1e-4 + momentum: 0.8 + decoder: + lr: 1e-3 + optim: + lr: 3e-3 + momentum: 0.9 """ if not hasattr(self, "parameters"): self._optimizer_param_groups = None @@ -1710,26 +1718,27 @@ def update_save_restore_connector(cls, save_restore_connector): setattr(cls, '_save_restore_connector', save_restore_connector) def _setup_profiling(self): - """ Enables nsys profiling - To use, add the following optoins to the model config: - ## Nsys profiling options - nsys_profile: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes - And then wrap the model training script with: - nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... - See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling - - Enables CUDA memory profiling - To use, add the following optoins to the model config: - ## CUDA memory profiling options - memory_profile: False - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - rank: 0 # Global rank ID to profile - output_path: None # Path to store the profile output file + """Enables nsys profiling + To use, add the following optoins to the model config: + ## Nsys profiling options + nsys_profile: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + And then wrap the model training script with: + nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/... + See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling + + Enables CUDA memory profiling + To use, add the following options to the model config: + ## CUDA memory profiling options + memory_profile: + enabled: True + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + rank: 0 # Global rank ID to profile + output_path: None # Path to store the profile output file """ if self.cfg.get('nsys_profile', None) is not None: if self.cfg.nsys_profile.get('enabled', False): @@ -1791,9 +1800,9 @@ def _setup_profiling(self): ) def on_train_start(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start - We use it here to copy the relevant config for dynamic freezing. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start + We use it here to copy the relevant config for dynamic freezing. """ # dynamic freezing @@ -1810,9 +1819,9 @@ def on_train_start(self): setattr(self, '_freeze_cfg', None) def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> Optional[int]: - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start - We use it here to enable nsys profiling and dynamic freezing. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start + We use it here to enable nsys profiling and dynamic freezing. """ # nsys profiling @@ -1856,9 +1865,9 @@ def on_train_batch_start(self, batch: Any, batch_idx: int, unused: int = 0) -> O self._freeze_cfg['is_frozen'][ml] = False def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int = 0) -> None: - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end - We use it here to enable nsys profiling. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end + We use it here to enable nsys profiling. """ if self.device.type == 'cuda': @@ -1893,30 +1902,30 @@ def _cleanup_on_execution_end(self): self._test_step_outputs = None def on_train_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end - We use it here to cleanup the dynamic freezing config. + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end + We use it here to cleanup the dynamic freezing config. """ self._cleanup_on_execution_end() def on_test_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end """ self._cleanup_on_execution_end() def on_predict_end(self): - """ PyTorch Lightning hook: - https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end """ self._cleanup_on_execution_end() # TODO: Remove in PTL 1.7.2 def cuda(self, device=None): - """ PTL is overriding this method and changing the pytorch behavior of a module. + """PTL is overriding this method and changing the pytorch behavior of a module. The PTL LightingModule override will move the module to device 0 if device is None. See the PTL method here: https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113 diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 77d00de89232..716c905493e0 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -13,6 +13,7 @@ # limitations under the License. import collections +import contextlib import itertools from typing import Callable, Dict, Iterable, Optional, Union @@ -108,6 +109,8 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam): but requires larger memory than distributing within all ranks, especially for pure data parallel models. (default: False). + lock_timeout (float, optional): timeout for callback mutex in + seconds. **kwargs: keyword arguments to pass to Apex DistributedFusedAdam. @@ -118,6 +121,7 @@ def __init__( params: Union[Iterable[torch.nn.Parameter], Iterable[dict]], disable_distributed_parameters: bool = False, distribute_within_nodes: bool = False, + lock_timeout: Optional[float] = None, **kwargs, ): @@ -152,6 +156,25 @@ def __init__( # Construct distributed optimizer super().__init__(param_groups, **kwargs) + # Create mutex with timeout + self._lock_with_timeout = None + if lock_timeout is not None: + + @contextlib.contextmanager + def lock_with_timeout(): + result = self._lock.acquire(timeout=lock_timeout) + try: + yield result + finally: + if result: + # Acquired lock before timeout + self._lock.release() + else: + # Failed to acquire lock before timeout + print(f'MegatronDistributedFusedAdam: Failed to acquire lock within {lock_timeout} seconds.') + + self._lock_with_timeout = lock_with_timeout + def _broadcast_params(self) -> None: # Assume params have already been synchronized pass @@ -166,7 +189,10 @@ def hook(*unused): 'before the forward pass (e.g. by calling data_ptr) ' 'or run DistributedFusedAdam with overlap_param_sync=False.' ) - with self._lock: + lock = self._lock + if self._lock_with_timeout is not None: + lock = self._lock_with_timeout() + with lock: need_to_initialize = 'fragments' not in self.state[param] if need_to_initialize: self._init_param_state(param, param_group_id, param_id) diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py index 21e2ca2751f8..ae4db1ce6f2a 100644 --- a/nemo/deploy/nlp/__init__.py +++ b/nemo/deploy/nlp/__init__.py @@ -18,3 +18,5 @@ from nemo.deploy.nlp.query_llm import NemoQueryLLM except Exception: use_query_llm = False + +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py new file mode 100644 index 000000000000..c27bbbd0102b --- /dev/null +++ b/nemo/deploy/nlp/megatronllm_deployable.py @@ -0,0 +1,316 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import IntEnum, auto +from pathlib import Path + +import numpy as np +import torch +import wrapt +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.text_generation_utils import ( + OutputType, + get_default_length_params, + get_default_sampling_params, +) +from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.deploy import ITritonDeployable +from nemo.deploy.utils import cast_output, str_ndarray2list + + +@wrapt.decorator +def noop_decorator(func): + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + +use_pytriton = True +batch = noop_decorator +try: + from pytriton.decorators import batch + from pytriton.model_config import Tensor +except Exception: + use_pytriton = False + +LOGGER = logging.getLogger("NeMo") + + +def GetTensorShape(pyvalue): + """ + utility function to get Triton Tensor shape from a python value + assume that lists are shape -1 and all others are scalars with shape 1 + """ + return (-1 if type(pyvalue) == list else 1,) + + +def GetNumpyDtype(pyvalue): + """ + utility function to get numpy dtype of a python value + e.g. bool -> np.bool_ + """ + ''' + manually defining the mapping of python type -> numpy type for now + is there a better way to do it? tried np.array(pyvalue).dtype, but that doesn't seem to work + ''' + py_to_numpy_mapping = {str: bytes, bool: np.bool_, float: np.single, int: np.int_} + python_type = type(pyvalue) + # for lists, return the type of the internal elements + if python_type == list: + python_type = type(pyvalue[0]) + numpy_type = py_to_numpy_mapping[python_type] + return numpy_type + + +class ServerSync(IntEnum): + """Enum for synchronization messages using torch.distributed""" + + WAIT = auto() + SIGNAL = auto() + + def to_long_tensor(self): + return torch.tensor([self], dtype=torch.long, device='cuda') + + +class MegatronLLMDeployable(ITritonDeployable): + """Triton inference server compatible deploy class for a .nemo model file""" + + def __init__( + self, + nemo_checkpoint_filepath: str = None, + num_devices: int = 1, + num_nodes: int = 1, + existing_model: MegatronGPTModel = None, + ): + if nemo_checkpoint_filepath is None and existing_model is None: + raise ValueError( + "MegatronLLMDeployable requires either a .nemo checkpoint filepath or an existing MegatronGPTModel, but both provided were None" + ) + if num_devices > 1: + LOGGER.warning( + "Creating a MegatronLLMDeployable with num_devices>1 will assume running with a PyTorch Lightning DDP-variant strategy, which will run the main script once per device. Make sure any user code is compatible with multiple executions!" + ) + + # if both existing_model and nemo_checkpoint_filepath are provided, existing_model will take precedence + if existing_model is not None: + self.model = existing_model + else: + self._load_from_nemo_checkpoint(nemo_checkpoint_filepath, num_devices, num_nodes) + + self.model.eval() + # helper threads spawned by torch.multiprocessing should loop inside this helper function + self._helper_thread_evaluation_loop() + + def _load_from_nemo_checkpoint(self, nemo_checkpoint_filepath: str, num_devices: int, num_nodes: int): + if Path(nemo_checkpoint_filepath).exists(): + trainer = Trainer( + strategy=NLPDDPStrategy(), + devices=num_devices, + num_nodes=num_nodes, + ) + + custom_config = MegatronGPTModel.restore_from( + nemo_checkpoint_filepath, trainer=trainer, return_config=True + ) + # transformer_engine should always be true according to EricH, but GPT-2B model will fail if it is enabled + custom_config.transformer_engine = True + # using multi-gpu for tensor parallelism directly for now, could do pipeline parallel instead or a combination + custom_config.tensor_model_parallel_size = num_devices + # had to override these to make Nemotron3-22B work, see sample_sequence_batch() in text_generation_utils.py + custom_config.activations_checkpoint_granularity = None + custom_config.activations_checkpoint_method = None + + self.model = MegatronGPTModel.restore_from( + nemo_checkpoint_filepath, trainer=trainer, override_config_path=custom_config + ) + + def _helper_thread_evaluation_loop(self): + # only deploy the server on main thread, other threads enter this evaluation loop + if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0: + while True: + wait_value = ServerSync.WAIT.to_long_tensor() + torch.distributed.broadcast(wait_value, 0) + if wait_value.item() == ServerSync.SIGNAL: + self.model.generate(inputs=[""], length_params=None) + + _INPUT_PARAMETER_FIELDS = { + "prompts": (-1, bytes, False), + } + + ''' + there is no get_default equivalent for OutputType like there is for SamplingParameters and LengthParameters + but we still want to generate output using a real OutputType TypedDict for static type checking + ''' + _BLANK_OUTPUTTYPE: OutputType = { + 'sentences': [""], + 'tokens': [[""]], + 'logprob': [[0.0]], + 'full_logprob': [[0.0]], + 'token_ids': [[0]], + 'offsets': [[0]], + } + + @property + def get_triton_input(self): + input_parameters = tuple( + Tensor(name=name, shape=(shape,), dtype=dtype, optional=optional) + for name, (shape, dtype, optional) in self._INPUT_PARAMETER_FIELDS.items() + ) + ''' + in theory, would like to use typedict2tensor() function to generate Tensors, but it purposely ignores 1D arrays + asked JakubK why on 2024-04-26, but he doesn't know who owns the code + sampling_parameters = typedict2tensor(SamplingParam) + length_parameters = typedict2tensor(LengthParam) + ''' + default_sampling_params: SamplingParam = get_default_sampling_params() + sampling_parameters = tuple( + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value), + optional=True, + ) + for parameter_name, parameter_value in default_sampling_params.items() + ) + default_length_params: LengthParam = get_default_length_params() + length_parameters = tuple( + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value), + optional=True, + ) + for parameter_name, parameter_value in default_length_params.items() + ) + + inputs = input_parameters + sampling_parameters + length_parameters + return inputs + + @property + def get_triton_output(self): + # outputs are defined by the fields of OutputType + outputs = [ + Tensor( + name=parameter_name, + shape=GetTensorShape(parameter_value), + dtype=GetNumpyDtype(parameter_value[0]), + ) + for parameter_name, parameter_value in MegatronLLMDeployable._BLANK_OUTPUTTYPE.items() + ] + return outputs + + @staticmethod + def _sampling_params_from_triton_inputs(**inputs: np.ndarray): + """Extract SamplingParam fields from triton input dict""" + sampling_params: SamplingParam = get_default_sampling_params() + for sampling_param_field in sampling_params.keys(): + if sampling_param_field in inputs: + sampling_params[sampling_param_field] = inputs.pop(sampling_param_field)[0][0] + return sampling_params + + @staticmethod + def _length_params_from_triton_inputs(**inputs: np.ndarray): + """Extract LengthParam fields from triton input dict""" + length_params: LengthParam = get_default_length_params() + for length_param_field in length_params.keys(): + if length_param_field in inputs: + length_params[length_param_field] = inputs.pop(length_param_field)[0][0] + return length_params + + @batch + def triton_infer_fn(self, **inputs: np.ndarray): + """Triton server inference function that actually runs the model""" + if torch.distributed.is_initialized(): + distributed_rank = torch.distributed.get_rank() + if distributed_rank != 0: + raise ValueError( + f"Triton inference function should not be called on a thread with torch.distributed rank != 0, but this thread is rank {distributed_rank}" + ) + signal_value = ServerSync.SIGNAL.to_long_tensor() + torch.distributed.broadcast(signal_value, 0) + + input_strings = str_ndarray2list(inputs.pop("prompts")) + sampling_params = self._sampling_params_from_triton_inputs(**inputs) + length_params = self._length_params_from_triton_inputs(**inputs) + + model_output = self.model.generate( + inputs=input_strings, length_params=length_params, sampling_params=sampling_params + ) + ''' + model_output['sentences'] will be a list of strings (one per prompt) + other fields will either be a list of lists (tokens, for example) + or a list of pytorch Tensor + ''' + + triton_output = {} + _OUTPUT_FILLER_VALUES = { + 'tokens': "", + 'logprob': 0.0, + 'full_logprob': 0.0, + 'token_ids': -1, + 'offsets': -1, + } + for model_output_field, value in model_output.items(): + + if model_output_field != 'sentences' and value is not None: + # find length of longest non-sentence output item + field_longest_output_item = 0 + for item in value: + field_longest_output_item = max(field_longest_output_item, len(item)) + # then pad shorter items to match this length + for index, item in enumerate(value): + num_pad_values = field_longest_output_item - len(item) + if num_pad_values > 0: + pad_value = _OUTPUT_FILLER_VALUES[model_output_field] + if isinstance(item, torch.Tensor): + pad_tensor = torch.full( + (num_pad_values, item.size(1)) if item.dim() > 1 else (num_pad_values,), + pad_value, + dtype=item.dtype, + device='cuda', + ) + padded_item = torch.cat((item, pad_tensor)) + value[index] = padded_item + else: + pad_list = [pad_value] * num_pad_values + padded_item = item + pad_list + value[index] = padded_item + + field_dtype = GetNumpyDtype(MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0]) + if value is None: + # triton does not allow for optional output parameters, so need to populate them if they don't exist + triton_output[model_output_field] = np.full( + # 'sentences' should always have a valid value, so use that for the output shape + np.shape(model_output['sentences']), + MegatronLLMDeployable._BLANK_OUTPUTTYPE[model_output_field][0], + dtype=field_dtype, + ) + elif field_dtype == bytes: + # strings are cast to bytes + triton_output[model_output_field] = cast_output(value, field_dtype) + elif isinstance(value[0], torch.Tensor): + if value[0].dtype == torch.bfloat16: + # numpy currently does not support bfloat16, so need to manually convert it + triton_output[model_output_field] = np.array([tensor.cpu().float().numpy() for tensor in value]) + else: + triton_output[model_output_field] = np.array([tensor.cpu().numpy() for tensor in value]) + else: + # non-strings are output as-is (in numpy format) + triton_output[model_output_field] = np.array(value) + return triton_output diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 4748f4957a52..e25d529ec62c 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -116,6 +116,9 @@ def __init__( "axis": None, "enable": enable_quant_kv_cache, } + if quantization_config.algorithm == "int8_sq": + logging.info(f"Using int8_sq alpha = {quantization_config.alpha}") + quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.alpha} self.quant_cfg = quant_cfg else: diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index c826848e9328..7cc92f0ca588 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -33,6 +33,7 @@ from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import get_tokenzier, is_nemo_file, load_nemo_model from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer +from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load @@ -120,6 +121,7 @@ def export( n_gpus: int = 1, tensor_parallel_size: int = None, pipeline_parallel_size: int = None, + gpus_per_node: int = None, max_input_len: int = 256, max_output_len: int = 256, max_input_token: Optional[int] = None, @@ -127,6 +129,7 @@ def export( max_batch_size: int = 8, max_prompt_embedding_table_size=None, use_parallel_embedding: bool = False, + use_embedding_sharing: bool = False, paged_kv_cache: bool = True, remove_input_padding: bool = True, dtype: str = "bfloat16", @@ -149,6 +152,7 @@ def export( n_gpus (int): number of GPUs to use for inference. tensor_parallel_size (int): tensor parallelism. pipeline_parallel_size (int): pipeline parallelism. + gpus_per_node (int): number of gpus per node. max_input_len (int): max input length. max_output_len (int): max output length. max_input_token (int): max input length. Deprecated, use max_input_len instead. @@ -156,6 +160,7 @@ def export( max_batch_size (int): max batch size. max_prompt_embedding_table_size (int): max prompt embedding size. use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not + use_embedding_sharing (bool): paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM. remove_input_padding (bool): enables removing input padding or not. dtype (str): Floating point type for model weights (Supports BFloat16/Float16). @@ -172,7 +177,7 @@ def export( if model_type not in self.get_supported_models_list: raise Exception( "Model {0} is not currently a supported model type. " - "Supported model types are llama, gptnext, falcon, and starcoder".format(model_type) + "Supported model types are llama, gptnext, falcon, and starcoder.".format(model_type) ) if model_type == "gpt" or model_type == "starcoder": @@ -188,6 +193,8 @@ def export( tensor_parallel_size = 1 pipeline_parallel_size = n_gpus + gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node + if Path(self.model_dir).exists(): if delete_existing_files and len(os.listdir(self.model_dir)) > 0: for files in os.listdir(self.model_dir): @@ -229,7 +236,7 @@ def export( tmp_dir = tempfile.TemporaryDirectory() nemo_export_dir = Path(tmp_dir.name) - if nemo_checkpoint_path.endswith("qnemo"): + if is_qnemo_checkpoint(nemo_checkpoint_path): if os.path.isdir(nemo_checkpoint_path): nemo_export_dir = nemo_checkpoint_path else: @@ -244,7 +251,17 @@ def export( max_output_len=max_output_len, max_batch_size=max_batch_size, max_prompt_embedding_table_size=max_prompt_embedding_table_size, + tensor_parallel_size=tensor_parallel_size, + pipeline_parallel_size=pipeline_parallel_size, + use_parallel_embedding=use_parallel_embedding, + paged_kv_cache=paged_kv_cache, + remove_input_padding=remove_input_padding, + enable_multi_block_mode=enable_multi_block_mode, + use_lora_plugin=use_lora_plugin, lora_target_modules=lora_target_modules, + max_lora_rank=max_lora_rank, + max_num_tokens=max_num_tokens, + opt_num_tokens=opt_num_tokens, ) else: model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir) @@ -256,7 +273,9 @@ def export( dtype=dtype, tensor_parallel_size=tensor_parallel_size, pipeline_parallel_size=pipeline_parallel_size, + gpus_per_node=gpus_per_node, use_parallel_embedding=use_parallel_embedding, + use_embedding_sharing=use_embedding_sharing, ) for weight_dict, model_config in zip(weights_dicts, model_configs): diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 5e522d8bbff2..da13449160f9 100644 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -72,9 +72,17 @@ def model_to_trtllm_ckpt( dtype: str = "bfloat16", tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, + gpus_per_node: int = None, use_parallel_embedding: bool = False, + use_embedding_sharing: bool = False, ) -> Tuple[List[Dict], List[PretrainedConfig]]: + if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing: + LOGGER.info( + "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True" + ) + use_embedding_sharing = True + weights_dict = convert_model_to_trt_llm_ckpt( model=model, nemo_model_config=nemo_model_config, @@ -88,12 +96,14 @@ def model_to_trtllm_ckpt( world_size = tensor_parallel_size * pipeline_parallel_size - lm_head_weight = weights_dict["lm_head.weight"] + has_lm_head = "lm_head.weight" in weights_dict + if has_lm_head: + lm_head_weight = weights_dict["lm_head.weight"] vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0] - vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) + vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size - if vocab_size_padded != vocab_size: + if has_lm_head and vocab_size_padded != vocab_size: pad_width = vocab_size_padded - vocab_size lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0) @@ -120,7 +130,7 @@ def model_to_trtllm_ckpt( 'hidden_act': hidden_act, 'use_parallel_embedding': use_parallel_embedding, 'embedding_sharding_dim': 0, - 'share_embedding_table': False, + 'share_embedding_table': use_embedding_sharing, 'quantization': { 'quant_algo': None, 'kv_cache_quant_algo': None, @@ -160,9 +170,15 @@ def model_to_trtllm_ckpt( "transformer.ln_f.bias", } + gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node + for i in range(world_size): mapping = tensorrt_llm.Mapping( - world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size + world_size=world_size, + rank=i, + tp_size=tensor_parallel_size, + pp_size=pipeline_parallel_size, + gpus_per_node=gpus_per_node, ) layers_range = mapping.pp_layers(num_layers) @@ -174,6 +190,8 @@ def model_to_trtllm_ckpt( if new_key.endswith(".bin"): # TP split if new_key.endswith(f"{mapping.tp_rank}.bin"): new_key = new_key.replace(f".{mapping.tp_rank}.bin", "") + else: + continue if "layers" in new_key: # PP layer_num = int(new_key.split(".")[2]) if layer_num in layers_range: @@ -202,15 +220,17 @@ def model_to_trtllm_ckpt( weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight if mapping.is_last_pp_rank(): - weights_dict_local["lm_head.weight"] = np.ascontiguousarray( - split(lm_head_weight, mapping.tp_size, mapping.tp_rank) - ) + if has_lm_head: + weights_dict_local["lm_head.weight"] = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank) + ) weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"] ln_f_bias = weights_dict.get("transformer.ln_f.bias") if ln_f_bias is not None: weights_dict_local["transformer.ln_f.bias"] = ln_f_bias + config["gpus_per_node"] = gpus_per_node model_config = PretrainedConfig(**config) model_config.mapping = mapping model_configs.append(model_config) diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py index df7e43548a44..c29edc87353e 100644 --- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py +++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py @@ -158,8 +158,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): model_level_weights["transformer.position_embedding.weight"].append(val) if pp_idx == 0: val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] - if embedding_scaling: - val = val * float(math.sqrt(hidden_size)) vocab_size = val.shape[0] if use_parallel_embedding: @@ -171,10 +169,6 @@ def handle_model_level_weights(model, tp_idx: int, pp_idx: int): val = torch_to_numpy(val.to(storage_type).cpu()) model_level_weights["transformer.vocab_embedding.weight"].append(val) - if share_embeddings_and_output: - val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)] - val = torch_to_numpy(val.to(storage_type).cpu()) - model_level_weights["lm_head.weight"].append(val) if has_lm_head and pp_idx == training_pp_size - 1: val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)] val = torch_to_numpy(val.to(storage_type).cpu()) diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index b7e2f7bc2973..630330381e56 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import os -import subprocess +import glob +import os +import warnings from typing import List, Optional -CONFIG_NAME = "config.json" +from modelopt.deploy.llm import build_tensorrt_llm + +from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME def qnemo_to_tensorrt_llm( @@ -28,50 +30,48 @@ def qnemo_to_tensorrt_llm( max_output_len: int, max_batch_size: int, max_prompt_embedding_table_size: int, + tensor_parallel_size: int = None, + pipeline_parallel_size: int = None, + use_parallel_embedding: bool = False, + paged_kv_cache: bool = True, + remove_input_padding: bool = True, + enable_multi_block_mode: bool = False, + use_lora_plugin: str = None, lora_target_modules: Optional[List[str]] = None, + max_lora_rank: int = 64, + max_num_tokens: int = None, + opt_num_tokens: int = None, ): - """Build TRT-LLM engine via trtllm-build CLI API in a subprocess.""" + """Build TensorRT-LLM engine with ModelOpt build_tensorrt_llm function.""" assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}" - print( - "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters" - " for quantized models is possible only on export step via nemo.export.quantize module." - " These parameters are ignored when building and running TensorRT-LLM engine below." + + warnings.warn( + "Note that setting tensor_parallel_size and pipeline_parallel_size parameters" + " for quantized models should be done on calibration step with nemo.export.quantize module." + " These parameters are ignored when building and running TensorRT-LLM engine below.", + UserWarning, + stacklevel=3, ) - # Load config to explicitly pass selected parameters to trtllm-build command: - with open(os.path.join(nemo_checkpoint_path, CONFIG_NAME), "r") as f: - model_config = json.load(f) - command = [ - "trtllm-build", - "--checkpoint_dir", - nemo_checkpoint_path, - "--output_dir", - engine_dir, - "--max_batch_size", - str(max_batch_size), - "--max_input_len", - str(max_input_len), - "--max_output_len", - str(max_output_len), - "--max_prompt_embedding_table_size", - str(max_prompt_embedding_table_size), - "--gemm_plugin", - model_config["dtype"], - "--gpt_attention_plugin", - model_config["dtype"], - "--strongly_typed", - "--use_custom_all_reduce", - "disable", - "--workers", - str(model_config["mapping"]["world_size"]), - ] - command_str = " ".join(command) - print(f"Build command is:\n{command_str}") - print("Running trtllm-build, this may take a while...") - result = subprocess.run(command, capture_output=True) # TODO: consider streaming logs - if result.returncode != 0: - print(result.stdout.decode()) - print(result.stderr.decode()) - raise RuntimeError("Error encountered for trtllm-build command, please check logs.") - print("Building engine done. Full logs are:") - print(result.stdout.decode()) + warnings.warn( + "Also use_parallel_embedding, paged_kv_cache, remove_input_padding, enable_multi_block_mode, max_num_tokens" + " and opt_num_tokens parameters are set by ModelOpt build_tensorrt_llm function in the optimal way and are" + " ignored on engine build step.", + UserWarning, + stacklevel=3, + ) + + num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*")))) + assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}" + + build_tensorrt_llm( + pretrained_config=os.path.join(nemo_checkpoint_path, CONFIG_NAME), + engine_dir=engine_dir, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + max_beam_width=1, + num_build_workers=num_build_workers, + enable_sparsity=False, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + ) diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py new file mode 100644 index 000000000000..58d1d308507f --- /dev/null +++ b/nemo/export/trt_llm/qnemo/utils.py @@ -0,0 +1,18 @@ +import os +from pathlib import Path + +from nemo.export.tarutils import TarPath + +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "rank{}.safetensors" + + +def is_qnemo_checkpoint(path: str) -> bool: + """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence.""" + if os.path.isdir(path): + path = Path(path) + else: + path = TarPath(path) + config_path = path / CONFIG_NAME + tensor_path = path / WEIGHTS_NAME.format(0) + return config_path.exists() and tensor_path.exists() diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index bbafec319fd5..ef9a14c1d582 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -19,7 +19,7 @@ from tensorrt_llm.builder import BuildConfig, Builder from tensorrt_llm.commands.build import build as build_trtllm from tensorrt_llm.logger import logger -from tensorrt_llm.lora_manager import LoraBuildConfig +from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights from tensorrt_llm.plugin import PluginConfig @@ -94,7 +94,7 @@ def build_and_save_engine( if use_lora_plugin is not None: build_config.plugin_config.set_lora_plugin(use_lora_plugin) - lora_config = LoraBuildConfig( + lora_config = LoraConfig( lora_dir=lora_ckpt_list, lora_ckpt_source='nemo', max_lora_rank=max_lora_rank, diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py index e54f223f91cc..31559ad9a81a 100644 --- a/nemo/lightning/__init__.py +++ b/nemo/lightning/__init__.py @@ -10,6 +10,7 @@ pass from nemo.lightning.base import get_vocab_size, teardown +from nemo.lightning.pytorch.opt import LRSchedulerModule, MegatronOptimizerModule, OptimizerModule from nemo.lightning.pytorch.plugins import MegatronDataSampler, MegatronMixedPrecision from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler from nemo.lightning.pytorch.strategies import MegatronStrategy @@ -29,9 +30,12 @@ def _is_slurm_interactive_mode(): __all__ = [ + "LRSchedulerModule", "MegatronStrategy", "MegatronDataSampler", "MegatronMixedPrecision", + "MegatronOptimizerModule", + "OptimizerModule", "Trainer", "get_vocab_size", "teardown", diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py index 88e2f3436699..adfc0aa14d29 100644 --- a/nemo/lightning/data.py +++ b/nemo/lightning/data.py @@ -103,7 +103,6 @@ def add_megatron_sampler( ) elif dataloader_type == 'cyclic': batch_sampler = MegatronPretrainingRandomSampler( - dataloader.dataset, total_samples=len(dataloader.dataset), consumed_samples=consumed_samples, micro_batch_size=micro_batch_size, @@ -259,8 +258,9 @@ def __iter__(self): assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 # data sharding and random sampling + data_parallel_size = self.micro_batch_times_data_parallel_size // self.micro_batch_size bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) * self.micro_batch_size - bucket_offset = current_epoch_samples // self.data_parallel_size + bucket_offset = current_epoch_samples // data_parallel_size start_idx = self.data_parallel_rank * bucket_size g = torch.Generator() diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index d23e57941aaf..3172d242e681 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -24,6 +24,7 @@ import torch import torch.distributed +from megatron.core.distributed import DistributedDataParallel as McoreDDP from megatron.core.distributed import DistributedDataParallelConfig from torch import Tensor, nn @@ -132,37 +133,37 @@ def __init__( _model.configure_model() _pipeline.append(_model) - if isinstance(ddp_config, DistributedDataParallelConfig): - from megatron.core.distributed import DistributedDataParallel as McoreDDP - - _pipeline = [ - McoreDDP( - model_chunk.config, - ddp_config, - model_chunk, - data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), - expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), - # Turn off bucketing for model_chunk 2 onwards, since communication for these - # model chunks is overlapped with compute anyway. - disable_bucketing=(model_chunk_idx > 0), - ) - for (model_chunk_idx, model_chunk) in enumerate(_pipeline) - ] + if isinstance(ddp_config, DistributedDataParallelConfig): + for model_chunk_idx, model_chunk in enumerate(_pipeline): + module = model_chunk.module + ddp = DDP( + module.config, + ddp_config, + module, + data_parallel_group=parallel_state.get_data_parallel_group(with_context_parallel=True), + expert_data_parallel_group=parallel_state.get_data_modulo_expert_parallel_group(), + # Turn off bucketing for model_chunk 2 onwards, since communication for these + # model chunks is overlapped with compute anyway. + disable_bucketing=(model_chunk_idx > 0), + ) + model_chunk.module = ddp + model_chunk.buffers = ddp.buffers # We need to do this explicitly since this is a attr pytorch uses + model_chunk.__class__.__getattr__ = getattr_proxy # type: ignore - for i, model_module in enumerate(_pipeline): - if not cpu: - model_module.cuda(torch.cuda.current_device()) + for i, model_module in enumerate(_pipeline): + if not cpu: + model_module.cuda(torch.cuda.current_device()) - for param in model_module.parameters(): - set_defaults_if_not_set_tensor_model_parallel_attributes(param) + for param in model_module.parameters(): + set_defaults_if_not_set_tensor_model_parallel_attributes(param) - if hasattr(model_module, "configure_model"): - if not hasattr(model_module, "set_input_tensor"): - if hasattr(model_module.module, "set_input_tensor"): - model_module.set_input_tensor = model_module.module.set_input_tensor - else: - # TODO: What to do here? - pass + if hasattr(model_module, "configure_model"): + if not hasattr(model_module, "set_input_tensor"): + if hasattr(model_module.module, "set_input_tensor"): + model_module.set_input_tensor = model_module.module.set_input_tensor + else: + # TODO: What to do here? + pass # Print number of parameters. if parallel_state.model_parallel_is_initialized() and parallel_state.get_data_parallel_rank() == 0: @@ -536,6 +537,7 @@ def __init__(self, name: str, is_property: bool = False, includes_self: bool = F self.includes_self = includes_self def __call__(self, module: nn.Module): + attr = getattr(module, self.name) if self.is_property: @@ -554,6 +556,24 @@ def wrapped(self, *args): return attr +def getattr_proxy(self, item: Any) -> Any: + try: + return super(self.__class__, self).__getattr__(item) + except AttributeError: + try: + return getattr(self.module, item) + except AttributeError: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'") + + +class DDP(McoreDDP): + def state_dict(self, prefix='', keep_vars=False, **kwargs): + self.module.state_dict(prefix=prefix, keep_vars=keep_vars, **kwargs) + + def __getattr__(self, item: Any) -> Any: + return getattr_proxy(self, item) + + class CallbackConnector: """ A connector for managing and invoking callbacks. diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py new file mode 100644 index 000000000000..f50fe0481e9d --- /dev/null +++ b/nemo/lightning/pytorch/callbacks/nsys.py @@ -0,0 +1,69 @@ +from typing import Any, List, Optional + +import torch +from pytorch_lightning.callbacks.callback import Callback + +from nemo.utils import logging +from nemo.utils.get_rank import get_rank + + +class NsysCallback(Callback): + + def __init__( + self, + start_step: int, + end_step: int, + ranks: List[int] = [0], + gen_shape: bool = False, + ): + """ + Args: + start_step (int): Global batch to start profiling + end_step (int): Global batch to end profiling + ranks (List[int]): Global rank IDs to profile + gen_shape (bool): Generate model and kernel details including input shapes + """ + assert type(start_step) == int, f'Nsys start_step must be of type int. Found: {type(start_step)}' + self._nsys_profile_start_step = start_step + + assert type(end_step) == int, f'Nsys end_step must be of type int. Found: {type(start_step)}' + self._nsys_profile_end_step = end_step + + assert ( + self._nsys_profile_end_step >= self._nsys_profile_start_step + ), f'Nsys end_step must be greater than or equal to nsys start_step' + + self._nsys_profile_ranks = ranks + self._nsys_profile_gen_shape = gen_shape + + logging.info( + f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},' + f'and end_step: {self._nsys_profile_end_step}' + ) + + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Optional[int]: + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start + We use it here to enable nsys profiling. + """ + + device = trainer.strategy.root_device + if device.type == 'cuda': + if batch_idx == self._nsys_profile_start_step and get_rank() in self._nsys_profile_ranks: + logging.info("====== Start nsys profiling ======") + torch.cuda.cudart().cudaProfilerStart() + if self._nsys_profile_gen_shape: + torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__() + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int) -> None: + """PyTorch Lightning hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end + We use it here to enable nsys profiling. + """ + + device = trainer.strategy.root_device + if device.type == 'cuda': + print(f'batch idx: {batch_idx}') + if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks: + logging.info("====== End nsys profiling ======") + torch.cuda.cudart().cudaProfilerStop() diff --git a/nemo/lightning/pytorch/opt/__init__.py b/nemo/lightning/pytorch/opt/__init__.py new file mode 100644 index 000000000000..988f40f5ca30 --- /dev/null +++ b/nemo/lightning/pytorch/opt/__init__.py @@ -0,0 +1,32 @@ +from nemo.lightning.pytorch.opt.base import LRSchedulerModule, OptimizerModule +from nemo.lightning.pytorch.opt.lr_scheduler import ( + InverseSquareRootAnnealingScheduler, + NoamAnnealingScheduler, + NoamHoldAnnealingScheduler, + PolynomialDecayAnnealingScheduler, + PolynomialHoldDecayAnnealingScheduler, + SquareAnnealingScheduler, + SquareRootAnnealingScheduler, + T5InverseSquareRootAnnealingScheduler, + WarmupAnnealingScheduler, + WarmupHoldPolicyScheduler, + WarmupPolicyScheduler, +) +from nemo.lightning.pytorch.opt.megatron import MegatronOptimizerModule + +__all__ = [ + "OptimizerModule", + "LRSchedulerModule", + "MegatronOptimizerModule", + "WarmupPolicyScheduler", + "WarmupHoldPolicyScheduler", + "SquareAnnealingScheduler", + "SquareRootAnnealingScheduler", + "NoamAnnealingScheduler", + "NoamHoldAnnealingScheduler", + "WarmupAnnealingScheduler", + "InverseSquareRootAnnealingScheduler", + "T5InverseSquareRootAnnealingScheduler", + "PolynomialDecayAnnealingScheduler", + "PolynomialHoldDecayAnnealingScheduler", +] diff --git a/nemo/lightning/pytorch/opt/base.py b/nemo/lightning/pytorch/opt/base.py new file mode 100644 index 000000000000..3e51cf451671 --- /dev/null +++ b/nemo/lightning/pytorch/opt/base.py @@ -0,0 +1,179 @@ +import types +from abc import ABC, abstractmethod +from typing import List, Optional + +import pytorch_lightning as L +from pytorch_lightning.utilities.types import OptimizerLRScheduler +from torch.optim import Optimizer + +from nemo.lightning.megatron_parallel import CallbackMethods + + +class LRSchedulerModule(L.Callback, CallbackMethods, ABC): + """A module to standardize the learning rate scheduler setup and configuration. + + This class decouples the learning rate scheduler from the model, similar to how the LightningDataModule + decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful + for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event, + this also supports hooking into the Megatron forward-backward function at a granular level. + + Example:: + + class MyLRSchedulerModule(LRSchedulerModule): + def setup(self, model, optimizer): + # Custom setup logic + ... + + def scheduler(self, model, optimizers): + # Define and return the learning rate scheduler + ... + + Methods: + setup(model, optimizer): Sets up the learning rate scheduler. + scheduler(model, optimizers): Abstract method to define the learning rate scheduler. + __call__(model, optimizers): Calls the setup and scheduler methods. + """ + + def setup(self, model, optimizer) -> None: + """Sets up the learning rate scheduler. + + Args: + model: The model for which the scheduler is being set up. + optimizer: The optimizer for which the scheduler is being set up. + """ + ... + + @abstractmethod + def scheduler(self, model, optimizers) -> OptimizerLRScheduler: + """Abstract method to define the learning rate scheduler. + + Args: + model: The model for which the scheduler is being defined. + optimizers: The optimizers for which the scheduler is being defined. + + Returns: + OptimizerLRScheduler: The learning rate scheduler. + """ + raise NotImplementedError("The scheduler method should be implemented by subclasses.") + + def __call__(self, model, optimizers): + """Calls the setup and scheduler methods. + + Args: + model: The model for which the scheduler is being called. + optimizers: The optimizers for which the scheduler is being called. + + Returns: + OptimizerLRScheduler: The learning rate scheduler. + """ + + self.setup(model, optimizers) + + self._scheduler = self.scheduler(model, optimizers) + + if not isinstance(self._scheduler, (dict, tuple)): + return optimizers, self._scheduler + + return self._scheduler + + +class OptimizerModule(L.Callback, CallbackMethods, ABC): + """A module to standardize the optimizer setup and configuration. + + This class decouples the optimizer from the model, similar to how the LightningDataModule + decouples data handling. It also acts as a Callback to hook into the training loop, which can be useful + for adding custom all-reduces, logging, early stopping, etc. Next to that standard Lightning callback-event, + this also supports hooking into the Megatron forward-backward function at a granular level. + + Attributes: + lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module. + + Example:: + + class MyOptimizerModule(OptimizerModule): + def __init__(self, lr_scheduler=None): + super().__init__(lr_scheduler) + + def setup(self, model): + # Custom setup logic + ... + + def optimizers(self, model): + # Define and return the optimizers + ... + + Methods: + connect(model, trainer): Connects the optimizer module to the model and trainer. + setup(model): Sets up the optimizer. + optimizers(model): Abstract method to define the optimizers. + __call__(model, megatron_parallel): Calls the setup and optimizers methods. + """ + + def __init__(self, lr_scheduler: Optional[LRSchedulerModule]): + """Initializes the OptimizerModule. + + Args: + lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module. + """ + self.lr_scheduler = lr_scheduler + + def connect(self, model: L.LightningModule) -> None: + """Connects the optimizer module to the model and trainer. + + Args: + model (L.LightningModule): The model to which the optimizer module is being connected. + """ + + def custom_configure_optimizers(lightning_module_self, megatron_parallel=None): + opt = self(lightning_module_self, megatron_parallel=megatron_parallel) + return opt + + model.configure_optimizers = types.MethodType(custom_configure_optimizers, model) + + def setup(self, model) -> None: + """Sets up the optimizer. + + Args: + model: The model for which the optimizer is being set up. + """ + ... + + @abstractmethod + def optimizers(self, model) -> List[Optimizer]: + """Abstract method to define the optimizers. + + Args: + model: The model for which the optimizers are being defined. + + Returns: + List[Optimizer]: The list of optimizers. + """ + raise NotImplementedError("The optimizers method should be implemented by subclasses.") + + def __call__(self, model: L.LightningModule, megatron_parallel=None) -> OptimizerLRScheduler: + """Calls the setup and optimizers methods. + + Args: + model (L.LightningModule): The model for which the optimizers are being called. + megatron_parallel: Optional parallel model. + + Returns: + OptimizerLRScheduler: The optimizers and optionally the learning rate scheduler. + """ + _model = model if megatron_parallel is None else megatron_parallel + callbacks = _model.trainer.callbacks + if self not in callbacks: + callbacks.append(self) + if self.lr_scheduler is not None and self.lr_scheduler not in callbacks: + callbacks.append(self.lr_scheduler) + + self.setup(_model) + self._optimizers = self.optimizers(_model) + + if self.lr_scheduler is not None: + self.lr_scheduler.setup(_model, self._optimizers) + with_scheduler = self.lr_scheduler(_model, self._optimizers) + + return with_scheduler + + return self._optimizers diff --git a/nemo/lightning/pytorch/opt/lr_scheduler.py b/nemo/lightning/pytorch/opt/lr_scheduler.py new file mode 100644 index 000000000000..1ce8dcf0d815 --- /dev/null +++ b/nemo/lightning/pytorch/opt/lr_scheduler.py @@ -0,0 +1,390 @@ +from typing import Optional + +from nemo.core.optim.lr_scheduler import ( + InverseSquareRootAnnealing, + NoamAnnealing, + NoamHoldAnnealing, + PolynomialDecayAnnealing, + PolynomialHoldDecayAnnealing, + SquareAnnealing, + SquareRootAnnealing, + T5InverseSquareRootAnnealing, + WarmupAnnealing, + WarmupHoldPolicy, + WarmupPolicy, +) +from nemo.lightning.pytorch.opt.base import LRSchedulerModule + + +class WarmupPolicyScheduler(LRSchedulerModule): + """Warmup Policy Learning Rate Scheduler.""" + + def __init__( + self, + warmup_steps: int = 750, + warmup_ratio: Optional[float] = None, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.warmup_steps = warmup_steps + self.warmup_ratio = warmup_ratio + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = WarmupPolicy( + optimizer, + warmup_steps=self.warmup_steps, + warmup_ratio=self.warmup_ratio, + max_steps=self.max_steps, + min_lr=self.min_lr, + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class WarmupHoldPolicyScheduler(LRSchedulerModule): + """Warmup Hold Policy Learning Rate Scheduler.""" + + def __init__( + self, + warmup_steps: int = 750, + warmup_ratio: Optional[float] = None, + hold_steps: Optional[int] = None, + hold_ratio: Optional[float] = None, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.warmup_steps = warmup_steps + self.warmup_ratio = warmup_ratio + self.hold_steps = hold_steps + self.hold_ratio = hold_ratio + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = WarmupHoldPolicy( + optimizer, + warmup_steps=self.warmup_steps, + warmup_ratio=self.warmup_ratio, + hold_steps=self.hold_steps, + hold_ratio=self.hold_ratio, + max_steps=self.max_steps, + min_lr=self.min_lr, + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class SquareAnnealingScheduler(LRSchedulerModule): + """Square Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 1e-5, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = SquareAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class SquareRootAnnealingScheduler(LRSchedulerModule): + """Square Root Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = SquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class NoamAnnealingScheduler(LRSchedulerModule): + """Noam Annealing Learning Rate Scheduler.""" + + def __init__( + self, + d_model: int, + warmup_steps: int = 750, + warmup_ratio: Optional[float] = None, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.d_model = d_model + self.warmup_steps = warmup_steps + self.warmup_ratio = warmup_ratio + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = NoamAnnealing( + optimizer, + d_model=self.d_model, + warmup_steps=self.warmup_steps, + warmup_ratio=self.warmup_ratio, + max_steps=self.max_steps, + min_lr=self.min_lr, + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class NoamHoldAnnealingScheduler(LRSchedulerModule): + """Noam Hold Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + decay_rate: float = 0.5, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.decay_rate = decay_rate + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = NoamHoldAnnealing( + optimizer, max_steps=self.max_steps, decay_rate=self.decay_rate, min_lr=self.min_lr + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class WarmupAnnealingScheduler(LRSchedulerModule): + """Warmup Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = WarmupAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class InverseSquareRootAnnealingScheduler(LRSchedulerModule): + """Inverse Square Root Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = InverseSquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class T5InverseSquareRootAnnealingScheduler(LRSchedulerModule): + """T5 Inverse Square Root Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = T5InverseSquareRootAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class PolynomialDecayAnnealingScheduler(LRSchedulerModule): + """Polynomial Decay Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + power: float = 1.0, + cycle: bool = False, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.power = power + self.cycle = cycle + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = PolynomialDecayAnnealing( + optimizer, max_steps=self.max_steps, min_lr=self.min_lr, power=self.power, cycle=self.cycle + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } + + +class PolynomialHoldDecayAnnealingScheduler(LRSchedulerModule): + """Polynomial Hold Decay Annealing Learning Rate Scheduler.""" + + def __init__( + self, + max_steps: int = 10, + min_lr: float = 0.0, + power: float = 1.0, + cycle: bool = False, + interval: str = "epoch", + frequency: int = 1, + monitor: str = "val_loss", + ): + super().__init__() + self.max_steps = max_steps + self.min_lr = min_lr + self.power = power + self.cycle = cycle + self.interval = interval + self.frequency = frequency + self.monitor = monitor + + def scheduler(self, optimizer): + lr_scheduler = PolynomialHoldDecayAnnealing( + optimizer, max_steps=self.max_steps, min_lr=self.min_lr, power=self.power, cycle=self.cycle + ) + return { + "optimizer": optimizer, + "scheduler": lr_scheduler, + "interval": self.interval, + "frequency": self.frequency, + "monitor": self.monitor, + } diff --git a/nemo/lightning/pytorch/opt/megatron.py b/nemo/lightning/pytorch/opt/megatron.py new file mode 100644 index 000000000000..dff08d7a07df --- /dev/null +++ b/nemo/lightning/pytorch/opt/megatron.py @@ -0,0 +1,97 @@ +from typing import Callable, List, Optional + +from megatron.core.distributed import finalize_model_grads +from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer +from megatron.core.utils import get_model_config +from torch.optim import Optimizer + +from nemo.lightning.megatron_parallel import MegatronParallel +from nemo.lightning.pytorch.opt.base import LRSchedulerModule, OptimizerModule + + +class MegatronOptimizerModule(OptimizerModule): + """A OptimizerModule for the megatron optimizers. + + Attributes: + config (OptimizerConfig): Configuration for the optimizer. + no_weight_decay_cond (Optional[Callable]): Condition for no weight decay. + scale_lr_cond (Optional[Callable]): Condition for scaling learning rate. + lr_mult (float): Learning rate multiplier. + + Example:: + + config = OptimizerConfig(...) + lr_scheduler = MyLRSchedulerModule(...) + optimizer_module = MegatronOptimizerModule(config, lr_scheduler) + + Methods: + setup(model): Sets up the optimizer. + optimizers(model): Defines the optimizers. + """ + + def __init__( + self, + config: OptimizerConfig, + lr_scheduler: Optional[LRSchedulerModule] = None, + no_weight_decay_cond: Optional[Callable] = None, + scale_lr_cond: Optional[Callable] = None, + lr_mult: float = 1.0, + ): + """Initializes the MegatronOptimizerModule. + + Args: + config (OptimizerConfig): Configuration for the optimizer. + lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module. + no_weight_decay_cond (Optional[Callable]): Condition for no weight decay. + scale_lr_cond (Optional[Callable]): Condition for scaling learning rate. + lr_mult (float): Learning rate multiplier. + """ + + super().__init__(lr_scheduler=lr_scheduler) + self.config = config + self.no_weight_decay_cond = no_weight_decay_cond + self.scale_lr_cond = scale_lr_cond + self.lr_mult = lr_mult + + def setup(self, model): + """We will add the finalize_model_grads function to the model config. + + Args: + model: The model for which the optimizer is being set up. + """ + + def finalize_model_grads_func(*args, **kwargs): + return self.finalize_model_grads(*args, **kwargs) + + get_model_config(model[0]).finalize_model_grads_func = finalize_model_grads_func + + def optimizers(self, model: MegatronParallel) -> List[Optimizer]: + """Defines the optimizers. + + Args: + model (MegatronParallel): The model for which the optimizers are being defined. + + Returns: + List[Optimizer]: The list of optimizers. + + Raises: + ValueError: If the model is not an instance of MegatronParallel. + """ + + if not isinstance(model, MegatronParallel): + raise ValueError("Model must be an instance of MegatronParallel") + + from nemo.core.optim import McoreDistributedOptimizer + + mcore_opt = get_megatron_optimizer( + self.config, + list(model), + no_weight_decay_cond=self.no_weight_decay_cond, + scale_lr_cond=self.scale_lr_cond, + lr_mult=self.lr_mult, + ) + + return [McoreDistributedOptimizer(mcore_opt)] + + def finalize_model_grads(self, *args, **kwargs): + return finalize_model_grads(*args, **kwargs) diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py index 8fa178d7df01..7aceda64de43 100644 --- a/nemo/lightning/pytorch/strategies.py +++ b/nemo/lightning/pytorch/strategies.py @@ -1,4 +1,5 @@ import functools +import inspect import logging import shutil from collections import OrderedDict @@ -90,7 +91,7 @@ def __init__( self.ckpt_include_optimizer = ckpt_include_optimizer if ddp == "megatron": - self.ddp_config = DistributedDataParallelConfig() + self.ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) elif isinstance(ddp, DistributedDataParallelConfig): self.ddp_config = ddp elif ddp == "pytorch": @@ -165,18 +166,6 @@ def setup(self, trainer: pl.Trainer) -> None: trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer) - # set up optimizers after the wrapped module has been moved to the device - self.setup_optimizers(trainer) - - # TODO: Throw an execption if we have a mcore optimizer and no ddp_config - - if hasattr(self.precision_plugin, "convert_optimizer"): - _optimizers = [*self.optimizers] - _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) - self.optimizers = _optimizers - - _optimizers_to_device(self.optimizers, self.root_device) - import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState): @@ -223,17 +212,32 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None: cpu=isinstance(trainer.accelerator, CPUAccelerator), ddp_config=self.ddp_config, ) + self.megatron_parallel.trainer = trainer + + # check signature-def of self.model.configure_optimizers to check if there's an optional arg: megatron_parallel + sig = inspect.signature(self.model.configure_optimizers) + if "megatron_parallel" in sig.parameters: + self.model.configure_optimizers = functools.partial( + self.model.configure_optimizers, megatron_parallel=self.megatron_parallel + ) + + self.setup_optimizers(trainer) + + # TODO: Throw an execption if we have a mcore optimizer and no ddp_config + + if hasattr(self.precision_plugin, "convert_optimizer"): + _optimizers = [*self.optimizers] + _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0]) + self.optimizers = _optimizers + + _optimizers_to_device(self.optimizers, self.root_device) + self.model = self.megatron_parallel - self.model.trainer = trainer if hasattr(self.precision_plugin, "convert_module"): self.model = self.precision_plugin.convert_module(self.model) self.model.callbacks.add(getattr(trainer, "callbacks")) - if hasattr(self, "optimizers") and self.optimizers: - for optimizer in self.optimizers: - self.model.callbacks.add(optimizer) - if self.data_sampler: self.model.callbacks.add(self.data_sampler) diff --git a/nemo/utils/decorators/__init__.py b/nemo/utils/decorators/__init__.py index 4468a3bc09b5..2cfec9e40d64 100644 --- a/nemo/utils/decorators/__init__.py +++ b/nemo/utils/decorators/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from nemo.utils.decorators.deprecated import deprecated +from nemo.utils.decorators.deprecated import deprecated, deprecated_warning from nemo.utils.decorators.experimental import experimental from nemo.utils.decorators.port_docs import add_port_docs diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py index 65f92e62563e..40957bb343d4 100644 --- a/nemo/utils/decorators/deprecated.py +++ b/nemo/utils/decorators/deprecated.py @@ -30,14 +30,14 @@ def deprecated(wrapped=None, version=None, explanation=None, wait_seconds=0): """ - Decorator which can be used for indicating that a function/class is deprecated and going to be removed. - Tracks down which function/class printed the warning and will print it only once per call. - - Args: - version: Version in which the function/class will be removed (optional). - explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). - wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned - with subsequent logging messages. + Decorator which can be used for indicating that a function/class is deprecated and going to be removed. + Tracks down which function/class printed the warning and will print it only once per call. + + Args: + version: Version in which the function/class will be removed (optional). + explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. """ if wrapped is None: @@ -71,3 +71,26 @@ def wrapper(wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) return wrapper(wrapped) + + +def deprecated_warning(old_method=None, new_method=None, wait_seconds=2): + """ + Function which can be used for indicating that a function/class is deprecated and going to be removed. + + Args: + old_method: Name of deprecated class/function. + new_method: Name of new class/function to use. + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. + """ + + # Create a banner + if new_method is not None: + msg = f"***** {old_method} is deprecated. Please, use {new_method} instead. *****" + else: + msg = f"***** {old_method} is deprecated and will be removed soon. *****" + banner = '\n'.join(['*' * len(msg)] * 2 + [msg] + ['*' * len(msg)] * 2) + + logging.warning(f"\n\n{banner}\n") + logging.warning(f"Waiting for {wait_seconds} seconds before this message disappears.") + time.sleep(wait_seconds) diff --git a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py index 07e12f36c3d7..99d1795aea9c 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py @@ -211,15 +211,18 @@ def convert(in_file, precision=None, cpu_only=True) -> None: else: output_layer_base_name = 'model.language_model.output_layer.weight' state_dict[hf_output_layer_weight_name] = param_to_weights(ckpt[output_layer_base_name]) - return state_dict, nemo_config + return state_dict, nemo_config, dtype if __name__ == '__main__': args = get_args() - hf_state_dict, nemo_config = convert(args.input_name_or_path, args.precision) + hf_state_dict, nemo_config, dtype = convert(args.input_name_or_path, args.precision) config = load_config(args.hf_model_name, nemo_config) - model = AutoModelForCausalLM.from_config(config) + model = AutoModelForCausalLM.from_config( + config, + torch_dtype=dtype, + ) model.load_state_dict(hf_state_dict) model.save_pretrained(args.output_path) hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_name) diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 0f7866e57cda..835ff46dd5fe 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -19,9 +19,9 @@ from pathlib import Path from nemo.deploy import DeployPyTriton +from nemo.deploy.nlp import MegatronLLMDeployable from nemo.export import TensorRTLLM - LOGGER = logging.getLogger("NeMo") @@ -31,6 +31,13 @@ def get_args(argv): description=f"Deploy nemo models to Triton", ) parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") + parser.add_argument( + "-dsn", + "--direct_serve_nemo", + default=False, + action='store_true', + help="Serve the nemo model directly instead of exporting to TRTLLM first. Will ignore other TRTLLM-specific arguments.", + ) parser.add_argument( "-ptnc", "--ptuning_nemo_checkpoint", @@ -146,18 +153,7 @@ def get_args(argv): return args -def nemo_deploy(argv): - args = get_args(argv) - - if args.debug_mode: - loglevel = logging.DEBUG - else: - loglevel = logging.INFO - - LOGGER.setLevel(loglevel) - LOGGER.info("Logging level set to {}".format(loglevel)) - LOGGER.info(args) - +def get_trtllm_deployable(args): if args.triton_model_repository is None: trt_llm_path = "/tmp/trt_llm_model_dir/" LOGGER.info( @@ -170,28 +166,24 @@ def nemo_deploy(argv): trt_llm_path = args.triton_model_repository if args.nemo_checkpoint is None and args.triton_model_repository is None: - LOGGER.error( + raise ValueError( "The provided model repository is not a valid TensorRT-LLM model " "directory. Please provide a --nemo_checkpoint." ) - return if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository): - LOGGER.error( + raise ValueError( "The provided model repository is not a valid TensorRT-LLM model " "directory. Please provide a --nemo_checkpoint." ) - return if args.nemo_checkpoint is not None and args.model_type is None: - LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.") - return + raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") ptuning_tables_files = [] if not args.ptuning_nemo_checkpoint is None: if args.max_prompt_embedding_table_size is None: - LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") - return + raise ValueError("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") for pt_checkpoint in args.ptuning_nemo_checkpoint: ptuning_nemo_checkpoint_path = Path(pt_checkpoint) @@ -199,19 +191,16 @@ def nemo_deploy(argv): if ptuning_nemo_checkpoint_path.is_file(): ptuning_tables_files.append(pt_checkpoint) else: - LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) - return + raise IsADirectoryError("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) else: - LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint)) - return + raise FileNotFoundError("File or directory {0} does not exist.".format(pt_checkpoint)) if args.task_ids is not None: if len(ptuning_tables_files) != len(args.task_ids): - LOGGER.error( + raise RuntimeError( "Number of task ids and prompt embedding tables have to match. " "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids)) ) - return trt_llm_exporter = TensorRTLLM( model_dir=trt_llm_path, @@ -245,8 +234,7 @@ def nemo_deploy(argv): save_nemo_model_config=True, ) except Exception as error: - LOGGER.error("An error has occurred during the model export. Error message: " + str(error)) - return + raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) try: for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files): @@ -265,12 +253,35 @@ def nemo_deploy(argv): prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, ) except Exception as error: - LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)) - return + raise RuntimeError( + "An error has occurred during adding the prompt embedding table(s). Error message: " + str(error) + ) + return trt_llm_exporter + + +def get_nemo_deployable(args): + if args.nemo_checkpoint is None: + raise ValueError("Direct serve requires a .nemo checkpoint") + return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) + + +def nemo_deploy(argv): + args = get_args(argv) + + if args.debug_mode: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + + LOGGER.setLevel(loglevel) + LOGGER.info("Logging level set to {}".format(loglevel)) + LOGGER.info(args) + + triton_deployable = get_nemo_deployable(args) if args.direct_serve_nemo else get_trtllm_deployable(args) try: nm = DeployPyTriton( - model=trt_llm_exporter, + model=triton_deployable, triton_model_name=args.triton_model_name, triton_model_version=args.triton_model_version, max_batch_size=args.max_batch_size, diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py index 744e2884d015..111c00df392a 100644 --- a/tests/collections/common/test_lhotse_dataloading.py +++ b/tests/collections/common/test_lhotse_dataloading.py @@ -158,9 +158,10 @@ def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]: root = nemo_manifest_path.parent / "nemo_tar" root.mkdir(exist_ok=True) - with TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer, SequentialJsonlWriter( - root / "tarred_audio_filepaths.jsonl" - ) as mft_writer: + with ( + TarWriter(f"{root}/audios_%01d.tar", shard_size=5) as tar_writer, + SequentialJsonlWriter(root / "tarred_audio_filepaths.jsonl") as mft_writer, + ): for idx, d in enumerate(load_jsonl(nemo_manifest_path)): p = d["audio_filepath"] name = Path(p).name @@ -856,7 +857,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path): from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV - INT16MAX = 2 ** 15 + INT16MAX = 2**15 expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX audio_path = str(tmp_path / "dummy.wav") sf.write(audio_path, expected_audio, 16000) @@ -904,7 +905,7 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path): from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV - INT16MAX = 2 ** 15 + INT16MAX = 2**15 expected_audio = np.random.randint(low=-INT16MAX - 1, high=INT16MAX, size=(16000,)).astype(np.float32) / INT16MAX audio_path = str(tmp_path / "dummy.wav") sf.write(audio_path, expected_audio, 16000) @@ -950,7 +951,13 @@ def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path): CutSet([cut]).to_file(cuts_path) config = OmegaConf.create( - {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,} + { + "cuts_path": cuts_path, + "sample_rate": 16000, + "use_lhotse": True, + "num_workers": 0, + "batch_size": 2, + } ) dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity()) @@ -981,13 +988,21 @@ def test_extended_data_input_cfg(cutset_shar_path, nemo_tarred_manifest_path_mul "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ], "sample_rate": 16000, @@ -1031,17 +1046,27 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ], "weight": 0.2, - "tags": {"group_name": "G1",}, + "tags": { + "group_name": "G1", + }, }, { "type": "group", @@ -1052,16 +1077,26 @@ def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest "manifest_filepath": nemo_tarred_manifest_path_multi[0], "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D3",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D3", + }, }, { "type": "lhotse_shar", "shar_path": cutset_shar_path, "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D4",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D4", + }, }, ], - "tags": {"group_name": "G2",}, + "tags": { + "group_name": "G2", + }, }, ], "sample_rate": 16000, @@ -1107,13 +1142,21 @@ def test_extended_data_input_cfg_yaml_path(tmp_path, cutset_shar_path, nemo_tarr "manifest_filepath": str(nemo_tarred_manifest_path_multi[0]), "tarred_audio_filepaths": str(nemo_tarred_manifest_path_multi[1]), "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D1", + }, }, { "type": "lhotse_shar", "shar_path": str(cutset_shar_path), "weight": 0.5, - "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + "tags": { + "language": "en", + "modality": "audio", + "dataset_name": "D2", + }, }, ] @@ -1166,7 +1209,13 @@ def txt_es_path(tmp_path_factory): def test_text_file_input(txt_en_path, txt_es_path): config = OmegaConf.create( { - "input_cfg": [{"type": "txt", "paths": txt_en_path, "language": "en",},], + "input_cfg": [ + { + "type": "txt", + "paths": txt_en_path, + "language": "en", + }, + ], "shuffle": True, "num_workers": 0, "batch_size": 4, @@ -1312,13 +1361,17 @@ def test_multimodal_text_audio_dataloading( "target_paths": es_paths, "source_language": "en", "target_language": "es", - "tags": {"modality": "text",}, + "tags": { + "modality": "text", + }, }, { "type": "nemo_tarred", "manifest_filepath": manifest_filepath, "tarred_audio_filepaths": tarred_audio_filepaths, - "tags": {"modality": "audio",}, + "tags": { + "modality": "audio", + }, }, ], "shuffle": True, @@ -1339,7 +1392,11 @@ def test_multimodal_text_audio_dataloading( ) dl = get_lhotse_dataloader_from_config( - config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer, + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + tokenizer=en_es_tokenizer, ) # Note: we use islice here because the dataloader will be infinite. @@ -1402,7 +1459,12 @@ def test_dataloader_with_noise_nemo_json(cutset_path: Path, nemo_manifest_path: "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1426,7 +1488,12 @@ def test_dataloader_with_noise_lhotse_jsonl(cutset_path: Path): "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1443,7 +1510,10 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ config = OmegaConf.create( { "cuts_path": str(cutset_path), - "noise_path": {"manifest_filepath": noise_json, "tarred_audio_filepaths": noise_tar,}, + "noise_path": { + "manifest_filepath": noise_json, + "tarred_audio_filepaths": noise_tar, + }, "noise_mix_prob": 1.0, "noise_snr": [-5.0, 5.0], "batch_size": 2, @@ -1451,7 +1521,12 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 2 @@ -1464,6 +1539,8 @@ def test_dataloader_with_noise_nemo_tar(cutset_path: Path, nemo_tarred_manifest_ def test_dataloader_with_synth_rir(cutset_path: Path): + from lhotse.augmentation import ReverbWithImpulseResponse + config = OmegaConf.create( { "cuts_path": str(cutset_path), @@ -1474,7 +1551,12 @@ def test_dataloader_with_synth_rir(cutset_path: Path): "shard_seed": 0, } ) - dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity(),) + dl = get_lhotse_dataloader_from_config( + config=config, + global_rank=0, + world_size=1, + dataset=Identity(), + ) batch = next(iter(dl)) assert isinstance(batch, CutSet) assert len(batch) == 4 @@ -1487,8 +1569,16 @@ def test_dataloader_with_synth_rir(cutset_path: Path): cut = batch[2] assert isinstance(cut, MonoCut) assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1 - assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse" + tfnm = cut.recording.transforms[0] + if isinstance(tfnm, dict): # lhotse<=1.23.0 + assert tfnm["name"] == "ReverbWithImpulseResponse" + else: # lhotse>=1.24.0 + assert isinstance(tfnm, ReverbWithImpulseResponse) cut = batch[3] assert isinstance(cut, MonoCut) assert isinstance(cut.recording.transforms, list) and len(cut.recording.transforms) == 1 - assert cut.recording.transforms[0]["name"] == "ReverbWithImpulseResponse" + tfnm = cut.recording.transforms[0] + if isinstance(tfnm, dict): # lhotse<=1.23.0 + assert tfnm["name"] == "ReverbWithImpulseResponse" + else: # lhotse>=1.24.0 + assert isinstance(tfnm, ReverbWithImpulseResponse) diff --git a/tests/collections/nlp/test_dialogue.py b/tests/collections/nlp/test_dialogue.py deleted file mode 100644 index 9c227f737d98..000000000000 --- a/tests/collections/nlp/test_dialogue.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch - -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import ( - DialogueGPTClassificationDataset, -) -from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics, DialogueGenerationMetrics -from nemo.collections.nlp.models.dialogue.dialogue_nearest_neighbour_model import DialogueNearestNeighbourModel - - -@pytest.mark.unit -def test_dialogue_metric_generation_f1(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - precision, recall, f1 = DialogueGenerationMetrics._get_one_f1(generated_field, ground_truth_field) - assert precision == 75 - assert recall == 75 - assert f1 == 75 - - -@pytest.mark.unit -def test_dialogue_metric_split_label_and_slots(): - fields = ["reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)", "time_of_day(7pm)"] - labels, slots_list = DialogueClassificationMetrics.split_label_and_slots(fields, with_slots=True) - assert labels == ["reserve_restaurant", 'none'] - assert slots_list == [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - - -@pytest.mark.unit -def test_dialogue_metric_slot_filling_metrics(): - generated_slots = [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - ground_truth_slots = [["time_of_day(7pm)"], ["time_of_day(7pm)", "number_of_people(3)"]] - - ( - avg_precision, - avg_recall, - avg_f1, - avg_joint_goal_accuracy, - ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots) - - assert avg_precision == 75 - assert avg_recall == 75 - assert avg_f1 == 75 - assert avg_joint_goal_accuracy == 0 - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_normalize_zero_shot_intent(): - label0 = 'food_ordering.contextual_query' - normalized_label0 = 'contextual query' - - label1 = 'food_ordering.nomatch' - normalized_label1 = 'no match' - - label2 = 'food_ordering.no' - normalized_label2 = 'no' - - assert normalized_label0 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label0) - assert normalized_label1 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label1) - assert normalized_label2 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label2) - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_get_continuous_slots(): - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {19: [3, 5], 18: [5, 6]} - - # here 18 and 19 maps to the same slot (originally variants of B-slot and I-slot) - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 18, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [3, 6]} - - # test if function works when non-empty slots are at boundary - slot_ids = [18, 54, 54, 19, 19] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [0, 1], 19: [3, 5]} - - -@pytest.mark.unit -def test_dialogue_assistant_map_bio_format_slots_to_unified_slots(): - - slots = ['B-time', 'I-time', 'B-alarm', 'I-alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '0', '2': '1', '3': '1', '4': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - # case in which BIOS scheme was not used in annotation - slots = ['time', 'alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '1', '2': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - -@pytest.mark.unit -def test_dialogue_data_processor_get_relevant_idxs(): - - dataset_split = 'train' - dev_proportion = 10 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 900 - assert idxs != list(range(900)) - - dataset_split = 'dev' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 400 - assert idxs != list(range(400)) - - dataset_split = 'test' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 1000 - assert idxs == list(range(1000)) - - -@pytest.mark.unit -def test_dialogue_sgd_data_processor_convert_camelcase_to_lower(): - label = 'none' - gt_converted_label = 'none' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'ReserveRestaurant' - gt_converted_label = 'reserve restaurant' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'Alarm' - gt_converted_label = 'alarm' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - slots = [] - linearized_slots = 'None' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': '7pm', 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': ['7pm', '1900'], 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - actions = [ - {'act': 'inform', 'slot': 'time', 'values': ['7pm', '1900']}, - {'act': 'confirm', 'slot': 'place', 'values': ['hall']}, - ] - - prompt_template = 'values' - formatted_actions = '7pm hall' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'slots_values' - formatted_actions = 'time (7pm) place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'acts_slots_values' - formatted_actions = 'inform time (7pm) confirm place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - -@pytest.mark.unit -def test_dialogue_sgd_dataset_naive_tokenize(): - - utterance = 'I am feeling hungry so I would like to find a place to eat.' - tokens = [ - 'I', - ' ', - 'am', - ' ', - 'feeling', - ' ', - 'hungry', - ' ', - 'so', - ' ', - 'I', - ' ', - 'would', - ' ', - 'like', - ' ', - 'to', - ' ', - 'find', - ' ', - 'a', - ' ', - 'place', - ' ', - 'to', - ' ', - 'eat', - '.', - ] - assert tokens == DialogueSGDBERTDataset._naive_tokenize(utterance) - - -@pytest.mark.unit -def test_dialogue_nearest_neighbour_mean_pooling(): - - model_output = [torch.ones(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.zeros(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.zeros(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.cat([torch.zeros(8, 256, 768), torch.ones(8, 256, 768)], axis=1)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float() * 0.5, DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) diff --git a/tests/collections/nlp/test_entity_linking_model.py b/tests/collections/nlp/test_entity_linking_model.py deleted file mode 100644 index 16b768184296..000000000000 --- a/tests/collections/nlp/test_entity_linking_model.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tempfile - -import pytest -import wget -from omegaconf import OmegaConf - -from nemo.collections.nlp.models import EntityLinkingModel - - -def get_cfg(): - - language_model = OmegaConf.create( - {"pretrained_model_name": "bert-base-uncased", "config_file": None, "config": None, "lm_checkpoint": None} - ) - - tokenizer = OmegaConf.create( - {"tokenizer_name": "bert-base-uncased", "vocab_file": None, "tokenizer_model": None, "do_lower_case": True} - ) - - model = OmegaConf.create( - { - "nemo_path": "sap_entity_linking.nemo", - "max_seq_length": 128, - "language_model": language_model, - "tokenizer": tokenizer, - "train_ds": None, - "validation_ds": None, - } - ) - - cfg = OmegaConf.create({"model": model}) - - return cfg - - -class TestEntityLinkingModel: - @pytest.mark.with_downloads() - @pytest.mark.unit - def test_creation_saving_restoring(self): - # Create a new temporary directory - with tempfile.TemporaryDirectory() as restore_dir: - with tempfile.TemporaryDirectory() as save_dir: - model = EntityLinkingModel(cfg=get_cfg().model) - assert isinstance(model, EntityLinkingModel) - - save_dir_path = save_dir - - # Where model will be saved - model_save_path = os.path.join(save_dir, f"{model.__class__.__name__}.nemo") - model.save_to(save_path=model_save_path) - - # Where model will be restored from - model_restore_path = os.path.join(restore_dir, f"{model.__class__.__name__}.nemo") - shutil.copy(model_save_path, model_restore_path) - - # at this point save_dir should not exist - assert save_dir_path is not None and not os.path.exists(save_dir_path) - assert not os.path.exists(model_save_path) - assert os.path.exists(model_restore_path) - - # attempt to restore - model_copy = model.__class__.restore_from(restore_path=model_restore_path) - assert model.num_weights == model_copy.num_weights - - -if __name__ == "__main__": - t = TestEntityLinkingModel() - t.test_creation_saving_restoring() diff --git a/tests/collections/nlp/test_hyena_operator.py b/tests/collections/nlp/test_hyena_operator.py new file mode 100644 index 000000000000..d6ebaa2f335d --- /dev/null +++ b/tests/collections/nlp/test_hyena_operator.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch.nn +from megatron.core.transformer.transformer_config import TransformerConfig + +from nemo.collections.nlp.modules.common.hyena.hyena import HyenaOperator, MultiHeadHyenaConv, SingleHeadHyenaConv +from nemo.collections.nlp.modules.common.hyena.hyena_spec import get_hyena_layer_with_transformer_engine_spec +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision + +try: + import fftconv + + HAVE_FFTCONV = True +except ImportError: + HAVE_FFTCONV = False + +try: + import flashfftconv + + HAVE_FLASHFFTCONV = True +except ImportError: + HAVE_FLASHFFTCONV = False + +try: + import causal_conv1d + + HAVE_CAUSAL_CONV1D = True +except ImportError: + HAVE_CAUSAL_CONV1D = False + + +@pytest.fixture() +def transformer_config(): + cfg = TransformerConfig(num_layers=2, hidden_size=864, num_attention_heads=1) + return cfg + + +@pytest.fixture() +def hyena_config(): + cfg = { + # HyenaOperator parameters + 'max_seq_length': 1024, + 'order': 2, + 'num_heads': 1, + 'dropout': 0.0, + 'short_filter_order': 3, + 'activation': "identity", + # HyenaConv parameters + 'precision': 'bf16', + 'bias': True, + 'fftconv_type': None, + # HyenaFilter parameters + 'emb_dim': 33, + 'learn_pos_emb_z': True, + 'mlp_width': 64, + 'sine_freq': 1, + 'num_inner_mlps': 2, + 'normalized': False, + # ExponentialModulation parameters + 'modulate': True, + 'learn_modulation': False, + 'fast_decay_pct': 0.3, + 'slow_decay_pct': 1.5, + 'target': 1e-2, + 'shift': 0.0, + } + return cfg + + +@pytest.fixture() +def submodules(hyena_config): + return get_hyena_layer_with_transformer_engine_spec(hyena_config).submodules + + +@pytest.mark.run_only_on('GPU') +@pytest.mark.skipif(not HAVE_CAUSAL_CONV1D, reason='causal-conv-1d not installed') +class TestHyenaOperator: + @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed') + @pytest.mark.parametrize( + "optionals_enabled, num_heads, expected_num_weights", + [(False, 1, 3068256), (True, 1, 3102912), (True, 8, 3053016)], + ) + def test_parameters( + self, optionals_enabled, num_heads, expected_num_weights, transformer_config, hyena_config, submodules + ): + # Expected num weights calculation: + # + # Denote: inner_width = d_model * (order + 1) + # head_dim = d_model / num_heads + # + # in_proj (layer_norm) --> d_model * 2 + # in_proj (linear) --> d_model * inner_width + inner_width + # out_proj (linear) --> d_model * d_model + d_model + # short_filter (depthwise-separable 1d conv) --> inner_width * short_filter_order + inner_width + # long_conv bias --> head_dim + # filter: + # pos_emb.z --> max_seq_len * emb_dim + # sin activation freqs --> mlp_width + # mlp: + # input layer --> emb_dim * mlp_width + mlp_width + # inner layers --> num_inner_mlps * (mlp_width ^ 2 + mlp_width) + # output_layer (no bias) --> mlp_width * head_dim + # modulation: head_dim + + hyena_config['fftconv_type'] = 'safari' + + hyena_config['learn_pos_emb_z'] = optionals_enabled + hyena_config['learn_modulation'] = optionals_enabled + hyena_config['num_heads'] = num_heads + hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config) + + assert hyena_module.d_model == transformer_config.hidden_size + assert isinstance(hyena_module.long_conv.filter.pos_emb.z, torch.nn.Parameter) == optionals_enabled + assert isinstance(hyena_module.long_conv.filter.modulation.deltas, torch.nn.Parameter) == optionals_enabled + + num_weights = sum([p.numel() for p in hyena_module.parameters()]) + assert num_weights == expected_num_weights + + @staticmethod + def check_gpu_forward(hyena_module, transformer_config, hyena_config): + dtype = torch_dtype_from_precision(hyena_config['precision']) + hyena_module = hyena_module.to(device='cuda', dtype=dtype) + + bs = 4 + seq_len = hyena_config['max_seq_length'] + d_model = transformer_config.hidden_size + + x = torch.randn(seq_len, bs, d_model) + x = x.to(device='cuda', dtype=dtype) + + y, _ = hyena_module(x) + assert y.shape[0] == seq_len + assert y.shape[1] == bs + assert y.shape[2] == d_model + + @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed') + def test_single_head_safari(self, transformer_config, hyena_config, submodules): + hyena_config['fftconv_type'] = 'safari' + hyena_config['num_heads'] = 1 + hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config) + + assert isinstance(hyena_module.long_conv, SingleHeadHyenaConv) + assert hyena_module.long_conv.fftconv_fn == hyena_module.long_conv._safari_fft + + self.check_gpu_forward(hyena_module, transformer_config, hyena_config) + + @pytest.mark.skipif(not HAVE_FLASHFFTCONV, reason='Safari fftconv not installed') + def test_single_head_flash(self, transformer_config, hyena_config, submodules): + hyena_config['fftconv_type'] = 'flash' + hyena_config['num_heads'] = 1 + hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config) + + assert isinstance(hyena_module.long_conv, SingleHeadHyenaConv) + assert hyena_module.long_conv.fftconv_fn == hyena_module.long_conv._flash_fft + + self.check_gpu_forward(hyena_module, transformer_config, hyena_config) + + @pytest.mark.skipif(not HAVE_FFTCONV, reason='Safari fftconv not installed') + def test_multi_head(self, transformer_config, hyena_config, submodules): + hyena_config['fftconv_type'] = 'safari' + hyena_config['num_heads'] = 8 + hyena_module = HyenaOperator(transformer_config, submodules=submodules, **hyena_config) + + assert isinstance(hyena_module.long_conv, MultiHeadHyenaConv) + + self.check_gpu_forward(hyena_module, transformer_config, hyena_config) diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py deleted file mode 100644 index 8206457ec6ee..000000000000 --- a/tests/collections/nlp/test_megatron.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - import apex - - apex_available = True -except Exception: - apex_available = False - -import os -import tempfile - -import onnx -import pytest -import torch -from omegaconf import OmegaConf - -import nemo.collections.nlp as nemo_nlp -from nemo.core.classes import typecheck - - -def get_pretrained_bert_345m_uncased_model(): - model_name = "megatron-bert-345m-uncased" - config = {"language_model": {"pretrained_model_name": model_name}, "tokenizer": {}} - omega_conf = OmegaConf.create(config) - model = nemo_nlp.modules.get_lm_model(cfg=omega_conf) - if torch.cuda.is_available(): - model = model.cuda() - return model - - -class TestMegatron: - @pytest.mark.skip("This test was written for megatron-lm") - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_list_pretrained_models(self): - pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list() - assert len(pretrained_lm_models) > 0 - - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Only one Megatron model is allowed") - def test_get_model(self): - model = get_pretrained_bert_345m_uncased_model() - assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder) - - typecheck.set_typecheck_enabled(enabled=False) - inp = model.input_example() - out = model.forward(*inp) - typecheck.set_typecheck_enabled(enabled=True) - - @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine') - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Megatron-LM BERT support deprecated. Supported in NeMo < 1.5") - def test_onnx_export(self): - model = get_pretrained_bert_345m_uncased_model() - assert model - with tempfile.TemporaryDirectory() as tmpdir: - # Generate filename in the temporary directory. - # Test export. - model.export(os.path.join(".", "megatron.onnx")) - - -if __name__ == "__main__": - t = TestMegatron() - t.test_onnx_export() diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py deleted file mode 100644 index 20932b6c4e0d..000000000000 --- a/tests/collections/nlp/test_mem_map_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import csv -import json -import os - -import pytest - -from nemo.collections.nlp.data.language_modeling import text_memmap_dataset - - -@pytest.fixture -def jsonl_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.jsonl" - - # Generate data to write to the JSONL file - data = [ - {"name": "John", "age": 30}, - {"name": "Jane", "age": 25}, - {"name": "Bob", "age": 35}, - ] - - # Write data to the JSONL file - with open(file_path, mode="w") as file: - for item in data: - json.dump(item, file) - file.write("\n") - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -@pytest.fixture -def csv_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.csv" - - # Generate data to write to the CSV file - data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]] - - # Write data to the CSV file - with open(file_path, mode="w", newline="") as file: - writer = csv.writer(file) - writer.writerows(data) - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -def test_jsonl_mem_map_dataset(jsonl_file): - """Test for JSONL memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert indexed_dataset[0] == {"name": "John", "age": 30} - assert indexed_dataset[1] == {"name": "Jane", "age": 25} - assert indexed_dataset[2] == {"name": "Bob", "age": 35} - - -def test_csv_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1) - assert indexed_dataset[0].strip() == "John" - assert indexed_dataset[1].strip() == "Jane" - assert indexed_dataset[2].strip() == "Bob" - - -def test_csv_fields_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVFieldsMemmapDataset( - dataset_paths=[csv_file], data_fields={"ID": 0, "Name": 1}, header_lines=1 - ) - assert isinstance(indexed_dataset[0], dict) - assert sorted(indexed_dataset[0].keys()) == ["ID", "Name"] - assert indexed_dataset[0]["ID"] == "1" and indexed_dataset[1]["ID"] == "2" and indexed_dataset[2]["ID"] == "3" - assert ( - indexed_dataset[0]["Name"].strip() == "John" - and indexed_dataset[1]["Name"].strip() == "Jane" - and indexed_dataset[2]["Name"].strip() == "Bob" - ) - - -@pytest.mark.parametrize( - "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset], -) -@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False]) -@pytest.mark.parametrize("relative_index_fn", [True, False]) -def test_mem_map_dataset_index_mapping_dir( - tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn, -): - """Test for index_mapping_dir.""" - if relative_index_fn: - jsonl_file = os.path.relpath(jsonl_file) - else: - jsonl_file = os.path.abspath(jsonl_file) - - if use_alternative_index_mapping_dir: - index_mapping_dir = tmp_path / "subdir" - dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)) - # Index files should not be created in default location. - assert not os.path.isfile(f"{jsonl_file}.idx.npy") - assert not os.path.isfile(f"{jsonl_file}.idx.info") - if relative_index_fn: - # Remove leading ".." sequences. - while jsonl_file.startswith(("../")): - jsonl_file = jsonl_file.lstrip("../") - idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx" - assert os.path.isfile(f"{idx_fn}.npy") - assert os.path.isfile(f"{idx_fn}.info") - else: - text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert os.path.isfile(f"{jsonl_file}.idx.npy") - assert os.path.isfile(f"{jsonl_file}.idx.info") diff --git a/tests/collections/nlp/test_prompt_learning.py b/tests/collections/nlp/test_prompt_learning.py deleted file mode 100644 index 4597fe9ecef0..000000000000 --- a/tests/collections/nlp/test_prompt_learning.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -import pytest -import torch - -from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import get_pseudo_tokens -from nemo.collections.nlp.modules.common import VirtualPromptSource -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.core import Dataset - - -def get_prompt_tuning_dataset( - dataset_path, tokenizer, virtual_prompt_source, task_templates, pseudo_tokens, -): - dataset = GPTPromptLearningDataset( - data=[dataset_path], - tokenizer=tokenizer, - virtual_prompt_source=virtual_prompt_source, - task_templates=task_templates, - pseudo_tokens=pseudo_tokens, - pad_token_id=tokenizer.unk_id, - max_seq_length=512, - min_seq_length=1, - ) - - return dataset - - -def create_temp_dataset(): - example_dataset_a = [ - {'taskname': 'task name A', 'text': 'Test sentence one, Answer: ', 'answer': 'test'} for i in range(24) - ] - example_dataset_b = [ - {'taskname': 'task name B', 'question': 'This is a question', 'answer': 'test'} for i in range(13) - ] - example_dataset = example_dataset_a + example_dataset_b - temp_file_name = 'temp_dataset_file.jsonl' - - with open(temp_file_name, 'w') as temp: - for example in example_dataset: - temp.write(json.dumps(example) + '\n') - - return temp_file_name - - -def get_task_templates(): - task_templates = {} - task_templates['task name A'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{text}{answer}", - "prompt_template_fields": ['text', 'answer'], - "total_virtual_tokens": 5, - "virtual_token_splits": [5], - "truncate_field": None, - "answer_only_loss": True, - "answer_field": "answer", - "task_id_num": 0, - } - task_templates['task name B'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{question}<|VIRTUAL_PROMPT_1|>{answer}{extra}", - "prompt_template_fields": ['question', 'answer', 'extra'], - "total_virtual_tokens": 10, - "virtual_token_splits": [7, 3], - "truncate_field": None, - "answer_only_loss": False, - "answer_field": None, - "task_id_num": 1, - } - return task_templates - - -class TestMegatronGPTPromptLearningDataset: - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_init_prompt_learning_dataset(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - print(type(dataset)) - - assert isinstance(dataset, Dataset) - - os.remove(dataset_path) - - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_prompt_learning_dataset_collate_fn_prompt_encoder(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - batch = [dataset[i] for i in range(8)] - batch = dataset.collate_fn(batch) - - assert len(batch) == 6 - - _, _, _, _, _, taskname_ids = batch - - assert list(taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A") - - os.remove(dataset_path) - - -if __name__ == "__main__": - t = TestMegatronGPTPromptLearningDataset() - t.test_init_prompt_learning_dataset() - t.test_prompt_learning_dataset_collate_fn_prompt_encoder() - print('-' * 50 + '\nALL PROMPT TUNING UNIT TESTS PASS!\n' + '-' * 50) diff --git a/tests/collections/nlp/test_qna.py b/tests/collections/nlp/test_qna.py deleted file mode 100644 index 4a470cacb711..000000000000 --- a/tests/collections/nlp/test_qna.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections - -import pytest -import torch - -from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset -from nemo.collections.nlp.data.question_answering.dataset.qa_gpt_dataset import GPTQADataset -from nemo.collections.nlp.metrics.qa_metrics import QAMetrics - - -@pytest.mark.unit -def test_remove_articles(): - sentences = [ - "this is an apple", - "this is the apple", - "this is a fruit", - ] - - expected_article_removed_sents = ["this is apple", "this is apple", "this is fruit"] - - article_removed_sents = [QAMetrics.remove_articles(sent) for sent in sentences] - - assert article_removed_sents == expected_article_removed_sents - - -@pytest.mark.unit -def test_white_space_fix(): - sentences = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - expected_white_space_fixed_sents = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - white_space_fixed_sents = [QAMetrics.white_space_fix(sent) for sent in sentences] - - assert white_space_fixed_sents == expected_white_space_fixed_sents - - -@pytest.mark.unit -def test_remove_punc(): - sentence = "this, is. a! sentence: with; punctuations?" - expected_punc_removed_sent = "this is a sentence with punctuations" - - punc_removed_sent = QAMetrics.remove_punc(sentence) - - assert punc_removed_sent == expected_punc_removed_sent - - -@pytest.mark.unit -def test_get_normalized_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - -@pytest.mark.unit -def test_get_one_f1(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_get_one_exact_match(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = QADataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == QADataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - -@pytest.mark.unit -def test_gpt_no_pad_loss_masking(): - input_ids = [1] * 15 + [50257] * 15 - input_ids = torch.tensor(input_ids) - - input_attn_mask = [1] * 16 + [0] * 14 - input_attn_mask = torch.Tensor(input_attn_mask) - - training_mask_end = 10 - - expected_labels = [-100] * 10 + [1] * 5 + [50257] + [-100] * 14 - expected_labels = torch.tensor(expected_labels) - - labels = GPTQADataset.update_labels_for_no_pad_loss(input_ids, training_mask_end, input_attn_mask) - - assert torch.all(labels.eq(expected_labels)) diff --git a/tests/collections/nlp/test_question_answering.py b/tests/collections/nlp/test_question_answering.py deleted file mode 100644 index c4aacf449c50..000000000000 --- a/tests/collections/nlp/test_question_answering.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -from pydoc import doc - -import pytest - -from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset -from nemo.collections.nlp.data.question_answering_squad.qa_squad_processing import ( - _get_tokens, - exact_match_score, - f1_score, -) - - -@pytest.mark.unit -def test_get_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - -@pytest.mark.unit -def test_f1_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_exact_match_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = SquadDataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == SquadDataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py deleted file mode 100644 index 8e4d6e9a7b8f..000000000000 --- a/tests/collections/nlp/test_spellchecking_asr_customization.py +++ /dev/null @@ -1,1102 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from transformers import AutoTokenizer - -from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - apply_replacements_to_text, - substitute_replacements_in_text, -) - - -@pytest.mark.unit -def test_substitute_replacements_in_text(): - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)] - gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False) - assert corrected_text == gold_text - - gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True) - assert corrected_text == gold_text_no_hyphen - - -@pytest.mark.unit -def test_apply_replacements_to_text(): - - # min_prob = 0.5 - # dp_data = None, - # min_dp_score_per_symbol: float = -99.9 - - # test more than one fragment to replace, test multiple same replacements - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [ - (66, 75, 'proterra', 0.99986), - (66, 75, 'proterra', 0.9956), - (101, 109, 'navistar', 0.93), - (101, 109, 'navistar', 0.91), - (101, 109, 'navistar', 0.92), - ] - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - # test that min_prob works - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - -@pytest.fixture() -def bert_example_builder(): - tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D") - label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10} - semiotic_classes = {"PLAIN": 0, "CUSTOM": 1} - max_seq_len = 256 - builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len) - return builder - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_creation(bert_example_builder): - assert bert_example_builder._tokenizer is not None - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_spans(bert_example_builder): - span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)] - spans = bert_example_builder._get_spans(span_info_parts) - spans.sort() - assert spans == gold_sorted_spans - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_fragment_indices(bert_example_builder): - hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w" - targets = [1] - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 8 17"] - gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 10 16"] - gold_sorted_fragment_indices = [(11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_input_features(bert_example_builder): - hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" - targets = [1, 3] - span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"] - - gold_tags = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - ] - gold_input_ids = [ - 101, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 1055, - 1035, - 1040, - 1045, - 1040, - 1045, - 1041, - 1035, - 1055, - 1051, - 1049, - 1051, - 1050, - 1035, - 1037, - 1050, - 1040, - 1035, - 1056, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 1035, - 1043, - 1048, - 1048, - 1051, - 102, - 1040, - 1045, - 1040, - 1045, - 1041, - 1054, - 1035, - 1055, - 1037, - 1057, - 1049, - 1051, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1045, - 1041, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1037, - 1050, - 1035, - 1043, - 1057, - 1045, - 1048, - 1048, - 1051, - 1056, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1041, - 1055, - 1055, - 1041, - 102, - 1049, - 1051, - 1050, - 1037, - 1040, - 1041, - 102, - 1039, - 1044, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 102, - 1055, - 1051, - 1048, - 1051, - 1049, - 1051, - 1050, - 102, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 102, - 1049, - 1041, - 1054, - 1039, - 1061, - 102, - ] - gold_input_maskgold_segment_ids = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - 10, - 10, - 10, - 10, - ] - gold_labels_mask = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ] - gold_input_ids_for_subwords = [ - 101, - 26357, - 2106, - 2666, - 2061, - 8202, - 1998, - 13012, - 16643, - 2319, - 1043, - 7174, - 102, - 2106, - 3771, - 7842, - 2819, - 2239, - 102, - 28625, - 3630, - 9856, - 102, - 9822, - 26458, - 7174, - 2102, - 102, - 13012, - 13473, - 11393, - 102, - 13813, - 3207, - 102, - 3017, - 102, - 15211, - 102, - 9168, - 102, - 2106, - 28173, - 4305, - 4305, - 102, - 8673, - 102, - ] - gold_input_mask_for_subwords = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - ] - gold_segment_ids_for_subwords = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 6, - 6, - 7, - 7, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - ] - gold_character_pos_to_subword_pos = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 3, - 3, - 3, - 4, - 4, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 8, - 8, - 8, - 9, - 9, - 9, - 10, - 11, - 11, - 11, - 12, - 13, - 13, - 13, - 14, - 14, - 14, - 14, - 15, - 15, - 16, - 16, - 17, - 17, - 18, - 19, - 19, - 19, - 19, - 19, - 20, - 20, - 21, - 21, - 21, - 22, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 24, - 24, - 24, - 25, - 25, - 25, - 26, - 27, - 28, - 28, - 28, - 29, - 29, - 29, - 30, - 30, - 30, - 31, - 32, - 32, - 32, - 32, - 33, - 33, - 34, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 36, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 38, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 40, - 41, - 41, - 41, - 42, - 42, - 42, - 43, - 43, - 44, - 44, - 45, - 46, - 46, - 46, - 46, - 46, - 47, - ] - - tags = [0 for _ in hyp.split()] - for p, t in zip(span_info_parts, targets): - c, start, end = p.split(" ") - start = int(start) - end = int(end) - tags[start:end] = [t for i in range(end - start)] - - # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) - - # get input features for words - hyp_with_words = hyp.replace(" ", "").replace("_", " ") - ref_with_words = ref.replace(" ", "").replace("_", " ") - ( - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - _, - _, - _, - _, - ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) - - character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords) - - assert tags == gold_tags - assert input_ids == gold_input_ids - assert input_mask == gold_input_mask - assert segment_ids == gold_segment_ids - assert labels_mask == gold_labels_mask - assert input_ids_for_subwords == gold_input_ids_for_subwords - assert input_mask_for_subwords == gold_input_mask_for_subwords - assert segment_ids_for_subwords == gold_segment_ids_for_subwords - assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos diff --git a/tests/deploy/pytriton_deploy.py b/tests/deploy/pytriton_deploy.py new file mode 100644 index 000000000000..3b722d2d7fec --- /dev/null +++ b/tests/deploy/pytriton_deploy.py @@ -0,0 +1,136 @@ +import argparse + +import numpy as np +from pytriton.client import ModelClient + +from nemo.deploy.deploy_pytriton import DeployPyTriton +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMPyTorch + + +def test_triton_deployable(args): + megatron_deployable = MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) + + prompts = ["What is the biggest planet in the solar system?", "What is the fastest steam locomotive in history?"] + url = "localhost:8000" + model_name = args.model_name + init_timeout = 600.0 + + nm = DeployPyTriton( + model=megatron_deployable, + triton_model_name=model_name, + triton_model_version=1, + max_batch_size=8, + port=8000, + address="0.0.0.0", + streaming=False, + ) + nm.deploy() + nm.run() + + # run once with NemoTritonQueryLLMPyTorch + nemo_triton_query = NemoTritonQueryLLMPyTorch(url, model_name) + + result_dict = nemo_triton_query.query_llm( + prompts, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + max_length=args.max_output_token, + init_timeout=init_timeout, + ) + print("NemoTritonQueryLLMPyTriton result:") + print(result_dict) + + # run once with ModelClient, the results should be identical + str_ndarray = np.array(prompts)[..., np.newaxis] + prompts = np.char.encode(str_ndarray, "utf-8") + max_output_token = np.full(prompts.shape, args.max_output_token, dtype=np.int_) + top_k = np.full(prompts.shape, args.top_k, dtype=np.int_) + top_p = np.full(prompts.shape, args.top_p, dtype=np.single) + temperature = np.full(prompts.shape, args.temperature, dtype=np.single) + + with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: + result_dict = client.infer_batch( + prompts=prompts, + max_length=max_output_token, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ) + print("ModelClient result:") + print(result_dict) + + # test logprobs generation + # right now we don't support batches where output data is inconsistent in size, so submitting each prompt individually + all_probs = np.full(prompts.shape, True, dtype=np.bool_) + compute_logprob = np.full(prompts.shape, True, dtype=np.bool_) + with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: + logprob_results = client.infer_batch( + prompts=prompts, + max_length=max_output_token, + top_k=top_k, + top_p=top_p, + temperature=temperature, + all_probs=all_probs, + compute_logprob=compute_logprob, + ) + print("Logprob results:") + print(logprob_results) + + nm.stop() + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Deploy nemo models to Triton and benchmark the models", + ) + + parser.add_argument( + "--model_name", + type=str, + required=True, + ) + parser.add_argument( + "--num_gpus", + type=int, + default=1, + ) + parser.add_argument( + "--nemo_checkpoint", + type=str, + required=True, + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=8, + ) + parser.add_argument( + "--max_output_token", + type=int, + default=128, + ) + parser.add_argument( + "--top_k", + type=int, + default=1, + ) + parser.add_argument( + "--top_p", + type=float, + default=0.0, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + ) + + return parser.parse_args() + + +if __name__ == '__main__': + args = get_args() + test_triton_deployable(args) diff --git a/tests/export/test_nemo_export.py b/tests/export/nemo_export.py similarity index 94% rename from tests/export/test_nemo_export.py rename to tests/export/nemo_export.py index bac592c90cc2..5541cc0f8673 100644 --- a/tests/export/test_nemo_export.py +++ b/tests/export/nemo_export.py @@ -128,6 +128,7 @@ def run_trt_llm_inference( trt_llm_model_dir, n_gpu=1, max_batch_size=8, + use_embedding_sharing=False, max_input_len=128, max_output_len=128, ptuning=False, @@ -216,6 +217,7 @@ def run_trt_llm_inference( lora_target_modules=lora_target_modules, max_num_tokens=int(max_input_len * max_batch_size * 0.2), opt_num_tokens=60, + use_embedding_sharing=use_embedding_sharing, save_nemo_model_config=True, ) @@ -237,6 +239,14 @@ def run_trt_llm_inference( stop_words_list=stop_words_list, ) + if not use_lora_plugin and not ptuning: + test_cpp_runtime( + engine_path=trt_llm_model_dir, + prompt=prompt, + max_output_len=max_output_len, + debug=True, + ) + nq = None nm = None output_deployed = "" @@ -290,6 +300,27 @@ def run_trt_llm_inference( raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) +def test_cpp_runtime( + engine_path, + prompt, + max_output_len, + debug, +): + trt_llm_exporter = TensorRTLLM(engine_path, load_model=True) + output = trt_llm_exporter.forward( + input_texts=prompt, + max_output_len=max_output_len, + top_k=1, + top_p=0.0, + temperature=1.0, + ) + + if debug: + print("") + print("--- Output deployed with cpp runtime: ", output) + print("") + + def run_existing_checkpoints( model_name, n_gpus, @@ -332,6 +363,12 @@ def run_existing_checkpoints( else: raise Exception("There is not lora checkpoint path defined.") + if model_info["model_type"] == "gemma": + print("*********************") + use_embedding_sharing = True + else: + use_embedding_sharing = False + return run_trt_llm_inference( model_name=model_name, model_type=model_info["model_type"], @@ -340,6 +377,7 @@ def run_existing_checkpoints( trt_llm_model_dir=model_info["trt_llm_model_dir"], n_gpu=n_gpus, max_batch_size=model_info["max_batch_size"], + use_embedding_sharing=use_embedding_sharing, max_input_len=512, max_output_len=model_info["max_output_len"], ptuning=ptuning, diff --git a/tests/export/run.sh b/tests/export/run.sh index 0071b1351113..b3badd25a8f9 100644 --- a/tests/export/run.sh +++ b/tests/export/run.sh @@ -20,32 +20,32 @@ for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done set +x -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 -python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8 -python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1 -python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 \ No newline at end of file +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 1 --streaming +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 2 --tp_size 1 --pp_size 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 4 --tp_size 2 --pp_size 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --min_gpus 8 --tp_size 1 --pp_size 8 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-RLHF --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name NV-GPT-8B-Chat-4k-SteerLM --existing_test_models --min_gpus 1 --max_gpus 8 +python tests/export/nemo_export.py --model_name GPT-43B-Base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name FALCON-7B-base --existing_test_models --min_gpus 1 --max_gpus 2 +python tests/export/nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8 +python tests/export/nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8 +python tests/export/nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1 +python tests/export/nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 \ No newline at end of file diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py index 31d20170c0b6..fafd25e49f5a 100644 --- a/tests/lightning/test_megatron_parallel.py +++ b/tests/lightning/test_megatron_parallel.py @@ -55,7 +55,7 @@ def test_init_with_defaults(self, mocker, mock_pipeline): mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1) mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False) - megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline) + megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline, cpu=True) assert megatron_parallel.pipeline == mock_pipeline assert megatron_parallel.precision_plugin is None @@ -85,6 +85,7 @@ def test_init_with_custom_parameters( data_step=mock_data_step, forward_step=mock_forward_step, loss_reduction=mock_loss_reduction, + cpu=True, ) assert megatron_parallel.pipeline == mock_pipeline diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb deleted file mode 100644 index ddd3bdd4f929..000000000000 --- a/tutorials/nlp/Dialogue.ipynb +++ /dev/null @@ -1,717 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "jaosjY4rGRNH" - }, - "source": [ - "# Installing NeMo from source\n", - "\n", - "\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "goQzOSflEq27" - }, - "outputs": [], - "source": [ - "import os \n", - "BRANCH = 'main'\n", - "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", - "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n", - "os.chdir('NeMo')\n", - "!./reinstall.sh\n", - "os.chdir('..')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjQ_z_xQMDIb" - }, - "source": [ - "# Overview\n", - "\n", - "There are three tasks as part of this tutorial\n", - "\n", - "1. Intent and Slot Classification using Assistant Dataset and a BERT model\n", - "2. Intent Classification using Schema Guided Dialogue Dataset and a GPT2 model\n", - "3. Answer Extender using MS Marco NLGen Dataset and a BART model\n", - "\n", - "Feel free to skip to the task that interests you most after installing NeMo from source." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AS-zwy8tEq2_" - }, - "source": [ - "# 1. Intent and Slot Classification using Assistant Dataset\n", - "\n", - "## 1.1 Task Description\n", - "\n", - "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n", - "for this Intent in a query.\n", - "For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n", - "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n", - "Intents and Slots names are usually task specific and defined as labels in the training data.\n", - "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n", - "\n", - "Our model enables to train and then detect both of these tasks together.\n", - "\n", - "Note: There is a similar model available at [Joint Intent Slot Classification Colab](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb). However, this model only support BERT style models while the model in this tutorial supports other types of models such as GPT2. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FJk_UAyeEq3B" - }, - "source": [ - "\n", - "## 1.2 Download Assistant dataset and convert to NeMo format\n", - "\n", - "This is a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n", - "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n", - "\n", - "An example is:\n", - "\n", - "* utterance: what alarms have i set for tomorrow \n", - "* intent: alarm_query\n", - "* slots: date(tomorrow)\n", - "\n", - "\n", - "Note: While only the assistant dataset is used here, import_dataset.py is also compatible with ATIS and SNIPS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjOVdGX2Eq3D" - }, - "outputs": [], - "source": [ - "# download and unzip the example dataset from github\n", - "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", - "!unzip master.zip\n", - "# convert the dataset to the NeMo format\n", - "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5n81deZsEq3G" - }, - "source": [ - "## 1.3 Training and/or Testing the model\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eoYc_8jhEq3G" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_bert_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='bert-base-uncased' \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GaPmHjayEbg8" - }, - "source": [ - "**Results after 3 epochs**\n", - "\n", - "Intent report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_query (label_id: 0) 100.00 94.44 97.14 18\n", - " alarm_remove (label_id: 1) 100.00 90.91 95.24 11\n", - " alarm_set (label_id: 2) 94.12 94.12 94.12 17\n", - " audio_volume_down (label_id: 3) 75.00 42.86 54.55 7\n", - " audio_volume_mute (label_id: 4) 100.00 92.86 96.30 14\n", - " audio_volume_up (label_id: 5) 72.22 100.00 83.87 13\n", - " calendar_query (label_id: 6) 87.50 77.78 82.35 18\n", - " calendar_remove (label_id: 7) 94.44 100.00 97.14 17\n", - " calendar_set (label_id: 8) 94.44 94.44 94.44 18\n", - " cooking_recipe (label_id: 9) 85.71 70.59 77.42 17\n", - " datetime_convert (label_id: 10) 88.89 100.00 94.12 8\n", - " datetime_query (label_id: 11) 89.47 100.00 94.44 17\n", - " email_addcontact (label_id: 12) 80.00 100.00 88.89 8\n", - " email_query (label_id: 13) 100.00 83.33 90.91 18\n", - " email_querycontact (label_id: 14) 78.95 88.24 83.33 17\n", - " email_sendemail (label_id: 15) 94.44 94.44 94.44 18\n", - " general_affirm (label_id: 16) 100.00 100.00 100.00 17\n", - " general_commandstop (label_id: 17) 100.00 100.00 100.00 18\n", - " general_confirm (label_id: 18) 100.00 100.00 100.00 17\n", - " general_dontcare (label_id: 19) 100.00 100.00 100.00 18\n", - " general_explain (label_id: 20) 100.00 100.00 100.00 17\n", - " general_joke (label_id: 21) 91.67 100.00 95.65 11\n", - " general_negate (label_id: 22) 100.00 100.00 100.00 18\n", - " general_praise (label_id: 23) 100.00 100.00 100.00 17\n", - " general_quirky (label_id: 24) 60.00 50.00 54.55 18\n", - " general_repeat (label_id: 25) 100.00 100.00 100.00 17\n", - " iot_cleaning (label_id: 26) 100.00 100.00 100.00 15\n", - " iot_coffee (label_id: 27) 85.71 100.00 92.31 18\n", - " iot_hue_lightchange (label_id: 28) 100.00 94.12 96.97 17\n", - " iot_hue_lightdim (label_id: 29) 100.00 100.00 100.00 12\n", - " iot_hue_lightoff (label_id: 30) 100.00 100.00 100.00 17\n", - " iot_hue_lighton (label_id: 31) 100.00 50.00 66.67 4\n", - " iot_hue_lightup (label_id: 32) 84.62 91.67 88.00 12\n", - " iot_wemo_off (label_id: 33) 100.00 100.00 100.00 9\n", - " iot_wemo_on (label_id: 34) 100.00 85.71 92.31 7\n", - " lists_createoradd (label_id: 35) 90.00 100.00 94.74 18\n", - " lists_query (label_id: 36) 100.00 94.12 96.97 17\n", - " lists_remove (label_id: 37) 88.89 88.89 88.89 18\n", - " music_likeness (label_id: 38) 100.00 93.75 96.77 16\n", - " music_query (label_id: 39) 100.00 100.00 100.00 17\n", - " music_settings (label_id: 40) 77.78 100.00 87.50 7\n", - " news_query (label_id: 41) 72.73 88.89 80.00 18\n", - " play_audiobook (label_id: 42) 100.00 100.00 100.00 17\n", - " play_game (label_id: 43) 93.75 83.33 88.24 18\n", - " play_music (label_id: 44) 85.00 100.00 91.89 17\n", - " play_podcasts (label_id: 45) 100.00 88.89 94.12 18\n", - " play_radio (label_id: 46) 84.21 94.12 88.89 17\n", - " qa_currency (label_id: 47) 85.00 94.44 89.47 18\n", - " qa_definition (label_id: 48) 89.47 100.00 94.44 17\n", - " qa_factoid (label_id: 49) 64.00 88.89 74.42 18\n", - " qa_maths (label_id: 50) 84.62 84.62 84.62 13\n", - " qa_stock (label_id: 51) 87.50 77.78 82.35 18\n", - " recommendation_events (label_id: 52) 87.50 82.35 84.85 17\n", - " recommendation_locations (label_id: 53) 83.33 83.33 83.33 18\n", - " recommendation_movies (label_id: 54) 100.00 60.00 75.00 10\n", - " social_post (label_id: 55) 100.00 94.12 96.97 17\n", - " social_query (label_id: 56) 100.00 82.35 90.32 17\n", - " takeaway_order (label_id: 57) 92.31 70.59 80.00 17\n", - " takeaway_query (label_id: 58) 93.75 83.33 88.24 18\n", - " transport_query (label_id: 59) 81.25 76.47 78.79 17\n", - " transport_taxi (label_id: 60) 100.00 100.00 100.00 16\n", - " transport_ticket (label_id: 61) 85.00 94.44 89.47 18\n", - " transport_traffic (label_id: 62) 93.75 88.24 90.91 17\n", - " weather_query (label_id: 63) 89.47 100.00 94.44 17\n", - " -------------------\n", - " micro avg 91.16 91.16 91.16 996\n", - " macro avg 91.66 90.44 90.48 996\n", - " weighted avg 91.72 91.16 91.04 996\n", - "```\n", - "Slot report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_type (label_id: 0) 0.00 0.00 0.00 2\n", - " app_name (label_id: 1) 0.00 0.00 0.00 1\n", - " artist_name (label_id: 2) 17.39 80.00 28.57 5\n", - " audiobook_author (label_id: 3) 0.00 0.00 0.00 0\n", - " audiobook_name (label_id: 4) 64.52 74.07 68.97 27\n", - " business_name (label_id: 5) 81.48 84.62 83.02 52\n", - " business_type (label_id: 6) 80.00 80.00 80.00 20\n", - " change_amount (label_id: 7) 57.14 66.67 61.54 6\n", - " coffee_type (label_id: 8) 100.00 33.33 50.00 3\n", - " color_type (label_id: 9) 75.00 92.31 82.76 13\n", - " cooking_type (label_id: 10) 0.00 0.00 0.00 1\n", - " currency_name (label_id: 11) 100.00 96.43 98.18 28\n", - " date (label_id: 12) 87.88 87.22 87.55 133\n", - " definition_word (label_id: 13) 85.00 85.00 85.00 20\n", - " device_type (label_id: 14) 84.75 76.92 80.65 65\n", - " drink_type (label_id: 15) 0.00 0.00 0.00 0\n", - " email_address (label_id: 16) 64.29 100.00 78.26 9\n", - " email_folder (label_id: 17) 100.00 50.00 66.67 2\n", - " event_name (label_id: 18) 80.00 75.00 77.42 64\n", - " food_type (label_id: 19) 84.38 77.14 80.60 35\n", - " game_name (label_id: 20) 93.55 78.38 85.29 37\n", - " game_type (label_id: 21) 0.00 0.00 0.00 0\n", - " general_frequency (label_id: 22) 0.00 0.00 0.00 9\n", - " house_place (label_id: 23) 80.95 91.89 86.08 37\n", - " ingredient (label_id: 24) 0.00 0.00 0.00 1\n", - " joke_type (label_id: 25) 100.00 100.00 100.00 5\n", - " list_name (label_id: 26) 89.29 69.44 78.12 36\n", - " meal_type (label_id: 27) 0.00 0.00 0.00 3\n", - " media_type (label_id: 28) 78.95 83.33 81.08 36\n", - " movie_name (label_id: 29) 0.00 0.00 0.00 1\n", - " movie_type (label_id: 30) 0.00 0.00 0.00 0\n", - " music_album (label_id: 31) 0.00 0.00 0.00 0\n", - " music_descriptor (label_id: 32) 0.00 0.00 0.00 2\n", - " music_genre (label_id: 33) 81.82 90.00 85.71 10\n", - " news_topic (label_id: 34) 80.00 30.77 44.44 13\n", - " order_type (label_id: 35) 100.00 42.11 59.26 19\n", - " person (label_id: 36) 70.79 100.00 82.89 63\n", - " personal_info (label_id: 37) 76.19 94.12 84.21 17\n", - " place_name (label_id: 38) 82.86 84.47 83.65 103\n", - " player_setting (label_id: 39) 75.00 42.86 54.55 7\n", - " playlist_name (label_id: 40) 0.00 0.00 0.00 3\n", - " podcast_descriptor (label_id: 41) 92.31 54.55 68.57 22\n", - " podcast_name (label_id: 42) 66.67 16.67 26.67 12\n", - " radio_name (label_id: 43) 94.87 94.87 94.87 39\n", - " relation (label_id: 44) 90.91 90.91 90.91 11\n", - " song_name (label_id: 45) 100.00 6.67 12.50 15\n", - " time (label_id: 46) 77.57 84.69 80.98 98\n", - " time_zone (label_id: 47) 44.44 100.00 61.54 4\n", - " timeofday (label_id: 48) 86.96 80.00 83.33 25\n", - " transport_agency (label_id: 49) 80.00 57.14 66.67 7\n", - " transport_descriptor (label_id: 50) 0.00 0.00 0.00 5\n", - " transport_name (label_id: 51) 0.00 0.00 0.00 0\n", - " transport_type (label_id: 52) 88.89 100.00 94.12 40\n", - " weather_descriptor (label_id: 53) 87.50 87.50 87.50 8\n", - " O (label_id: 54) 97.07 97.52 97.30 5408\n", - " -------------------\n", - " micro avg 94.24 94.24 94.24 6582\n", - " macro avg 64.87 59.93 59.17 6582\n", - " weighted avg 94.23 94.24 93.95 6582\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-44x5PqyrOeQ" - }, - "source": [ - "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QyqQbpR4rNHT" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "# model.dataset.target_template=with_slots: this perform slot filling with intent classification\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_gpt2_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " model.dataset.target_template=with_slots \\\n", - " model.dataset.eval_mode=generation \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FbQ-6TVM1yQg" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would be helpful\n", - "\n", - "Intent report:\n", - "\n", - " ```\n", - " label precision recall f1 support \n", - " transport query (label_id: 0) 72.73 84.21 78.05 19\n", - " weather query (label_id: 1) 94.74 94.74 94.74 19\n", - " play game (label_id: 2) 92.86 68.42 78.79 19\n", - " qa currency (label_id: 3) 100.00 100.00 100.00 19\n", - " qa maths (label_id: 4) 100.00 100.00 100.00 14\n", - " iot wemo off (label_id: 5) 75.00 100.00 85.71 9\n", - " datetime convert (label_id: 6) 46.67 87.50 60.87 8\n", - " email addcontact (label_id: 7) 70.00 87.50 77.78 8\n", - " music likeness (label_id: 8) 57.89 61.11 59.46 18\n", - " music query (label_id: 9) 78.57 57.89 66.67 19\n", - " general negate (label_id: 10) 95.00 100.00 97.44 19\n", - " email sendemail (label_id: 11) 92.86 68.42 78.79 19\n", - " general affirm (label_id: 12) 95.00 100.00 97.44 19\n", - " play audiobook (label_id: 13) 57.69 78.95 66.67 19\n", - " general praise (label_id: 14) 100.00 94.74 97.30 19\n", - " alarm set (label_id: 15) 85.71 94.74 90.00 19\n", - " general explain (label_id: 16) 100.00 89.47 94.44 19\n", - " iot wemo on (label_id: 17) 83.33 71.43 76.92 7\n", - " cooking recipe (label_id: 18) 90.00 94.74 92.31 19\n", - " music settings (label_id: 19) 60.00 42.86 50.00 7\n", - " social post (label_id: 20) 84.21 84.21 84.21 19\n", - " recommendation events (label_id: 21) 72.73 84.21 78.05 19\n", - " audio volume up (label_id: 22) 76.47 100.00 86.67 13\n", - " lists remove (label_id: 23) 73.08 100.00 84.44 19\n", - " transport ticket (label_id: 24) 94.74 94.74 94.74 19\n", - " general joke (label_id: 25) 100.00 100.00 100.00 12\n", - " play podcasts (label_id: 26) 94.12 84.21 88.89 19\n", - " iot hue lightchange (label_id: 27) 85.71 63.16 72.73 19\n", - " audio volume mute (label_id: 28) 84.62 73.33 78.57 15\n", - " general dontcare (label_id: 29) 95.00 100.00 97.44 19\n", - " qa definition (label_id: 30) 77.27 89.47 82.93 19\n", - " email querycontact (label_id: 31) 58.33 73.68 65.12 19\n", - " general commandstop (label_id: 32) 100.00 100.00 100.00 19\n", - " calendar remove (label_id: 33) 94.44 89.47 91.89 19\n", - " news query (label_id: 34) 100.00 57.89 73.33 19\n", - " calendar query (label_id: 35) 63.16 63.16 63.16 19\n", - " social query (label_id: 36) 88.24 83.33 85.71 18\n", - " transport traffic (label_id: 37) 90.48 100.00 95.00 19\n", - " transport taxi (label_id: 38) 100.00 94.44 97.14 18\n", - " alarm query (label_id: 39) 100.00 94.74 97.30 19\n", - " iot hue lightoff (label_id: 40) 88.89 84.21 86.49 19\n", - " takeaway order (label_id: 41) 81.25 68.42 74.29 19\n", - " iot coffee (label_id: 42) 100.00 94.74 97.30 19\n", - " recommendation movies (label_id: 43) 75.00 90.00 81.82 10\n", - " iot hue lightup (label_id: 44) 78.57 78.57 78.57 14\n", - " email query (label_id: 45) 85.71 94.74 90.00 19\n", - " lists createoradd (label_id: 46) 82.35 73.68 77.78 19\n", - " play radio (label_id: 47) 84.21 84.21 84.21 19\n", - " audio volume down (label_id: 48) 100.00 87.50 93.33 8\n", - " general quirky (label_id: 49) 30.00 15.79 20.69 19\n", - " play music (label_id: 50) 71.43 52.63 60.61 19\n", - " qa stock (label_id: 51) 90.48 100.00 95.00 19\n", - " iot cleaning (label_id: 52) 93.33 87.50 90.32 16\n", - " iot hue lightdim (label_id: 53) 100.00 100.00 100.00 12\n", - " recommendation locations (label_id: 54) 100.00 89.47 94.44 19\n", - " general repeat (label_id: 55) 100.00 100.00 100.00 19\n", - " takeaway query (label_id: 56) 77.27 89.47 82.93 19\n", - " alarm remove (label_id: 57) 100.00 100.00 100.00 11\n", - " datetime query (label_id: 58) 75.00 63.16 68.57 19\n", - " iot hue lighton (label_id: 59) 60.00 100.00 75.00 3\n", - " qa factoid (label_id: 60) 50.00 57.89 53.66 19\n", - " calendar set (label_id: 61) 75.00 78.95 76.92 19\n", - " general confirm (label_id: 62) 100.00 100.00 100.00 19\n", - " lists query (label_id: 63) 66.67 73.68 70.00 19\n", - " label_id: 64 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 83.55 83.55 83.55 1076\n", - " macro avg 83.53 83.93 83.01 1076\n", - " weighted avg 84.26 83.55 83.30 1076\n", - " \n", - "```\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " intent_f1 83.55018615722656\n", - " intent_precision 83.55018615722656\n", - " intent_recall 83.55018615722656\n", - " slot_f1 73.99985919756773\n", - "slot_joint_goal_accuracy 65.89219330855019\n", - " slot_precision 73.85223048327137\n", - " slot_recall 74.14807930607186\n", - " test_intent_accuracy 83.55018587360595\n", - " test_loss_epoch 0.019178826361894608\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gd42arYoEq3J" - }, - "source": [ - "# 2. Schema Guided Dialogue (SGD)\n", - "\n", - "## 2.1 Task Description\n", - "---\n", - "\n", - "SGD is a multi-domain intent classification dataset from Google with close to 100k examples.\n", - "\n", - "An example is:\n", - "\n", - "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n", - "* intent: ReserveRestaurant\n", - "* slots: {\"time\": \"11:30 am\"}\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "neH8rXwjEq3J" - }, - "source": [ - "## 2.2 Download the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IgD8eavfJ5pi" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7G7uPrUpEq3J" - }, - "source": [ - "## 2.3 Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gqo-rwQlEq3K" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./dstc8-schema-guided-dialogue' \\\n", - " model.dataset.dialogues_example_dir='./sgd_gpt2_predictions' \\\n", - " model.dataset.task='sgd' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kGDlV5HvI2PQ" - }, - "outputs": [], - "source": [ - "!ls sgd_gpt2_predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p8g0f5KDTu9K" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would needed to reach convergence.\n", - "\n", - "\n", - "```\n", - " label precision recall f1 support \n", - " check balance (label_id: 0) 0.00 0.00 0.00 0\n", - " find trains (label_id: 1) 80.20 91.95 85.68 348\n", - " make payment (label_id: 2) 83.12 28.07 41.97 228\n", - " book appointment (label_id: 3) 86.93 87.15 87.04 397\n", - " get cars available (label_id: 4) 96.88 90.51 93.58 274\n", - " get event dates (label_id: 5) 0.00 0.00 0.00 0\n", - " buy bus ticket (label_id: 6) 78.61 91.33 84.49 173\n", - " add event (label_id: 7) 0.00 0.00 0.00 0\n", - " get alarms (label_id: 8) 58.33 77.78 66.67 45\n", - " reserve car (label_id: 9) 83.75 72.43 77.68 185\n", - " get events (label_id: 10) 0.00 0.00 0.00 0\n", - " reserve roundtrip flights (label_id: 11) 0.00 0.00 0.00 0\n", - " lookup music (label_id: 12) 89.83 86.89 88.33 61\n", - " book house (label_id: 13) 91.13 92.50 91.81 200\n", - " search oneway flight (label_id: 14) 74.77 47.70 58.25 174\n", - " buy event tickets (label_id: 15) 72.19 95.31 82.15 128\n", - " find apartment (label_id: 16) 0.00 0.00 0.00 0\n", - " schedule visit (label_id: 17) 77.27 66.06 71.23 386\n", - " play media (label_id: 18) 92.94 86.81 89.77 91\n", - " get ride (label_id: 19) 99.41 98.82 99.12 170\n", - " reserve oneway flight (label_id: 20) 0.00 0.00 0.00 0\n", - " find bus (label_id: 21) 96.64 87.53 91.86 361\n", - " find restaurants (label_id: 22) 77.14 91.22 83.59 148\n", - " get times for movie (label_id: 23) 0.00 0.00 0.00 0\n", - " transfer money (label_id: 24) 0.00 0.00 0.00 0\n", - " request payment (label_id: 25) 46.71 63.39 53.79 112\n", - " play movie (label_id: 26) 100.00 65.11 78.87 321\n", - " search house (label_id: 27) 97.91 91.83 94.77 306\n", - " search roundtrip flights (label_id: 28) 67.49 82.41 74.21 199\n", - " find provider (label_id: 29) 95.11 90.53 92.77 602\n", - " find attractions (label_id: 30) 100.00 89.01 94.19 91\n", - " reserve hotel (label_id: 31) 56.75 97.04 71.62 169\n", - " lookup song (label_id: 32) 0.00 0.00 0.00 0\n", - " add alarm (label_id: 33) 95.68 60.18 73.89 221\n", - " find home by area (label_id: 34) 48.95 59.79 53.83 194\n", - " get available time (label_id: 35) 0.00 0.00 0.00 0\n", - " buy movie tickets (label_id: 36) 100.00 29.39 45.42 473\n", - " reserve restaurant (label_id: 37) 95.71 84.80 89.92 342\n", - " find movies (label_id: 38) 62.40 97.61 76.14 335\n", - " get weather (label_id: 39) 100.00 87.69 93.44 195\n", - " search hotel (label_id: 40) 99.35 52.60 68.78 289\n", - " find events (label_id: 41) 99.57 82.56 90.27 281\n", - " play song (label_id: 42) 0.00 0.00 0.00 0\n", - " rent movie (label_id: 43) 0.00 0.00 0.00 0\n", - " get train tickets (label_id: 44) 45.83 5.56 9.91 198\n", - " none (label_id: 45) 55.77 98.90 71.32 728\n", - " label_id: 46 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 77.23 77.23 77.23 8425\n", - " macro avg 82.01 76.68 76.56 8425\n", - " weighted avg 83.23 77.23 76.86 8425\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jUJb-9VLLBXo" - }, - "source": [ - "# 3. MS Marco\n", - "\n", - "## Task Description\n", - "\n", - "MS Marco NLGen is a dataset from Microsoft that takes extracted answers and questions and output fluent answers.\n", - "\n", - "An example is \n", - "\n", - "\n", - "* question: What county is Nine Mile in?\n", - "* extracted_answer: Onondaga\n", - "* fluent_answer: Nine Mile is in Onondaga county.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VtXEKG_UQU9u" - }, - "source": [ - "## Download and unzip files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b9avsZ1CEq3K" - }, - "outputs": [], - "source": [ - "!mkdir ms_marco\n", - "os.chdir('ms_marco')\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz\n", - "\n", - "!gunzip train_v2.1.json.gz\n", - "!gunzip dev_v2.1.json.gz\n", - "\n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json \n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json \n", - "\n", - "os.chdir('..')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "h7UZ9R8gQTFo" - }, - "source": [ - "## Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fwGQCwbvRf2m" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.dialogues_example_dir='./marco_bart_predictions' \\\n", - " model.dataset.data_dir='./ms_marco' \\\n", - " model.save_model=True \\\n", - " model.dataset.debug_mode=True \\\n", - " model.dataset.task='ms_marco' \\\n", - " model.language_model.pretrained_model_name='facebook/bart-base' \\\n", - " trainer.max_epochs=1 \\\n", - " model.dataset.debug_mode=False \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UL7ekAOZ2abi" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "Train more epochs for optimal performance\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " bleu 65.46179962158203\n", - " f1 78.24439835896995\n", - " precision 81.92473076099847\n", - " recall 76.72508929408436\n", - " test_accuracy 25.563487607283225\n", - " test_loss 0.4419259166606655\n", - " test_loss_epoch 0.4420809745788574\n", - " test_ppl 1.5557004846779854\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Dialogue.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb deleted file mode 100644 index dfdf594e6804..000000000000 --- a/tutorials/nlp/Entity_Linking_Medical.ipynb +++ /dev/null @@ -1,632 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "\n", - "## Install NeMo if using google collab or if its not installed locally\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Install dependencies\n", - "!pip install wget\n", - "!pip install faiss-gpu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import torch\n", - "import wget\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from omegaconf import OmegaConf\n", - "from pytorch_lightning import Trainer\n", - "from IPython.display import display\n", - "from tqdm import tqdm\n", - "\n", - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Task Description\n", - "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n", - "\n", - "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n", - "\n", - "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Self Alignment Pretraining\n", - "Self-Alignment pretraining is a second stage pretraining of an existing encoder (called second stage because the encoder model can be further finetuned after this more general pretraining step). The dataset used during training consists of pairs of concept synonyms that map to the same ID. At each training iteration, we only select *hard* examples present in the mini batch to calculate the loss and update the model weights. In this context, a hard example is an example where a concept is closer to an unrelated concept in the mini batch than it is to the synonym concept it is paired with by some margin. I encourage you to take a look at [section 2 of the paper](https://arxiv.org/pdf/2010.11784.pdf) for a more formal and in depth description of how hard examples are selected.\n", - "\n", - "We then use a [metric learning loss](https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf) calculated from the hard examples selected. This loss helps reshape the embedding space. The concept representation space is rearranged to be more suitable for entity matching via embedding cosine similarity. \n", - "\n", - "Now that we have idea of what's going on, let's get started!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download data into project directory\n", - "PROJECT_DIR = \".\" #Change if you don't want the current directory to be the project dir\n", - "DATA_DIR = os.path.join(PROJECT_DIR, \"tiny_example_data\")\n", - "\n", - "if not os.path.isdir(os.path.join(DATA_DIR)):\n", - " wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/tiny_example_data.zip',\n", - " os.path.join(PROJECT_DIR, \"tiny_example_data.zip\"))\n", - "\n", - " !unzip {PROJECT_DIR}/tiny_example_data.zip -d {PROJECT_DIR}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial we will be using a tiny toy dataset to demonstrate how to use NeMo's entity linking model functionality. The dataset includes synonyms for 12 medical concepts. Entity phrases with the same ID are synonyms for the same concept. For example, \"*chronic kidney failure*\", \"*gradual loss of kidney function*\", and \"*CKD*\" are all synonyms of concept ID 5. Here's the dataset before preprocessing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_data = pd.read_csv(os.path.join(DATA_DIR, \"tiny_example_dev_data.csv\"), names=[\"ID\", \"CONCEPT\"], index_col=False)\n", - "print(raw_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've already paired off the concepts for this dataset with the format `ID concept_synonym1 concept_synonym2`. Here are the first ten rows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data = pd.read_table(os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\"), names=[\"ID\", \"CONCEPT_SYN1\", \"CONCEPT_SYN2\"], delimiter='\\t')\n", - "print(training_data.head(10))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the [Unified Medical Language System (UMLS)](https://www.nlm.nih.gov/research/umls/index.html) dataset for full medical domain entity linking training. The data contains over 9 million entities and is a table of medical concepts with their corresponding concept IDs (CUI). After [requesting a free license and making a UMLS Terminology Services (UTS) account](https://www.nlm.nih.gov/research/umls/index.html), the [entire UMLS dataset](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) can be downloaded from the NIH's website. If you've cloned the NeMo repo you can run the data processing script located in `examples/nlp/entity_linking/data/umls_dataset_processing.py` on the full dataset. This script will take in the initial table of UMLS concepts and produce a .tsv file with each row formatted as `CUI\\tconcept_synonym1\\tconcept_synonym2`. Once the UMLS dataset .RRF file is downloaded, the script can be run from the `examples/nlp/entity_linking` directory like so: \n", - "```\n", - "python data/umls_dataset_processing.py\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Second stage pretrain a BERT Base encoder on the self-alignment pretraining task (SAP) for improved entity linking. Using a GPU, the model should take 5 minutes or less to train on this example dataset and training progress will be output below the cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download config\n", - "wget.download(f\"https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml\",\n", - " os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Load in config file\n", - "cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set config file variables\n", - "cfg.project_dir = PROJECT_DIR\n", - "cfg.model.nemo_path = os.path.join(PROJECT_DIR, \"tiny_example_sap_bert_model.nemo\")\n", - "cfg.model.train_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\")\n", - "cfg.model.validation_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_validation_pairs.tsv\")\n", - "\n", - "# remove distributed training flags\n", - "cfg.trainer.strategy = 'auto'\n", - "cfg.trainer.accelerator = 'auto'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the trainer and model\n", - "trainer = Trainer(**cfg.trainer)\n", - "exp_manager(trainer, cfg.get(\"exp_manager\", None))\n", - "model = nemo_nlp.models.EntityLinkingModel(cfg=cfg.model, trainer=trainer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train and save the model\n", - "trainer.fit(model)\n", - "model.save_to(cfg.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can run the script at `examples/nlp/entity_linking/self_alignment_pretraining.py` to train a model on a larger dataset. Run\n", - "\n", - "```\n", - "python self_alignment_pretraining.py project_dir=.\n", - "```\n", - "from the `examples/nlp/entity_linking` directory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Evaluation\n", - "\n", - "Let's evaluate our freshly trained model and compare its performance with a BERT Base encoder that hasn't undergone self-alignment pretraining. We first need to restore our trained model and load our BERT Base Baseline model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Restore second stage pretrained model\n", - "sap_model_cfg = cfg\n", - "sap_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_index\")\n", - "sap_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "sap_model = nemo_nlp.models.EntityLinkingModel.restore_from(sap_model_cfg.model.nemo_path).to(device)\n", - "\n", - "# Load original model\n", - "base_model_cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set train/val datasets to None to avoid loading datasets associated with training\n", - "base_model_cfg.model.train_ds = None\n", - "base_model_cfg.model.validation_ds = None\n", - "base_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"base_model_index\")\n", - "base_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "base_model = nemo_nlp.models.EntityLinkingModel(base_model_cfg.model).to(device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are going evaluate our model on a nearest neighbor task using top 1 and top 5 accuracies as our metric. We will be using a tiny example test knowledge base and test queries. For this evaluation we are going to be comparing every test query with every concept vector in our test set knowledge base. We will rank each item in the knowledge base by its cosine similarity with the test query. We'll then compare the IDs of the predicted most similar test knowledge base concepts with our ground truth query IDs to calculate top 1 and top 5 accuracies. For this metric higher is better." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Helper function to get data embeddings\n", - "def get_embeddings(model, dataloader):\n", - " embeddings, cids = [], []\n", - "\n", - " with torch.no_grad():\n", - " for batch in tqdm(dataloader):\n", - " input_ids, token_type_ids, attention_mask, batch_cids = batch\n", - " batch_embeddings = model.forward(input_ids=input_ids.to(device), \n", - " token_type_ids=token_type_ids.to(device), \n", - " attention_mask=attention_mask.to(device))\n", - "\n", - " # Accumulate index embeddings and their corresponding IDs\n", - " embeddings.extend(batch_embeddings.cpu().detach().numpy())\n", - " cids.extend(batch_cids)\n", - " \n", - " return embeddings, cids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, test_kb, test_queries, ks):\n", - " # Initialize knowledge base and query data loaders\n", - " test_kb_dataloader = model.setup_dataloader(test_kb, is_index_data=True)\n", - " test_query_dataloader = model.setup_dataloader(test_queries, is_index_data=True)\n", - " \n", - " # Get knowledge base and query embeddings\n", - " test_kb_embs, test_kb_cids = get_embeddings(model, test_kb_dataloader)\n", - " test_query_embs, test_query_cids = get_embeddings(model, test_query_dataloader)\n", - "\n", - " # Calculate the cosine distance between each query and knowledge base concept\n", - " score_matrix = np.matmul(np.array(test_query_embs), np.array(test_kb_embs).T)\n", - " accs = {k : 0 for k in ks}\n", - " \n", - " # Compare the knowledge base IDs of the knowledge base entities with \n", - " # the smallest cosine distance from the query \n", - " for query_idx in tqdm(range(len(test_query_cids))):\n", - " query_emb = test_query_embs[query_idx]\n", - " query_cid = test_query_cids[query_idx]\n", - " query_scores = score_matrix[query_idx]\n", - "\n", - " for k in ks:\n", - " topk_idxs = np.argpartition(query_scores, -k)[-k:]\n", - " topk_cids = [test_kb_cids[idx] for idx in topk_idxs]\n", - " \n", - " # If the correct query ID is among the top k closest kb IDs\n", - " # the model correctly linked the entity\n", - " match = int(query_cid in topk_cids)\n", - " accs[k] += match\n", - "\n", - " for k in ks:\n", - " accs[k] /= len(test_query_cids)\n", - " \n", - " return accs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create configs for our test data\n", - "test_kb = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_kb.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "test_queries = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_queries.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "ks = [1, 5]\n", - "\n", - "# Evaluate both models on our test data\n", - "base_accs = evaluate(base_model, test_kb, test_queries, ks)\n", - "base_accs[\"Model\"] = \"BERT Base Baseline\"\n", - "\n", - "sap_accs = evaluate(sap_model, test_kb, test_queries, ks)\n", - "sap_accs[\"Model\"] = \"BERT + SAP\"\n", - "\n", - "print(\"Top 1 and Top 5 Accuracy Comparison:\")\n", - "results_df = pd.DataFrame([base_accs, sap_accs], columns=[\"Model\", 1, 5])\n", - "results_df = results_df.style.set_properties(**{'text-align': 'left', }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n", - "display(results_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The purpose of this section was to show an example of evaluating your entity linking model. This evaluation set contains very little data, and no serious conclusions should be drawn about model performance. Top 1 accuracy should be between 0.7 and 1.0 for both models and top 5 accuracy should be between 0.8 and 1.0. When evaluating a model trained on a larger dataset, you can use a nearest neighbors index to speed up the evaluation time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building an Index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To qualitatively observe the improvement we gain from the second stage pretraining, let's build two indices. One will be built with BERT base embeddings before self-alignment pretraining and one will be built with the model we just trained. Our knowledge base in this tutorial will be in the same domain and have some overlapping concepts as the training set. This data file is formatted as `ID\\tconcept`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `EntityLinkingDataset` class can load the data used for training the entity linking encoder as well as for building the index if the `is_index_data` flag is set to true. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def build_index(cfg, model):\n", - " # Setup index dataset loader\n", - " index_dataloader = model.setup_dataloader(cfg.index.index_ds, is_index_data=True)\n", - " \n", - " # Get index dataset embeddings\n", - " embeddings, _ = get_embeddings(model, index_dataloader)\n", - " \n", - " # Train IVFFlat index using faiss\n", - " embeddings = np.array(embeddings)\n", - " quantizer = faiss.IndexFlatL2(cfg.index.dims)\n", - " index = faiss.IndexIVFFlat(quantizer, cfg.index.dims, cfg.index.nlist)\n", - " index = faiss.index_cpu_to_all_gpus(index)\n", - " index.train(embeddings)\n", - " \n", - " # Add concept embeddings to index\n", - " for i in tqdm(range(0, embeddings.shape[0], cfg.index.index_batch_size)):\n", - " index.add(embeddings[i:i+cfg.index.index_batch_size])\n", - "\n", - " # Save index\n", - " faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "build_index(sap_model_cfg, sap_model.to(device))\n", - "build_index(base_model_cfg, base_model.to(device))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking via Nearest Neighbor Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it's time to query our indices! We are going to query both our index built with embeddings from BERT Base, and our index with embeddings built from the SAP BERT model we trained. Our sample query phrases will be \"*high blood sugar*\" and \"*head pain*\". \n", - "\n", - "To query our indices, we first need to get the embedding of each query from the corresponding encoder model. We can then pass these query embeddings into the faiss index which will perform a nearest neighbor search, using cosine distance to compare the query embedding with embeddings present in the index. Once we get a list of knowledge base index concept IDs most closely matching our query, all that is left to do is map the IDs to a representative string describing the concept. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def query_index(cfg, model, index, queries, id2string):\n", - " # Get query embeddings from our entity linking encoder model\n", - " query_embs = get_query_embedding(queries, model).cpu().detach().numpy()\n", - " \n", - " # Use query embedding to find closest concept embedding in knowledge base\n", - " distances, neighbors = index.search(query_embs, cfg.index.top_n)\n", - " \n", - " # Get the canonical strings corresponding to the IDs of the query's nearest neighbors in the kb \n", - " neighbor_concepts = [[id2string[concept_id] for concept_id in query_neighbor] \\\n", - " for query_neighbor in neighbors]\n", - " \n", - " # Display most similar concepts in the knowledge base. \n", - " for query_idx in range(len(queries)):\n", - " print(f\"\\nThe most similar concepts to {queries[query_idx]} are:\")\n", - " for cid, concept, dist in zip(neighbors[query_idx], neighbor_concepts[query_idx], distances[query_idx]):\n", - " print(cid, concept, 1 - dist)\n", - "\n", - " \n", - "def get_query_embedding(queries, model):\n", - " # Tokenize our queries\n", - " model_input = model.tokenizer(queries,\n", - " add_special_tokens = True,\n", - " padding = True,\n", - " truncation = True,\n", - " max_length = 512,\n", - " return_token_type_ids = True,\n", - " return_attention_mask = True)\n", - " \n", - " # Pass tokenized input into model\n", - " query_emb = model.forward(input_ids=torch.LongTensor(model_input[\"input_ids\"]).to(device),\n", - " token_type_ids=torch.LongTensor(model_input[\"token_type_ids\"]).to(device),\n", - " attention_mask=torch.LongTensor(model_input[\"attention_mask\"]).to(device))\n", - " \n", - " return query_emb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load indices\n", - "sap_index = faiss.read_index(sap_model_cfg.index.index_save_name)\n", - "base_index = faiss.read_index(base_model_cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Map concept IDs to one canonical string\n", - "index_data = open(sap_model_cfg.index.index_ds.data_file, \"r\", encoding='utf-8-sig')\n", - "id2string = {}\n", - "\n", - "for line in index_data:\n", - " cid, concept = line.split(\"\\t\")\n", - " id2string[int(cid) - 1] = concept.strip()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id2string" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Some sample queries\n", - "queries = [\"high blood sugar\", \"head pain\"]\n", - "\n", - "# Query BERT Base\n", - "print(\"BERT Base output before Self Alignment Pretraining:\")\n", - "query_index(base_model_cfg, base_model, base_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")\n", - "\n", - "# Query SAP BERT\n", - "print(\"SAP BERT output after Self Alignment Pretraining:\")\n", - "query_index(sap_model_cfg, sap_model, sap_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even after only training on this tiny amount of data, the qualitative performance boost from self-alignment pretraining is visible. The baseline model links \"*high blood sugar*\" to the entity \"*6 diabetes*\" while our SAP BERT model accurately links \"*high blood sugar*\" to \"*Hyperinsulinemia*\". Similarly, \"*head pain*\" and \"*Myocardial infraction*\" are not the same concept, but \"*head pain*\" and \"*Headache*\" are." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For larger knowledge bases keeping the default embedding size might be too large and cause out of memory issues. You can apply PCA or some other dimensionality reduction method to your data to reduce its memory footprint. Code for creating a text file of all the UMLS entities in the correct format needed to build an index and creating a dictionary mapping concept ids to canonical concept strings can be found here `examples/nlp/entity_linking/data/umls_dataset_processing.py`. \n", - "\n", - "The code for extracting knowledge base concept embeddings, training and applying a PCA transformation to the embeddings, building a faiss index and querying the index from the command line is located at `examples/nlp/entity_linking/build_index.py` and `examples/nlp/entity_linking/query_index.py`. \n", - "\n", - "If you've cloned the NeMo repo, both of these steps can be run as follows on the command line from the `examples/nlp/entity_linking/` directory.\n", - "\n", - "```\n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands. Intermediate steps of the index building process are saved. In the occurrence of an error, previously completed steps do not need to be rerun. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Command Recap" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is a recap of the commands and steps to repeat this process on the full UMLS dataset. \n", - "\n", - "1) Download the UMLS dataset file `MRCONSO.RRF` from the NIH website and place it in the `examples/nlp/entity_linking/data` directory.\n", - "\n", - "2) Run the following commands from the `examples/nlp/entity_linking` directory\n", - "```\n", - "python data/umls_dataset_processing.py\n", - "python self_alignment_pretraining.py project_dir=. \n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "The model will take ~24hrs to train on two GPUs and ~48hrs to train on one GPU. By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands and changing `project_dir=` in the `self_alignment_pretraining.py` command. If you change the project directory, you should also move the `MRCONOSO.RRF` file to a `data` sub directory within the one you've specified. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned in the introduction, entity linking within NVIDIA NeMo is not limited to the medical domain. The same data processing and training steps can be applied to a variety of domains and use cases. You can edit the datasets used as well as training and loss function hyperparameters within your config file to better suit your domain." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb deleted file mode 100644 index b77b3439b444..000000000000 --- a/tutorials/nlp/GLUE_Benchmark.ipynb +++ /dev/null @@ -1,566 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "GLUE_Benchmark.ipynb", - "provenance": [], - "private_outputs": true, - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU", - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "o_0K1lsW1dj9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "JFWG-jYCfvD7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n", - "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n", - "\n", - "! pip install ipywidgets\n", - "! jupyter nbextension enable --py widgetsnbextension\n", - "\n", - "# Please restart the kernel after running this cell" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "dzqD2WDFOIN-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "daYw_Xll2ZR9", - "colab_type": "text" - }, - "source": [ - "In this tutorial, we are going to describe how to finetune a BERT-like model based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) on [GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding](https://openreview.net/pdf?id=rJ4km2R5t7). \n", - "\n", - "# GLUE tasks\n", - "GLUE Benchmark includes 9 natural language understanding tasks:\n", - "\n", - "## Single-Sentence Tasks\n", - "\n", - "* CoLA - [The Corpus of Linguistic Acceptability](https://arxiv.org/abs/1805.12471) is a set of English sentences from published linguistics literature. The task is to predict whether a given sentence is grammatically correct or not.\n", - "* SST-2 - [The Stanford Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence: positive or negative.\n", - "\n", - "## Similarity and Paraphrase tasks\n", - "\n", - "* MRPC - [The Microsoft Research Paraphrase Corpus](https://www.aclweb.org/anthology/I05-5002.pdf) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n", - "* QQP - [The Quora Question Pairs](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n", - "* STS-B - [The Semantic Textual Similarity Benchmark](https://arxiv.org/abs/1708.00055) is a collection of sentence pairs drawn from news headlines, video, and image captions, and natural language inference data. The task is to determine how similar two sentences are.\n", - "\n", - "## Inference Tasks\n", - "\n", - "* MNLI - [The Multi-Genre Natural Language Inference Corpus](https://cims.nyu.edu/~sbowman/multinli/multinli_0.9.pdf) is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The task has the matched (in-domain) and mismatched (cross-domain) sections.\n", - "* QNLI - [The Stanford Question Answering Dataset](https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf) is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question. The task is to determine whether the context sentence contains the answer to the question.\n", - "* RTE The Recognizing Textual Entailment (RTE) datasets come from a series of annual [textual entailment challenges](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment). The task is to determine whether the second sentence is the entailment of the first one or not.\n", - "* WNLI - The Winograd Schema Challenge is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices (Hector Levesque, Ernest Davis, and Leora Morgenstern. The winograd schema challenge. In Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. 2012).\n", - "\n", - "All tasks are classification tasks, except for the STS-B task which is a regression task. All classification tasks are 2-class problems, except for the MNLI task which has 3-classes.\n", - "\n", - "More details about GLUE benchmark could be found [here](https://gluebenchmark.com/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZnuziSwJ1yEB", - "colab_type": "text" - }, - "source": [ - "# Datasets\n", - "\n", - "**To proceed further, you need to download the GLUE data.** For example, you can download [this script](https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py) using `wget` and then execute it by running:\n", - "\n", - "`python download_glue_data.py`\n", - "\n", - "use `--tasks TASK` if datasets for only selected GLUE tasks are needed\n", - "\n", - "After running the above commands, you will have a folder `glue_data` with data folders for every GLUE task. For example, data for MRPC task would be under glue_data/MRPC.\n", - "\n", - "This tutorial and [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py) work with all GLUE tasks without any modifications. For this tutorial, we are going to use MRPC task.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "--wJ2891aIIE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# supported task names: [\"cola\", \"sst-2\", \"mrpc\", \"sts-b\", \"qqp\", \"mnli\", \"qnli\", \"rte\", \"wnli\"]\n", - "TASK = 'mrpc'\n", - "DATA_DIR = 'glue_data/MRPC'\n", - "WORK_DIR = \"WORK_DIR\"\n", - "MODEL_CONFIG = 'glue_benchmark_config.yaml'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qB0oLE4R9EhJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! ls -l $DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gMWuU69pbUDe", - "colab_type": "text" - }, - "source": [ - "For each task, there are 3 files: `train.tsv, dev.tsv, and test.tsv`. Note, MNLI has 2 dev sets: matched and mismatched, evaluation on both dev sets will be done automatically." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6UDPgadLN6SG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# let's take a look at the training data \n", - "! head -n 5 {DATA_DIR}/train.tsv" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_whKCxfTMo6Y", - "colab_type": "text" - }, - "source": [ - "# Model configuration\n", - "\n", - "Now, let's take a closer look at the model's configuration and learn to train the model.\n", - "\n", - "GLUE model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Sequence Regression module (for STS-B task) or Sequence classifier module (for the rest of the tasks).\n", - "\n", - "The model is defined in a config file which declares multiple important sections. They are:\n", - "- **model**: All arguments that are related to the Model - language model, a classifier, optimizer and schedulers, datasets and any other related information\n", - "\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "T1gA8PsJ13MJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/glue_benchmark/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mX3KmWMvSUQw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(OmegaConf.to_yaml(config))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZCgWzNBkaQLZ", - "colab_type": "text" - }, - "source": [ - "# Model Training\n", - "## Setting up Data within the config\n", - "\n", - "Among other things, the config file contains dictionaries called **dataset**, **train_ds** and **validation_ds**. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n", - "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n", - "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n", - "\n", - "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", - "\n", - "Let's now add the data directory path, task name and output directory for saving predictions to the config." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LQHCJN-ZaoLp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "config.model.task_name = TASK\n", - "config.model.output_dir = WORK_DIR\n", - "config.model.dataset.data_dir = DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nB96-3sTc3yk", - "colab_type": "text" - }, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n", - "\n", - "Let's first instantiate a Trainer object" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1tG4FzZ4Ui60", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(OmegaConf.to_yaml(config.trainer))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "knF6QeQQdMrH", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# lets modify some trainer configs\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", - "\n", - "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", - "# config.trainer.amp_level = O1\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = 'auto'\n", - "\n", - "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", - "config.trainer.max_steps = 128\n", - "\n", - "trainer = pl.Trainer(**config.trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8IlEMdVxdr6p", - "colab_type": "text" - }, - "source": [ - "## Setting up a NeMo Experiment\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8uztqGAmdrYt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", - "\n", - "# the exp_dir provides a path to the current experiment for easy access\n", - "exp_dir = str(exp_dir)\n", - "exp_dir" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8tjLhUvL_o7_", - "colab_type": "text" - }, - "source": [ - "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model and use [Megatron-LM BERT](https://arxiv.org/abs/1909.08053) or [AlBERT model](https://arxiv.org/abs/1909.11942):" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Xeuc2i7Y_nP5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n", - "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n", - "\n", - "# specify BERT-like model, you want to use, for example, \"megatron-bert-345m-uncased\" or 'bert-base-uncased'\n", - "PRETRAINED_BERT_MODEL = \"albert-base-v1\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "RK2xglXyAUOO", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# add the specified above model parameters to the config\n", - "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fzNZNAVRjDD-", - "colab_type": "text" - }, - "source": [ - "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", - "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NgsGLydWo-6-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = nemo_nlp.models.GLUEModel(cfg=config.model, trainer=trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kQ592Tx4pzyB", - "colab_type": "text" - }, - "source": [ - "## Monitoring training progress\n", - "Optionally, you can create a Tensorboard visualization to monitor training progress." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mTJr16_pp0aS", - "colab_type": "code", - "colab": {} - }, - "source": [ - "try:\n", - " from google import colab\n", - " COLAB_ENV = True\n", - "except (ImportError, ModuleNotFoundError):\n", - " COLAB_ENV = False\n", - "\n", - "# Load the TensorBoard notebook extension\n", - "if COLAB_ENV:\n", - " %load_ext tensorboard\n", - " %tensorboard --logdir {exp_dir}\n", - "else:\n", - " print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CFgAlaIdndjW", - "colab_type": "text" - }, - "source": [ - "Note, it’s recommended to finetune the model on each task separately. Also, based on [GLUE Benchmark FAQ#12](https://gluebenchmark.com/faq), there are might be some differences in dev/test distributions for QQP task and in train/dev for WNLI task." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hUvnSpyjp0Dh", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# start model training\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ref1qSonGNhP", - "colab_type": "text" - }, - "source": [ - "## Training Script\n", - "\n", - "If you have NeMo installed locally, you can also train the model with [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py).\n", - "\n", - "To run training script, use:\n", - "\n", - "`python glue_benchmark.py \\\n", - " model.dataset.data_dir=PATH_TO_DATA_DIR \\\n", - " model.task_name=TASK`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KVPFofXaoKNE", - "colab_type": "text" - }, - "source": [ - "Average results after 3 runs:\n", - "\n", - "| Task | Metric | ALBERT-large | ALBERT-xlarge | Megatron-345m | BERT base paper | BERT large paper |\n", - "|-------|--------------------------|--------------|---------------|---------------|-----------------|------------------|\n", - "| CoLA | Matthew's correlation | 54.94 | 61.72 | 64.56 | 52.1 | 60.5 |\n", - "| SST-2 | Accuracy | 92.74 | 91.86 | 95.87 | 93.5 | 94.9 |\n", - "| MRPC | F1/Accuracy | 92.05/88.97 | 91.87/88.61 | 92.36/89.46 | 88.9/- | 89.3/- |\n", - "| STS-B | Person/Spearman corr. | 90.41/90.21 | 90.07/90.10 | 91.51/91.61 | -/85.8 | -/86.5 |\n", - "| QQP | F1/Accuracy | 88.26/91.26 | 88.80/91.65 | 89.18/91.91 | 71.2/- | 72.1/- |\n", - "| MNLI | Matched /Mismatched acc. | 86.69/86.81 | 88.66/88.73 | 89.86/89.81 | 84.6/83.4 | 86.7/85.9 |\n", - "| QNLI | Accuracy | 92.68 | 93.66 | 94.33 | 90.5 | 92.7 |\n", - "| RTE | Accuracy | 80.87 | 82.86 | 83.39 | 66.4 | 70.1 |\n", - "\n", - "WNLI task was excluded from the experiments due to the problematic WNLI set.\n", - "The dev sets were used for evaluation for ALBERT and Megatron models, and the test sets results for [the BERT paper](https://arxiv.org/abs/1810.04805).\n", - "\n", - "Hyperparameters used to get the results from the above table, could be found in the table below. Some tasks could be further finetuned to improve performance numbers, the tables are for a baseline reference only.\n", - "Each cell in the table represents the following parameters:\n", - "Number of GPUs used/ Batch Size/ Learning Rate/ Number of Epochs. For not specified parameters, please refer to the default parameters in the training script.\n", - "\n", - "| Task | ALBERT-large | ALBERT-xlarge | Megatron-345m |\n", - "|-------|--------------|---------------|---------------|\n", - "| CoLA | 1 / 32 / 1e-5 / 3 | 1 / 32 / 1e-5 / 10 | 4 / 16 / 2e-5 / 12 |\n", - "| SST-2 | 4 / 16 / 2e-5 / 5 | 4 / 16 / 2e-5 /12 | 4 / 16 / 2e-5 / 12 |\n", - "| MRPC | 1 / 32 / 1e-5 / 5 | 1 / 16 / 2e-5 / 5 | 1 / 16 / 2e-5 / 10 |\n", - "| STS-B | 1 / 16 / 2e-5 / 5 | 1 / 16 / 4e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n", - "| QQP | 1 / 16 / 2e-5 / 5 | 4 / 16 / 1e-5 / 12 | 4 / 16 / 1e-5 / 12 |\n", - "| MNLI | 4 / 64 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | \n", - "| QNLI | 4 / 16 / 1e-5 / 5 | 4 / 16 / 1e-5 / 5 | 4 / 16 / 2e-5 / 5 | \n", - "| RTE | 1 / 16 / 1e-5 / 5 | 1 / 16 / 1e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n" - ] - } - ] -} diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb deleted file mode 100644 index c19c07b67005..000000000000 --- a/tutorials/nlp/MegatronBert_export.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "8046e96a", - "metadata": {}, - "outputs": [], - "source": [ - "BRANCH='main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38bfe8ea", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98c00a93", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ] - }, - { - "cell_type": "markdown", - "id": "e9fb1a66", - "metadata": {}, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "# Task Description\n", - "In this tutorial, we are going to describe how to export NeMo NLP models with BERT based models as the pre-trained model." - ] - }, - { - "cell_type": "markdown", - "id": "dd0fb016", - "metadata": {}, - "source": [ - "## Convert the Megatron-LM Weights to Nemo file\n", - "\n", - "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n", - "\n", - "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e451f219", - "metadata": {}, - "outputs": [], - "source": [ - "from nemo.collections.nlp.modules.common.megatron.megatron_utils import MEGATRON_CONFIG_MAP\n", - "import pathlib\n", - "\n", - "PRETRAINED_BERT_MODEL = \"megatron-bert-345m-uncased\" # specify BERT-like model from MEGATRON_CONFIG_MAP.keys()\n", - "nemo_out_path = \"qa_pretrained.nemo\" # the nemo output file name\n", - "\n", - "checkpoint_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['checkpoint']\n", - "vocab_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['vocab']\n", - "checkpoint_filename = pathlib.Path(checkpoint_url).name\n", - "vocab_filename = pathlib.Path(vocab_url).name\n", - "if not pathlib.Path(checkpoint_filename).exists():\n", - " print('downloading from checkpoint url', checkpoint_url)\n", - " !wget $checkpoint_url\n", - "if not pathlib.Path(vocab_filename).exists():\n", - " print('downloading from vocab url', vocab_url)\n", - " !wget $vocab_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7586b5c0", - "metadata": {}, - "outputs": [], - "source": [ - "WORK_DIR = \"WORK_DIR\"\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "\n", - "# Prepare the model parameters \n", - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "MODEL_CONFIG = \"megatron_bert_config.yaml\"\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/conf/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0dd3124", - "metadata": {}, - "outputs": [], - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "\n", - "config.model.megatron_legacy = True # set to true if you trained the NLP model on NeMo < 1.5.0\n", - "config.model.bias_gelu_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export \n", - "config.model.masked_softmax_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export\n", - "\n", - "config.model.num_layers = 24\n", - "config.model.hidden_size = 1024\n", - "config.model.ffn_hidden_size = 4096\n", - "config.model.num_attention_heads = 16\n", - "config.model.tokenizer.vocab_file = vocab_filename\n", - "config.model.tokenizer.type = 'BertWordPieceLowerCase' # change this to BertWordPieceCase if you are using a cased pretrained model\n", - "config.model.tensor_model_parallel_size = 1\n", - "config.model.data.data_prefix = ''\n", - "config.model.max_position_embeddings = 512\n", - "config.model.data.seq_length = 512\n", - "config.cfg = {}\n", - "config.cfg.cfg = config.model\n", - "with open('hparams.yaml', 'w') as f:\n", - " f.write(OmegaConf.to_yaml(config.cfg))\n", - "if(config.model.megatron_legacy):\n", - " checkpoint_filename = \"model_optim_rng_ca.pt\" #provide path to the pretrained pt file you used during training on NeMo < 1.5.0, for NeMo >= 1.5.0\n", - "print(checkpoint_filename)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47dca6de", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "PWD = os.getcwd()\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py')\n", - "!python -m torch.distributed.run --nproc_per_node=1 megatron_lm_ckpt_to_nemo.py --checkpoint_folder=$PWD --checkpoint_name=$checkpoint_filename --hparams_file=$PWD/hparams.yaml --nemo_file_path=$PWD/$nemo_out_path --model_type=bert --tensor_model_parallel_size=1" - ] - }, - { - "cell_type": "markdown", - "id": "1ae8d31b", - "metadata": {}, - "source": [ - "# Legacy NLP Bert based model conversion\n", - "\n", - "Step 1: Convert legacy nemo checkpoint to a checkpoint which is currently supported by nemo\n", - "\n", - "Step 2: Use the converted model from step 1 to export the nemo file to the required format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86639a3d", - "metadata": {}, - "outputs": [], - "source": [ - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/nemo_legacy_import/nlp_checkpoint_port.py')\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/export.py')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48820d57", - "metadata": {}, - "outputs": [], - "source": [ - "legacy_nemo_file_path = \"/NeMo/megatron_multiqa.nemo\" #path to you model trained on NeMo < 1.5\n", - "nemo_converted_out_path = \"converted_megatron_multiqa.nemo\"\n", - "megatron_absolute_language_model_path = \"/NeMo/tutorials/nlp/qa_pretrained.nemo\" # Give the absolute path of the model you obtained using megatron_lm_ckpt_to_nemo\n", - "onnx_export_out_path = \"onnx_megatron_multiqa.onnx\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7191e0cb", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python nlp_checkpoint_port.py {legacy_nemo_file_path} {nemo_converted_out_path} --megatron-legacy=True --megatron-checkpoint {megatron_absolute_language_model_path}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccc720ef", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python export.py {nemo_converted_out_path} {onnx_export_out_path} --autocast --runtime-check\")" - ] - }, - { - "cell_type": "markdown", - "id": "f10461f2", - "metadata": {}, - "source": [ - "# Convert a NLP model with BERT based pre-trained model trained on NeMo >= 1.5.0\n", - "\n", - "For models trained on NeMo >= 1.5.0, you just run the export script and skip the legacy conversion part" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0514ab37", - "metadata": {}, - "outputs": [], - "source": [ - "nemo_file_path = \"\"\n", - "onnx_export_out_path = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d6b5db4", - "metadata": {}, - "outputs": [], - "source": [ - "python export.py $nemo_converted_out_path $onnx_export_out_path --autocast --runtime-check" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb deleted file mode 100644 index 054928245d9d..000000000000 --- a/tutorials/nlp/Question_Answering.ipynb +++ /dev/null @@ -1,1163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "tiIOhb7iVC3J" - }, - "source": [ - "# Overview" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PucJwfbhVC3L" - }, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "This tutorial will demonstrate how to train, evaluate, and test three types of models for Question-Answering -\n", - "1. BERT-like models for Extractive Question-Answering\n", - "2. Sequence-to-Sequence (S2S) models for Generative Question-Answering (ex. T5/BART-like)\n", - "3. GPT-like models for Generative Question-Answering\n", - "\n", - "## Task Description\n", - "\n", - "- Given a context and a natural language query, we want to generate an answer for the query\n", - "- Depending on how the answer is generated, the task can be broadly divided into two types:\n", - " 1. Extractive Question Answering\n", - " 2. Generative Question Answering\n", - "\n", - "\n", - "### Extractive Question-Answering with BERT-like models\n", - "\n", - "Given a question and a context, both in natural language, predict the span within the context with a start and end position which indicates the answer to the question.\n", - "For every word in our training dataset we’re going to predict:\n", - "- likelihood this word is the start of the span \n", - "- likelihood this word is the end of the span\n", - "\n", - "We are using a BERT encoder with 2 span prediction heads for predicting start and end position of the answer. The span predictions are token classifiers consisting of a single linear layer.\n", - "\n", - "### Generative Question-Answering with S2S and GPT-like models\n", - "\n", - "Given a question and a context, both in natural language, generate an answer for the question. Unlike the BERT-like models, there is no constraint that the answer should be a span within the context." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IpX0w2PtVC3M" - }, - "source": [ - "# Installing NeMo" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "72XWYFQYVC3M" - }, - "source": [ - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_xQBtr0KVC3M" - }, - "outputs": [], - "source": [ - "BRANCH = 'main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9R1D6W58VC3N" - }, - "outputs": [], - "source": [ - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fof5-57iVC3N" - }, - "source": [ - "# Imports and constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KqKD-wReVC3O" - }, - "outputs": [], - "source": [ - "import os\n", - "import wget\n", - "import gc\n", - "\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf\n", - "\n", - "from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "pl.seed_everything(42)\n", - "gc.disable()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xhPr9Jf_VC3O" - }, - "outputs": [], - "source": [ - "# set the following paths\n", - "DATA_DIR = \"data_dir\" # directory for storing datasets\n", - "WORK_DIR = \"work_dir\" # directory for storing trained models, logs, additionally downloaded scripts\n", - "\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "os.makedirs(WORK_DIR, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dWymW8e0VC3O" - }, - "source": [ - "# Configuration" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0YhKTkuXVC3P" - }, - "source": [ - "The model is defined in a config file which declares multiple important sections:\n", - "- **model**: All arguments that will relate to the Model - language model, span prediction, optimizer and schedulers, datasets and any other related information\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning\n", - "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n", - "\n", - "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WOIWJqQ0VC3P" - }, - "outputs": [], - "source": [ - "# download the model's default configuration file \n", - "config_dir = WORK_DIR + '/conf/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + \"qa_conf.yaml\"):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/conf/qa_conf.yaml', config_dir)\n", - "else:\n", - " print ('config file already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cvD-gv-FVC3P" - }, - "outputs": [], - "source": [ - "# this will print the entire default config of the model\n", - "config_path = f'{WORK_DIR}/conf/qa_conf.yaml'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(\"Default Config - \\n\")\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E08e-ItPVC3P" - }, - "source": [ - "# Training and testing models on SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xn022MsKVC3Q" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c356CGL1VC3Q" - }, - "source": [ - "For this example, we are going to download the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset to showcase how to do training and inference. There are two datasets, SQuAD1.0 and SQuAD2.0. SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. SQuAD2.0 dataset combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gaju1h_bVC3Q" - }, - "source": [ - "To download both datasets, we use `NeMo/examples/nlp/question_answering/get_squad.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nb840_bZVC3Q" - }, - "outputs": [], - "source": [ - "# download get_squad.py script to download and preprocess the SQuAD data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/get_squad.py'):\n", - " print('Downloading get_squad.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/get_squad.py', WORK_DIR)\n", - "else:\n", - " print ('get_squad.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sOgY0tRzVC3Q" - }, - "outputs": [], - "source": [ - "# download and preprocess the data\n", - "!python $WORK_DIR/get_squad.py --destDir $DATA_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nprGkyvRVC3Q" - }, - "source": [ - "After execution of the above cell, your data folder will contain a subfolder \"squad\" the following four files for training and evaluation\n", - "\n", - "```\n", - "squad \n", - "β”‚\n", - "└───v1.1\n", - "β”‚ β”‚ - train-v1.1.json\n", - "β”‚ β”‚ - dev-v1.1.json\n", - "β”‚\n", - "└───v2.0\n", - " β”‚ - train-v2.0.json\n", - " β”‚ - dev-v2.0.json\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GX0KWQXKVC3Q" - }, - "outputs": [], - "source": [ - "!ls -LR {DATA_DIR}/squad" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RFVcvseOVC3R" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Grb0EeRqVC3R" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# indicates whether the dataset is of extractive nature or not\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "config.model.dataset.check_if_answer_in_context = True\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/squad/v2.0/train-v2.0.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 8\n", - "config.model.validation_ds.batch_size = 8\n", - "config.model.test_ds.batch_size = 8\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rFWF41VwVC3R" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "42yif-GIVC3R" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use [0] this tutorial does not support multiple GPUs. If needed please use NeMo/examples/nlp/question_answering/question_answering.py\n", - "config.trainer.accelerator = \"gpu\"\n", - "config.trainer.strategy=\"auto\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EDQzMBlbVC3R" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pxY4rnJBVC3R" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-SQuAD2\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N2_C8reNVC3R" - }, - "source": [ - "## BERT model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4Mf-_rioVC3R" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gtlGHzVJVC3R" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"bert-base-uncased\"\n", - "config.model.tokenizer.tokenizer_name = \"bert-base-uncased\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bert_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 3e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RaM7fe8rVC3R" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ukLzGmy9VC3R" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = BERTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qZIA69rlVC3R" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "asutB9ZzVC3R" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n5AIv0SEVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7k5kD6tvVC3S" - }, - "outputs": [], - "source": [ - "model = BERTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zyh0SNiyVC3S" - }, - "source": [ - "## S2S BART model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sy9IYgVYVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PKNmHKV5VC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5\n", - "\n", - "#remove vocab_file from gpt model\n", - "config.model.tokenizer.vocab_file = None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S_0glS4yVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8jWyHY1oVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xg-j39b4VC3S" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ocsf0EBDVC3S" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Vs3pl0VMVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NoW6_GO_VC3S" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7-iInbPVC3S" - }, - "source": [ - "## GPT2 model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VaIC0l2aVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5j6SVk6fVC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"gpt2\"\n", - "config.model.tokenizer.tokenizer_name = \"gpt2\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/gpt2_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 1e-4" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rWhhEuvzVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vBtP3ukDVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = GPTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EApFrJh8VC3T" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zYo2JDdOVC3T" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6aNEt06fVC3T" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ioLT4DVbVC3T" - }, - "outputs": [], - "source": [ - "model = GPTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hTWOlD9AVC3T" - }, - "source": [ - "# Training and testing models on MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZWsMwnGVC3T" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pRUAwgAbVC3T" - }, - "source": [ - "### Downloading the data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qz3DO9JGVC3T" - }, - "source": [ - "MS-MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. MS-MARCO consists of 1,010,916 queries generated from real, anonymized Bing user queries. The contexts are extracted from real web documents and the answers are generated by humans.\n", - "\n", - "Please agree to the Terms of Use at https://microsoft.github.io/msmarco/ before downloading the data\n", - "\n", - "The data can be downloaded at:\n", - "- https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "- https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Fm5MzZ91inP5" - }, - "outputs": [], - "source": [ - "os.makedirs(os.path.join(DATA_DIR, \"msmarco\"), exist_ok=True)\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/train_v2.1.json.gz\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nDmFHzBtVC3T" - }, - "source": [ - "### Converting to SQuAD format\n", - "\n", - "The script for converting MS-MARCO dataset to SQuAD can be found at `NeMo/examples/nlp/question_answering/convert_msmarco_to_squad_format.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tJtNIzZQVC3T" - }, - "outputs": [], - "source": [ - "# download convert_msmarco_to_squad_format.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/convert_msmarco_to_squad_format.py'):\n", - " print('Downloading convert_msmarco_to_squad_format.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/convert_msmarco_to_squad_format.py', WORK_DIR)\n", - "else:\n", - " print ('convert_msmarco_to_squad_format.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Io_esJPSuBcW" - }, - "outputs": [], - "source": [ - "# we will exclude examples from MS-MARCO dataset that do not have a wellFormedAnswer using a utility script\n", - "# download remove_ms_marco_samples_without_wellFormedAnswers.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/remove_ms_marco_samples_without_wellFormedAnswers.py'):\n", - " print('Downloading remove_ms_marco_samples_without_wellFormedAnswers.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py', WORK_DIR)\n", - "else:\n", - " print ('remove_ms_marco_samples_without_wellFormedAnswers.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cs_CXkfXuYVQ" - }, - "outputs": [], - "source": [ - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/train_v2.1.json\n", - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/dev_v2.1.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AUAKI086VC3T" - }, - "outputs": [], - "source": [ - "!(python $WORK_DIR/convert_msmarco_to_squad_format.py \\\n", - " --msmarco_train_input_filepath=$DATA_DIR/msmarco/train_v2.1.json \\\n", - " --msmarco_dev_input_filepath=$DATA_DIR/msmarco/dev_v2.1.json \\\n", - " --converted_train_save_path=$DATA_DIR/msmarco/msmarco-squad-format-train-v2.1.json \\\n", - " --converted_dev_save_path=$DATA_DIR/msmarco/msmarco-squad-format-dev-v2.1.json \\\n", - " --exclude_negative_samples=False \\\n", - " --keep_only_relevant_passages=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AeHesaFcVC3T" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rhx-_1X3VC3T" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "# should be False for MS-MARCO dataset, or other datasets of generative nature\n", - "config.model.dataset.check_if_answer_in_context = False\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-train-v2.1.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 16\n", - "config.model.validation_ds.batch_size = 16\n", - "config.model.test_ds.batch_size = 16\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X43k_EeqVC3T" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HavpkQLPVC3U" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0]\n", - "config.trainer.accelerator = \"gpu\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R-_FIZE2VC3U" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "10TT3okiVC3U" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-MSMARCO\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MKIq6YT-VC3U" - }, - "source": [ - "## S2S BART model for MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvf-QpYLVC3U" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DDVZ1a5fVC3U" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_msmarco_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3N75cdLRVC3U" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bv9UMkfxVC3U" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BhVuV9sWVC3U" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1JeaJ_OgVC3U" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yj0dGexaVC3U" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l1elN-WDVC3U" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Question_Answering.ipynb", - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3.8.0 ('test_ptl_1.7')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb deleted file mode 100644 index 71c7ca505144..000000000000 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ /dev/null @@ -1,1412 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "PiRuohn_FQco" - }, - "source": [ - "# Overview\n", - "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", - "\n", - "Estimated time: 10-15 min.\n", - "\n", - "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n", - "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n", - "\n", - "This model is an alternative to word boosting/shallow fusion approaches:\n", - " - does not require retraining ASR model;\n", - " - does not require beam-search/language model(LM);\n", - " - can be applied on top of any English ASR model output;" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qm5wmxVEGXgH" - }, - "source": [ - "## What is custom vocabulary?\n", - "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n", - "\n", - "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n", - "\n", - "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D5_XwuXDOKho" - }, - "source": [ - "## Tutorial Plan\n", - "\n", - "1. Create a sample custom vocabulary using some medical terminology.\n", - "2. Study what customization does - a detailed analysis of a small example.\n", - "3. Run a bigger example:\n", - " * Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n", - "\n", - "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n", - "* `puramesin` => `puromycin`\n", - "* `parromsin` => `puromycin`\n", - "* `and hydrod` => `anhydride`\n", - "* `lesh night and` => `lesch-nyhan`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "agz8B2CxXBBG" - }, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "koRPpYISNPuH" - }, - "source": [ - "## Installing NeMo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HCnnz3cgVc4Q" - }, - "outputs": [], - "source": [ - "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", - "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "GITHUB_ACCOUNT = \"NVIDIA\"\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n", - "\n", - "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", - "# comment out the below lines and set NEMO_DIR to your local path.\n", - "NEMO_DIR = 'nemo'\n", - "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_M92gCn_NW1_" - }, - "source": [ - "## Additional installs\n", - "We will use `sentence_splitter` to split abstracts to sentences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ddyJA3NtGl9C" - }, - "outputs": [], - "source": [ - "!pip install sentence_splitter" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qVa91rGkeFje" - }, - "source": [ - "Clone the SpellMapper model from HuggingFace.\n", - "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JiI9dkEm5cpW" - }, - "outputs": [], - "source": [ - "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8saqFOePVfFf" - }, - "source": [ - "## Imports\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tAJyiYn_VnrF" - }, - "outputs": [], - "source": [ - "import IPython.display as ipd\n", - "import json\n", - "import random\n", - "import re\n", - "import soundfile as sf\n", - "import torch\n", - "\n", - "from collections import Counter, defaultdict\n", - "from difflib import SequenceMatcher\n", - "from matplotlib.pyplot import imshow\n", - "from matplotlib import pyplot as plt\n", - "from sentence_splitter import SentenceSplitter\n", - "from typing import List, Set, Tuple\n", - "\n", - "from nemo.collections.tts.models import FastPitchModel\n", - "from nemo.collections.tts.models import HifiGanModel\n", - "\n", - "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n", - "\n", - "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n", - " get_all_candidates_coverage,\n", - " get_index,\n", - " load_ngram_mappings,\n", - " search_in_index,\n", - " get_candidates,\n", - " read_spellmapper_predictions,\n", - " apply_replacements_to_text,\n", - " load_ngram_mappings_for_dp,\n", - " get_alignment_by_dp,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mfAaOdAWUGUV" - }, - "source": [ - "Use seed to get a reproducible behaviour." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UlGnNKTuT_6A" - }, - "outputs": [], - "source": [ - "random.seed(0)\n", - "torch.manual_seed(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPPHI7Zd_fDz" - }, - "source": [ - "## Download data\n", - "\n", - "File `pubmed24n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n", - "\n", - "File `wordlist.txt` contains 100k **single-word** medical terms.\n", - "\n", - "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n", - "\n", - "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mX6cvE8xw2n1" - }, - "outputs": [], - "source": [ - "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0009.xml.gz\n", - "!gunzip pubmed24n0009.xml.gz\n", - "!grep \"AbstractText\" pubmed24n0009.xml > abstract.txt\n", - "\n", - "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n", - "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n", - "!wget https://norvig.com/ngrams/count_1w.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mBm9BeqNaRlC" - }, - "source": [ - "## Auxiliary functions\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kVUKhSh48Ypi" - }, - "outputs": [], - "source": [ - "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€ℒ$β€’ΒΌ}{~β€”=β€œ\\\"β€β€³β€Ÿβ€ž]\")\n", - "\n", - "\n", - "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n", - " \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n", - " wordlist.txt - 100k single-word medical terms.\n", - " valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n", - " count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n", - " \"\"\"\n", - " common_words = set()\n", - " with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word, freq = line.strip().casefold().split(\"\\t\")\n", - " if int(freq) < 500000:\n", - " break\n", - " common_words.add(word)\n", - " print(\"Size of common words vocabulary:\", len(common_words))\n", - "\n", - " abbreviations = defaultdict(set)\n", - " medical_vocabulary = set()\n", - " with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - " # first line is header\n", - " for line in lines[1:]:\n", - " abbrev, _, phrase = line.strip().split(\"\\t\")\n", - " # skip phrases longer than 3 words because some of them are long explanations\n", - " if phrase.count(\" \") > 2:\n", - " continue\n", - " if phrase in common_words:\n", - " continue\n", - " medical_vocabulary.add(phrase)\n", - " abbrev = abbrev.lower()\n", - " abbreviations[abbrev].add(phrase)\n", - "\n", - " with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word = line.strip().casefold()\n", - " # skip words containing digits\n", - " if re.match(r\".*\\d.*\", word):\n", - " continue\n", - " if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n", - " continue\n", - " if word in common_words:\n", - " continue\n", - " medical_vocabulary.add(word)\n", - "\n", - " print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n", - " print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n", - " return medical_vocabulary, abbreviations\n", - "\n", - "\n", - "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n", - " \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n", - " Args:\n", - " medical_vocabulary: set of known medical words or phrases\n", - " Returns:\n", - " sentences: list of extracted sentences\n", - " all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n", - " all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n", - " \"\"\"\n", - " splitter = SentenceSplitter(language='en')\n", - "\n", - " all_sentences = []\n", - " all_found_singleword = set()\n", - " all_found_multiword = set()\n", - " with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " text = line.strip().replace(\"\", \"\").replace(\"\", \"\")\n", - " sents = splitter.split(text)\n", - " found_singleword = set()\n", - " found_multiword = set()\n", - " for sent in sents:\n", - " # remove anything in brackets from text\n", - " sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n", - " # remove quotes from text\n", - " sent = sent.replace(\"\\\"\", \"\")\n", - " # skip sentences containing digits because normalization is out of scope of this tutorial\n", - " if re.match(r\".*\\d.*\", sent):\n", - " continue\n", - " # skip sentences containing abbreviations with period inside the sentence (for the same reason)\n", - " if \". \" in sent:\n", - " continue\n", - " # skip long sentences as they may cause OOM issues\n", - " if len(sent) > 150:\n", - " continue\n", - " # replace all punctuation to space and convert to lowercase\n", - " sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n", - " sent_clean = \" \".join(sent_clean.split(\" \"))\n", - " words = sent_clean.split(\" \")\n", - "\n", - " found_phrases = set()\n", - " for begin in range(len(words)):\n", - " for end in range(begin + 1, min(begin + 4, len(words))):\n", - " phrase = \" \".join(words[begin:end])\n", - " if phrase in medical_vocabulary:\n", - " found_phrases.add(phrase)\n", - " if end - begin == 1:\n", - " found_singleword.add(phrase)\n", - " else:\n", - " found_multiword.add(phrase)\n", - " if len(found_phrases) > 0:\n", - " all_sentences.append((sent, \";\".join(found_phrases)))\n", - " all_found_singleword = all_found_singleword.union(found_singleword)\n", - " all_found_multiword = all_found_multiword.union(found_multiword)\n", - "\n", - " print(\"Sentences:\", len(all_sentences))\n", - " print(\"Unique single-word terms found:\", len(all_found_singleword))\n", - " print(\"Unique multi-word terms found:\", len(all_found_multiword))\n", - " print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n", - " \n", - " return all_sentences, all_found_singleword, all_found_multiword" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XU3xeCBVpWOL" - }, - "outputs": [], - "source": [ - "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n", - " \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n", - " Args:\n", - " i_words: list of words in first sequence\n", - " j_words: list of words in second sequence\n", - " Returns:\n", - " list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n", - " \"\"\"\n", - " s = SequenceMatcher(None, i_words, j_words)\n", - " result = []\n", - " for tag, i1, i2, j1, j2 in s.get_opcodes():\n", - " result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n", - " result = sorted(result, key=lambda x: x[3])\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ydXp_pFYmYu" - }, - "source": [ - "## Read medical data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WAeauax0SV1-" - }, - "outputs": [], - "source": [ - "medical_vocabulary, _ = get_medical_vocabulary()\n", - "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n", - "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n", - "big_sample = list(medical_vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FRli7-Kx7sOO" - }, - "outputs": [], - "source": [ - "for sent, phrases in sentences[0:10]:\n", - " print(sent, \"\\t\", phrases)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rL1VqH2_dk93" - }, - "source": [ - "# SpellMapper ASR Customization\n", - "\n", - "SpellMapper model relies on two offline preparation steps:\n", - "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n", - "2. Indexing of user vocabulary by n-grams.\n", - "\n", - "![Offline data preparation](images/spellmapper_data_preparation.png)\n", - "\n", - "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n", - "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n", - "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n", - "3. Do post-processing to combine results.\n", - "\n", - "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OeJpsMwslmrd" - }, - "source": [ - "## N-gram mappings\n", - "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uH6p0mOd12pi" - }, - "source": [ - "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n", - "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n", - "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n", - "* `` means that the original letter is deleted. \n", - "This auxiliary markup will be removed automatically during loading.\n", - "\n", - "`_` is used instead of real space symbol.\n", - "\n", - "Last three columns are:\n", - "* joint frequency\n", - "* frequency of original n-gram\n", - "* frequency of misspelled n-gram\n", - "\n", - "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qul163dB1sKp" - }, - "outputs": [], - "source": [ - "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eWxcrVWZ3Pfq" - }, - "source": [ - "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WHKhE945-N7o" - }, - "outputs": [], - "source": [ - "print(\"load n-gram mappings...\")\n", - "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n", - "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n", - "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n", - "print(\"Size of banned ngrams:\", len(ban_ngram))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "49IcMBfllvXN" - }, - "source": [ - "## Indexing of custom vocabulary" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b1K6paeee2Iu" - }, - "source": [ - "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n", - "\n", - "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n", - "gram mappings (see Index of custom vocabulary in Fig. 1)\n", - "\n", - "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xWb0jGqw6Woi" - }, - "outputs": [], - "source": [ - "custom_phrases = []\n", - "for phrase in medical_vocabulary:\n", - " if phrase not in found_singleword and phrase not in found_multiword:\n", - " continue\n", - " custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n", - "print(\"Size of customization vocabulary:\", len(custom_phrases))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UHWor5pD2Eyb" - }, - "source": [ - "Now we build the index for our custom phrases.\n", - "\n", - "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n", - "\n", - "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hs4RDXj0-xW9" - }, - "outputs": [], - "source": [ - "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n", - "print(\"Size of phrases:\", len(phrases))\n", - "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n", - "\n", - "# Save index to file - later we will use it in other script\n", - "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for ngram in ngram2phrases:\n", - " for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n", - " phrase = phrases[phrase_id]\n", - " out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RV1sdQ9rvar8" - }, - "source": [ - "## Small detailed example\n", - "\n", - "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n", - "\n", - "We will see \n", - "1. How this custom phrase is indexed.\n", - "2. How candidate retrieval works, given ASR-hypothesis.\n", - "3. How inference and post-processing work.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kGBTTJXixnrG" - }, - "source": [ - "### N-grams in index" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ryfUlqNMl4vQ" - }, - "source": [ - "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n", - "Columns: \n", - "1. n-gram\n", - "2. beginning position in the phrase\n", - "3. length\n", - "4. log probability\n", - "\n", - "Note that many n-grams are not from n-gram mappings file. Those are derived by growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x0ZVsXGBo8pt" - }, - "outputs": [], - "source": [ - "for ngram in ngram2phrases:\n", - " for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n", - " if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n", - " print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "20ov23ze4xeQ" - }, - "source": [ - "### Candidate retrieval\n", - "Candidate retrieval tasks are:\n", - " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n", - " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n", - " - Find approximate starting position for each candidate phrase. \n", - "\n", - "\n", - "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t_rhKQ3Xqa8A" - }, - "outputs": [], - "source": [ - "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n", - "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n", - "print(\" \".join(list(sent)))\n", - "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "orkRapbjF4aZ" - }, - "source": [ - "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n", - "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n", - "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n", - "\n", - "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n", - "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JF7u4_iiHLyI" - }, - "outputs": [], - "source": [ - "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n", - "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n", - "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "45mvKg8ZyNbr" - }, - "source": [ - "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n", - "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n", - "\n", - "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n", - "\n", - "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sjyn9I98udL9" - }, - "source": [ - "### Inference\n", - "\n", - "Now let's generate input for SpellMapper inference. \n", - "An input line should consist of 4 tab-separated columns:\n", - " - text of ASR-hypothesis\n", - " - texts of 10 candidates separated by semicolon\n", - " - 1-based ids of non-dummy candidates\n", - " - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n", - "Note that candidate retrieval is done inside the function `get_candidates`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cJnusVfBRhRX" - }, - "outputs": [], - "source": [ - "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n", - "letters = list(sent)\n", - "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n", - "# We add two columns with targets and span_info. \n", - "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n", - "targets = []\n", - "span_info = []\n", - "for idx, c in enumerate(candidates):\n", - " if c[1] == -1:\n", - " continue\n", - " targets.append(str(idx + 1)) # targets are 1-based\n", - " start = c[1]\n", - " end = min(c[1] + c[2], len(letters)) # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n", - " span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n", - "\n", - "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates]) + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n", - "out.close()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qpei5o89SmaU" - }, - "outputs": [], - "source": [ - "!cat spellmapper_input.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9rAmO15SS6go" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wd2aq4T1N5cs" - }, - "source": [ - "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n", - "1. ASR-hypothesis (same as in input)\n", - "2. 10 candidates separated with semicolon (same as in input)\n", - "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n", - "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ravgEX8cTFty" - }, - "outputs": [], - "source": [ - "!cat spellmapper_output.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "az26364-PHb2" - }, - "source": [ - "We can use some utility functions to apply found replacements and get actual corrected text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lPtFa_EhK8pb" - }, - "outputs": [], - "source": [ - "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n", - "text, replacements, _ = spellmapper_results[0]\n", - "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n", - "print(\"Text before correction:\\n\", text)\n", - "print(\"Text after correction:\\n\", corrected_text)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "efF7O-D91FLX" - }, - "source": [ - "# Bigger customization example\n", - "\n", - "Let's test customization on more data. The plan is\n", - " * Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r_EFPnyDcXZt" - }, - "source": [ - "## Run TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "i9F5SBhmr8rk" - }, - "outputs": [], - "source": [ - "# create a folder for wav files (TTS output)\n", - "!rm -r audio\n", - "!mkdir audio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JMbkNVt7YBAO" - }, - "outputs": [], - "source": [ - "if torch.cuda.is_available():\n", - " device = \"cuda\"\n", - "else:\n", - " device = \"cpu\"\n", - "\n", - "# Load FastPitch from HuggingFace\n", - "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n", - "# Load HifiGan vocoder from HuggingFace\n", - "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n", - "\n", - "# Write sentences that we want to feed to TTS\n", - "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for sent, _ in sentences[0:100]:\n", - " out.write(sent + \"\\n\")\n", - "\n", - "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n", - "i = 0\n", - "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n", - " for line in inp:\n", - " text = line.strip()\n", - " text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower() #replace all punctuation to space and convert to lowercase\n", - " text_clean = \" \".join(text_clean.split())\n", - "\n", - " parsed = spectrogram_generator.parse(text, normalize=True)\n", - "\n", - " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", - " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", - "\n", - " # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n", - " filename = \"audio/\" + str(i) + \".wav\"\n", - " sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n", - " out_manifest.write(\n", - " \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n", - " )\n", - " i += 1\n", - "\n", - " # display some examples\n", - " if i < 10:\n", - " print(f'\"{text}\"\\n')\n", - " ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n", - "\n", - "out_manifest.close()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9T3CZcCAmxCz" - }, - "source": [ - "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`.", - "\n", - "Note that TTS model may mispronounce some unknown words, for example, abbreviations like `tRNAs`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pR_T1HnttVjm" - }, - "outputs": [], - "source": [ - "lines = []\n", - "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - "\n", - "for line in lines:\n", - " try:\n", - " data = json.loads(line.strip())\n", - " except:\n", - " print(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bt2TMLLvdUHm" - }, - "source": [ - "Free GPU memory to avoid OOM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZwEpAOCaRH7s" - }, - "outputs": [], - "source": [ - "del spectrogram_generator\n", - "del vocoder\n", - "torch.cuda.empty_cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HrensakWdLkt" - }, - "source": [ - "## Run baseline ASR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IQNIo2M_mqJc" - }, - "source": [ - "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n", - "\n", - "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NMN63ux1mJiG" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/transcribe_speech.py \\\n", - " pretrained_name=\"stt_en_conformer_ctc_large\" \\\n", - " dataset_manifest=manifest.json \\\n", - " output_filename=ctc_baseline_transcript_tmp.json \\\n", - " batch_size=2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L3swQ8uqqgnp" - }, - "source": [ - "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n", - "\n", - "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n", - "\n", - "So we need to correct the manifests to ensure that all spaces are single." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z17sxkmXrXpJ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n", - "\n", - "for i in range(len(test_data)):\n", - " # if there are multiple spaces in the string they will be merged to one\n", - " test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n", - "\n", - "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n", - " for d in test_data:\n", - " line = json.dumps(d)\n", - " out.write(line + \"\\n\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PuKtfhbVkVJY" - }, - "outputs": [], - "source": [ - "!head -n 4 ctc_baseline_transcript.json" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aCJw9NEXqRg8" - }, - "source": [ - "### Calculating WER of baseline transcript\n", - "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZmNEGVWQsGo2" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_baseline_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AvPwJr0ZqdkN" - }, - "source": [ - "### See fragments that differ\n", - "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RAeaVCpMv78y" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['text'] for data in test_data]\n", - "audio_filepath = [data['audio_filepath'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"PRED vs REF\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dUSOF7iD1w_9" - }, - "source": [ - "## Run SpellMapper" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "x39BQhYB6_Fr" - }, - "source": [ - "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y8x-yT5WqfFz" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n", - " --manifest ctc_baseline_transcript.json \\\n", - " --custom_vocab_index index.txt \\\n", - " --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n", - " --short2full_name short2full.txt \\\n", - " --output_name spellmapper_input.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ueq_JAPWGs_Y" - }, - "source": [ - "Run the inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zgkqiiZtJjcB" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPQWJX8dFLfX" - }, - "source": [ - "Now we postprocess SpellMapper output and create output corrected manifest." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3eFU515yKvXP" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --ngram_mappings \"\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hRoIhhGh17tp" - }, - "source": [ - "### Calculating WER of corrected transcript." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qIT957bGo9AY" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NYXIPusupqOQ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['pred_text_before_correction'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"Corrected vs baseline\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DJtXlqXbTD6M" - }, - "source": [ - "### Filtering by Dynamic Programming(DP) score\n", - "\n", - "What else can be done?\n", - "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n", - "Let's look at an example. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "05Qf9wgHU_UR" - }, - "outputs": [], - "source": [ - "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n", - "\n", - "fragment = \"and hydrod\"\n", - "replacement = \"anhydride\"\n", - "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n", - "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n", - "path = get_alignment_by_dp(\n", - " replacement_spaced,\n", - " fragment_spaced,\n", - " dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n", - ")\n", - "print(\"Dynamic Programming path:\")\n", - "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n", - " print(\n", - " \"\\t\",\n", - " \"frag=\",\n", - " fragment_ngram,\n", - " \"; repl=\",\n", - " replacement_ngram,\n", - " \"; score=\",\n", - " score,\n", - " \"; sum_score=\",\n", - " sum_score,\n", - " \"; joint_freq=\",\n", - " joint_freq,\n", - " \"; orig_freq=\",\n", - " orig_freq,\n", - " \"; misspelled_freq=\",\n", - " misspelled_freq,\n", - " )\n", - "\n", - "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n", - "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hgfKPKckaLnc" - }, - "source": [ - "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n", - "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UhSXh7ht_JRn" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript_dp.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --use_dp \\\n", - " --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n", - " --min_dp_score_per_symbol -1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "u8R5YHB3vPC8" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript_dp.json \\\n", - " only_score_manifest=True" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "upvTbkFAeYtR" - }, - "source": [ - "# Final notes\n", - "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n", - "\n", - "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n", - "\n", - "3. To reproduce evaluation experiments from this paper see these scripts:\n", - " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - "\n", - "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", - "\n", - "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n", - "\n", - "6. Promising future research directions would be:\n", - " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", - " - retrain with adding more various false positives to the training data" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}